使用Keras绘制梯度下降曲线
我在Keras中实现了以下代码,使用加州房价数据集,试图绘制theta 1和theta 2的值,以及随机梯度下降、批量梯度下降或小批量梯度下降的选择如何影响结果:
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
# Load California housing dataset
california_housing = fetch_california_housing()
X, y = california_housing.data, california_housing.target
# Normalize features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)
def build_model():
model = Sequential([
Dense(64, activation="relu",input_shape=(8,)),
Dense(64, activation="relu"),
Dense(1)
])
#model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
return model
sgd_optimizer = SGD(lr=0.001) # Stochastic Gradient Descent
minibatch_sgd_optimizer = SGD(lr=0.001) # Mini-batch Gradient Descent
batch_sgd_optimizer = SGD(lr=0.001) # Batch Gradient Descent
# Compile the model
model=build_model()
model.compile(loss='mse', optimizer=sgd_optimizer)
theta1_sgd, theta2_sgd = [], []
theta1_minibatch_sgd, theta2_minibatch_sgd = [], []
theta1_batch_sgd, theta2_batch_sgd = [], []
# Function to perform gradient descent and store theta values
def perform_gradient_descent(optimizer, batch_size=None):
theta1_list, theta2_list = [], []
loss_history = []
for _ in range(5): # Number of epochs
history=model.fit(X_normalized, y, epochs=1, batch_size=batch_size, verbose=0)
loss_history.append(history.history['loss'][0])
weights = model.layers[0].get_weights()[0].flatten() # Get current theta values
theta1_list.append(weights[0])
theta2_list.append(weights[1])
print(theta1_list," ",theta2_list)
return loss_history,theta1_list, theta2_list
# Perform gradient descent with different optimizers
loss_sgd, theta1_sgd, theta2_sgd = perform_gradient_descent(sgd_optimizer, batch_size=1) # Stochastic Gradient Descent
loss_minibatch_sgd, theta1_minibatch_sgd, theta2_minibatch_sgd = perform_gradient_descent(minibatch_sgd_optimizer, batch_size=32) # Mini-batch Gradient Descent
loss_batch_sgd, theta1_batch_sgd, theta2_batch_sgd = perform_gradient_descent(batch_sgd_optimizer, batch_size=len(X_normalized)) # Batch Gradient Descent
# Plotting the loss versus number of epochs
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(loss_sgd) + 1), loss_sgd, label='Stochastic Gradient Descent')
plt.plot(range(1, len(loss_minibatch_sgd) + 1), loss_minibatch_sgd, label='Mini-batch Gradient Descent')
plt.plot(range(1, len(loss_batch_sgd) + 1), loss_batch_sgd, label='Batch Gradient Descent')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss')
plt.title('Loss vs. Number of Epochs')
plt.legend()
plt.grid(True)
plt.show()
# Plotting the gradient descent trajectories
plt.figure(figsize=(10, 6))
plt.plot(theta1_sgd, theta2_sgd, label='Stochastic Gradient Descent', marker='o')
plt.plot(theta1_minibatch_sgd, theta2_minibatch_sgd, label='Mini-batch Gradient Descent', marker='s')
plt.plot(theta1_batch_sgd, theta2_batch_sgd, label='Batch Gradient Descent', marker='x')
plt.xlabel('Theta 1')
plt.ylabel('Theta 2')
plt.title('Gradient Descent Trajectories')
#plt.xlim(-0.08, -0.05) # Set limit for Theta 1
#plt.ylim(0.02, 0.03) # Set limit for Theta 2
plt.legend()
plt.grid(True)
plt.show()
不过,我发现一个问题,有时候存放theta值的列表中的值是Nan(不是一个数字),而有时候又是正常的值。我注意到当训练的轮数超过10次时,会出现这种情况,这是什么原因呢?
1 个回答
0
看起来这里只有一个 model
,它被不同的优化器更新,尽管这个模型只用其中一个优化器进行了编译。
你可以试着创建三个模型,每个模型都注册一个优化器。可以参考下面的方式:
.
.
.
# Compile the models, one per optimiser
model_1 = build_model()
model_2 = build_model()
model_3 = build_model()
model_1.compile(loss='mse', optimizer=sgd_optimizer)
model_2.compile(loss='mse', optimizer=minibatch_sgd_optimizer)
model_3.compile(loss='mse', optimizer=batch_sgd_optimizer)
theta1_sgd, theta2_sgd = [], []
theta1_minibatch_sgd, theta2_minibatch_sgd = [], []
theta1_batch_sgd, theta2_batch_sgd = [], []
# Function to perform gradient descent and store theta values
# Supply the model and its optimiser
def perform_gradient_descent(model, optimizer, batch_size=None): #<- now accepts "model" as well
... #code is exactly the same as before
# Perform gradient descent with models and their optimizers
# Stochastic Gradient Descent on model_1
loss_sgd, theta1_sgd, theta2_sgd = perform_gradient_descent(
model_1, sgd_optimizer, batch_size=1
)
# Mini-batch Gradient Descent on model_2
loss_minibatch_sgd, theta1_minibatch_sgd, theta2_minibatch_sgd = perform_gradient_descent(
model_2, minibatch_sgd_optimizer, batch_size=32
)
# Batch Gradient Descent on model_3
loss_batch_sgd, theta1_batch_sgd, theta2_batch_sgd = perform_gradient_descent(
model_3, batch_sgd_optimizer, batch_size=len(X_normalized)
)
.
.
.
由于随机初始化,它们的起始点会有所不同。你可以通过额外的代码来控制这一点,在每个模型创建之前设置随机种子。