使用Keras绘制梯度下降曲线

0 投票
1 回答
31 浏览
提问于 2025-04-14 17:02

我在Keras中实现了以下代码,使用加州房价数据集,试图绘制theta 1和theta 2的值,以及随机梯度下降、批量梯度下降或小批量梯度下降的选择如何影响结果:

import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

# Load California housing dataset
california_housing = fetch_california_housing()
X, y = california_housing.data, california_housing.target

# Normalize features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

def build_model():
    model = Sequential([
    Dense(64, activation="relu",input_shape=(8,)),
    Dense(64, activation="relu"),
    Dense(1)
    ])
    #model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
    return model


sgd_optimizer = SGD(lr=0.001)  # Stochastic Gradient Descent
minibatch_sgd_optimizer = SGD(lr=0.001)  # Mini-batch Gradient Descent
batch_sgd_optimizer = SGD(lr=0.001)  # Batch Gradient Descent


# Compile the model
model=build_model()
model.compile(loss='mse', optimizer=sgd_optimizer)

theta1_sgd, theta2_sgd = [], []
theta1_minibatch_sgd, theta2_minibatch_sgd = [], []
theta1_batch_sgd, theta2_batch_sgd = [], []


# Function to perform gradient descent and store theta values
def perform_gradient_descent(optimizer, batch_size=None):
    theta1_list, theta2_list = [], []
    loss_history = []
    for _ in range(5):  # Number of epochs
        history=model.fit(X_normalized, y, epochs=1, batch_size=batch_size, verbose=0)
        loss_history.append(history.history['loss'][0])
        weights = model.layers[0].get_weights()[0].flatten()  # Get current theta values
        theta1_list.append(weights[0])
        theta2_list.append(weights[1])
    print(theta1_list," ",theta2_list)
    return loss_history,theta1_list, theta2_list

# Perform gradient descent with different optimizers
loss_sgd, theta1_sgd, theta2_sgd = perform_gradient_descent(sgd_optimizer, batch_size=1)  # Stochastic Gradient Descent
loss_minibatch_sgd, theta1_minibatch_sgd, theta2_minibatch_sgd = perform_gradient_descent(minibatch_sgd_optimizer, batch_size=32)  # Mini-batch Gradient Descent
loss_batch_sgd, theta1_batch_sgd, theta2_batch_sgd = perform_gradient_descent(batch_sgd_optimizer, batch_size=len(X_normalized))  # Batch Gradient Descent

# Plotting the loss versus number of epochs
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(loss_sgd) + 1), loss_sgd, label='Stochastic Gradient Descent')
plt.plot(range(1, len(loss_minibatch_sgd) + 1), loss_minibatch_sgd, label='Mini-batch Gradient Descent')
plt.plot(range(1, len(loss_batch_sgd) + 1), loss_batch_sgd, label='Batch Gradient Descent')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss')
plt.title('Loss vs. Number of Epochs')
plt.legend()
plt.grid(True)
plt.show()

# Plotting the gradient descent trajectories
plt.figure(figsize=(10, 6))
plt.plot(theta1_sgd, theta2_sgd, label='Stochastic Gradient Descent', marker='o')
plt.plot(theta1_minibatch_sgd, theta2_minibatch_sgd, label='Mini-batch Gradient Descent', marker='s')
plt.plot(theta1_batch_sgd, theta2_batch_sgd, label='Batch Gradient Descent', marker='x')
plt.xlabel('Theta 1')
plt.ylabel('Theta 2')
plt.title('Gradient Descent Trajectories')
#plt.xlim(-0.08, -0.05)  # Set limit for Theta 1
#plt.ylim(0.02, 0.03)  # Set limit for Theta 2
plt.legend()
plt.grid(True)
plt.show()

不过,我发现一个问题,有时候存放theta值的列表中的值是Nan(不是一个数字),而有时候又是正常的值。我注意到当训练的轮数超过10次时,会出现这种情况,这是什么原因呢?

1 个回答

0

看起来这里只有一个 model,它被不同的优化器更新,尽管这个模型只用其中一个优化器进行了编译。

你可以试着创建三个模型,每个模型都注册一个优化器。可以参考下面的方式:

.
.
.

# Compile the models, one per optimiser
model_1 = build_model()
model_2 = build_model()
model_3 = build_model()

model_1.compile(loss='mse', optimizer=sgd_optimizer)
model_2.compile(loss='mse', optimizer=minibatch_sgd_optimizer)
model_3.compile(loss='mse', optimizer=batch_sgd_optimizer)

theta1_sgd, theta2_sgd = [], []
theta1_minibatch_sgd, theta2_minibatch_sgd = [], []
theta1_batch_sgd, theta2_batch_sgd = [], []


# Function to perform gradient descent and store theta values
# Supply the model and its optimiser
def perform_gradient_descent(model, optimizer, batch_size=None): #<- now accepts "model" as well
    ... #code is exactly the same as before

# Perform gradient descent with models and their optimizers

# Stochastic Gradient Descent on model_1
loss_sgd, theta1_sgd, theta2_sgd = perform_gradient_descent(
    model_1, sgd_optimizer, batch_size=1
)

# Mini-batch Gradient Descent on model_2
loss_minibatch_sgd, theta1_minibatch_sgd, theta2_minibatch_sgd = perform_gradient_descent(
    model_2, minibatch_sgd_optimizer, batch_size=32
)

# Batch Gradient Descent on model_3
loss_batch_sgd, theta1_batch_sgd, theta2_batch_sgd = perform_gradient_descent(
    model_3, batch_sgd_optimizer, batch_size=len(X_normalized)
)

.
.
.

由于随机初始化,它们的起始点会有所不同。你可以通过额外的代码来控制这一点,在每个模型创建之前设置随机种子。

撰写回答