在过去的两周里,我一直在尝试使用MNIST数据库实现一个带有前馈神经网络的手写数字分类器。 神经网络采用交叉熵损失,输出层采用Softmax,其余节点采用Sigmoid函数激活。你知道吗
我在训练网络时遇到的主要问题是,损失从未收敛,而且非常痉挛。你知道吗
我已经试着重新编写我的脚本,检查我是否正确地实现了数学,但我没有成功。我也读过其他有同样问题的帖子,但没有一个能给我一个我想要的答案。此外,我尝试用一个更简单的神经网络进行检查,但也没有成功。 我在代码中使用的源代码: Backpropagation By Ken-ChenDerivative of Softmax With Cross-EntropyExample to Feed-Forward and Backpropagation
注意我很确定在更新权重时会出现问题,但我无法判断。
注意下面的代码是重新编写的脚本。你知道吗
import numpy as np
import matplotlib.pyplot as plt
import pickle
from mnist import MNIST
class Neural_Re:
def __init__(self, layers):
self.layers = layers
self.weights = []
self.bias_weights = [0 for i in range(len(layers) - 1)]
self.activated_sums = []
self.derivative_sums = []
self.learning_rate = 0.1
self.error = []
# Initialize Random Weights
for i in range(len(layers) - 1):
weights_matrix = np.random.rand(self.layers[i + 1], self.layers[i]).dot(
np.sqrt(2 / (self.layers[i] + self.layers[i + 1])))
self.weights.append(weights_matrix)
def add_bias(self, layer):
# e.g: if layer == 0, layer in bias_weights is 0
layer -= 1
weights_matrix = np.random.rand(self.layers[layer + 1]).dot(
np.sqrt(2 / (self.layers[layer] + self.layers[layer + 1])))
self.bias_weights.insert(layer, weights_matrix)
def set_input(self, inputs):
self.activated_sums.append(inputs)
@staticmethod
def activation_sigmoid(sums, derivative=False):
for i in range(len(sums)):
if sums[i] > 37:
sums[i] = 37
elif sums[i] < -37:
sums[i] = -37
s = 1 / (1 + np.exp(-sums))
if derivative:
return s * (1 - s)
else:
return s
@staticmethod
def activation_softmax(sums):
exp = np.exp(sums)
return exp / exp.sum()
def propagate(self):
for layer in range(len(self.weights)):
# Calculate Sum w*x + b
zl = self.weights[layer].dot(self.activated_sums[layer])
np.add(zl, self.bias_weights[layer], out=zl)
# Saving Sums of (w*x + b) for use in calculating the error of
each node in backprop
self.derivative_sums.append(zl)
if layer == len(self.weights) - 1:
al = self.activation_softmax(zl)
else:
al = self.activation_sigmoid(zl)
self.activated_sums.append(al)
def backprop(self, target_vector):
for layer in range(len(self.weights) - 1, -1, -1):
if layer == len(self.weights) - 1: # If layer is output layer:
# calculate derivative w.r.t Output sum vector (∂J/∂z)
# [J = Loss function , z = Sum vector before activation]
self.derivative_sums[layer] = np.subtract(self.activated_sums[len(self.activated_sums)-1], target_vector)
else:
# Calculate Error of each Node in layer
derivative_sigmoid = self.activation_sigmoid(self.derivative_sums[layer], derivative=True)
sum_errors_in_next_layer = np.sum(self.derivative_sums[layer + 1].dot(self.weights[layer + 1]))
self.derivative_sums[layer] = np.multiply(sum_errors_in_next_layer, derivative_sigmoid)
for layer in range(0, len(self.weights) - 1):
# Stochastic Gradient Descent, Update weights.
self.SGD(layer)
# Calculate Error of model in iteration n
self.calc_J(target_vector)
# Reset Activated_sums, derivative_sums for next iteration.
self.activated_sums = []
self.derivative_sums = []
def SGD(self, layer):
gradient_error = np.multiply(np.outer(self.derivative_sums[layer], self.activated_sums[layer].T), self.learning_rate)
self.weights[layer] = np.subtract(self.weights[layer], gradient_error)
if self.bias_weights[layer] is not 0:
self.bias_weights[layer] = np.subtract(self.bias_weights[layer], np.multiply(self.derivative_sums[layer], self.learning_rate))
def calc_J(self, hot_vector):
x = -np.sum(hot_vector * np.log(self.activated_sums[len(self.activated_sums) - 1]))
print(x)
self.error.append(x)
def graph_loss(self):
x = self.error
y = [i for i in range(len(self.error))]
plt.plot(y, x) # x axis -> Loss, y axis -> epochs
plt.show()
if __name__ == "__main__":
mndata = MNIST('samples', gz=True)
images, labels = mndata.load_training()
NN = Neural_Re([784, 200, 200, 10])
NN.learning_rate = 0.001
iter = 2000
for i in range(iter):
image = np.multiply(mndata.process_images_to_numpy(images[i]), 1 / 256)
# label vector
label = [0 for j in range(10)]
label[labels[i]] = 1
print(i)
NN.set_input(image)
NN.propagate()
NN.backprop(label)
NN.graph_loss()
最初,我希望得到这样的结果(图形信息无关紧要,只针对上下文):
(来源:github.io)
但是,当我让我的神经网络运行时(2000个训练样本):
为什么神经网络不能收敛到任何东西? 任何答案都将不胜感激!你知道吗
编辑1:更改Backprop“计算导数w.r.t”
编辑2:增加了学习率[不是全部问题,仍然不起作用]
编辑3:为激活的和和和导数和数组添加了重置语句[导致了叠加输入的问题,但没有解决它]
目前没有回答
相关问题 更多 >
编程相关推荐