神经网络不收敛于“任何地方”

2024-04-26 21:39:13 发布

您现在位置:Python中文网/ 问答频道 /正文

在过去的两周里,我一直在尝试使用MNIST数据库实现一个带有前馈神经网络的手写数字分类器。 神经网络采用交叉熵损失,输出层采用Softmax,其余节点采用Sigmoid函数激活。你知道吗

我在训练网络时遇到的主要问题是,损失从未收敛,而且非常痉挛。你知道吗

我已经试着重新编写我的脚本,检查我是否正确地实现了数学,但我没有成功。我也读过其他有同样问题的帖子,但没有一个能给我一个我想要的答案。此外,我尝试用一个更简单的神经网络进行检查,但也没有成功。 我在代码中使用的源代码: Backpropagation By Ken-ChenDerivative of Softmax With Cross-EntropyExample to Feed-Forward and Backpropagation

注意我很确定在更新权重时会出现问题,但我无法判断。
注意下面的代码是重新编写的脚本。你知道吗

import numpy as np
import matplotlib.pyplot as plt
import pickle
from mnist import MNIST


class Neural_Re:
    def __init__(self, layers):
        self.layers = layers
        self.weights = []
        self.bias_weights = [0 for i in range(len(layers) - 1)]
        self.activated_sums = []
        self.derivative_sums = []
        self.learning_rate = 0.1

        self.error = []

        # Initialize Random Weights
        for i in range(len(layers) - 1):
            weights_matrix = np.random.rand(self.layers[i + 1], self.layers[i]).dot(
                np.sqrt(2 / (self.layers[i] + self.layers[i + 1])))
            self.weights.append(weights_matrix)

    def add_bias(self, layer):
        # e.g: if layer == 0, layer in bias_weights is 0
        layer -= 1
        weights_matrix = np.random.rand(self.layers[layer + 1]).dot(
            np.sqrt(2 / (self.layers[layer] + self.layers[layer + 1])))
        self.bias_weights.insert(layer, weights_matrix)

    def set_input(self, inputs):
        self.activated_sums.append(inputs)

    @staticmethod
    def activation_sigmoid(sums, derivative=False):
        for i in range(len(sums)):
            if sums[i] > 37:
                sums[i] = 37
            elif sums[i] < -37:
                sums[i] = -37

        s = 1 / (1 + np.exp(-sums))
        if derivative:
            return s * (1 - s)
        else:
            return s

    @staticmethod
    def activation_softmax(sums):
        exp = np.exp(sums)
        return exp / exp.sum()

    def propagate(self):
        for layer in range(len(self.weights)):
            # Calculate Sum w*x + b
            zl = self.weights[layer].dot(self.activated_sums[layer])
            np.add(zl, self.bias_weights[layer], out=zl)
            # Saving Sums of (w*x + b) for use in calculating the error of 
              each node in backprop

            self.derivative_sums.append(zl)

            if layer == len(self.weights) - 1:
                al = self.activation_softmax(zl)
            else:
                al = self.activation_sigmoid(zl)

            self.activated_sums.append(al)

    def backprop(self, target_vector):
        for layer in range(len(self.weights) - 1, -1, -1):
            if layer == len(self.weights) - 1: # If layer is output layer: 
                 # calculate derivative w.r.t Output sum vector (∂J/∂z)
                 # [J = Loss function , z = Sum vector before activation]
                self.derivative_sums[layer] = np.subtract(self.activated_sums[len(self.activated_sums)-1], target_vector)
            else:
                # Calculate Error of each Node in layer
                derivative_sigmoid = self.activation_sigmoid(self.derivative_sums[layer], derivative=True)
                sum_errors_in_next_layer = np.sum(self.derivative_sums[layer + 1].dot(self.weights[layer + 1]))
                self.derivative_sums[layer] = np.multiply(sum_errors_in_next_layer, derivative_sigmoid)

        for layer in range(0, len(self.weights) - 1):
            # Stochastic Gradient Descent, Update weights.
            self.SGD(layer)

        # Calculate Error of model in iteration n
        self.calc_J(target_vector)
        # Reset Activated_sums, derivative_sums for next iteration.
        self.activated_sums = []
        self.derivative_sums = []



    def SGD(self, layer):
        gradient_error = np.multiply(np.outer(self.derivative_sums[layer], self.activated_sums[layer].T), self.learning_rate)
        self.weights[layer] = np.subtract(self.weights[layer], gradient_error)
        if self.bias_weights[layer] is not 0:
            self.bias_weights[layer] = np.subtract(self.bias_weights[layer], np.multiply(self.derivative_sums[layer], self.learning_rate))

    def calc_J(self, hot_vector):
        x = -np.sum(hot_vector * np.log(self.activated_sums[len(self.activated_sums) - 1]))
        print(x)
        self.error.append(x)

    def graph_loss(self):
        x = self.error
        y = [i for i in range(len(self.error))]
        plt.plot(y, x) # x axis -> Loss, y axis -> epochs
        plt.show()

if __name__ == "__main__":
    mndata = MNIST('samples', gz=True)
    images, labels = mndata.load_training()

    NN = Neural_Re([784, 200, 200, 10])
    NN.learning_rate = 0.001
    iter = 2000

    for i in range(iter):
        image = np.multiply(mndata.process_images_to_numpy(images[i]), 1 / 256)
        # label vector
        label = [0 for j in range(10)]
        label[labels[i]] = 1

        print(i)
        NN.set_input(image)
        NN.propagate()
        NN.backprop(label)
    NN.graph_loss()

最初,我希望得到这样的结果(图形信息无关紧要,只针对上下文): Expected:
(来源:github.io

但是,当我让我的神经网络运行时(2000个训练样本): result:

为什么神经网络不能收敛到任何东西? 任何答案都将不胜感激!你知道吗

编辑1:更改Backprop“计算导数w.r.t”

编辑2:增加了学习率[不是全部问题,仍然不起作用]

编辑3:为激活的和和和导数和数组添加了重置语句[导致了叠加输入的问题,但没有解决它]


Tags: inselflayerforlenlayersdefnp