Python反向传播：随着批处理大小的增加，梯度变得越来越小

import numpy as np import random as rd import time import matplotlib.pyplot as plt class NeuralNetworkTrainer: def __init__(self, neuralNetwork,validator): self.network = neuralNetwork # Uses the neural network object which contains weights, bias' as a list of numpy arrays for each layer (the first element being 'None' to be consistent with indexes), as well as activations and layer sizes self.eta = 0 self.dataSet = [] # Another class loads the dataset here (tuples of inputs, 2D-numpy arrays of the image and outputs, the corresponding symbol) self.initializeWeightsBias() self.validator = validator # validator object self.validationAccuracy = [] # list of accuracies per epoch def initializeWeightsBias(self): #gradients initialization self.gradientToBias = [None]*len(self.network.layers) self.gradientToWeights = [None]*len(self.network.layers) def train(self,epochs,miniBatchSize,eta): #train algorithm self.eta = eta for i in range(0,epochs): self.shuffleData() for j in range(0,len(self.dataSet)//miniBatchSize): self.batchBackPropagation(self.createMiniBatch(miniBatchSize,j)) self.update() correctOutputs, dataSetLength = self.validator.validate() self.validationAccuracy.append(round(correctOutputs/dataSetLength,4)) return self.network # *************************** # BACKPROPAGATION ALGORITHM def batchBackPropagation(self,inputOutputBatch): self.initializeWeightsBias() activations = [None]*len(self.network.activations) for i in range(0,len(activations)): #Initialize activations activations[i] = np.empty((len(inputOutputBatch),self.network.activations[i].shape[0],self.network.activations[i].shape[1])) output = np.empty((len(inputOutputBatch),self.network.activations[-1].shape[0],self.network.activations[-1].shape[1])) #correct formatting of output vector out of the symbol (vector with 0's and a 1 in the corresponding output) for i in range(0,len(inputOutputBatch)): inputVector, outputVector = self.vectorizeInputOuput(inputOutputBatch[i]) self.network.loadInput(inputVector) self.network.activate() #feedforward of input through the network with current weights/bias output[i] = outputVector for l in range(1,len(activations)): #creation of activation tensor as explained before activations[l][i] = self.network.activations[l] self.gradientToBias[-1] =(activations[-1]-output)*(activations[-1]-np.square(activations[-1])) #calculation of gradientBias for last layer for all the minibatches as a 3D tensor calculation (see algorithm image) for i in range(2,len(self.network.layers)): self.gradientToBias[-i] = np.tensordot(self.gradientToBias[-i+1],self.network.weights[-i+1],axes= ((1),(0))).transpose(0,2,1)*(activations[-i]-np.square(activations[-i])) #calculation of the rest of the gradientToBias for the rest of the layers as a 3D tensor calculation the first index being the index of the dataset in that minibatch (according to algorithm image) for i in range(1,len(self.network.layers)): self.gradientToWeights[i] = np.einsum('ijk,ilm->ijl',self.gradientToBias[i],activations[i-1]) return self.network # analogous 3D tensor calculation of gradientToWeights for each dataset in the minibatch inside every layer of the 3D tensor # ***************************** def update(self): #reduction of gradients of each dataset to one final gradient to each parameter by summing over axis=0) for i in range(1,len(self.network.layers)): self.network.weights[i] -= self.eta*np.sum(self.gradientToWeights[i],axis =0) self.network.bias[i] -= self.eta*np.sum(self.gradientToBias[i], axis = 0) return self.network def shuffleData(self): #self explanatory rd.shuffle(self.dataSet) return self.network def createMiniBatch(self, miniBatchSize, index): #self explanatory return self.dataSet[index*miniBatchSize:(index+1)*miniBatchSize] def mapOutputToVector(self,output): #self explanatory outputVector = np.zeros((len(self.network.outputMap),1)) outputVector[self.network.outputMap.index(output)] = 1 return outputVector def vectorizeInputOuput(self,inputOutputData): #selfexplanatory return inputOutputData.input.flatten().reshape((-1,1)), self.mapOutputToVector(inputOutputData.output)

1条回答

网友
1楼 · 发布于 2024-05-13 10:54:34

首先，小批量过大通常会导致精度较低
您在第一个图中面临的问题是过度拟合，因此您需要减少纪元的数量
对于第二个图，您有200000个样本，批量大小为2000，则历元应包含200000/2000=100个步骤，这被视为每个历元梯度中的少量步骤
通常，您需要为历元数和批大小选择正确的数字，以获得最佳结果。也许是一个时代中的1000步，不要训练太多的时代来避免过度拟合

相关问题更多 >

编程相关推荐

热门问题

热门文章