尝试在Python中编写自己的神经网络
上个学期,我参加了斯坦福大学的一个在线机器学习课程,老师是吴恩达教授。http://www.ml-class.org/course/auth/welcome 我觉得这个课程非常有用。为了更好地理解神经网络,我尝试用Python自己写了一个。代码如下:
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l
self.sl = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.weights.append(numpy.matrix(numpy.random.rand(self.sl[idx-1]+1, self.sl[idx])/5))
self.cost = []
def update(self, input):
if input.shape[1] != self.sl[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. I'm expecting inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.z.append(self.a[-1]*weight)
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
self.cost.append(abs(float(Cost)))
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
Delta.append(self.a[idx].T*delta[-(idx+1)])
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.update(input)
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def predict(self, input):
self.update(input)
return self.a[-1]
但是它没有运行成功 =(. 通过查看成本和迭代次数的关系,我发现成本有个小波动,而对A的预测结果都是一样的。有人能帮我理解一下为什么我的神经网络没有收敛吗?
谢谢,抱歉代码有点多(也许有人会觉得有用)。
更新:
我不再使用随机数据,而是从UCI机器学习库获取了一些结构化数据。这个数据集是关于葡萄牙东北部森林火灾的烧毁面积,使用了气象和其他数据:http://archive.ics.uci.edu/ml/datasets/Forest+Fires 我对数据进行了修改,把天和月转换成了数字:https://docs.google.com/spreadsheet/ccc?key=0Am3oTptaLsExdC1PeXl1eTczRnRNejl3QUo5RjNLVVE
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)
features = data[:,0:11]
targets = numpy.matrix(data[:,12]).T
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
n = NN([11, 10, 1]) #The class takes the list of how many nodes in each layer
n.train(nfeatures, targets, 0.003, 0.0)
import matplotlib.pyplot
matplotlib.pyplot.subplot(221)
matplotlib.pyplot.plot(n.cost)
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.subplot(222)
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
matplotlib.pyplot.close()
为什么成本在大约4000的时候停滞不前,而数据与预测结果之间没有任何趋势呢?你可以在这里看到图表:https://docs.google.com/open?id=0B23oTptaLsExMTQ0OTAxNWEtYjE2NS00MjA5LTg1MjMtNDBhYjVmMTFhZDhm
3 个回答
我觉得你的偏置应该在加权输入的某个地方减去(或者设置为-1)。从你的代码来看,神经元把所有的输入都加起来了,包括偏置(这个偏置设置为+1)。
(抱歉,我的积分不够,无法添加评论,所以我就继续发答案吧。)
是的,这确实听起来有点奇怪。不过,如果在训练之后你生成一个新的矩阵B:
B = numpy.random.rand(5, 4)/5
Targets = B*X
print n.predict(B)
print B*X
它通常会正常工作(大多数情况下 - 有时它仍然会把平均值(Targets)作为答案)。注意:在我的例子中,我把特征从100个减少到了4个。
另外,我觉得在50个数据元素上跑5000次迭代对你没有什么好处。一般来说,你应该尽量使用更多的训练数据 - 在这里你可以用尽可能多的数据,但你用的例子甚至比你有的特征还要少。
这很有趣,我会再想想这个 :) 我用你的网络做了一个更简单的例子 - 输入我提供了两个数字,期望输出它们的和。结果还算可以。
这个神经网络在处理森林火灾数据时遇到了一些问题,具体原因有几个。
首先,numpy.tanh()这个函数的表现不太正常。代码需要从:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)),numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
改成:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1])))))
其次,numpy和matplotlib这两个库之间的配合也不太顺利。numpy生成的矩阵在绘图时似乎是反向的。可以通过使用matrix.tolist()来解决这个问题。代码需要从:
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
改成:
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
最后,节点的数量应该大约是样本大小的10%。与其用10个节点,不如用50个节点效果更好。
下面是可以正常工作的神经网络代码,里面有一个新函数autoparam,它会尝试找到最佳的学习率和正则化常数。你可以在这里查看森林火灾的成本与迭代次数的图表,以及数据与预测结果的图表:https://docs.google.com/open?id=0B23oTptaLsExMWQ4ZWM1ODYtZDMzMC00M2VkLWI1OWUtYzg3NzgxNWYyMTIy
感谢你的阅读!我希望我的神经网络能帮助到大家。
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l
self.sl = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.weights.append(numpy.matrix(numpy.random.rand(self.sl[idx-1]+1, self.sl[idx]))/5)
self.cost = []
def update(self, input):
if input.shape[1] != self.sl[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. Expected inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.z.append(self.a[-1]*weight)
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1]))))) #sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
self.cost.append(abs(float(Cost)))
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
Delta.append(self.a[idx].T*delta[-(idx+1)])
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.update(input)
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def autoparam(self, data, alpha = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3], lamda = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]):
#data: numpy matrix with targets in last column
#alpha: learning rate
#lamda: regularization term
#Create training, cross validation, and test sets
while 1:
try:
numpy.seterr(invalid = 'raise')
numpy.random.shuffle(data) #Shuffle data
training_set = data[0:data.shape[0]/10*6, 0:-1]
self.ntraining_set = (training_set-training_set.mean(axis=0))/training_set.std(axis=0)
self.training_tgt = numpy.matrix(data[0:data.shape[0]/10*6, -1]).T
cv_set = data[data.shape[0]/10*6:data.shape[0]/10*8, 0:-1]
self.ncv_set = (cv_set-cv_set.mean(axis=0))/cv_set.std(axis=0)
self.cv_tgt = numpy.matrix(data[data.shape[0]/10*6:data.shape[0]/10*8, -1]).T
test_set = data[data.shape[0]/10*8:, 0:-1]
self.ntest_set = (test_set-test_set.mean(axis=0))/test_set.std(axis=0)
self.test_tgt = numpy.matrix(data[data.shape[0]/10*8:, -1]).T
break
except FloatingPointError:
pass
numpy.seterr(invalid = 'warn')
cost = 999999
for i in alpha:
for j in lamda:
self.__init__(self.sl)
self.train(self.ntraining_set, self.training_tgt, i, j, 2000)
current_cost = 1/float(cv_set.shape[0])*sum(numpy.square(self.predict(self.ncv_set) - self.cv_tgt)).tolist()[0][0]
print current_cost
if current_cost < cost:
cost = current_cost
self.learning_rate = i
self.regularization = j
self.__init__(self.sl)
def predict(self, input):
self.update(input)
return self.a[-1]
加载数据、绘图等操作...
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)#Load
numpy.random.shuffle(data)
features = data[:,0:11]
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
targets = numpy.matrix(data[:, 12]).T
n = NN([11, 50, 1])
n.train(nfeatures, targets, 0.07, 0.0, 2000)
import matplotlib.pyplot
matplotlib.pyplot.subplot(221)
matplotlib.pyplot.plot(n.cost)
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.subplot(222)
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
matplotlib.pyplot.plot(targets.tolist(), targets.tolist(), c = 'r')
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
matplotlib.pyplot.close()