Python THANO中的MLP分类器在局部极小值处沉降

Python THANO中的MLP分类器在局部极小值处沉降,python,machine-learning,neural-network,theano,Python,Machine Learning,Neural Network,Theano,我用theano编写了一个MLP分类器。使用反向传播算法的训练函数如下: self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)] self.bias=[theano.shared(numpy.random.random(network.architecture[

我用theano编写了一个MLP分类器。使用反向传播算法的训练函数如下:

self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
self.layers=network.layers
self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
prediction=T.dmatrix()
output=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(prediction,output).mean()
for i,j in zip(self.weights,self.bias):
    cost+=T.sum(i**2)*reg_lambda
    cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)
我试图针对XOR问题训练分类器。实施过程非常简单

network=FeedForwardNetwork([2,2,2])
network.initialize()
network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
print network.predict(numpy.array([[1.,0.]]))
print network.predict(numpy.array([[0.,0.]]))
initialize()方法只编译后端的所有函数,即反向传播函数、用于计算预测的前向传递函数和一些其他theano函数。现在,当我运行这段代码时,训练会停留在一个局部极小值

0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
0.69314718056
训练开始时,损失约为0.92。它稳定地下降到上述值,并在那里停止。我试着改变alpha和动量的值。我做错了什么

附言。 整个代码如下: 网络.py

import theano
import theano.tensor as T
import numpy
from layers import *
from backend import NetworkBackend

class Network:

    def __init__(self,architecture):
        self.architecture=architecture
        self.layers=[]
        self.weights=[]
        self.bias=[]

    def __str__(self):
        banner=''
        for i in range(len(self.weights)):
            banner+=str(self.weights[i])+'\n'
            banner+=str(self.bias[i])+'\n'
        return banner

class FeedForwardNetwork(Network):

    def initialize(self):
        self.layers.append(InputLayer(units=self.architecture[0]))
        for i in range(1,len(self.architecture[:-1])):
            self.layers.append(SigmoidLayer(units=self.architecture[i]))
        self.layers.append(SoftmaxLayer(units=self.architecture[-1]))
        self.backend=NetworkBackend(self)

    def predict(self,inputs):
        return self.backend.activate(inputs)

    def train(self,X,y,alpha=100,reg_lambda=0.0001,epochs=10000,momentum=0.9):
        cost=1
        while cost>0.01 and epochs:
            prediction=self.predict(X)
            cost=self.backend.backprop(prediction,y,reg_lambda,alpha,momentum)
            print cost
            epochs-=1


if __name__=='__main__':
    network=FeedForwardNetwork([2,2,2])
    network.initialize()
    network.train(numpy.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.],[0.,0.],[0.,1.],[1.,0.],[1.,1.]]),numpy.array([[0.,1.],[1.,0.],[1.,0.],[0.,1.],[0.,1.],[1.,0.],[1.,0.],[0.,1.]]),alpha=0.01,epochs=1000000000000000,momentum=0.9)
    print network.predict(numpy.array([[1.,0.]]))
    print network.predict(numpy.array([[0.,0.]]))
layers.py

import theano
import theano.tensor as T
import scipy
from backend import ComputationBackend

class Layer:

    def __init__(self,units):
        self.units=units
        self.backend=ComputationBackend()

    def __str__(self):
        banner=self.__class__.__name__
        banner+=" Units:%d"%self.units
        return banner

class SigmoidLayer(Layer):

    def forwardPass(self,inputs):
        return self.backend.sigmoid(inputs)


class InputLayer(Layer):

    def forwardPass(self,inputs):
        return inputs

class SoftmaxLayer(Layer):

    def forwardPass(self,inputs):
        return self.backend.softmax(inputs)
backend.py

import theano
import theano.tensor as T
import numpy

class NetworkBackend:

    def __init__(self,network):

        # initialize shared variables
        self.weights=[theano.shared(numpy.random.random((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]
        self.bias=[theano.shared(numpy.random.random(network.architecture[i+1])) for i in range(len(network.architecture)-1)]
        self.layers=network.layers
        self.prev_rate=[theano.shared(numpy.zeros((network.architecture[i+1],network.architecture[i]))) for i in range(len(network.architecture)-1)]+[theano.shared(numpy.zeros(network.architecture[i+1])) for i in range(len(network.architecture)-1)]

        # activation for network layers
        inputs=T.dmatrix()
        temp=self.layers[0].forwardPass(inputs)
        for i in range(1,len(self.layers[:-1])):
            temp=self.layers[i].forwardPass(T.dot(temp,self.weights[i-1].transpose())+self.bias[i-1])
        output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].transpose())+self.bias[-1])
        self.activate=theano.function([inputs],output)

        prediction=T.dmatrix()
        output=T.dmatrix()
        reg_lambda=T.dscalar()
        alpha=T.dscalar()
        momentum=T.dscalar()
        cost=T.nnet.categorical_crossentropy(prediction,output).mean()
        for i,j in zip(self.weights,self.bias):
            cost+=T.sum(i**2)*reg_lambda
            cost+=T.sum(j**2)*reg_lambda
        parameters=self.weights+self.bias
        rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) for parameter,prev_rate in zip(parameters,self.prev_rate)]
        updates=[(weight,weight-rate) for weight,rate in zip(parameters,rates)]+[(prev_rate,rate) for prev_rate,rate in zip(self.prev_rate,rates)]
        self.backprop=theano.function([prediction,output,reg_lambda,alpha,momentum],cost,updates=updates)


class ComputationBackend:

    def __init__(self):

        # sigmoid activation
        self.sigmoid=T.nnet.sigmoid

        # softmax activation
        self.softmax=T.nnet.softmax

这可能是由参数初始化引起的。下面的代码示例使用带有单个隐藏层的神经网络实现了一个基本XOR学习器

import numpy
import theano
import theano.tensor as tt


def compile(input_size, hidden_size):
    w_h = theano.shared(numpy.random.standard_normal(size=(input_size, hidden_size)).astype(theano.config.floatX))
    b_h = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
    w_y = theano.shared(numpy.zeros((hidden_size,), dtype=theano.config.floatX))
    b_y = theano.shared(numpy.zeros(1, dtype=theano.config.floatX), broadcastable=(True,))
    x = tt.matrix()
    z = tt.ivector()
    learning_rate = tt.scalar()
    h = tt.tanh(tt.dot(x, w_h) + b_h)
    y = tt.nnet.sigmoid(tt.dot(h, w_y) + b_y)
    cost = tt.nnet.binary_crossentropy(y, z).mean()
    updates = [(p, p - learning_rate * tt.grad(cost, p)) for p in [w_h, b_h, w_y, b_y]]
    return theano.function([x, z, learning_rate], outputs=cost, updates=updates), theano.function([x], outputs=y)


def main():
    numpy.random.seed(5)
    train, test = compile(2, 2)
    for _ in xrange(100000):
        print train([[1, 1], [1, 0], [0, 1], [0, 0]], [0, 1, 1, 0], 0.1)
    print test([[1, 1], [1, 0], [0, 1], [0, 0]])


main()
请注意随机数生成器种子值。有了
5的种子
学习者会集中在一个好的解决方案上,如果有足够的时间,看起来它正在走向一个完美的解决方案。但是,如果种子更改为
1
,网络将陷入局部最优;它能分辨第二维度,但不能分辨第一维度


不同的随机初始化方法可能会产生更好的结果,即对RNG种子不太敏感。

最终找到了答案!在NetworkBackend中,在计算成本时,我计算预期输出和作为参数传递给theano函数的预测之间的交叉熵,而不是使用activate函数计算的预测。因此,theano图不包含正向过程。因此,theano.tensor.grad只找到正则化函数的梯度,而不是实际的代价函数!因此,正确的实施应该是:

inputs=T.dmatrix()
temp=self.layers[0].forwardPass(inputs)
for i in range(1,len(self.layers[:-1])):
    temp=self.layers[i].forwardPass(T.dot
    (temp,self.weights[i-1].transpose())+self.bias[i-1])
    output=self.layers[-1].forwardPass(T.dot(temp,self.weights[-1].
    transpose())+self.bias[-1])
self.activate=theano.function([inputs],output)

label=T.dmatrix()
reg_lambda=T.dscalar()
alpha=T.dscalar()
momentum=T.dscalar()
cost=T.nnet.categorical_crossentropy(output,label).mean()
for i,j in zip(self.weights,self.bias):
    cost+=T.sum(i**2)*reg_lambda
    cost+=T.sum(j**2)*reg_lambda
parameters=self.weights+self.bias
rates=[(alpha*T.grad(cost,parameter)+momentum*prev_rate) 
for parameter,prev_rate in zip(parameters,self.prev_rate)]
updates=[(weight,weight-rate) for weight,rate
 in zip(parameters,rates)]+[(prev_rate,rate) 
for prev_rate,rate in zip(self.prev_rate,rates)]
self.backprop=theano.function([inputs,label,reg_lambda,alpha,momentum],
cost,updates=updates)

因此,我没有为预测声明一个新的矩阵,而是使用激活函数中使用的相同方程,在训练函数中获取输入并计算预测。这就完成了theano图,theano.tensor.grad()现在计算代价函数的梯度以及重新规格化。

如果有帮助的话,该值是2的自然对数。为了可读性,您可能需要剪裁一些长代码行。尤其是嵌套调用和理解,如果没有任何空格和附加注释,就很难破译。在一定次数的迭代后,它是否会改变,或者是处于局部最小值?是的,它确实会改变,每次迭代后都会达到该精确值。我尝试了不同的学习率,从0.0001到100。这只会改变达到局部最小值所需的迭代次数。我试着改变种子和随机数的分布。它仍然在相同的值处停止。您的3层网络版本运行得非常好,但是,我仍然不知道我做错了什么。