Python 如何避免Nan';在tensorflow的神经网络层中有什么?

Python 如何避免Nan';在tensorflow的神经网络层中有什么?,python,tensorflow,neural-network,deep-learning,Python,Tensorflow,Neural Network,Deep Learning,代码: image_row = 640 image_col = 480 num_labels = 17 num_channels = 3 # grayscale import numpy as np #Load data train_dataset, train_labels = load_file.load_data() test_dataset = scipy.misc.imread("1501005004.548261985.png") test_labels = np.loadtx

代码:

image_row = 640
image_col = 480
num_labels = 17
num_channels = 3 # grayscale

import numpy as np

#Load data
train_dataset, train_labels = load_file.load_data()
test_dataset = scipy.misc.imread("1501005004.548261985.png")
test_labels =  np.loadtxt("1501005004.493062654.txt", comments="#", delimiter=",", unpack=False)

batch_labels = train_labels


print('Training set', train_dataset.shape, train_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)


def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_row, image_col, num_channels)).astype(np.float32)
  #labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  labels = labels.reshape((-1,num_labels)).astype(np.float32)
  return dataset, labels

train_dataset, train_labels = reformat(train_dataset, train_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)

print('Training set', train_dataset.shape, train_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)


def accuracy(labels,predictions):
    return 100.0 * tf.reduce_sum(tf.pow(predictions - labels,2))


batch_size = 1
kernel_size = patch_size =5
depth = 16
num_hidden1 = 64
num_hidden2 = 32

graph = tf.Graph()

with graph.as_default():
    #Input data
    tf_train_dataset = tf.placeholder(tf.float32,shape=([batch_size, image_row, image_col, num_channels]))

        tf_train_labels = tf.placeholder(tf.float32,shape=([batch_size, num_labels]))

        tf_test_dataset = tf.constant(test_dataset) 

    # Variables.
        layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1))
        layer1_biases = tf.Variable(tf.zeros([depth]))

    # dropout
    keep_prob = tf.placeholder("float")

    layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1))
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))

    layer3_weights = tf.Variable(tf.truncated_normal([image_row // 4 * image_col // 4 * depth, num_hidden1], stddev=0.1))
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden1]))

    layer4_weights = tf.Variable(tf.truncated_normal([num_hidden1, num_hidden2], stddev=0.1))
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden2]))

    layer5_weights = tf.Variable(tf.truncated_normal([num_hidden2, num_labels], stddev=0.1))
    layer5_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))

    def model(data):

            conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
            hidden = tf.nn.relu(conv + layer1_biases)

            #   pooling

                pool1 = tf.nn.max_pool(hidden, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],padding='SAME', name='pool1')
            norm1 = tf.nn.lrn(pool1, 4, bias=1.0,   alpha=0.001 / 9.0, beta=0.75,name='norm1')       

            #   layer2
            conv = tf.nn.conv2d(norm1, layer2_weights, [1, 1, 1, 1], padding='SAME')
            hidden = tf.nn.relu(conv + layer2_biases)

            #   pooling2
            pool2 = tf.nn.max_pool(hidden, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],padding='SAME', name='pool1')
            norm2 = tf.nn.lrn(pool2, 4, bias=1.0,   alpha=0.001 / 9.0, beta=0.75,name='norm1')

            #   layer3
            conv = tf.nn.conv2d(norm2, layer2_weights, [1, 1, 1, 1], padding='SAME')
            hidden = tf.nn.relu(conv + layer2_biases)

            shape = hidden.get_shape().as_list()
            reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])

            #   RELU - 1e-9
            hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)    
            hidden = tf.matmul(hidden, layer4_weights) + layer4_biases

            # #   add a dropout
            #     hidden = tf.nn.dropout(hidden, keep_prob)

            result = tf.matmul(hidden, layer5_weights) + layer5_biases

            return result


    logits = model(tf_train_dataset)
        print ('AFTER LOGITS')
        embed()
    loss = tf.reduce_sum(tf.pow(logits-tf_train_labels,2))/(2*batch_size)
    #loss = tf.reduce_sum(tf.pow(logits-batch_labels,2))/(2*batch_size)

    global_step = tf.Variable(0, trainable = False)
    start_learning_rate = 0.001
    learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, 100000, 0.96,staircase = True)     
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)


    #Prediction
    train_prediction = logits
    test_prediction = tf_test_dataset

num_steps  = 10000001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
        print('----------------INITIALIZED-----------------')
    for step in range(num_steps):
                print(step)
        offset = (step * batch_size)% (train_labels.shape[0] - batch_size)
        print('after offset')
        embed()
        batch_data = train_dataset[offset: (offset+batch_size), :,:,:]
        batch_labels = train_labels[offset: (offset + batch_size),:]
        feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels, keep_prob:1.0}

        _,l,prediction = session.run([optimizer, loss,train_prediction], feed_dict= feed_dict)
        print('after _,l,prediction')
        embed()
        if(step % 50 ==0):
            print("Minibatch loss %d: %f"%(step,l))
            print('Minibatch accuracy:' % accuracy(prediction, batch_labels))       
在上面的代码中,我在上一个隐藏层中获得了大量的Inf值,其输出粘贴在下面:

In [93]: session.run(hidden)
Out[93]: 
array([[  9.99999972e-10,   9.99999972e-10,   9.99999972e-10,
                     inf,   9.99999972e-10,   5.50044295e+28,
          9.99999972e-10,   9.99999972e-10,   3.21215463e+28,
          9.99999972e-10,   1.24344986e+28,   9.99999972e-10,
          9.99999972e-10,   2.52180816e+28,   9.99999972e-10,
          9.99999972e-10,   9.99999972e-10,   9.99999972e-10,
          1.41978562e+28,              inf,   9.99999972e-10,
如何避免这些Inf值。我是一个深度学习和Tensorflow的初学者,因此我不知道如何进行这些

我尝试在relu层中添加一个常量:
hidden=tf.nn.relu(tf.matmul(重塑,layer3\u权重)+layer3\u偏差+1e-9)
,但没有帮助


我该怎么做呢?

一般来说,这个问题会在渐变爆炸时出现,你需要裁剪渐变

# Replace this lines with the following
>optimizer=tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_vars = optimizer.compute_gradients(loss, tf.trainable_variables())
grads_vars = clip_grad_norms(grads_vars, max_norm=10)
train_op = optimizer.apply_gradients(grads_vars)

# finally
> _,l,prediction = session.run([optimizer, loss,train_prediction], feed_dict= feed_dict)
#replace with
_,l,prediction = session.run([train_op, loss,train_prediction],  feed_dict= feed_dict)

# clip_grad_norms function link
https://github.com/n3011/tefla/blob/master/tefla/core/base.py#L253

通常,此问题会在渐变爆炸时出现,您需要剪裁渐变

# Replace this lines with the following
>optimizer=tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
grads_vars = optimizer.compute_gradients(loss, tf.trainable_variables())
grads_vars = clip_grad_norms(grads_vars, max_norm=10)
train_op = optimizer.apply_gradients(grads_vars)

# finally
> _,l,prediction = session.run([optimizer, loss,train_prediction], feed_dict= feed_dict)
#replace with
_,l,prediction = session.run([train_op, loss,train_prediction],  feed_dict= feed_dict)

# clip_grad_norms function link
https://github.com/n3011/tefla/blob/master/tefla/core/base.py#L253

从上面的代码来看,似乎在第4层和第5层之间没有任何激活函数

 hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)    
 hidden = tf.matmul(hidden, layer4_weights) + layer4_biases
 result = tf.matmul(hidden, layer5_weights) + layer5_biases

 return result

根据您对重量和偏差的初始化,这可能是流动重量过大/不足的原因。

从上面的代码来看,似乎您在第4层和第5层之间没有任何激活功能

 hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)    
 hidden = tf.matmul(hidden, layer4_weights) + layer4_biases
 result = tf.matmul(hidden, layer5_weights) + layer5_biases

 return result

根据重量和偏差的初始化,这可能是流动重量过大/不足的原因。

如果不查看完整代码,很难回答。太高的学习率可能会导致这种行为,但可能还有许多其他原因。我建议发布完整的代码。谢谢@MiriamFarber!我已经用完整的代码更新了帖子。在计算损失时,可以尝试
reduce\u mean
而不是
reduce\u sum
?另外,
lrn
也不建议,如果您试图进行分类,您的目标应该(最好)是一个热编码,我建议使用的损失函数是交叉熵(输出层上有软最大激活)。请参阅此处的教程:为什么要将权重初始化为如此大的数字?这可能是在没有看到整个代码的情况下很难回答的原因之一。太高的学习率可能会导致这种行为,但可能还有许多其他原因。我建议发布完整的代码。谢谢@MiriamFarber!我已经用完整的代码更新了帖子。在计算损失时,可以尝试
reduce\u mean
而不是
reduce\u sum
?另外,
lrn
也不建议,如果您试图进行分类,您的目标应该(最好)是一个热编码,我建议使用的损失函数是交叉熵(输出层上有软最大激活)。请参阅此处的教程:为什么要将权重初始化为如此大的数字?这可能是南斯的一个原因