Python 2.7 Tensorflow估计模型发散，损失=NaN_Python 2.7_Tensorflow_Nan_Convolutional Neural Network_Tensorflow Estimator

Python 2.7 Tensorflow估计模型发散，损失=NaN

python-2.7 tensorflow

Python 2.7 Tensorflow估计模型发散，损失=NaN,python-2.7,tensorflow,nan,convolutional-neural-network,tensorflow-estimator,Python 2.7,Tensorflow,Nan,Convolutional Neural Network,Tensorflow Estimator,我使用的是一个设置为CNN的tensorflow估计器，每次运行代码时都会出现以下错误： ERROR:tensorflow:Model diverged with loss = NaN. Traceback (most recent call last): File "cnn_training_v3.py", line 108, in <module> classifier.train(input_fn=train_input_fn, steps=200, hooks=[

我使用的是一个设置为CNN的tensorflow估计器，每次运行代码时都会出现以下错误：

ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
  File "cnn_training_v3.py", line 108, in <module>
    classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 363, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 843, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 859, in _train_model_default
    saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1059, in _train_with_estimator_spec
    _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 567, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1043, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1134, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1119, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1199, in run
    run_metadata=run_metadata))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 623, in after_run
    raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.

这是我的主要代码。我的目标是训练CNN观察一个街区塔的图像，并预测图像中有多少街区

# Load and process dataset

image_files = []
text_files = []
images = []
labels = []

# load files from folder
for root, dirs, files in os.walk("images"):  
    for filename in files:
        if 'before' in filename:
            image_files.append(filename)
        elif 'text' in filename:
            text_files.append(filename)

# for each pair of files, append relevant data to image and label lists
# note to self: label 0 means 2 blocks, label 1 means 3 blocks, label 2 means 4 blocks, label 3 means 5 blocks
for imagename in image_files:
    images.append(cv2.imread('images/'+filename))
    num = imagename[7:len(imagename)-4]
    for textname in text_files:
        if ('_'+num+'.') in textname:
            textfile = open('images/'+textname, 'r')
            for line in textfile:
                if 'Number of blocks' in line:
                    nblocks = int(line[18:].strip('\n'))
                    if nblocks == 2:
                        label = 0
                    elif nblocks == 3:
                        label = 1
                    elif nblocks == 4:
                        label = 2
                    elif nblocks == 5:
                        label = 3
            labels.append(label)

# separate images and labels into train and test sets - 50% train, 50% evaluate
train_images = images[0:len(images)/2]
train_labels = labels[0:len(labels)/2]
test_images = images[len(images)/2:]
test_labels = labels[len(labels)/2:]

# convert dataset into numpy arrays
train_data_numpy = np.array(train_images, np.float32)
train_labels_numpy = np.array(train_labels, np.int32)
test_data_numpy = np.array(test_images, np.float32)
test_labels_numpy = np.array(test_labels, np.int32)



# Put images through CNN

# Create the Estimator
classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="models/cnn")

# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1)

# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":train_data_numpy}, y=train_labels_numpy, batch_size=1, num_epochs=None, shuffle=True)
classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])

# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":test_data_numpy}, y=test_labels_numpy, num_epochs=1, shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)

我正在Ubuntu 16.04上使用Python 2.7.12。如果您能深入了解NaN丢失的原因，我们将不胜感激。

找到了解决方案！原来模型以前的检查点与当前培训会话冲突，因此我删除了模型保存检查点所在文件夹中的所有内容，现在它正在培训，没有任何丢失错误。

找到了解决方案！原来模型以前的检查点与当前培训会话冲突，所以我删除了模型保存检查点的文件夹中的所有内容，现在它正在培训，没有任何错误

# Load and process dataset

image_files = []
text_files = []
images = []
labels = []

# load files from folder
for root, dirs, files in os.walk("images"):  
    for filename in files:
        if 'before' in filename:
            image_files.append(filename)
        elif 'text' in filename:
            text_files.append(filename)

# for each pair of files, append relevant data to image and label lists
# note to self: label 0 means 2 blocks, label 1 means 3 blocks, label 2 means 4 blocks, label 3 means 5 blocks
for imagename in image_files:
    images.append(cv2.imread('images/'+filename))
    num = imagename[7:len(imagename)-4]
    for textname in text_files:
        if ('_'+num+'.') in textname:
            textfile = open('images/'+textname, 'r')
            for line in textfile:
                if 'Number of blocks' in line:
                    nblocks = int(line[18:].strip('\n'))
                    if nblocks == 2:
                        label = 0
                    elif nblocks == 3:
                        label = 1
                    elif nblocks == 4:
                        label = 2
                    elif nblocks == 5:
                        label = 3
            labels.append(label)

# separate images and labels into train and test sets - 50% train, 50% evaluate
train_images = images[0:len(images)/2]
train_labels = labels[0:len(labels)/2]
test_images = images[len(images)/2:]
test_labels = labels[len(labels)/2:]

# convert dataset into numpy arrays
train_data_numpy = np.array(train_images, np.float32)
train_labels_numpy = np.array(train_labels, np.int32)
test_data_numpy = np.array(test_images, np.float32)
test_labels_numpy = np.array(test_labels, np.int32)



# Put images through CNN

# Create the Estimator
classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="models/cnn")

# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1)

# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":train_data_numpy}, y=train_labels_numpy, batch_size=1, num_epochs=None, shuffle=True)
classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])

# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":test_data_numpy}, y=test_labels_numpy, num_epochs=1, shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)