Search code examples
python-2.7tensorflownanconv-neural-networktensorflow-estimator

Tensorflow estimator model diverged with loss = NaN


I am using a tensorflow estimator set up as a CNN, and every time I run my code I get this error:

ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
  File "cnn_training_v3.py", line 108, in <module>
    classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 363, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 843, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 859, in _train_model_default
    saving_listeners)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1059, in _train_with_estimator_spec
    _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 567, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1043, in run
    run_metadata=run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1134, in run
    raise six.reraise(*original_exc_info)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1119, in run
    return self._sess.run(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1199, in run
    run_metadata=run_metadata))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/basic_session_run_hooks.py", line 623, in after_run
    raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.

I know there have been similar questions already asked on this site, but their answers didn't help me. I've tried decreasing the learning rate, adding epsilon to my logits probabilities and changing the loss function but still got errors.

Here is my CNN function:

# CNN function
def cnn_model_fn(features, labels, mode):

    # Define the layers of the cnn
    input_layer = tf.reshape(features["images"], [-1, 200, 200, 3])
    conv_layer = tf.layers.conv2d(inputs=input_layer, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu)
    pool_layer = tf.layers.max_pooling2d(inputs=conv_layer, pool_size=[2, 2], strides=2)
    conv_layer_two = tf.layers.conv2d(inputs=pool_layer, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu)
    pool_layer_two = tf.layers.max_pooling2d(inputs=conv_layer_two, pool_size=[2, 2], strides=2)
    flat_pool_two = tf.reshape(pool_layer_two, [-1, 50 * 50 * 64])
    dense_layer = tf.layers.dense(inputs=flat_pool_two, units=1024, activation=tf.nn.relu)
    logits = tf.layers.dense(inputs=dense_layer, units=4)

    # Add epsilon to logits
    epsilon = tf.constant(value=0.00001, shape=(1,4))
    logits = logits + epsilon

    # Generate predictions (for PREDICT and EVAL mode)
    blocknum_prediction = tf.argmax(input=logits, axis=1)
    blocknum_probabilities = tf.nn.softmax(logits, name="softmax_tensor")
    predictions = {"blocknum_classes": blocknum_prediction}

    # Return predictions when in PREDICT mode
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Operation (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0001)
        train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {"blocknum_accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["blocknum_classes"])}
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

Here is my main code. My goal is to train the CNN to look at an image of a tower of blocks, and predict how many blocks there are in the image.

# Load and process dataset

image_files = []
text_files = []
images = []
labels = []

# load files from folder
for root, dirs, files in os.walk("images"):  
    for filename in files:
        if 'before' in filename:
            image_files.append(filename)
        elif 'text' in filename:
            text_files.append(filename)

# for each pair of files, append relevant data to image and label lists
# note to self: label 0 means 2 blocks, label 1 means 3 blocks, label 2 means 4 blocks, label 3 means 5 blocks
for imagename in image_files:
    images.append(cv2.imread('images/'+filename))
    num = imagename[7:len(imagename)-4]
    for textname in text_files:
        if ('_'+num+'.') in textname:
            textfile = open('images/'+textname, 'r')
            for line in textfile:
                if 'Number of blocks' in line:
                    nblocks = int(line[18:].strip('\n'))
                    if nblocks == 2:
                        label = 0
                    elif nblocks == 3:
                        label = 1
                    elif nblocks == 4:
                        label = 2
                    elif nblocks == 5:
                        label = 3
            labels.append(label)

# separate images and labels into train and test sets - 50% train, 50% evaluate
train_images = images[0:len(images)/2]
train_labels = labels[0:len(labels)/2]
test_images = images[len(images)/2:]
test_labels = labels[len(labels)/2:]

# convert dataset into numpy arrays
train_data_numpy = np.array(train_images, np.float32)
train_labels_numpy = np.array(train_labels, np.int32)
test_data_numpy = np.array(test_images, np.float32)
test_labels_numpy = np.array(test_labels, np.int32)



# Put images through CNN

# Create the Estimator
classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="models/cnn")

# Set up logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1)

# Train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":train_data_numpy}, y=train_labels_numpy, batch_size=1, num_epochs=None, shuffle=True)
classifier.train(input_fn=train_input_fn, steps=200, hooks=[logging_hook])

# Evaluate the model and print results
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"images":test_data_numpy}, y=test_labels_numpy, num_epochs=1, shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)

I am using Python 2.7.12 on Ubuntu 16.04. Any insight into why this NaN loss is occurring would be greatly appreciated.


Solution

  • Found the solution! Turns out previous checkpoints of the model were conflicting with the current training session, so I deleted everything in the folder my model saves checkpoints to and now it's training without any NaN loss errors.