I made the model that predicted the MNIST data using tf.softmax
.
But it didn't work. the cost was printed as nan
.
I know it would work to use tf.nn.softmax_cross_entropy_with_logits
like this
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg)
I think the code of my cost have problem.
I wonder why the method of using tf.softmax
doesn't work at a neural nets.
And a adaptation of l2reg is coorect ??
Thanks ~
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import matplotlib.pyplot as plt
import random
import numpy as np
mnist = input_data.read_data_sets('MNIST_data/',one_hot=True)
nb_classes = 10
X = tf.placeholder(tf.float32,[None,28*28], name='x-input')
Y = tf.placeholder(tf.float32,[None,nb_classes], name='y-input')
with tf.name_scope('layer1') as scope:
W1 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight1')
b1 = tf.Variable(tf.random_normal([28*28]),name='bias1')
layer1 = tf.nn.relu(tf.matmul(X,W1)+b1)
w1_hist = tf.summary.histogram('weight1', W1)
layer1_hist = tf.summary.histogram('layer1', layer1)
with tf.name_scope('layer2') as scope:
W2 = tf.Variable(tf.random_normal([28*28,28*28]),name='weight2')
b2 = tf.Variable(tf.random_normal([28*28]),name='bias2')
layer2 = tf.nn.relu(tf.matmul(layer1,W2)+b2)
# 1. From TF graph, decide which tensors you want to log
w2_hist = tf.summary.histogram('weight2', W2)
layer2_hist = tf.summary.histogram('layer2', layer2)
with tf.name_scope('layer3') as scope:
W3 = tf.Variable(tf.random_normal([28*28,nb_classes]),name='weight3')
b3 = tf.Variable(tf.random_normal([nb_classes]),name='bias3')
logits = tf.matmul(layer2,W3)+b3
#hypothesis = tf.div(tf.exp(logits),tf.exp(logit,dim)
hypothesis = tf.nn.softmax(logits)
w3_hist = tf.summary.histogram('weight3', W3)
hypothesis_hist = tf.summary.histogram('hypothesis', hypothesis)
with tf.name_scope('cost') as scope:
# the method of l2reg, when deep
l2reg = tf.reduce_sum(tf.square(W1)) + tf.nn.l2_loss(W2) + tf.nn.l2_loss(W3)
cost = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(hypothesis),reduction_indices=1)) + (0.01 * l2reg)
# cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis,labels=Y))+ (0.01 * l2reg) => this worked very well
cost_summ = tf.summary.scalar('cost',cost)
with tf.name_scope('train') as scope:
train = tf.train.AdamOptimizer(learning_rate=1e-2).minimize(cost)
predicted = tf.argmax(hypothesis,1)
correction = tf.cast(tf.equal(predicted,tf.argmax(Y,1)),dtype=tf.float32)
Accuracy = tf.reduce_mean(correction)
# parameters
training_epochs = 15
batch_size = 100
with tf.Session() as sess:
# 2. merge all summaries
summary = tf.summary.merge_all()
# 3. Create writer and add graph
writer = tf.summary.FileWriter('./logs/mnist_l2reg_1e-2',sess.graph)
#writer.add_graph(sess.graph)
# 4. Run summary merge and add_summary
sess.run(tf.global_variables_initializer())
for epoch in range(training_epochs):
avg_cost = 0
total_batch = int(mnist.train.num_examples / batch_size)
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
s,cost_val,_ = sess.run([summary,cost,train],feed_dict={X:batch_xs,Y:batch_ys})
writer.add_summary(s, global_step=i)
avg_cost += cost_val / total_batch
print('{:5} cost: {:.2f}'.format(epoch+1,avg_cost))
print('Accuracy: ',Accuracy.eval(session=sess,feed_dict={X:mnist.test.images, Y:mnist.test.labels}))
# Get one and predict
r = random.randint(0, mnist.test.num_examples - 1)
print(r, 'test_num: {}, train_num: {}'.format(mnist.test.num_examples,mnist.train.num_examples))
# numpy.array[something] mean the row of the array
# notice that if slice like numpy.array[s:s+1], shape will be printed [1,-1]
print(mnist.test.labels[r],np.shape(mnist.test.labels))
print('argmax test none axis when array vector{}'.format(tf.argmax(mnist.test.labels[r]).eval(session=sess)))
print('Label:',sess.run(tf.argmax(mnist.test.labels[r:r+1], 1)))
print('Prediction:', sess.run(tf.argmax(hypothesis, 1),feed_dict={X: mnist.test.images[r:r +1]}))
plt.imshow(mnist.test.images[r:r+1].reshape(28,28), cmap='Greys', interpolation='nearest')
plt.show()
The issue you are experiencing is exactly why tf.nn.softmax_cross_entropy_with_logits
is so important to use: the numerical instability of the log
operation itself.
Explanation: here you have a rather large network that will end up being very confident in some classifications. In particular, it will end up assigning extremely low probability to some images (say, a picture of a 1
) being of a particular class (say, class 5
). The logit
will then be quite negative, and the tf.nn.softmax
entry for that highly negative logit can be numerically zero (not exactly zero, but with finite precision, it will be represented as zero). Then when you take the log
to calculate cross entropy yourself, you will get numerical issues, resulting in nan
values for your cost. The function tf.nn.softmax_cross_entropy_with_logits
deals with that using a trick to avoid log
and exp
under/overflow issues. That trick is sometimes called the exp-normalize trick; see this blog post (not written by me; I just think it's a clear explanation) for more detail.
In short, use the tf.nn.softmax_cross_entropy_with_logits
and don't try to calculate cross entropy yourself.