How to use BatchNormalization with tensorflow?

I am having trouble using Batch Normalization with tensorflow. I have build the following model:

def weight_variable(kernal_shape):
    weights = tf.get_variable(name='weights', shape=kernal_shape, dtype=tf.float32, trainable=True,
                        initializer=tf.truncated_normal_initializer(stddev=0.02))
    return weights
def bias_variable(shape):
    initial = tf.constant(0.0, shape=shape)
    return tf.Variable(initial)

# return 1 conv layer
def conv_layer(x, w_shape, b_shape, is_training, padding='SAME'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram("biases", b)

    # Note that I used a stride of 2 on purpose in order not to use max pool layer.
    conv = tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding=padding) + b
    conv = tf.contrib.layers.batch_norm(conv, scale=True, is_training=is_training)

    activations = tf.nn.relu(conv)

    tf.summary.histogram("activations", activations)

    return activations

# return deconv layer
def deconv_layer(x, w_shape, b_shape, is_training, padding="SAME", activation='relu'):
    W = weight_variable(w_shape)
    tf.summary.histogram("weights", W)

    b = bias_variable(b_shape)
    tf.summary.histogram('biases', b)

    x_shape = tf.shape(x)
    # output shape: [batch_size, h * 2, w * 2, input_shape from w].
    out_shape = tf.stack([x_shape[0], x_shape[1] * 2, x_shape[2] * 2, w_shape[2]])
    # Note that I have used a stride of 2 since I used a stride of 2 in conv layer.

    conv_trans = tf.nn.conv2d_transpose(x, W, out_shape, [1, 2, 2, 1], padding=padding) + b
    conv_trans = tf.contrib.layers.batch_norm(conv_trans, scale=True, is_training=is_training)

    if activation == 'relu':
        transposed_activations = tf.nn.relu(conv_trans)
    else:
        transposed_activations = tf.nn.sigmoid(conv_trans)

    tf.summary.histogram("transpose_activation", transposed_activations)
    return transposed_activations

def model(input):
    with tf.variable_scope('conv1'):
        conv1 = conv_layer(input, [4, 4, 3, 32], [32], is_training=phase_train)  # image size: [56, 56]
    with tf.variable_scope('conv2'):
        conv2 = conv_layer(conv1, [4, 4, 32, 64], [64], is_training=phase_train)  # image size: [28, 28]
    with tf.variable_scope('conv3'):
        conv3 = conv_layer(conv2, [4, 4, 64, 128], [128], is_training=phase_train)  # image size: [14, 14]
    with tf.variable_scope('conv4'):
        conv4 = conv_layer(conv3, [4, 4, 128, 256], [256], is_training=phase_train)  # image size: [7, 7]
        conv4_reshaped = tf.reshape(conv4, [batch_size * num_participants, 7 * 7 * 256], name='conv4_reshaped')

    w_c_mu = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_mu')
    b_c_mu = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_mu')
    w_c_sig = tf.Variable(tf.truncated_normal([7 * 7 * 256, latent_dim], stddev=0.1), name='weight_fc_sig')
    b_c_sig = tf.Variable(tf.constant(0.1, shape=[latent_dim]), name='biases_fc_sig')
    epsilon = tf.random_normal([1, latent_dim])

    tf.summary.histogram('weights_c_mu', w_c_mu)
    tf.summary.histogram('biases_c_mu', b_c_mu)
    tf.summary.histogram('weights_c_sig', w_c_sig)
    tf.summary.histogram('biases_c_sig', b_c_sig)

    with tf.variable_scope('mu'):
        mu = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_mu), b_c_mu)
        tf.summary.histogram('mu', mu)

    with tf.variable_scope('stddev'):
        stddev = tf.nn.bias_add(tf.matmul(conv4_reshaped, w_c_sig), b_c_sig)
        tf.summary.histogram('stddev', stddev)

    with tf.variable_scope('z'):
        # This formula was adopted from the following paper: http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7979344
        latent_var = mu + tf.multiply(tf.sqrt(tf.exp(stddev)), epsilon)
        tf.summary.histogram('features_sig', stddev)

    with tf.variable_scope('GRU'):
        print(latent_var.get_shape().as_list())
        latent_var = tf.reshape(latent_var, shape=[int(batch_size / 100)* num_participants, time_steps, latent_dim])

        cell = tf.nn.rnn_cell.GRUCell(cell_size)   # state_size of cell_size.
        H, C = tf.nn.dynamic_rnn(cell, latent_var, dtype=tf.float32)  # H size: [batch_size * num_participants, SEQLEN, cell_size]
        H = tf.reshape(H, [batch_size * num_participants, cell_size])

    with tf.variable_scope('output'):
        # output layer.
        w_output = tf.Variable(tf.truncated_normal([cell_size, 1], mean=0, stddev=0.01, dtype=tf.float32, name='w_output'))
        tf.summary.histogram('w_output', w_output)
        b_output = tf.get_variable('b_output', shape=[1], dtype=tf.float32,
                                   initializer=tf.constant_initializer(0.0))
        predictions = tf.add(tf.matmul(H, w_output), b_output, name='softmax_output')
        tf.summary.histogram('output', predictions)

        var_list = [v for v in tf.global_variables() if 'GRU' in v.name]
        var_list.append([w_output, b_output])

    return predictions, var_list

In addition, I am restoring the model parameters as follows:

saver_torestore = tf.train.Saver()

with tf.Session() as sess:
    train_writer = tf.summary.FileWriter(events_path, sess.graph)
    merged = tf.summary.merge_all()

    to_run_list = [merged, RMSE]

    # Initialize `iterator` with training data.
    sess.run(init_op)

    # Note that the last name "Graph_model" is the name of the saved checkpoints file => the ckpt is saved
    # under tensorboard_logs.
    ckpt = tf.train.get_checkpoint_state(
        os.path.dirname(model_path))
    if ckpt and ckpt.model_checkpoint_path:
        saver_torestore.restore(sess, ckpt.model_checkpoint_path)
        print('checkpoints are saved!!!')
    else:
        print('No stored checkpoints')

    counter = 0
    for _ in range(num_epoch):
        sess.run(iterator.initializer)
        print('epoch:', _)

        # This while loop will run indefinitly until the end of the first epoch
        while True:
            try:
                summary, loss_ = sess.run(to_run_list, feed_dict={phase_train: False})

                print('loss: ' + str(loss_))

                losses.append(loss_)
                counter += 1

                train_writer.add_summary(summary, counter)

            except tf.errors.OutOfRangeError:
                print('error, ignore ;) ')
                break

     print('average losses:', np.average(losses))
     train_writer.close()

I make sure that variables are saved. So I ran the following command:

def assign_values_to_batchNorm():
    vars = [v for v in tf.global_variables() if "BatchNorm" in v.name and "Adam" not in v.name]
    file_names = [(v.name[:-2].replace("/", "_") + ".txt") for v in vars]
    for var, file_name in zip(vars, file_names):
        lst = open(file_name).read().split(";")[:-1]
        print(lst)
        values = list(map(np.float32, lst))
        tf.assign(var, values)

Please note that I have used this method in order to restore the values of moving mean and moving variance manually. But I got the same result.

And I called the assign_values_to_batchNorm() under session. I got some values => It seems that the moving average, moving variance, gamma and betta are all saved.

Now Please note that I am working on windows 10, and I have tensorflow version 1.3.

So, whenever I run summary, loss_ = sess.run(to_run_list, feed_dict={phase_train: True}) under the session as well, after initializing/restoring all variables, I got a RMSE of 0.022 which is the same error achieved at the end of training the model. Now, if I set phase_train to false, I got a RMSE of 0.038. Please note that I am just testing the network in the meanwhile. Therefore, even though I am using the training dataset for testing, but my purpose was just to test the behavior of the network while training/testing. So this is so weird I guess to me. And please note that the phase is placeholder. I have it in code as follows:

phase_train = tf.placeholder(dtype=tf.bool, name='phase')

In addition, here is the code snippet for the optimizer:

with tf.name_scope('optimizer'):
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdamOptimizer(0.00001).minimize(RMSE)

Main Problem: RMSE = 0.038 when phase = False and 0.022 when phase = True.

Any help is much appreciated!!

Solution

So I thought that there might be a problem in using batch normalization layer. So I created a simple model and trained it on MNIST dataset. So we have 2 scenarios, in the first case, training the model with batch norm, 2nd, training it without batch norm.

Now, if we compared the testing result, with and without batch norm, we see that we get higher accuracy, or lower loss while using BN. Remember, the model that includes the BN, while testing the phase is set to false. Therefore, we can conclude that having a model with BN is better than that without BN.

Second, if we consider the model that was trained with batch normalization. Now, if we compare the loss on the test set (while setting phase to True on one hand, and False on the other hand), we conclude that we achieve better results when setting phase to True. Because, intuitively, using the statistics of the current batch are more accurate that the statistics of the training dataset.

In conclusion, my problem arises after training the model with batch normalization, and testing the model while setting phase to True, then, False. So, for sure we will get better loss (lower) while setting phase to true rather than false.