Search code examples
tensorflowconv-neural-networktensorboardloss-function

Why my loss function oscilates only in the middle


Im making a convolutional neural network and always get a loss function oscilating more in the middle, how can I fix it? Loss error

My data are only 100 images, I want to overfit to see that everything goes well but when getting the loss always in the central part there is a greater oscillation, I have tried to lower the learning rate but it is always the same, there is always a greater oscillation in the half of the curve, take all the 100 images in the batch size but the oscillation does not decrease, this is my code.Why can this happen? How can I fix it?

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import os

train_path='/Users/David/Deskt...'

batch_size_train=100
num_epochs=1


tf.logging.set_verbosity(tf.logging.INFO)
sess=tf.Session()

#Convolutional Model

def cnn_model(features,labels,mode):


    #Capa de ingreso
    input_layer=tf.reshape(features["x"],[-1,224,224,3])


    #Capa convolucional 1........
    conv1=tf.layers.conv2d(
        inputs=input_layer,
        filters=64,
        kernel_size=[10,10],
        padding="same",
        activation=tf.nn.relu,
        name="Convolucion_1")

    #Pooling 1.........
    pool1=tf.layers.max_pooling2d(inputs=conv1,pool_size=[2,2],strides=2,name="Pool_1")

    conv2=tf.layers.conv2d(
        inputs=pool1,
        filters=128,
        kernel_size=[10,10],
        padding="same",
        activation=tf.nn.relu,
        name="Convolucion_2")

    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2,name="Pool_2")

    conv3=tf.layers.conv2d(
        inputs=pool2,
        filters=192,
        kernel_size=[10,10],
        padding="same",
        activation=tf.nn.relu,
        name="Convolucion_3")

    pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2], strides=2,name="Pool_3")

    conv4=tf.layers.conv2d(
        inputs=pool3,
        filters=256,
        kernel_size=[10,10],
        padding="same",
        activation=tf.nn.relu,
        name="Convolucion_4")

    pool4 = tf.layers.max_pooling2d(inputs=conv4, pool_size=[2, 2], strides=2,name="Pool_4")

    conv5=tf.layers.conv2d(
        inputs=pool4,
        filters=320,
        kernel_size=[10,10],
        padding="same",
        activation=tf.nn.relu,
        name="Convolucion_5")

    pool5 = tf.layers.max_pooling2d(inputs=conv5, pool_size=[2, 2], strides=2,name="Pool_5")

    pool5_flat=tf.reshape(pool5,[-1,7*7*320],name="Flat_Pool")

    #Deep neural network..............
    dense=tf.layers.dense(inputs=pool5_flat,units=10000,activation=tf.nn.relu,name="Capa_1")

    dense1=tf.layers.dense(inputs=dense,units=7000,activation=tf.nn.relu,name="Capa_2")

    dense2=tf.layers.dense(inputs=dense1,units=4000,activation=tf.nn.relu,name="Capa_3")

    dense3=tf.layers.dense(inputs=dense2,units=1000,activation=tf.nn.relu,name="Capa_4")

    dense4=tf.layers.dense(inputs=dense3,units=500,activation=tf.nn.relu,name="Capa_5")

    logits=tf.layers.dense(inputs=dense4,units=2,name="Capa_final")

    onehot_labels = tf.one_hot(indices=labels, depth=2)

    t=tf.nn.softmax(logits, name="softmax_tensor")

    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)

    tf.summary.scalar('loss',loss)

    ds=tf.train.SummarySaverHook(save_steps=1,output_dir="/Users/David/Desktop/David/Tesis/Practica/Programas/CNN/Model_Chekpoint",summary_op=tf.summary.merge_all())

    loss_hook = tf.train.LoggingTensorHook(tensors={"loss":loss}, every_n_iter=1)


    if mode==tf.estimator.ModeKeys.TRAIN:

        optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.001)
        train_op=optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode,loss=loss,train_op=train_op,training_hooks=[ds,loss_hook])


    def read_file(filename_queue):

    #Funcion para leer el archivo tf.record, y retornamos el next recrod
    reader=tf.TFRecordReader()
    _,serialized_example=reader.read(filename_queue)

    #Se decodifica el tf.record retornando un diccionario 
    feature={'train/image':tf.FixedLenFeature([],tf.string),
             'train/label':tf.FixedLenFeature([],tf.int64)}
    features=tf.parse_single_example(serialized_example,features=feature,name="Decodificacion_Parse")

    #Convertimos el string a numeros de los decodificados features
    image=tf.decode_raw(features['train/image'],tf.float32,name="imagenes_decod")* (1 / 255.0)

    #Convertimos a datos
    label=tf.cast(features['train/label'],dtype=tf.int32,name="label_decod")

    #Reshape data
    image=tf.reshape(image,[224,224,3]) 


    return image,label


    def input_pipeline(filenames,batch_size):


    #Creacion de una lista de los archivos
    filename_queue=tf.train.string_input_producer([filenames],num_epochs=1,shuffle=True,name="Creacion_lista_archiv")
    images,labels=read_file(filename_queue)

    #Mezclar (shuffle) los datos de entrada
    min_after_dequeue=100
    capacity=min_after_dequeue+3*batch_size
    images,labels=tf.train.shuffle_batch([images,labels],batch_size=batch_size,capacity=capacity,num_threads=2,min_after_dequeue=min_after_dequeue,name="Shuffle_data_in")

    return images,labels


def main(unused_argv):


#Lectura y Decodificacion de datos
img_train,lbl_train=input_pipeline(train_path,batch_size_train)

#Estimator - Modelo 
gun_detector=tf.estimator.Estimator(model_fn=cnn_model,model_dir="/Users/David/Desktop/David/Tesis/Practica/Programas/CNN/Model_Chekpoint")

#Inicializacion de variables y run de la session
init_op=tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
sess.run(init_op)

#Corremos las filas(queue) que se crearon en el grafico computacional 
coord = tf.train.Coordinator()
threads=tf.train.start_queue_runners(sess=sess,coord=coord)



try:
    while not coord.should_stop():

        img,lbl=sess.run([img_train,lbl_train])

        train_input_fn = tf.estimator.inputs.numpy_input_fn(
            x={"x": img},
            y=lbl,
            batch_size=70,
            num_epochs=None,
            shuffle=True)

        gun_detector.train(
            input_fn=train_input_fn,
            steps=5000)


except tf.errors.OutOfRangeError:
    print('Done training -- epoch limit reached')
finally:
    coord.request_stop()

coord.join(threads)
sess.close()


if __name__ == '__main__':
   tf.app.run()

Solution

  • Oscillations occured due to the learning rate. If you learn to fast you'll skip over the local minimums and your loss function will be divergent. If you make your learning rate to small you will never converge or converge very slowly. You can try to fiddle around with your learning rate to get those oscillations away but then you run the risk of over training your model. Your graph looks fine to me as long as you converge In a reasonable amount of time you shouldn't care what happens in the middle.