Search code examples
kerasmodelloadloss

Huge increase in loss function after loading model in Keras, custom data, heavy agumentation


I got a quite simple problem. After I train my model in Keras, I use save(filepath) method to save my model. Afterward, when I want to continue training, I load up my model, start fitting the model and loss jumps to 420! (from like ~5) and I can't really find out why. According to Keras doc, save() method should save all things, architecture, optimizer state, and weights.

#preprocessing function
def get_random_eraser(p=0.5, s_l=0.02, s_h=0.4, r_1=0.3, r_2=1/0.3, v_l=0, v_h=255, pixel_level=False):
    def eraser(input_img):
        img_h, img_w, img_c = input_img.shape
        p_1 = np.random.rand()

        if p_1 > p:
            return norm(input_img)

        while True:
            s = np.random.uniform(s_l, s_h) * img_h * img_w
            r = np.random.uniform(r_1, r_2)
            w = int(np.sqrt(s / r))
            h = int(np.sqrt(s * r))
            left = np.random.randint(0, img_w)
            top = np.random.randint(0, img_h)

            if left + w <= img_w and top + h <= img_h:
                break

        if pixel_level:
            c = np.random.uniform(v_l, v_h, (h, w, img_c))
        else:
            c = np.random.uniform(v_l, v_h)

        input_img[top:top + h, left:left + w, :] = c
        input_img = norm(input_img)
        input_img = random_crop(input_img, (50, 50))
        return input_img

    return eraser

def norm(img):
    return img / 127.5 - 1.

def random_crop(img, random_crop_size):
    # Note: image_data_format is 'channel_last'
    assert img.shape[2] == 3
    height, width = img.shape[0], img.shape[1]
    dy, dx = random_crop_size
    x = np.random.randint(0, width - dx + 1)
    y = np.random.randint(0, height - dy + 1)
    crop = img[y:(y+dy), x:(x+dx), :]
    return cv2.resize(crop, (height, width), cv2.INTER_LANCZOS4)

model = mn.MobileNetV2(input_shape=None, alpha=1.0, include_top=False, weights='imagenet', classes=179)
model.summary()
l = model.layers

for layer in l:
    print(layer.get_config(), '\n')
    if 'kernel_regularizer' in layer.get_config():
        print('found kernel regularizer')
        layer.kernel_regularizer=l2(l=0.1)
        print('kernel regularizer', layer.kernel_regularizer)
    if 'bias_regularizer' in layer.get_config():
        print('found kernel regularizer')
        layer.bias_regularizer=l2(l=0.1)
        print('bias regularizer', layer.bias_regularizer)

x = Dropout(0.7)(l[-1].output)
x = Conv2D(179, (1,1), activation='linear')(x)
x = ReLU()(x)
x = GlobalAveragePooling2D()(x)
x = Softmax()(x)
model_mod = Model(inputs=model.input, outputs=x)

gen_t = ImageDataGenerator(
                        horizontal_flip=True,
                        vertical_flip=True,
                        rotation_range=45,
                        width_shift_range=0.3,
                        height_shift_range=0.3,
                        shear_range = 0.3,
                        zoom_range = 0.3,
                        preprocessing_function=get_random_eraser(s_l=0, s_h=0.8),
                        validation_split=0.1
)
gen_v = ImageDataGenerator(
                        preprocessing_function=norm,
                        validation_split=0.1
)

early_stop = EarlyStopping(patience=10, restore_best_weights=True, verbose=True)
tb = TensorBoard(batch_size=32)
mc = ModelCheckpoint('mobilenetv2_combined.hdf5', monitor='val_loss', save_best_only=True, verbose=True)

train_generator = gen_t.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="training")

validation_generator = gen_v.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="validation")

class_weights = class_weight.compute_class_weight('balanced', np.unique(train_generator.classes), train_generator.classes)
model_mod.compile(k.optimizers.sgd(lr=0.001, momentum=0.9, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])

hist = model_mod.fit_generator(train_generator,validation_data=validation_generator, epochs=1, initial_epoch=0, callbacks=[early_stop, tb, mc], class_weight=class_weights)
model_mod.save('mobilenet_model_save.h5')

Found 17924 images belonging to 179 classes.
Found 1910 images belonging to 179 classes.
Epoch 1/1
561/561 [==============================] - 415s 741ms/step - loss: 4.9594 - acc: 0.0322 - top_k_categorical_accuracy: 0.1134 - val_loss: 4.4137 - val_acc: 0.0921 - val_top_k_categorical_accuracy: 0.2644

Epoch 00001: val_loss improved from inf to 4.41366, saving model to mobilenetv2_combined.hdf5

So this is code I'm running for training. Now basically same code for continue training (this is just for illustration):

gen_t = ImageDataGenerator(
                        horizontal_flip=True,
                        vertical_flip=True,
                        rotation_range=45,
                        width_shift_range=0.3,
                        height_shift_range=0.3,
                        shear_range = 0.3,
                        zoom_range = 0.3,
                        preprocessing_function=get_random_eraser(s_l=0, s_h=0.8),
                        validation_split=0.1
)
gen_v = ImageDataGenerator(
                        preprocessing_function=norm,
                        validation_split=0.1
)
early_stop = EarlyStopping(patience=10, restore_best_weights=True, verbose=True)
tb = TensorBoard(batch_size=32)
mc = ModelCheckpoint('mobilenetv2_combined.hdf5', monitor='val_loss', save_best_only=True, verbose=True)

train_generator = gen_t.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="training")

validation_generator = gen_v.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="validation")

model_mod = load_model('mobilenet_model_save.h5')

class_weights = class_weight.compute_class_weight('balanced', np.unique(train_generator.classes), train_generator.classes)
#model_mod.compile(adam(lr=0.0001, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])
model_mod.compile(k.optimizers.sgd(lr=0.001, momentum=0.9, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])

hist = model_mod.fit_generator(train_generator,validation_data=validation_generator, epochs=2, initial_epoch=1, callbacks=[early_stop, tb, mc], class_weight=class_weights)
model_mod.save('mobilenet_model_save.h5')
Found 17924 images belonging to 179 classes.
Found 1910 images belonging to 179 classes.
Epoch 2/2
561/561 [==============================] - 373s 665ms/step - loss: 174.3220 - acc: 0.0815 - top_k_categorical_accuracy: 0.2320 - val_loss: 49.8441 - val_acc: 0.0110 - val_top_k_categorical_accuracy: 0.0455

Epoch 00002: val_loss improved from inf to 49.84411, saving model to mobilenetv2_combined.hdf5

Does anybody have any idea what's going on? I tried a very simple toy example with MNIST and everything seems to be working fine. I'll be happy for any suggestion. One more interesting thing, it's just value of the loss function. Accuracy of the network stays the same as after training, e.g. after training, the network finish with an accuracy of 40% and when I resume training (with huge loss jump), the accuracy is still 40%.


Solution

  • So I haven't figured this out, but my guess that it is either problem with saving sort of "custom" (from application module) network problem or due to using older version 2.2.0 (due to squeezenet bug).

    I doubt that this question gonna get more attention than it got in the last 10 days, so I'm closing the question.

    My "solution" was to train the network in a single go, without interruption.