I got a quite simple problem. After I train my model in Keras, I use save(filepath) method to save my model. Afterward, when I want to continue training, I load up my model, start fitting the model and loss jumps to 420! (from like ~5) and I can't really find out why. According to Keras doc, save() method should save all things, architecture, optimizer state, and weights.
#preprocessing function
def get_random_eraser(p=0.5, s_l=0.02, s_h=0.4, r_1=0.3, r_2=1/0.3, v_l=0, v_h=255, pixel_level=False):
def eraser(input_img):
img_h, img_w, img_c = input_img.shape
p_1 = np.random.rand()
if p_1 > p:
return norm(input_img)
while True:
s = np.random.uniform(s_l, s_h) * img_h * img_w
r = np.random.uniform(r_1, r_2)
w = int(np.sqrt(s / r))
h = int(np.sqrt(s * r))
left = np.random.randint(0, img_w)
top = np.random.randint(0, img_h)
if left + w <= img_w and top + h <= img_h:
break
if pixel_level:
c = np.random.uniform(v_l, v_h, (h, w, img_c))
else:
c = np.random.uniform(v_l, v_h)
input_img[top:top + h, left:left + w, :] = c
input_img = norm(input_img)
input_img = random_crop(input_img, (50, 50))
return input_img
return eraser
def norm(img):
return img / 127.5 - 1.
def random_crop(img, random_crop_size):
# Note: image_data_format is 'channel_last'
assert img.shape[2] == 3
height, width = img.shape[0], img.shape[1]
dy, dx = random_crop_size
x = np.random.randint(0, width - dx + 1)
y = np.random.randint(0, height - dy + 1)
crop = img[y:(y+dy), x:(x+dx), :]
return cv2.resize(crop, (height, width), cv2.INTER_LANCZOS4)
model = mn.MobileNetV2(input_shape=None, alpha=1.0, include_top=False, weights='imagenet', classes=179)
model.summary()
l = model.layers
for layer in l:
print(layer.get_config(), '\n')
if 'kernel_regularizer' in layer.get_config():
print('found kernel regularizer')
layer.kernel_regularizer=l2(l=0.1)
print('kernel regularizer', layer.kernel_regularizer)
if 'bias_regularizer' in layer.get_config():
print('found kernel regularizer')
layer.bias_regularizer=l2(l=0.1)
print('bias regularizer', layer.bias_regularizer)
x = Dropout(0.7)(l[-1].output)
x = Conv2D(179, (1,1), activation='linear')(x)
x = ReLU()(x)
x = GlobalAveragePooling2D()(x)
x = Softmax()(x)
model_mod = Model(inputs=model.input, outputs=x)
gen_t = ImageDataGenerator(
horizontal_flip=True,
vertical_flip=True,
rotation_range=45,
width_shift_range=0.3,
height_shift_range=0.3,
shear_range = 0.3,
zoom_range = 0.3,
preprocessing_function=get_random_eraser(s_l=0, s_h=0.8),
validation_split=0.1
)
gen_v = ImageDataGenerator(
preprocessing_function=norm,
validation_split=0.1
)
early_stop = EarlyStopping(patience=10, restore_best_weights=True, verbose=True)
tb = TensorBoard(batch_size=32)
mc = ModelCheckpoint('mobilenetv2_combined.hdf5', monitor='val_loss', save_best_only=True, verbose=True)
train_generator = gen_t.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="training")
validation_generator = gen_v.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="validation")
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_generator.classes), train_generator.classes)
model_mod.compile(k.optimizers.sgd(lr=0.001, momentum=0.9, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])
hist = model_mod.fit_generator(train_generator,validation_data=validation_generator, epochs=1, initial_epoch=0, callbacks=[early_stop, tb, mc], class_weight=class_weights)
model_mod.save('mobilenet_model_save.h5')
Found 17924 images belonging to 179 classes.
Found 1910 images belonging to 179 classes.
Epoch 1/1
561/561 [==============================] - 415s 741ms/step - loss: 4.9594 - acc: 0.0322 - top_k_categorical_accuracy: 0.1134 - val_loss: 4.4137 - val_acc: 0.0921 - val_top_k_categorical_accuracy: 0.2644
Epoch 00001: val_loss improved from inf to 4.41366, saving model to mobilenetv2_combined.hdf5
So this is code I'm running for training. Now basically same code for continue training (this is just for illustration):
gen_t = ImageDataGenerator(
horizontal_flip=True,
vertical_flip=True,
rotation_range=45,
width_shift_range=0.3,
height_shift_range=0.3,
shear_range = 0.3,
zoom_range = 0.3,
preprocessing_function=get_random_eraser(s_l=0, s_h=0.8),
validation_split=0.1
)
gen_v = ImageDataGenerator(
preprocessing_function=norm,
validation_split=0.1
)
early_stop = EarlyStopping(patience=10, restore_best_weights=True, verbose=True)
tb = TensorBoard(batch_size=32)
mc = ModelCheckpoint('mobilenetv2_combined.hdf5', monitor='val_loss', save_best_only=True, verbose=True)
train_generator = gen_t.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="training")
validation_generator = gen_v.flow_from_directory(os.path.join(DATA_FOLDER_PATH, 'data_mod', 'train'), target_size=(256, 256), batch_size=32, subset="validation")
model_mod = load_model('mobilenet_model_save.h5')
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_generator.classes), train_generator.classes)
#model_mod.compile(adam(lr=0.0001, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])
model_mod.compile(k.optimizers.sgd(lr=0.001, momentum=0.9, nesterov=True), loss='categorical_crossentropy', metrics=['accuracy', 'top_k_categorical_accuracy'])
hist = model_mod.fit_generator(train_generator,validation_data=validation_generator, epochs=2, initial_epoch=1, callbacks=[early_stop, tb, mc], class_weight=class_weights)
model_mod.save('mobilenet_model_save.h5')
Found 17924 images belonging to 179 classes.
Found 1910 images belonging to 179 classes.
Epoch 2/2
561/561 [==============================] - 373s 665ms/step - loss: 174.3220 - acc: 0.0815 - top_k_categorical_accuracy: 0.2320 - val_loss: 49.8441 - val_acc: 0.0110 - val_top_k_categorical_accuracy: 0.0455
Epoch 00002: val_loss improved from inf to 49.84411, saving model to mobilenetv2_combined.hdf5
Does anybody have any idea what's going on? I tried a very simple toy example with MNIST and everything seems to be working fine. I'll be happy for any suggestion. One more interesting thing, it's just value of the loss function. Accuracy of the network stays the same as after training, e.g. after training, the network finish with an accuracy of 40% and when I resume training (with huge loss jump), the accuracy is still 40%.
So I haven't figured this out, but my guess that it is either problem with saving sort of "custom" (from application module) network problem or due to using older version 2.2.0 (due to squeezenet bug).
I doubt that this question gonna get more attention than it got in the last 10 days, so I'm closing the question.
My "solution" was to train the network in a single go, without interruption.