python tensorflow machine-learning keras neural-network

Why does my neural network predict the incorrect class label for test images belonging to one class, despite having a high validation accuracy?

I am using the Inception v4 model to train a classifier on 3 classes A, B and C, each having roughly 900 images in the training dataset and 80 images in the validation set. I ran my training code for 200 epochs with a batch size of 8. I was getting an average validation accuracy of more than 99% with a very low loss:-

Epoch 199/200
303/303 [==============================] - 53s 174ms/step - loss: 0.0026 - accuracy: 0.9996 - val_loss: 5.1226e-04 - val_accuracy: 1.0000
Epoch 200/200
303/303 [==============================] - 53s 176ms/step - loss: 0.0019 - accuracy: 1.0000 - val_loss: 0.1079 - val_accuracy: 0.9750

When I run my test code on the images in directory A of the validation set, its predicting 80% of the images as class A, 20% as class C and nothing at all in class B. Same with class C (80% as C, 20% as A). And on directory B, all are being predicted as class A or C. In all three test cases, not a single image is being classified as class B by the test program, despite the high validation accuracy as well as the use of the exact same directory that was used for validation at training time (the latter also leads me to believe that its not caused primarily by overfitting).

This was the output of the test program on directory B:

25/25 [==============================] - 8s 186ms/step - loss: 0.0212 - accuracy: 0.9963
['loss', 'accuracy']
[0.02124088630080223, 0.9963099360466003]
Testing images located in val/B/
[[6.2504888e-01 8.8258091e-08 3.7495103e-01]]
A:62.5%
[[8.8602149e-01 1.3459101e-05 1.1396510e-01]]
A:88.6%
[[4.7189465e-01 4.0863368e-05 5.2806443e-01]]
C:52.81%
[[1.0370950e-01 2.7608112e-07 8.9629024e-01]]
C:89.63%
[[7.1212035e-01 3.3269991e-06 2.8787634e-01]]
A:71.21%

and so on.

I even tried to divide the line img = np.expand_dims(test_image, axis=0) by 255, as described by another question I had asked elsewhere. It was successful in that case, but not so here.

Here is my training code:

def create_inception_v4(nb_classes, load_weights, checkpoint_path):

    init = Input((299,299, 3))

    x = inception_stem(init)

    # 4 x Inception A
    for i in range(4):
        x = inception_A(x)

    # Reduction A
    x = reduction_A(x)

    # 7 x Inception B
    for i in range(7):
        x = inception_B(x)

    # Reduction B
    x = reduction_B(x)

    # 3 x Inception C
    for i in range(3):
        x = inception_C(x)

    # Average Pooling
    x = AveragePooling2D((8, 8))(x)

    # Dropout - Use 0.2, as mentioned in official paper. 
    x = Dropout(0.2)(x)
    x = Flatten()(x)

    # Output
    out = Dense(nb_classes, activation='softmax')(x)

    model = Model(init, out, name='Inception-v4')

    if load_weights:
        weights = checkpoint_path
        model.load_weights(weights, by_name=True)
        print("Model weights loaded.")
 
    return model





def train(args,check,checkpoint_path,network_name="inceptionv4"):
    n_gpus=int(args['gpus'])      
   
    sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

    datagen=ImageDataGenerator(rescale=1/255,
                rotation_range=40,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.1,
                zoom_range=0.1,
                horizontal_flip=True,
                fill_mode='nearest',
                samplewise_std_normalization=True)

    val_datagen = ImageDataGenerator(rescale=1/255)

    batch_size = int(args["batch_size"])

    train_generator = datagen.flow_from_directory(train_dir,target_size=(299,299),class_mode="categorical", batch_size=batch_size)
    val_gen = datagen.flow_from_directory(val_dir,target_size=(299,299),class_mode="categorical", batch_size=batch_size)

    mc = keras.callbacks.ModelCheckpoint(f"{network_name}_checkpoints/{network_name}.h5", save_weights_only=True, save_best_only=True)

    tensorboard = TensorBoard(log_dir="{}/{}".format(args["log_dir"], time()))

    validation_steps = 10


    model = create_inception_v4(int(args["num_classes"]),check,checkpoint_path)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=float(args['learning_rate']), decay=1e-6, momentum=0.9, nesterov=True), metrics=["accuracy"])   

    counter = Counter(train_generator.classes)                          
    max_val = float(max(counter.values()))       
    class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}    
   
    hist = model.fit(train_generator,epochs=num_epochs,verbose=True,validation_data=val_gen,validation_steps=validation_steps,callbacks=[mc, tensorboard], class_weight=class_weights)
    model.save(f"checkpoints/{network_name}_{num_epochs}epochs.h5")

And here is my test code:

def test_model(test_dir, num_epochs,class_names, network_name="inceptionv4",):

    model=load_model(f'checkpoints/{network_name}_{num_epochs}epochs.h5')


    datagen=ImageDataGenerator(rescale=1/255,
                rotation_range=40,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.1,
                zoom_range=0.1,
                horizontal_flip=True,
                fill_mode='nearest',         
                samplewise_std_normalization=True)

    val_datagen = ImageDataGenerator(rescale=1/255)
    val_dir = "val/"
    val_gen = datagen.flow_from_directory(val_dir,target_size=(299,299),class_mode="categorical")


    test_accuracy=model.evaluate(val_gen,steps=25)
    print(model.metrics_names)
    print(test_accuracy)

    
    img_width, img_height = 299, 299

    print(f"Testing images located in {test_dir}")
    counter = 0
    results_dict = {}
    start_time = time.time()
    
    for filename_img in os.listdir(test_dir):
        counter += 1
        filename = os.path.join(test_dir,filename_img)
        img = image.load_img(filename, target_size=(img_width, img_height))
        test_image = image.img_to_array(img)
        test_image.shape
        img = np.expand_dims(test_image, axis=0)/255
        classes = model.predict(img, batch_size=10)

        print(classes)
        predicted_class = class_names[np.argmax(classes)]

        if predicted_class not in results_dict.keys():
            results_dict[predicted_class] = 1
        else:
            results_dict[predicted_class] += 1

        print(f"{predicted_class}:{round(np.amax(classes)*100,2)}%")
        if counter % 100 == 0:
            print(f"{counter} files processed!")

    time_taken = time.time() - start_time
    time_taken = round(time_taken,2)
    print(f"{counter} images processed in {time_taken} seconds, at a rate of {round(counter/time_taken,2)} images per second.")
    
    for predicted_class in results_dict.keys():
        print(f"{predicted_class} = {results_dict[predicted_class]} predictions")

What am I doing wrong?

Edit 1- I tried to account for imbalanced classes by adding the class_weight parameter as shown in the edited code. Still not able to predict class B. I even tried using val_datagen instead of datagen that led to even worse results.

Edit 2- Now I copied my entire folder elsewhere, then deleted class B and retained classes A and C. I trained the model, once again got a very high training accuracy and now my test program can only predict class C and not class A. I have a feeling that I have made a really silly mistake in my test.py code.

Solution

This was a very frustrating error. I realised that I was getting a high validation accuracy on model.evaluate() for an entire directory, but not so for model.predict() on an individual image. This was because the Image Augmentation techniques that were applied for training were also used on validation but not on individual images fed as input to the model.

In this case, I realised the samplewise_std_normalization wasn't applied to a test image. So I used the standardising function, as inspired by this answer- test_image = datagen.standardize(test_image), and now my model works perfectly. The full test.py code can be seen below:

def test_model(test_dir, num_epochs,class_names, network_name="inceptionv4",):

    model=load_model(f'checkpoints/{network_name}_{num_epochs}epochs.h5')


    datagen=ImageDataGenerator(rescale=1/255,
                rotation_range=40,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.1,
                zoom_range=0.1,
                horizontal_flip=True,
                fill_mode='nearest',         
                samplewise_std_normalization=True)

    val_datagen = ImageDataGenerator(rescale=1/255)
    val_dir = "val/"
    val_gen = datagen.flow_from_directory(val_dir,target_size=(299,299),class_mode="categorical")


    test_accuracy=model.evaluate(val_gen,steps=25)
    print(model.metrics_names)
    print(test_accuracy)

    
    img_width, img_height = 299, 299

    print(f"Testing images located in {test_dir}")
    counter = 0
    results_dict = {}
    start_time = time.time()
    
    for filename_img in os.listdir(test_dir):
        counter += 1
        filename = os.path.join(test_dir,filename_img)
        img = image.load_img(filename, target_size=(img_width, img_height))
        test_image = image.img_to_array(img)
        test_image = np.expand_dims(test_image, axis=0)  
        # Don't divide by 255, this is taken care of by the standardize function
        test_image = datagen.standardize(test_image)
        classes = model.predict(test_image, batch_size=10)

        print(classes)
        predicted_class = class_names[np.argmax(classes)]

        if predicted_class not in results_dict.keys():
            results_dict[predicted_class] = 1
        else:
            results_dict[predicted_class] += 1

        print(f"{predicted_class}:{round(np.amax(classes)*100,2)}%")
        if counter % 100 == 0:
            print(f"{counter} files processed!")

    time_taken = time.time() - start_time
    time_taken = round(time_taken,2)
    print(f"{counter} images processed in {time_taken} seconds, at a rate of {round(counter/time_taken,2)} images per second.")
    
    for predicted_class in results_dict.keys():
        print(f"{predicted_class} = {results_dict[predicted_class]} predictions")