python validation machine-learning data-science classification

High accuracy training but low accuracy test/prediction

I am using CNN to classify apple type. I achieved high accuracy on train data but really low accuracy on test data. Data is split into 80:20. I am not sure if my data is overfitting or not.

I have 2 folders that contain TraningData and TestData, and each folder has 4 subfolders braeburn, red_apple, red_delicious, rotten (containing corresponding pictures).

TRAIN_DIR = 'apple_fruit'
TEST_DIR = 'apple_fruit'
classes = ['braeburn','red_apples','red_delicious','rotten'] train_datagen = ImageDataGenerator(rescale = 1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest')
 
test_datagen = ImageDataGenerator(rescale = 1./255) 

training_set = train_datagen.flow_from_directory(TRAIN_DIR,
shuffle=True,
target_size = (100,100),
batch_size = 25,
classes =['braeburn','red_apples','red_delicious','rotten'])

test_set= test_datagen.flow_from_directory(TEST_DIR,
target_size = (100, 100),
shuffle=True,
 batch_size = 25,classes = classes)

model =Sequential()
model.add(Conv2D(filters=128, kernel_size=(3,3),input_shape=(100,100,3), activation='relu', padding
= 'same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(filters=16, kernel_size=(3,3), activation='relu', padding = 'same'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.6))
model.add(Dense(4,activation='softmax'))
model.compile(optimizer ='adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

history = model.fit(x=training_set,#y=training_set.labels,
steps_per_epoch=len(training_set),
epochs =10)

model.save('Ripe2_model6.h5')  # creates a HDF5 file 'my_model.h5'

model_path = "Ripe2_model6.h5"
loaded_model = keras.models.load_model(model_path)
classes = ['braeburn','red_apples','red_delicious','rotten']
predictions = model.predict(x=test_set, steps=len(test_set), verbose=True)
pred = np.round(predictions)

y_true=test_set.classes
y_pred=np.argmax(pred, axis=-1)
    > cm = confusion_matrix(y_true=test_set.classes, y_pred=np.argmax(pred, axis=-1))
test_set.classes
np.argmax(pred, axis=-1)
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):

accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy

  """
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
    """
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title,color = 'white')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45,color = 'white')
plt.yticks(tick_marks, classes,color = 'white')
target_names = ['braeburn','red_apples','red_delicious','rotten']

if target_names is not None:
 tick_marks = np.arange(len(target_names))
 plt.xticks(tick_marks, target_names, rotation=45)
 plt.yticks(tick_marks, target_names)

if normalize:
 cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
 thresh = cm.max() / 1.5 if normalize else cm.max() / 2
 for i, j in itertools.product(range(cm.shape[0]), 
  range(cm.shape[1])):
  if normalize:
   plt.text(j, i, "{:0.4f}".format(cm[i, j]),
   horizontalalignment="center",
   color="white" if cm[i, j] > thresh else "black")
  else:
   plt.text(j, i, "{:,}".format(cm[i, j]),
   horizontalalignment="center",
   color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.ylabel('True label',color = 'white')
plt.xlabel('Predicted label',color = 'white')

cm_plot_labels = ['braeburn','red_apples','red_delicious','rotten']
plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')

print(accuracy_score(y_true, y_pred))
print(recall_score(y_true, y_pred, average=None))
print(precision_score(y_true, y_pred, average=None))

The confusion matrix:

accuracy - 0.2909090909090909
recall - [0.23484848 0.32319392 0.15151515 0.36213992]
precision - [0.23308271 0.32319392 0.15151515 0.36363636]

I have tried changing many features but still no progress.

Solution

It indicates that the data in the test set is quite different from what the model learned. To understand if it is overfitting or a single unfortunate split:

Check if your results are dependent on the initial Train/Test split. To achieve that you can:

[optional] Merge all the pictures into the whole dataset (train+test) folder.
Split images into train/test randomly (rather than using initial splitting)
Implement cross-validation (e.g. K-Fold)

Do you have a sufficient number of samples? Try to add more samples and check how it affects performance. You can also apply data augmentation technics.