I am wondering if it is an issue (possible data leakage?) when implementing leave one out cross validation by hand if the model is fit each iteration after testing on each fold? It seems like if the model is trained on all data except for "X" and after testing on "X" the model is trained on all data other than "Y" and tested on "Y" it has seen "Y" on the first iteration. Is this actually a problem, and does my implementation of LOOCV by hand appear to be correct?
Thanks for your time!
i = 0
j = 0
for i in range(0, 41):
X_copy = X_orig[(i):(i+1)] #Slice the ith element from the numpy array
y_copy = y_orig[(i):(i+1)]
X_model = X_orig
y_model = y_orig
X_model = np.delete(X_model, i, axis = 0)
y_model = np.delete(y_model, i, axis = 0)
model.fit(X_model, y_model, epochs=115, batch_size=28, verbose = 0) #verbose = 0 removes learning info
prediction = model.predict(X_copy)
prediction[prediction>=0.5] = 1
prediction[prediction<0.5] = 0
print(prediction, y_copy)
if np.array_equal(y_copy, prediction):
j = j + 1
#print(y_copy, prediction)
if np.not_equal:
#print(y_copy, prediction)
pass
print(j/41) #For 41 samples in dataset
Why don't you use this?
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
model =...
test_fold_predictions = []
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model.fit(X_train, y_train)
test_fold_predictions.append(model.predict(X_test))
EDIT
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
model = Sequential()
model.add(Dense(5000, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.1))
model.add(Dense(600, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(y_train.shape[1], activation='sigmoid'))
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='binary_crossentropy',
optimizer=sgd)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
test_fold_predictions = []
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
model.fit(X_train, y_train, epochs=5, batch_size=2000)
test_fold_predictions.append(model.predict(X_test))