I am trying to calculate some average metrics after performing cross validation. The function that do that is the following one:
from sklearn.model_selection import KFold
from numpy import mean
from numpy import std
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
# Returns average confusion matrix, average accuracy
# and average standard deviation after all the cross-validation runs
def get_average_metrics(model,cv,X_fss,y):
conf_matrix_list_of_arrays = []
scores = []
for train_index, test_index in cv.split(X_fss):
X_train, X_test = X_fss[train_index], X_fss[test_index]
y_train, y_test = y[train_index], y[test_index]
score = model.fit(X_train, y_train).score(X_test, y_test)
conf_matrix = confusion_matrix(y_test, model.predict(X_test))
scores.append(score)
conf_matrix_list_of_arrays.append(conf_matrix)
# Average confusion matrix
mean_of_conf_matrix_arrays = mean(conf_matrix_list_of_arrays, axis=0)
# Average accuracy
avg_score = mean(scores)
# Average standard deviation
std_score = std(scores)
return avg_score,std_score,mean_of_conf_matrix_arrays
However, I get this error in X_train, X_test = X_fss[train_index], X_fss[test_index]
line:
KeyError: "None of [Int64Index([ 1, 2, 4, 5, 6, 7,
9, 10, 11, 12,\n ...\n 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1629, 1630],\n dtype='int64', length=1467)] are in the [columns]"
Received funcion parameters:
logistic = LogisticRegression()
cv = KFold(n_splits=10,shuffle=True, random_state=1)
X_fss
sample:
y
sample:
I have solved it converting X_fss
Dataframe into a numpy array:
X_fss = X_fss.to_numpy()