I'm getting the error,
ValueError: Cannot have number of splits n_splits=7 greater than the number of samples: n_samples=0.
But my csv has data. What's wrong?
My csv: here
Another (extended) csv: here
I need to get the values of models from the data and print it to the screen. I also need to answer the question of which characteristics of the schools exhibit a significant difference corresponding to -1.
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
# Paso (a): Cargar los datos y realizar el preprocesamiento necesario
datos = pd.read_csv('datoscorto.csv', encoding='latin-1')
#datos = datos_old[np.isfinite(datos_old).all(1)]
# Check for missing values
print(datos.isnull().sum())
# Remove rows with missing values
datos = datos.dropna()
# Paso (b): Crear etiqueta
datos['etiqueta'] = datos['sigdif_lect8b_rbd'].apply(lambda x: 1 if x == -1 else 0)
# Paso (c): Selección de variables
columnas_caracteristicas = ['nalu_lect8b_rbd', 'prom_lect8b_rbd', 'dif_lect8b_rbd']
# Paso (d): Entrenar distintos modelos
X = datos[columnas_caracteristicas]
y = datos['etiqueta']
modelos = [
DummyClassifier(strategy='most_frequent'),
DecisionTreeClassifier(),
LogisticRegression(),
RandomForestClassifier(),
KNeighborsClassifier(),
SVC()
]
kf = KFold(n_splits=7, shuffle=True, random_state=42)
for modelo in modelos:
scores = cross_val_score(modelo, X, y, scoring='accuracy', cv=kf)
print(f"Modelo: {modelo.__class__.__name__}")
print(f"Accuracy: {scores.mean():.4f}")
precision_scores = cross_val_score(modelo, X, y, scoring='precision', cv=kf)
print(f"Precision: {precision_scores.mean():.4f}")
recall_scores = cross_val_score(modelo, X, y, scoring='recall', cv=kf)
print(f"Recall: {recall_scores.mean():.4f}")
You've lost all the rows when dropping NA values. I've printed out the shape of datos
before and after this action:
# Paso (a): Cargar los datos y realizar el preprocesamiento necesario
datos = pd.read_csv('datoscorto.csv', encoding='latin-1')
#datos = datos_old[np.isfinite(datos_old).all(1)]
# Check for missing values
# print(datos.isnull().sum())
print(datos.shape)
# Remove rows with missing values
datos = datos.dropna()
print(datos.shape)
Output:
(49, 44)
(0, 44)
There are columns in your first example csv which are always empty, for example marca_lect8b_rbd
. It makes sence to drop NAs after you select only columnas_caracteristicas
columns.