I have been training a text classifier to then later use to predict characters of a TV show. So far, my code looks like:
vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=0.001, max_df=0.75,stop_words='English')
X = vectorizer.fit_transform(data['text'])
y = data['character']
print(X.shape, y.shape) #prints (5999, 1429) (5999,)
# get baseline performance
kf = KFold(n_splits=5)
most_frequent = DummyClassifier(strategy='most_frequent')
print(cross_val_score(most_frequent , X, y=y, cv=kf, n_jobs= -1, scoring="accuracy").mean())
# fine-tune classifier
base_clf = CalibratedClassifierCV(cv=kf, base_estimator=LogisticRegression(n_jobs= -1, solver='lbfgs' ))
param_grid = {'base_estimator__C': [0.01, 0.05, 0.1, 0.5, 1.0, 10, 20, 50],
'base_estimator__class_weight': ['balanced', 'auto']}
search = GridSearchCV(base_clf, param_grid, cv=kf, scoring='f1_micro')
search.fit(X, y)
# use best classifier to get performance estimate
clf = search.best_estimator_.base_estimator
print(cross_val_score(clf, X, y=y, cv=kf, n_jobs= -1, scoring='f1_micro').mean())
However, I keep getting the following error:
ValueError Traceback (most recent call last)
/var/folders/fv/h7n33cb5227g4t5lxym8g_800000gn/T/ipykernel_2208/2611717736.py in <module>
6
7 search = GridSearchCV(base_clf, param_grid, cv=kf, scoring='f1_micro')
----> 8 search.fit(X, y)
9
10 # use best classifier to get performance estimate
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
878 refit_start_time = time.time()
879 if y is not None:
--> 880 self.best_estimator_.fit(X, y, **fit_params)
881 else:
882 self.best_estimator_.fit(X, **fit_params)
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/calibration.py in fit(self, X, y, sample_weight)
301 if n_folds and np.any([np.sum(y == class_) < n_folds
302 for class_ in self.classes_]):
--> 303 raise ValueError(f"Requesting {n_folds}-fold "
304 "cross-validation but provided less than "
305 f"{n_folds} examples for at least one class.")
ValueError: Requesting 5-fold cross-validation but provided less than 5 examples for at least one class.
I am not quite sure how to resolve this error and would truly appreciate any advice.
Thank you in advance!
You need to check the distribution of your target value data['character']
: it seems that the number of values in one of the classes in the target column is too small. To do it you can use : data['character'].value_counts()