Here is the full error:
`---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[33], line 2
1 gnb = GaussianNB()
----> 2 cv = cross_val_score(gnb,X_train,y_train,cv=5, error_score = 'raise')
3 print(cv)
4 print(cv.mean())
File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py:515, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
512 # To ensure multimetric format is not supported
513 scorer = check_scoring(estimator, scoring=scoring)
--> 515 cv_results = cross_validate(
516 estimator=estimator,
517 X=X,
518 y=y,
519 groups=groups,
520 scoring={"score": scorer},
521 cv=cv,
522 n_jobs=n_jobs,
523 verbose=verbose,
524 fit_params=fit_params,
525 pre_dispatch=pre_dispatch,
526 error_score=error_score,
527 )
528 return cv_results["test_score"]
File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File /opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py:63, in Parallel.__call__(self, iterable)
58 config = get_config()
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:1918, in Parallel.__call__(self, iterable)
1916 output = self._get_sequential_output(iterable)
1917 next(output)
-> 1918 return output if self.return_generator else list(output)
1920 # Let's create an ID that uniquely identifies the current call. If the
1921 # call is interrupted early and that the same instance is immediately
1922 # re-used, this id will be used to prevent workers that were
1923 # concurrently finalizing a task from the previous call to run the
1924 # callback.
1925 with self._lock:
File /opt/conda/lib/python3.10/site-packages/joblib/parallel.py:1847, in Parallel._get_sequential_output(self, iterable)
1845 self.n_dispatched_batches += 1
1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
1848 self.n_completed_tasks += 1
1849 self.print_progress()
File /opt/conda/lib/python3.10/site-packages/sklearn/utils/parallel.py:123, in _FuncWrapper.__call__(self, *args, **kwargs)
121 config = {}
122 with config_context(**config):
--> 123 return self.function(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py:686, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
684 estimator.fit(X_train, **fit_params)
685 else:
--> 686 estimator.fit(X_train, y_train, **fit_params)
688 except Exception:
689 # Note fit time as time until error
690 fit_time = time.time() - start_time
File /opt/conda/lib/python3.10/site-packages/sklearn/naive_bayes.py:267, in GaussianNB.fit(self, X, y, sample_weight)
265 self._validate_params()
266 y = self._validate_data(y=y)
--> 267 return self._partial_fit(
268 X, y, np.unique(y), _refit=True, sample_weight=sample_weight
269 )
File /opt/conda/lib/python3.10/site-packages/sklearn/naive_bayes.py:427, in GaussianNB._partial_fit(self, X, y, classes, _refit, sample_weight)
424 if _refit:
425 self.classes_ = None
--> 427 first_call = _check_partial_fit_first_call(self, classes)
428 X, y = self._validate_data(X, y, reset=first_call)
429 if sample_weight is not None:
File /opt/conda/lib/python3.10/site-packages/sklearn/utils/multiclass.py:420, in _check_partial_fit_first_call(clf, classes)
413 raise ValueError(
414 "`classes=%r` is not the same as on last call "
415 "to partial_fit, was: %r" % (classes, clf.classes_)
416 )
418 else:
419 # This is the first call to partial_fit
--> 420 clf.classes_ = unique_labels(classes)
421 return True
423 # classes is None and clf.classes_ has already previously been set:
424 # nothing to do
File /opt/conda/lib/python3.10/site-packages/sklearn/utils/multiclass.py:107, in unique_labels(*ys)
105 _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
106 if not _unique_labels:
--> 107 raise ValueError("Unknown label type: %s" % repr(ys))
109 if is_array_api:
110 # array_api does not allow for mixed dtypes
111 unique_ys = xp.concat([_unique_labels(y) for y in ys])
ValueError: Unknown label type: (array([0.0, 1.0], dtype=object),)`
I am trying to implement cross-validation in the Spaceship Titanic Project on Kaggle, you can find my full notebook here. I have not been able to get the cross-validation function to run. I have been fiddling with it for hours, and I have not been able to find the solution.
I followed Ken Jee's Titanic Project Example to complete my own Titanic Prediction submission. In this code, the cross-validation function can successfully run, and I tried to re-purpose this to the Spaceship Titanic project.
Please let me know if you need any other details. I am quite new to Machine Learning, so I thank you in advance for your patience.
I've tried getting rid of null values, verified that the data types for my training and test lists are the same, and I've tried mapping categorical variables to numerical variables.
Your target (y_train) is of type object
Casting y_train before doing CV to int32 fixes the error.
y_train = y_train.astype("int32")