pandas scikit-learn sampling multilabel-classification multiclass-classification

Inconsistent numbers of samples error in multi-class / multi-label machine learning model

I have a OneVsOne model running fine with textual feature and target fields. To progress to a multi-class model (i.e. with multiple textual feature fields), I believe OneVsRest with Logistic Regression is suitable.

However, when I use the following pipeline:

model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidfT', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
    ])

I am getting the following error when trying to run a OneVsRest classifier with Logistic Regression:

ValueError: Found input variables with inconsistent numbers of samples: [3, 224]

The feature fields are in a pandas dataframe of 224 rows and the target field is a pandas series of length 224. There are no nulls in the data.

Here is the full traceback:

ValueError                                Traceback (most recent call last)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\defect_autocategorisation_main9.py:127
    119 model = Pipeline([
    120     ('vect', CountVectorizer()),
    121     ('tfidfT', TfidfTransformer()),
    122     ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
    123     ])
    124 #model = OneVsRestClassifier(LogisticRegression())
    125 
    126 # Initialize the classifier
--> 127 model.fit(X,y)
    128 predicted = model.predict(X_test)
    129 #predicted = model.predict(X_test, Y_test)
    130    
    131 # creating a confusion matrix  
   (...)
    140 
    141 # Generate classification report

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\pipeline.py:473, in Pipeline.fit(self, X, y, **params)
    471     if self._final_estimator != "passthrough":
    472         last_step_params = routed_params[self.steps[-1][0]]
--> 473         self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    475 return self

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\multiclass.py:370, in OneVsRestClassifier.fit(self, X, y, **fit_params)
    366 columns = (col.toarray().ravel() for col in Y.T)
    367 # In cases where individual estimators are very fast to train setting
    368 # n_jobs > 1 in can results in slower performance due to the overhead
    369 # of spawning threads.  See joblib issue #112.
--> 370 self.estimators_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
    371     delayed(_fit_binary)(
    372         self.estimator,
    373         X,
    374         column,
    375         fit_params=routed_params.estimator.fit,
    376         classes=[
    377             "not %s" % self.label_binarizer_.classes_[i],
    378             self.label_binarizer_.classes_[i],
    379         ],
    380     )
    381     for i, column in enumerate(columns)
    382 )
    384 if hasattr(self.estimators_[0], "n_features_in_"):
    385     self.n_features_in_ = self.estimators_[0].n_features_in_

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\parallel.py:74, in Parallel.__call__(self, iterable)
     69 config = get_config()
     70 iterable_with_config = (
     71     (_with_config(delayed_func, config), args, kwargs)
     72     for delayed_func, args, kwargs in iterable
     73 )
---> 74 return super().__call__(iterable_with_config)

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\joblib\parallel.py:1918, in Parallel.__call__(self, iterable)
   1916     output = self._get_sequential_output(iterable)
   1917     next(output)
-> 1918     return output if self.return_generator else list(output)
   1920 # Let's create an ID that uniquely identifies the current call. If the
   1921 # call is interrupted early and that the same instance is immediately
   1922 # re-used, this id will be used to prevent workers that were
   1923 # concurrently finalizing a task from the previous call to run the
   1924 # callback.
   1925 with self._lock:

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\joblib\parallel.py:1847, in Parallel._get_sequential_output(self, iterable)
   1845 self.n_dispatched_batches += 1
   1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
   1848 self.n_completed_tasks += 1
   1849 self.print_progress()

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\parallel.py:136, in _FuncWrapper.__call__(self, *args, **kwargs)
    134     config = {}
    135 with config_context(**config):
--> 136     return self.function(*args, **kwargs)

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\multiclass.py:93, in _fit_binary(estimator, X, y, fit_params, classes)
     91 else:
     92     estimator = clone(estimator)
---> 93     estimator.fit(X, y, **fit_params)
     94 return estimator

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1466     estimator._validate_params()
   1468 with config_context(
   1469     skip_parameter_validation=(
   1470         prefer_skip_nested_validation or global_skip_validation
   1471     )
   1472 ):
-> 1473     return fit_method(estimator, *args, **kwargs)

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\linear_model\_logistic.py:1223, in LogisticRegression.fit(self, X, y, sample_weight)
   1220 else:
   1221     _dtype = [np.float64, np.float32]
-> 1223 X, y = self._validate_data(
   1224     X,
   1225     y,
   1226     accept_sparse="csr",
   1227     dtype=_dtype,
   1228     order="C",
   1229     accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
   1230 )
   1231 check_classification_targets(y)
   1232 self.classes_ = np.unique(y)

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:650, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    648         y = check_array(y, input_name="y", **check_y_params)
    649     else:
--> 650         X, y = check_X_y(X, y, **check_params)
    651     out = X, y
    653 if not no_val_X and check_params.get("ensure_2d", True):

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\validation.py:1320, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1301 X = check_array(
   1302     X,
   1303     accept_sparse=accept_sparse,
   (...)
   1315     input_name="X",
   1316 )
   1318 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
-> 1320 check_consistent_length(X, y)
   1322 return X, y

File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\validation.py:457, in check_consistent_length(*arrays)
    455 uniques = np.unique(lengths)
    456 if len(uniques) > 1:
--> 457     raise ValueError(
    458         "Found input variables with inconsistent numbers of samples: %r"
    459         % [int(l) for l in lengths]
    460     )

ValueError: Found input variables with inconsistent numbers of samples: [3, 224]

There is this similar Stack Overflow question: ValueError: Number of features of the model must match the input but neither the suggestions in this nor in any of the few other similar questions work for me.

Although my data is textual, for info the above pipeline causes the same error when using the Iris dataset but it completes successfully when just running the classifier (i.e. omitting the vectorizer and transformer). However, just running the classifier on my textual data doesn't work, giving the following expected error:

ValueError: could not convert string to float: 'Jacket'

I am aware of OneHot encoding but this 'inconsistent numbers of samples' problem seems irrespective of any encoding issue and I would like to solve this before tackling any other issues.

Edit 22/10/24: Here is a Minimal Reproducible Example geared to use the iris dataset:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# Read in the dataset to train the model
training_data = pd.read_csv('iris_dataset.csv')   
print(training_data) 

# Load feature data
X = training_data[['sepal.length', 'sepal.width','petal.length','petal.width']]
    
# Load target data
y = training_data['variety']

# Split training data into training and test portions
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.5, random_state=42)
    
# Create the pipeline composed of vectoriser, transformer and classifier
model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidfT', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
    ])

# Initialize the classifier
model.fit(X,y)
predicted = model.predict(X_test)

Edit 23/10/24: Here is the MRE again, self-contained with textual data inputs:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# Read in the dataset to train the model
training_data = pd.read_csv('iris_dataset.csv')   
print(training_data)

training_data = pd.DataFrame({
                'Location': ['Structure', 'Stucture', 'Structure', 'Access systems'],\
                'Component': ['Mid bay brace12', 'Mid bay brace10', 'Mid bay brace07', 'First stage ladder'],\
                'Defect Description': ['Surface corrosion', 'Coating delamination with minor surface corrosion', 'Corrosion', 'Entangled rope'],\
                'Failure Mode': ['Corrosion', 'Corrosion','Corrosion', 'Debris']
                })


# Load feature data
X = training_data[['Location', 'Component','Defect Description']]
    
# Load target data
y = training_data['Failure Mode']

# Split training data into training and test portions
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.5, random_state=42)
    
# Create the pipeline composed of vectoriser, transformer and classifier
model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidfT', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
    ])

# Initialize the classifier
model.fit(X,y)
predicted = model.predict(X_test)

Solution

I'll expand this later, but the main solution is to use a ColumnTransformer to run separate text transformers per column (I've also consolidated CountVectorizer+TfidfTransformer=TfidfVectorizer):

preproc = ColumnTransformer([
    (col+"_tfidf", TfidfVectorizer(), col)
    for col in X.columns
])

model = Pipeline([
    ("preproc", preproc),
    ('clf', LogisticRegression()),
])

model.fit(X,y)