I have a OneVsOne model running fine with textual feature and target fields. To progress to a multi-class model (i.e. with multiple textual feature fields), I believe OneVsRest with Logistic Regression is suitable.
However, when I use the following pipeline:
model = Pipeline([
('vect', CountVectorizer()),
('tfidfT', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
])
I am getting the following error when trying to run a OneVsRest classifier with Logistic Regression:
ValueError: Found input variables with inconsistent numbers of samples: [3, 224]
The feature fields are in a pandas dataframe of 224 rows and the target field is a pandas series of length 224. There are no nulls in the data.
Here is the full traceback:
ValueError Traceback (most recent call last)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\defect_autocategorisation_main9.py:127
119 model = Pipeline([
120 ('vect', CountVectorizer()),
121 ('tfidfT', TfidfTransformer()),
122 ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
123 ])
124 #model = OneVsRestClassifier(LogisticRegression())
125
126 # Initialize the classifier
--> 127 model.fit(X,y)
128 predicted = model.predict(X_test)
129 #predicted = model.predict(X_test, Y_test)
130
131 # creating a confusion matrix
(...)
140
141 # Generate classification report
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\pipeline.py:473, in Pipeline.fit(self, X, y, **params)
471 if self._final_estimator != "passthrough":
472 last_step_params = routed_params[self.steps[-1][0]]
--> 473 self._final_estimator.fit(Xt, y, **last_step_params["fit"])
475 return self
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\multiclass.py:370, in OneVsRestClassifier.fit(self, X, y, **fit_params)
366 columns = (col.toarray().ravel() for col in Y.T)
367 # In cases where individual estimators are very fast to train setting
368 # n_jobs > 1 in can results in slower performance due to the overhead
369 # of spawning threads. See joblib issue #112.
--> 370 self.estimators_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
371 delayed(_fit_binary)(
372 self.estimator,
373 X,
374 column,
375 fit_params=routed_params.estimator.fit,
376 classes=[
377 "not %s" % self.label_binarizer_.classes_[i],
378 self.label_binarizer_.classes_[i],
379 ],
380 )
381 for i, column in enumerate(columns)
382 )
384 if hasattr(self.estimators_[0], "n_features_in_"):
385 self.n_features_in_ = self.estimators_[0].n_features_in_
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\parallel.py:74, in Parallel.__call__(self, iterable)
69 config = get_config()
70 iterable_with_config = (
71 (_with_config(delayed_func, config), args, kwargs)
72 for delayed_func, args, kwargs in iterable
73 )
---> 74 return super().__call__(iterable_with_config)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\joblib\parallel.py:1918, in Parallel.__call__(self, iterable)
1916 output = self._get_sequential_output(iterable)
1917 next(output)
-> 1918 return output if self.return_generator else list(output)
1920 # Let's create an ID that uniquely identifies the current call. If the
1921 # call is interrupted early and that the same instance is immediately
1922 # re-used, this id will be used to prevent workers that were
1923 # concurrently finalizing a task from the previous call to run the
1924 # callback.
1925 with self._lock:
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\joblib\parallel.py:1847, in Parallel._get_sequential_output(self, iterable)
1845 self.n_dispatched_batches += 1
1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
1848 self.n_completed_tasks += 1
1849 self.print_progress()
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\parallel.py:136, in _FuncWrapper.__call__(self, *args, **kwargs)
134 config = {}
135 with config_context(**config):
--> 136 return self.function(*args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\multiclass.py:93, in _fit_binary(estimator, X, y, fit_params, classes)
91 else:
92 estimator = clone(estimator)
---> 93 estimator.fit(X, y, **fit_params)
94 return estimator
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:1473, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
1466 estimator._validate_params()
1468 with config_context(
1469 skip_parameter_validation=(
1470 prefer_skip_nested_validation or global_skip_validation
1471 )
1472 ):
-> 1473 return fit_method(estimator, *args, **kwargs)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\linear_model\_logistic.py:1223, in LogisticRegression.fit(self, X, y, sample_weight)
1220 else:
1221 _dtype = [np.float64, np.float32]
-> 1223 X, y = self._validate_data(
1224 X,
1225 y,
1226 accept_sparse="csr",
1227 dtype=_dtype,
1228 order="C",
1229 accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
1230 )
1231 check_classification_targets(y)
1232 self.classes_ = np.unique(y)
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\base.py:650, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
648 y = check_array(y, input_name="y", **check_y_params)
649 else:
--> 650 X, y = check_X_y(X, y, **check_params)
651 out = X, y
653 if not no_val_X and check_params.get("ensure_2d", True):
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\validation.py:1320, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1301 X = check_array(
1302 X,
1303 accept_sparse=accept_sparse,
(...)
1315 input_name="X",
1316 )
1318 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
-> 1320 check_consistent_length(X, y)
1322 return X, y
File c:\Users\u363028\OneDrive - IBERDROLA S.A\Tools\sklearn\sk_learn\Lib\site-packages\sklearn\utils\validation.py:457, in check_consistent_length(*arrays)
455 uniques = np.unique(lengths)
456 if len(uniques) > 1:
--> 457 raise ValueError(
458 "Found input variables with inconsistent numbers of samples: %r"
459 % [int(l) for l in lengths]
460 )
ValueError: Found input variables with inconsistent numbers of samples: [3, 224]
There is this similar Stack Overflow question: ValueError: Number of features of the model must match the input but neither the suggestions in this nor in any of the few other similar questions work for me.
Although my data is textual, for info the above pipeline causes the same error when using the Iris dataset but it completes successfully when just running the classifier (i.e. omitting the vectorizer and transformer). However, just running the classifier on my textual data doesn't work, giving the following expected error:
ValueError: could not convert string to float: 'Jacket'
I am aware of OneHot encoding but this 'inconsistent numbers of samples' problem seems irrespective of any encoding issue and I would like to solve this before tackling any other issues.
Edit 22/10/24: Here is a Minimal Reproducible Example geared to use the iris dataset:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
# Read in the dataset to train the model
training_data = pd.read_csv('iris_dataset.csv')
print(training_data)
# Load feature data
X = training_data[['sepal.length', 'sepal.width','petal.length','petal.width']]
# Load target data
y = training_data['variety']
# Split training data into training and test portions
X_train, X_test, y_train, y_test \
= train_test_split(X, y, test_size=0.5, random_state=42)
# Create the pipeline composed of vectoriser, transformer and classifier
model = Pipeline([
('vect', CountVectorizer()),
('tfidfT', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
])
# Initialize the classifier
model.fit(X,y)
predicted = model.predict(X_test)
Edit 23/10/24: Here is the MRE again, self-contained with textual data inputs:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
# Read in the dataset to train the model
training_data = pd.read_csv('iris_dataset.csv')
print(training_data)
training_data = pd.DataFrame({
'Location': ['Structure', 'Stucture', 'Structure', 'Access systems'],\
'Component': ['Mid bay brace12', 'Mid bay brace10', 'Mid bay brace07', 'First stage ladder'],\
'Defect Description': ['Surface corrosion', 'Coating delamination with minor surface corrosion', 'Corrosion', 'Entangled rope'],\
'Failure Mode': ['Corrosion', 'Corrosion','Corrosion', 'Debris']
})
# Load feature data
X = training_data[['Location', 'Component','Defect Description']]
# Load target data
y = training_data['Failure Mode']
# Split training data into training and test portions
X_train, X_test, y_train, y_test \
= train_test_split(X, y, test_size=0.5, random_state=42)
# Create the pipeline composed of vectoriser, transformer and classifier
model = Pipeline([
('vect', CountVectorizer()),
('tfidfT', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression(), n_jobs = 1))
])
# Initialize the classifier
model.fit(X,y)
predicted = model.predict(X_test)
I'll expand this later, but the main solution is to use a ColumnTransformer
to run separate text transformers per column (I've also consolidated CountVectorizer+TfidfTransformer=TfidfVectorizer
):
preproc = ColumnTransformer([
(col+"_tfidf", TfidfVectorizer(), col)
for col in X.columns
])
model = Pipeline([
("preproc", preproc),
('clf', LogisticRegression()),
])
model.fit(X,y)