I am trying to use sklearn and LightGBM with both numerical and categorical features. I created a Pipeline with:
It trains my model just fine but I have an error message when I want to use my model for prediction on a test dataset. It looks like the preprocessing is not applied to this test dataset but I don't get why. In the tutorials I've found online, it seems to work, though with sklearn classifiers.
Here is my code:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder,
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer,
from sklearn.impute import SimpleImputer
# Numerical features
numerical_features = ['Distance']
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
# Categorical features
categorical_features = ['Travel', 'Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder())])
# Build the preprocessor with ColumnTransformer
preprocess = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
]
)
# Build a pipeline
clf = Pipeline(steps=[('preprocess', preprocess),
('classifier', LGBMClassifier(random_state=17))])
# Fit
clf.fit(X_build, y_build)
# Scores
print("model training score (clf internal scoring function with standards parameters): {0}".format(clf.score(X_build, y_build))) # returns a score
print("Score: %f" % clf.score(X_valid, y_valid)) # Here is the problem
And here is the error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-116-70bf0e236540> in <module>()
----> 1 print("Score: %f" % clf.predict(X_valid))
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
329 for name, transform in self.steps[:-1]:
330 if transform is not None:
--> 331 Xt = transform.transform(Xt)
332 return self.steps[-1][-1].predict(Xt, **predict_params)
333
~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
491
492 X = _check_X(X)
--> 493 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
494 self._validate_output(Xs)
495
~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
391 _get_column(X, column), y, weight)
392 for _, trans, column, weight in self._iter(
--> 393 fitted=fitted, replace_strings=True))
394 except ValueError as e:
395 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
918 self._iterating = self._original_iterator is not None
919
--> 920 while self.dispatch_one_batch(iterator):
921 pass
922
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
603
604 def _transform_one(transformer, X, y, weight, **fit_params):
--> 605 res = transformer.transform(X)
606 # if we have a weight for this transformer, multiply output
607 if weight is None:
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X)
449 for name, transform in self.steps:
450 if transform is not None:
--> 451 Xt = transform.transform(Xt)
452 return Xt
453
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
611 copy=True)
612 else:
--> 613 return self._transform_new(X)
614
615 def inverse_transform(self, X):
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform_new(self, X)
572 n_samples, n_features = X.shape
573
--> 574 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
575
576 mask = X_mask.ravel()
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown)
105 msg = ("Found unknown categories {0} in column {1}"
106 " during transform".format(diff, i))
--> 107 raise ValueError(msg)
108 else:
109 # Set the problematic rows to an acceptable value and
ValueError: Found unknown categories ['BOS-CHS', 'ORD-JAC', 'LAS-OKC', 'VCT-IAH', 'CVG-EGE', 'PIT-PVD', 'BDL-SLC', 'TEX-PHX', 'LAX-LGA', 'LEX-LGA', 'CLE-SLC', 'KOA-SNA', 'SNA-HNL', 'MDW-SNA', 'MIA-SEA', 'MEM-RDU', 'YUM-IPL', 'SLC-KOA', 'EGE-EWR', 'MTJ-DFW', 'TPA-CHS', 'FLL-OAK', 'PVD-MCI', 'SLC-DSM', 'RSW-DEN', 'ORD-JAN', 'ATL-FSD', 'CHS-JAX', 'MCO-MLI', 'FSD-SLC', 'SLC-LGA', 'GRB-DFW', 'PNS-JAX', 'BDL-LAX', 'ATL-SOP', 'MSP-FAI', 'CLT-CAE', 'PIT-SEA', 'SRQ-IND', 'PHF-CLT', 'MIA-CMH', 'FAR-SLC', 'TUL-LAS', 'EWR-TUS', 'ORD-STT', 'CLT-TRI', 'BHM-CLE', 'ORD-PWM', 'SRQ-IAH', 'BOI-ORD', 'ATL-EGE', 'ATL-CID', 'IND-MSY', 'EGE-LAX', 'BUR-PDX', 'BTR-LGA', 'MIA-SLC', 'ONT-PDX', 'CLE-SBN', 'MSP-JAC', 'CMH-FLL', 'MEM-AUS', 'PHX-MFR', 'SJU-STL', 'ASE-SLC', 'CID-ATL', 'DFW-MLI', 'SCC-BRW', 'LGA-MSN', 'MCO-PFN', 'MDW-SJU', 'SEA-SIT', 'DTW-OMA', 'GRR-TPA', 'EGE-SFO', 'DFW-RST', 'GRR-LAS', 'TPA-TLH', 'PWM-CLT', 'TLH-MIA', 'PHF-FLL', 'SFO-EGE', 'SAT-STL', 'RSW-MKE', 'DTW-MSY', 'IAH-TXK', 'TLH-JFK', 'ATL-GUC', 'IAH-VCT', 'DEN-GRR', 'IND-SEA', 'PIE-MDW', 'BHM-IAD', 'IAD-BHM', 'BUR-MCO', 'MTJ-EWR', 'CLE-HOU', 'MSY-STL', 'DFW-SYR', 'BUF-LAS', 'LEX-EWR'] in column 0 during transform
Do you know what the problem is ?
Thanks
The problem seems to be that OHE finds new categories in the validation sample that were not there in the training sample.
Unfortunately, sklearn's implementation can not handle such situation out-of-the-box. So one has to check that categories in the new data are the same as in the training set. There can be different strategies on how to treat new categories. Try to google and experiment with different things. Examples can be: make OHE aware of all possible categories including those in the new data (using categories
argument in the constructor) or drop those new categories in new data (compare new data to automatically learned categories_
parameter). Of course, the first option does not make sense in production, but the second can be always implemented