I am trying to perform multiple column transformations using OneHotEncoder()
and TfidfVectorizer()
on my training data which is a numpy array. I am trying to use make_column_transformer()
to perform all transformations at once. X_train is my input data.
Input Data
print(X_train.shape)
>>> (75117, 6)
Sample instance
print(X_train[5,:])
>>> ['electrical_contractor_license-electrical_contractor_license-general_contractor_license-refrigeration_contractor_lic.'
'brennan_heating_company_inc' 'instal new electr boiler'
'single_family_/_duplex' 0.0 0]
Column Transformation code
column_trans = make_column_transformer(
(OneHotEncoder(sparse=False, handle_unknown='ignore'), [0, 1, 3]),
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), [2]),
remainder='passthrough')
z = column_trans.fit_transform(X_train)
Using the above code, OneHotEncoder()
works fine on columns (0, 1, 3)
but when I add TfidfVectorizer()
for column 2
it throws the following error.
TypeError: cannot use a string pattern on a bytes-like object
Full Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1167-68498e1c856a> in <module>
4 remainder='passthrough')
5
----> 6 z = column_trans.fit_transform(X_train)
7 print(z[0,:].shape)
8 print(z[0,:])
/opt/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
516 self._validate_remainder(X)
517
--> 518 result = self._fit_transform(X, y, _fit_transform_one)
519
520 if not result:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
455 message=self._log_message(name, idx, len(transformers)))
456 for idx, (name, trans, column, weight) in enumerate(
--> 457 self._iter(fitted=fitted, replace_strings=True), 1))
458 except ValueError as e:
459 if "Expected 2D array, got 1D array instead" in str(e):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, 'fit_transform'):
--> 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1857 """
1858 self._check_params()
-> 1859 X = super().fit_transform(raw_documents)
1860 self._tfidf.fit(X)
1861 # X is already a transformed view of raw_documents so
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1218
1219 vocabulary, X = self._count_vocab(raw_documents,
-> 1220 self.fixed_vocabulary_)
1221
1222 if self.binary:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1129 for doc in raw_documents:
1130 feature_counter = {}
-> 1131 for feature in analyze(doc):
1132 try:
1133 feature_idx = vocabulary[feature]
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
103 doc = preprocessor(doc)
104 if tokenizer is not None:
--> 105 doc = tokenizer(doc)
106 if ngrams is not None:
107 if stop_words is not None:
TypeError: cannot use a string pattern on a bytes-like object
It does work when I use it outside of make_column_transformer()
, but the reason why I am using make_column_transformer()
and not separately, is because, if I do One hot encoding
first and then tfidf
, then its quite possible that the number of features generated by one hot encoder might vary, so hard coding the column index for tfidf might not be a good idea.
tf = TfidfVectorizer(min_df=1, stop_words='english')
n = tf.fit_transform(X_train[:,2])
n.toarray()
>>> array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
I had the same problem. The solution is to replace:
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), [2]),
by
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), 2),
Otherwise the TfidfVectorizer, upon calling fit
, receives 2D data (of shape (#documents, 1)
) which it can't handle.
It is explained in ColumnTransformer
documentation:
columns: str, array-like of str, int, array-like of int, array-like of bool, slice or callable
Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where transformer expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. [...]