The handling of n-gram variable (such as SUBSTRING_4L_V3
) in the ML pre-processing step has been giving me some issues.
I'm able to transform and standardize numerical, categorical, and n-gram variables separately,
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
}
df = pd.DataFrame(data)
def transform_numerical():
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)
scaler = preprocessing.StandardScaler().fit(x_train)
x_trainT = scaler.transform(x_train)
x_testT = scaler.transform(x_test)
print(x_train)
print(x_trainT)
print()
print(x_test)
print(x_testT)
print('/////////////////////////', '\n')
transform_numerical()
def transform_categorical():
x_train, x_test, y_train, y_test = train_test_split(
df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)
cat_imputer = SimpleImputer(strategy='constant', fill_value='')
cat_imputer.fit(x_train)
x_trainT = cat_imputer.transform(x_train)
x_testT = cat_imputer.transform(x_test)
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
encoder.fit(x_trainT)
x_trainT = encoder.transform(x_trainT)
x_testT = encoder.transform(x_testT)
print(x_trainT.toarray())
print(x_train)
print()
print(x_testT.toarray())
print(x_test)
print('/////////////////////////', '\n')
transform_categorical()
def transform_list():
x_train, x_test, y_train, y_test = train_test_split(
df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)
cat_imputer = SimpleImputer(strategy='constant', fill_value='')
cat_imputer.fit(x_train)
x_trainT = cat_imputer.transform(x_train)
x_testT = cat_imputer.transform(x_test)
x_trainT = x_trainT.ravel()
x_testT = x_testT.ravel()
count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
x_trainT = count_vect.fit_transform(x_trainT)
print(x_trainT.toarray())
print('/////////////////////////', '\n')
transform_list()
For SUBSTRING_4L_V3
, I need to flatten it via ravel()
before applying the CountVectorizer()
.
However, I am not familiar with how to implement them sequentially in a ML pipeline, below
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
class RavelTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self.ravel()
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
}
df = pd.DataFrame(data)
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)
transformer_num = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
transformer_cat = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])
transformer_ngram = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('ravel', RavelTransformer()),
('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None,
max_features=5000))])
preprocessor = ColumnTransformer(
transformers=[
('num', transformer_num, ['AGE']),
('cat', transformer_cat, ['NAME', 'URBAN']),
('ngram', transformer_ngram, ['SUBSTRING_4L_V3']),
])
ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
#print('Model score: %.3f' % model.score(x_test, y_test))
Error:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't
What the error message is telling you is that there is no transform
function in your RavelTransformer
class.
My assumption is that you want to do something like this:
class RavelTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.ravel()
Here, your RavelTransformer
does nothing in the fit
step, but transforms your data by raveling it, as expected.