Search code examples
python-3.xmachine-learningscikit-learncountvectorizer

How do I add a custom intermediate preprocessor in machine learning pipeline that handles n-gram columns in scikit-learn?


The handling of n-gram variable (such as SUBSTRING_4L_V3) in the ML pre-processing step has been giving me some issues.

I'm able to transform and standardize numerical, categorical, and n-gram variables separately,

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer

data = {
    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
    }

df = pd.DataFrame(data)

def transform_numerical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)

    scaler = preprocessing.StandardScaler().fit(x_train)
    x_trainT = scaler.transform(x_train)
    x_testT = scaler.transform(x_test)

    print(x_train)
    print(x_trainT)
    print()
    print(x_test)
    print(x_testT)
    print('/////////////////////////', '\n')

transform_numerical()

def transform_categorical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)

    encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
    encoder.fit(x_trainT)
    x_trainT = encoder.transform(x_trainT)
    x_testT = encoder.transform(x_testT)

    print(x_trainT.toarray())
    print(x_train)
    print()
    print(x_testT.toarray())
    print(x_test)
    print('/////////////////////////', '\n')

transform_categorical()

def transform_list():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)
    x_trainT = x_trainT.ravel()
    x_testT = x_testT.ravel()

    count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) 
    x_trainT = count_vect.fit_transform(x_trainT)

    print(x_trainT.toarray())
    print('/////////////////////////', '\n')

transform_list()

For SUBSTRING_4L_V3, I need to flatten it via ravel() before applying the CountVectorizer().

However, I am not familiar with how to implement them sequentially in a ML pipeline, below

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

class RavelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self.ravel()

data = {
    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
    }

df = pd.DataFrame(data)

x_train, x_test, y_train, y_test = train_test_split(
    df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)

transformer_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

transformer_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

transformer_ngram = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('ravel', RavelTransformer()),
    ('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, 
        max_features=5000))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_num, ['AGE']),
        ('cat', transformer_cat, ['NAME', 'URBAN']),
        ('ngram', transformer_ngram, ['SUBSTRING_4L_V3']),
        ])

ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
#print('Model score: %.3f' % model.score(x_test, y_test))

Error:

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't

Solution

  • What the error message is telling you is that there is no transform function in your RavelTransformer class.

    My assumption is that you want to do something like this:

    class RavelTransformer(BaseEstimator, TransformerMixin):
        def __init__(self):
            pass
    
        def fit(self, X, y=None):
            return self
    
        def transform(self, X, y=None):
            return X.ravel()
    

    Here, your RavelTransformer does nothing in the fit step, but transforms your data by raveling it, as expected.