Search code examples
pythonpipelinecategorical-datadictvectorizer

Categorical variables in sklearn pipeline with DictVectorizer


I want to apply a pipeline with numeric & categorical variables as below

import numpy as np
import pandas as pd
from sklearn import linear_model,  pipeline, preprocessing
from sklearn.feature_extraction import DictVectorizer 

df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df['a']
X = df[['b', 'c', 'd']]

I create indices for numeric

numeric = ['b']
numeric_indices = np.array([(column in numeric) for column in X.columns], dtype = bool)

& for categorical variables

categorical = ['c', 'd'] 
categorical_indices = np.array([(column in categorical) for column in X.columns], dtype = bool)

Then i create a pipeline

regressor = linear_model.SGDRegressor()
encoder = DictVectorizer(sparse = False)

estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        

            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_indices])),
                ('scaling', preprocessing.StandardScaler(with_mean = 0.))            
                        ])),

            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_indices])),
                ('DictVectorizer', encoder )           
                        ])),
        ])),
    ('model_fitting', regressor)
    ]
)

and i get

estimator.fit(X, y)
ValueError: could not convert string to float: 'f'

I know i have to apply encoder.fit() in the pipeline but don't understand how to apply it Or we hate to use preprocessing.OneHotEncoder() but again we need convert string to float

How to improve it?


Solution

  • I see just this way

    import numpy as np
    import pandas as pd
    from sklearn import linear_model, metrics, pipeline, preprocessing
    df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
    y = df.a
    num = df[['b']]
    cat = df[['c', 'd']]
    from sklearn.feature_extraction import DictVectorizer
    enc = DictVectorizer(sparse = False)
    enc_data = enc.fit_transform(cat .T.to_dict().values())
    crat = pd.DataFrame(enc_data,  columns=enc.get_feature_names())
    X = pd.concat([crat, num], axis=1)
    cat_columns = ['c=a', 'c=b', 'c=c', 'd=f', 'd=m'] 
    cat_indices = np.array([(column in cat_columns) for column in X.columns], dtype = bool)
    numeric_col = ['b']
    num_indices = np.array([(column in numeric_col) for column in X.columns], dtype = bool)
    reg = linear_model.SGDRegressor()
    estimator = pipeline.Pipeline(steps = [       
        ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
                ('categorical', preprocessing.FunctionTransformer(lambda data: data[:, cat_indices])), 
    
                #numeric
                ('numeric', pipeline.Pipeline(steps = [
                    ('select', preprocessing.FunctionTransformer(lambda data: data[:, num_indices])),
                    ('scale', preprocessing.StandardScaler())            
                            ]))
            ])),
        ('model', reg)
        ]
    )
    estimator.fit(X, y)