python pipeline categorical-data dictvectorizer

Categorical variables in sklearn pipeline with DictVectorizer

I want to apply a pipeline with numeric & categorical variables as below

import numpy as np
import pandas as pd
from sklearn import linear_model,  pipeline, preprocessing
from sklearn.feature_extraction import DictVectorizer 

df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df['a']
X = df[['b', 'c', 'd']]

I create indices for numeric

numeric = ['b']
numeric_indices = np.array([(column in numeric) for column in X.columns], dtype = bool)

& for categorical variables

categorical = ['c', 'd'] 
categorical_indices = np.array([(column in categorical) for column in X.columns], dtype = bool)

Then i create a pipeline

regressor = linear_model.SGDRegressor()
encoder = DictVectorizer(sparse = False)

estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        

            #numeric
            ('numeric_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_indices])),
                ('scaling', preprocessing.StandardScaler(with_mean = 0.))            
                        ])),

            #categorical
            ('categorical_variables_processing', pipeline.Pipeline(steps = [
                ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_indices])),
                ('DictVectorizer', encoder )           
                        ])),
        ])),
    ('model_fitting', regressor)
    ]
)

and i get

estimator.fit(X, y)
ValueError: could not convert string to float: 'f'

I know i have to apply encoder.fit() in the pipeline but don't understand how to apply it Or we hate to use preprocessing.OneHotEncoder() but again we need convert string to float

How to improve it?

Solution

I see just this way

import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, pipeline, preprocessing
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df.a
num = df[['b']]
cat = df[['c', 'd']]
from sklearn.feature_extraction import DictVectorizer
enc = DictVectorizer(sparse = False)
enc_data = enc.fit_transform(cat .T.to_dict().values())
crat = pd.DataFrame(enc_data,  columns=enc.get_feature_names())
X = pd.concat([crat, num], axis=1)
cat_columns = ['c=a', 'c=b', 'c=c', 'd=f', 'd=m'] 
cat_indices = np.array([(column in cat_columns) for column in X.columns], dtype = bool)
numeric_col = ['b']
num_indices = np.array([(column in numeric_col) for column in X.columns], dtype = bool)
reg = linear_model.SGDRegressor()
estimator = pipeline.Pipeline(steps = [       
    ('feature_processing', pipeline.FeatureUnion(transformer_list = [        
            ('categorical', preprocessing.FunctionTransformer(lambda data: data[:, cat_indices])), 

            #numeric
            ('numeric', pipeline.Pipeline(steps = [
                ('select', preprocessing.FunctionTransformer(lambda data: data[:, num_indices])),
                ('scale', preprocessing.StandardScaler())            
                        ]))
        ])),
    ('model', reg)
    ]
)
estimator.fit(X, y)