I want to apply a pipeline with numeric & categorical variables as below
import numpy as np
import pandas as pd
from sklearn import linear_model, pipeline, preprocessing
from sklearn.feature_extraction import DictVectorizer
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df['a']
X = df[['b', 'c', 'd']]
I create indices for numeric
numeric = ['b']
numeric_indices = np.array([(column in numeric) for column in X.columns], dtype = bool)
& for categorical variables
categorical = ['c', 'd']
categorical_indices = np.array([(column in categorical) for column in X.columns], dtype = bool)
Then i create a pipeline
regressor = linear_model.SGDRegressor()
encoder = DictVectorizer(sparse = False)
estimator = pipeline.Pipeline(steps = [
('feature_processing', pipeline.FeatureUnion(transformer_list = [
#numeric
('numeric_variables_processing', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_indices])),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
#categorical
('categorical_variables_processing', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_indices])),
('DictVectorizer', encoder )
])),
])),
('model_fitting', regressor)
]
)
and i get
estimator.fit(X, y)
ValueError: could not convert string to float: 'f'
I know i have to apply encoder.fit() in the pipeline but don't understand how to apply it Or we hate to use preprocessing.OneHotEncoder() but again we need convert string to float
How to improve it?
I see just this way
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, pipeline, preprocessing
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df.a
num = df[['b']]
cat = df[['c', 'd']]
from sklearn.feature_extraction import DictVectorizer
enc = DictVectorizer(sparse = False)
enc_data = enc.fit_transform(cat .T.to_dict().values())
crat = pd.DataFrame(enc_data, columns=enc.get_feature_names())
X = pd.concat([crat, num], axis=1)
cat_columns = ['c=a', 'c=b', 'c=c', 'd=f', 'd=m']
cat_indices = np.array([(column in cat_columns) for column in X.columns], dtype = bool)
numeric_col = ['b']
num_indices = np.array([(column in numeric_col) for column in X.columns], dtype = bool)
reg = linear_model.SGDRegressor()
estimator = pipeline.Pipeline(steps = [
('feature_processing', pipeline.FeatureUnion(transformer_list = [
('categorical', preprocessing.FunctionTransformer(lambda data: data[:, cat_indices])),
#numeric
('numeric', pipeline.Pipeline(steps = [
('select', preprocessing.FunctionTransformer(lambda data: data[:, num_indices])),
('scale', preprocessing.StandardScaler())
]))
])),
('model', reg)
]
)
estimator.fit(X, y)