Search code examples
classpython-3.xpipeline

Error in the class to create pipeline


I try to create a new variable 'age' from two variables 'date_birth' and 'date_survey'

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model, pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

my dataframe

df = pd.DataFrame({'a':[1,2,3], 
                   'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'],
                   'birth': ['1985', '1984', '1986'] })

The code for pipeline

X = df[['date_survey', 'birth']]
y = df['a']
class MultiColumn:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self
    def transform(self, X):                                                           
        return X[self.columns]
class Age(TransformerMixin):

    def transform(self, X, y=None, **fit_params): 
        X['date_survey'] = pd.to_datetime(X['date_survey'])
        year = pd.DataFrame(X['date_survey'].apply(lambda x: x.year))
        age = X['birth'].convert_objects(convert_numeric=True) - year
        return age

    def fit(self, X, y=None, **fit_params):
        return self
regressor = linear_model.SGDRegressor()
pipeline = Pipeline([
          ('union', FeatureUnion(
        transformer_list=[    
             # age
            ('age', Pipeline([
                ('selector', MultiColumn(columns=['date_survey', 'birth'])),
                ('date', Age())

            ])),
        ])),
    # Use a regression
    ('model_fitting', regressor),
])
pipeline.fit(X, y)

and i get an error

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

I guess that the error in class Age, but i cann't understand how to improve it


Solution

  •   date_survey birth date_survey_in_transform  year
    0  10.01.2013  1985               2013-10-01  2013
    1  20.02.2014  1984               2014-02-20  2014
    2  30.03.2015  1986               2015-03-30  2015
    

    birth - year is negative.

    age = X['birth'].convert_objects(convert_numeric=True) - year
    

    I modified some of your code to get it to run without errors.

    import numpy as np
    import pandas as pd
    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn import linear_model, pipeline
    from sklearn.pipeline import FeatureUnion
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import SGDRegressor
    
    df = pd.DataFrame({'a':[1,2,3], 
                       'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'],
                       'birth': ['1985', '1984', '1986'] })
    
    X = df[['date_survey', 'birth']]
    y = df['a']
    class MultiColumn:
        def __init__(self,columns=None):
            self.columns = columns # array of column names to encode
    
        def fit(self,X,y=None):
            return self
    
        def transform(self, X):                                                           
            return X[self.columns]
    
    class Age(TransformerMixin):
    
        def transform(self, X, y=None, **fit_params): 
            X['date'] = pd.to_datetime(X['date_survey'])
            X['year'] = X['date'].dt.year
            X['age'] = X['year'] - X['birth'].astype('int64')
            return X['age'].reshape(-1, 1)
    
        def fit(self, X, y=None, **fit_params):
            return self
    
    pipeline = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                # age
                ('age', Pipeline([
                    ('selector', MultiColumn(columns=['date_survey', 'birth'])),
                    ('date', Age())
                    ])
                 ),
                ]
            )
         ),
        # Use a regression
        ('model_fitting', SGDRegressor())
        ])
    
    pipeline.fit(X, y)