Search code examples
pythonpandasscikit-learnpipeline

Custom function transformer not performing as expected - sklearn pipeline


I'm writing a custom transformer for a scikit-learn Pipeline. The transformer seems to work on it's own, and the fit() and transform() methods work individually, but when I include it in a pipeline, it raises an error stating:

AttributeError: 'NoneType' object has no attribute 'transform'

For reference, here is the code for my custom transformer:

class feature_union(TransformerMixin, BaseEstimator):
    
    def __init__(self):
        self.Xt = None
        self.PI2_categories = ['D3', 'D4', 'A6', 'A5', 'D1', 'D2', 'A8', 'B2', 'E1', 
                               'A1', 'A2', 'C1', 'C4', 'A7', 'C2', 'C3', 'A4', 'A3', 'B1']     
        
    def fit(self, X, y=None):
        
        product_columns = ['Product_Info_1', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7'] + self.PI2_categories
        product_idx = [col for col in range(X.shape[1]) if X.columns[col] in product_columns]

        personal_columns = ['Ins_Age', 'Ht', 'Wt', 'BMI']
        personal_idx = [col for col in range(X.shape[1]) if X.columns[col] in personal_columns]

        medical_hist_columns = ["Medical_History_{}".format(x) for x in range(1, 42, 1)]
        medical_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_hist_columns]

        family_hist_columns = ["Family_Hist_{}".format(x) for x in range(1, 6, 1)]
        family_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in family_hist_columns]

        insured_info_columns = ["InsuredInfo_{}".format(x) for x in range(1, 8, 1)]
        insured_info_idx = [col for col in range(X.shape[1]) if X.columns[col] in insured_info_columns]

        insurance_hist_columns = ["Insurance_History_{}".format(x) for x in range(1, 10, 1)]
        insurance_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in insurance_hist_columns]

        employment_info_columns = ["Employment_Info_{}".format(x) for x in range(1, 7, 1)]
        employment_info_idx = [col for col in range(X.shape[1]) if X.columns[col] in employment_info_columns]

        medical_keyword_columns = ["Medical_Keyword_{}".format(x) for x in range(1, 49, 1)]
        medical_keyword_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_keyword_columns]

        medical_keyword_columns = ["Medical_Keyword_{}".format(x) for x in range(1, 49, 1)]
        medical_keyword_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_keyword_columns]

        get_original_features = lambda X: X
        get_product_columns  = lambda X: X[:, product_idx]
        get_personal_columns = lambda X: X[:, personal_idx]
        get_medical_hist_columns = lambda X: X[:, medical_hist_idx]
        get_family_hist_columns = lambda X: X[:, family_hist_idx]
        get_insured_info_columns = lambda X: X[:, insured_info_idx]
        get_insurance_hist_columns = lambda X: X[:, insurance_hist_idx]
        get_employment_info_columns = lambda X: X[:, employment_info_idx]
        get_medical_keyword_columns = lambda X: X[:, medical_keyword_idx]


        get_medical_and_family = lambda X: X[:, medical_keyword_idx + medical_hist_idx + family_hist_idx]


        union = FeatureUnion([
            ("original_features", FunctionTransformer(get_original_features)),

            ("product_interaction", Pipeline([('select_product', FunctionTransformer(get_product_columns)),
                                              ('product_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
                                            ])),

            ("personal_interaction", Pipeline([('select_personal', FunctionTransformer(get_personal_columns)),
                                              ('personal_interaction', PolynomialFeatures(4, include_bias=False, interaction_only=True))
                                            ])),

            ("medical_hist_interaction", Pipeline([('select_medical', FunctionTransformer(get_medical_hist_columns)),
                                                   ('medical_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
                                                 ])),

            ("family_hist_interaction", Pipeline([('select_family_hist', FunctionTransformer(get_family_hist_columns)),
                                                  ('family_hist_interaction', PolynomialFeatures(5, include_bias=False, interaction_only=True))
                                                ])),

            ("insured_info_interaction", Pipeline([('select_insured_info', FunctionTransformer(get_insured_info_columns)),
                                                   ('insured_info_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
                                                 ])),

            ("insurance_hist_interaction", Pipeline([('select_insurance_hist', FunctionTransformer(get_insurance_hist_columns)),
                                                   ('insurance_hist_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
                                                   ])),

            ("employment_info_interaction", Pipeline([('select_employment_info', FunctionTransformer(get_employment_info_columns)),
                                                    ('employment_info_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
                                                    ])),

            ("medical_keyword_interaction", Pipeline([('select_medical_keyword', FunctionTransformer(get_medical_keyword_columns)),
                                                    ('medical_keyword_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
                                                    ])),

            ])
        
        Xt = union.fit_transform(X)
        
        return self.Xt
    
    def transform(self, X, y=None):
        Xt = self.Xt
        return Xt

And when I use it in a pipeline like this:

pipeline_feat_union = Pipeline([('preprocess', preprocess()),
                                ('feat_union', feature_union()),
                                ('classifier', GaussianNB())])

It raises the following error:

AttributeError: 'NoneType' object has no attribute 'transform'

Solution

  • I ran into the same problem. The GuassianNB() class doesn't have a transform method defined.

    But you don't need to use the transform method at all if you are including your classifier in the pipeline. The only two methods that you need are the fit method and the predict method.

    
    pipeline_feat_union.fit(X_train, y_train)
    pipeline_feat_union.predict(X_train)