Search code examples
scikit-learndata-sciencetransformpipeline

FeatureTransform from sklearn properly used


i am trying to pass an email to my Pipeline and throw some prob based on the training. For doing that i used a bunch of function to take from the email pass like

from collections import Counter

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


class EmailLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array([len(e[0].split("@")[0]) for e in X]).reshape(-1, 1)


class DomainLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array([len(e[0].split("@")[-1]) for e in X]).reshape(-1, 1)


class NumberOfVoulsTransfomer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        vouls = "aeiouAEIOU"
        name = [e[0].split("@")[0] for e in X]
        return np.array(
            [sum(1 for char in name if char in vouls) for name in name]
        ).reshape(-1, 1)


class NumberOfCapitalsTransfomer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array(
            [sum(1 for char in email[0] if char.isupper()) for email in X]
        ).reshape(-1, 1)


class NumberOfDigitsTransfomer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        digits = "0123456789"
        return np.array(
            [sum(1 for char in email[0] if char in digits) for email in X]
        ).reshape(-1, 1)

So after this i package them inside another class and send it to Pipeline like this

class EmailsSuspicionModel:
    def __init__(self, X_train, X_valid, y_train, y_valid, model_params):
        self.X_train = X_train
        self.X_valid = X_valid
        self.y_train = y_train
        self.y_valid = y_valid
        self.model_params = model_params
        self.preprocesser = FeatureUnion(
            [
                ("email_length", EmailLengthTransformer()),
                ("domain_length", DomainLengthTransformer()),
                ("number_of_vouls", NumberOfVoulsTransfomer()),
                ("number_of_capitals", NumberOfCapitalsTransfomer()),
                ("number_of_digits", NumberOfDigitsTransfomer()),
                ("highest_char_frequency", HighestCharFrequencyTransfomer()),
                ("number_of_different_chars", NumberOfDifferentChars()),
                (
                    "number_of_consecutive_or_identical_chars",
                    NumberOfConsecutiveOrIdenticalCharsTransfomer()
                ),
            ]
        )

    def transform(self):
        logging.info("Transform validation data - Required for evaluation")
        valid_preprocesser = self.preprocesser.fit(self.X_train)
        return valid_preprocesser.transform(self.X_valid)

    def pipeline(self):
        logging.info("Build sklearn pipeline with XGBoost model")
        xgb_model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)
        if self.model_params:
            logging.info(f"XGBoost model params: {self.model_params}")
            xgb_model = XGBClassifier(**self.model_params)

        return Pipeline([("preproc", self.preprocesser), ("classifier", xgb_model)])

    def fit(self):
        self.pipeline().fit(
            self.X_train, self.y_train, classifier__eval_set=[(self.transform(), self.y_valid)]
        )

So whenver i start using the classes in action

X_valid_transformed = EmailsSuspicionModel(X_train.values, X_valid.values, y_train, y_valid, model_params=None).transform()
pipeline = EmailsSuspicionModel(X_train, X_valid, y_train, y_valid, model_params=None).pipeline()
pipeline.fit(
        X_train, y_train, classifier__eval_set=[(X_valid_transformed, y_valid)]
    )

My model is not yielding my expecting results ( i double check it against a notebook which i dont use pipeline ) and i think is because X_train is not being trained with the proper feature set since whenever i do

pipeline['preproc'].transform(['[email protected]'])
([1, 1, 0, 0, 0]) 

Clearly that transformation is not being applied correctly since result is ([10, 9, 3, 0, 0]) from the functions provided and i think the model is being trained with this same error


Solution

  • It's a shape issue. If you transform instead [['[email protected]']] you'll recover the expected values. You've written your transformers to expect 2D array inputs (the e[0] or email[0] in each of them otherwise selects the first character of the email).