i am trying to pass an email to my Pipeline
and throw some prob based on the training. For doing that i used a bunch of function to take from the email pass like
from collections import Counter
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class EmailLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("@")[0]) for e in X]).reshape(-1, 1)
class DomainLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("@")[-1]) for e in X]).reshape(-1, 1)
class NumberOfVoulsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
vouls = "aeiouAEIOU"
name = [e[0].split("@")[0] for e in X]
return np.array(
[sum(1 for char in name if char in vouls) for name in name]
).reshape(-1, 1)
class NumberOfCapitalsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array(
[sum(1 for char in email[0] if char.isupper()) for email in X]
).reshape(-1, 1)
class NumberOfDigitsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
digits = "0123456789"
return np.array(
[sum(1 for char in email[0] if char in digits) for email in X]
).reshape(-1, 1)
So after this i package them inside another class and send it to Pipeline
like this
class EmailsSuspicionModel:
def __init__(self, X_train, X_valid, y_train, y_valid, model_params):
self.X_train = X_train
self.X_valid = X_valid
self.y_train = y_train
self.y_valid = y_valid
self.model_params = model_params
self.preprocesser = FeatureUnion(
[
("email_length", EmailLengthTransformer()),
("domain_length", DomainLengthTransformer()),
("number_of_vouls", NumberOfVoulsTransfomer()),
("number_of_capitals", NumberOfCapitalsTransfomer()),
("number_of_digits", NumberOfDigitsTransfomer()),
("highest_char_frequency", HighestCharFrequencyTransfomer()),
("number_of_different_chars", NumberOfDifferentChars()),
(
"number_of_consecutive_or_identical_chars",
NumberOfConsecutiveOrIdenticalCharsTransfomer()
),
]
)
def transform(self):
logging.info("Transform validation data - Required for evaluation")
valid_preprocesser = self.preprocesser.fit(self.X_train)
return valid_preprocesser.transform(self.X_valid)
def pipeline(self):
logging.info("Build sklearn pipeline with XGBoost model")
xgb_model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)
if self.model_params:
logging.info(f"XGBoost model params: {self.model_params}")
xgb_model = XGBClassifier(**self.model_params)
return Pipeline([("preproc", self.preprocesser), ("classifier", xgb_model)])
def fit(self):
self.pipeline().fit(
self.X_train, self.y_train, classifier__eval_set=[(self.transform(), self.y_valid)]
)
So whenver i start using the classes in action
X_valid_transformed = EmailsSuspicionModel(X_train.values, X_valid.values, y_train, y_valid, model_params=None).transform()
pipeline = EmailsSuspicionModel(X_train, X_valid, y_train, y_valid, model_params=None).pipeline()
pipeline.fit(
X_train, y_train, classifier__eval_set=[(X_valid_transformed, y_valid)]
)
My model is not yielding my expecting results ( i double check it against a notebook which i dont use pipeline ) and i think is because X_train is not being trained with the proper feature set since whenever i do
pipeline['preproc'].transform(['[email protected]'])
([1, 1, 0, 0, 0])
Clearly that transformation is not being applied correctly since result is ([10, 9, 3, 0, 0])
from the functions provided and i think the model is being trained with this same error
It's a shape issue. If you transform instead [['[email protected]']]
you'll recover the expected values. You've written your transformers to expect 2D array inputs (the e[0]
or email[0]
in each of them otherwise selects the first character of the email).