I am working on a fairly simple machine learning problem in the form of a practicum. I am using the following code to preprocess the data:
from preprocess.date_converter import DateConverter
from sklearn.pipeline import Pipeline
from preprocess.nan_fixer import CustomImputer
import pandas as pd
from preprocess.encoding import FrecuencyEncoding
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from preprocess.scaler import CustomScaler
def basic_preprocess(df, target : str):
important_features = ['amt', 'category', 'merchant', 'trans_date_trans_time', 'unix_time', 'dob', 'street', 'merch_lat', 'merch_long', 'city', 'merch_zipcode', 'city_pop', 'job', 'last', 'first', 'cc_num', 'long', 'zip', "is_fraud"]
df = df.drop(["trans_num", "Unnamed: 0"], axis=1)
df = df[important_features]
df = df.copy()
# df, df_ = train_test_split(df, test_size=0.5, shuffle=True, random_state=42, stratify=df[target])
df_train, unseen_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42, stratify=df[target])
df_val, df_test = train_test_split(unseen_df, test_size=0.5, shuffle=True, random_state=42, stratify=unseen_df[target])
pipeline = Pipeline([
("date_converter", DateConverter("trans_date_trans_time")),
("imputer", CustomImputer(strategy="most_frequent")),
("encoding", FrecuencyEncoding()),
("scaler", CustomScaler(df.drop(target, axis=1).columns.tolist())),
])
pipeline.fit(df_train.drop(target, axis=1), df_train[target])
X_train = pd.DataFrame(pipeline.transform(df_train.drop(target, axis=1)), index=df_train.index)
X_test = pd.DataFrame(pipeline.transform(df_test.drop(target, axis=1)), index=df_test.index)
X_val = pd.DataFrame(pipeline.transform(df_val.drop(target, axis=1)), index=df_val.index)
df_train = pd.concat([X_train, df_train[target]], axis=1)
df_test = pd.concat([X_test, df_test[target]], axis=1)
df_val = pd.concat([X_val, df_val[target]], axis=1)
return [df_train, df_test, df_val]
Note: preprocess
is a personal library that I created to make some transformers of my own. Here is the code for each of the transformers:
class CustomScaler(BaseEstimator, TransformerMixin):
"""
Receives the numeric columns from the dataframe. Scales their values using RobustScaler.
"""
def __init__(self, attributes):
self.attributes = attributes
self.scaler = RobustScaler() # Inicializa el escalador
def fit(self, X, y=None):
# Ajusta el escalador solo con los datos de entrenamiento
scale_cols = X[self.attributes]
self.scaler.fit(scale_cols)
return self
def transform(self, X, y=None):
X_copy = X.copy()
scale_attrs = X_copy[self.attributes]
# Usa el escalador ya ajustado para transformar los datos
X_scaled = self.scaler.transform(scale_attrs)
X_scaled = pd.DataFrame(X_scaled, columns=self.attributes, index=X_copy.index)
for attr in self.attributes:
X_copy[attr] = X_scaled[attr]
return X_copy
class CustomImputer(BaseEstimator, TransformerMixin):
"""
It implements SimpleImputer but, unlike SimpleImputer, it returns
dataframes and not numpy arrays.
"""
def __init__(self, strategy : str) -\> None:
self._imputer = SimpleImputer(strategy=strategy)
def fit(self, X, y=None):
self._imputer.fit(X, y)
return self
def transform(self, X, y=None):
transformed_data = self._imputer.transform(X)
return pd.DataFrame(transformed_data, columns=X.columns)
class FrecuencyEncoding(BaseEstimator, TransformerMixin):
"""
Searches for categorical columns and replaces them
using Frequency Encoding
"""
def __init__(self):
self._frequencies = {}
def fit(self, X, y=None):
for col in X.select_dtypes(['object']).columns:
self._frequencies[col] = X[col].value_counts().to_dict()
return self
def transform(self, X, y=None):
df_encoded = X.copy()
for col in df_encoded.select_dtypes(['object']).columns:
df_encoded[col] = df_encoded[col].map(self._frequencies[col])
df_encoded[col] = df_encoded[col].fillna(0)
return df_encoded
class DateConverter(BaseEstimator, TransformerMixin):
"""
Transformer created to convert all date columns to float.
It receives the list of date columns in the constructor.
Consider that in order to convert all elements (including nan) to float,
it was necessary to convert nans to 0 and then convert 0 to nans again.
"""
def __init__(self, attributes):
self.attributes = attributes
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = X.copy()
def converter(col):
return pd.to_datetime(col, errors='coerce').apply(lambda x: x.timestamp() if pd.notna(x) else 0).astype(float).replace(0, np.nan) if col.name in self.attributes else col
return X.apply(converter)
The problem is that, running basic_preprocess
I get the following error message:
/home/santiago/.local/lib/python3.10/site-packages/sklearn/pipeline.py:62: FutureWarning: This Pipeline instance is not fitted yet. Call 'fit' with appropriate arguments before using other methods such as transform, predict, etc. This will raise an error in 1.8 instead of the current warning.
warnings.warn(
/home/santiago/.local/lib/python3.10/site-packages/sklearn/pipeline.py:62: FutureWarning: This Pipeline instance is not fitted yet. Call 'fit' with appropriate arguments before using other methods such as transform, predict, etc. This will raise an error in 1.8 instead of the current warning.
warnings.warn(
This is no longer the case if I use PCA at the end of the pipeline:
pipeline = Pipeline([
("date_converter", DateConverter("trans_date_trans_time")),
("imputer", CustomImputer(strategy="most_frequent")),
("encoding", FrecuencyEncoding()),
("scaler", CustomScaler(df.drop(target, axis=1).columns.tolist())),
("PCA", PCA(n_components=0.999))
])
How to resolve it?
I ran the transformers separately expecting to get some unexpected behavior, but they all worked perfectly.
These are the results generated by the transformers:
~~~~~ Train dataframe before pipeline
amt category merchant trans_date_trans_time unix_time ... first cc_num long zip is_fraud
509059 51.71 grocery_net fraud_Stokes, Christiansen and Sipes 2019-08-09 03:13:27 1344482007 ... Destiny 639023984367 -74.9732 13647 0
395295 13.78 entertainment fraud_Effertz LLC 2019-06-29 19:56:48 1340999808 ... Sarah 373905417449658 -97.6443 76665 0
536531 961.26 shopping_pos fraud_Kris-Padberg 2019-08-18 14:42:20 1345300940 ... Sharon 3553629419254918 -122.3456 98238 0
271001 43.68 health_fitness fraud_Ratke and Sons 2019-05-13 21:27:46 1336944466 ... Jeremy 371034293500716 -120.7986 96135 0
532788 33.08 entertainment fraud_Upton PLC 2019-08-17 14:15:41 1345212941 ... Amy 4335531783520911 -91.4867 65066 0
1275175 199.99 kids_pets fraud_Schaefer Ltd 2020-06-13 22:05:09 1371161109 ... Maureen 4306630852918 -90.4504 63131 0
1117784 5.41 misc_pos fraud_Williamson LLC 2020-04-10 12:42:46 1365597766 ... Adam 6011366578560244 -77.7186 17051 0
429225 58.08 kids_pets fraud_Bogisich-Weimann 2019-07-11 18:28:54 1342031334 ... Greg 30428204673351 -76.2963 17088 0
739916 23.70 personal_care fraud_Dickinson Ltd 2019-11-11 23:26:50 1352676410 ... Monica 213161869125933 -70.6993 4226 0
93872 104.69 grocery_pos fraud_Strosin-Cruickshank 2019-02-25 05:16:10 1330146970 ... John 30026790933302 -91.0286 39113 0
[10 rows x 19 columns]
~~~~~ Test dataframe before pipeline
amt category merchant trans_date_trans_time unix_time ... first cc_num long zip is_fraud
734803 6.38 shopping_pos fraud_Quitzon, Green and Bashirian 2019-11-10 09:03:06 1352538186 ... Linda 4433091568498503 -77.1458 20882 0
875327 98.80 grocery_pos fraud_Heidenreich PLC 2019-12-21 13:12:50 1356095570 ... Gina 6538441737335434 -80.1752 16114 0
549897 1.29 food_dining fraud_Lesch, D'Amore and Brown 2019-08-23 17:10:46 1345741846 ... Martin 4990494243023 -78.8031 21524 0
770188 37.27 kids_pets fraud_Ullrich Ltd 2019-11-25 15:17:31 1353856651 ... Stephanie 4502539526809429801 -91.6421 72513 0
698390 8.76 travel fraud_Lynch-Mohr 2019-10-25 15:18:53 1351178333 ... Margaret 2254917871818484 -76.3477 20687 0
557456 14.26 health_fitness fraud_Rippin-VonRueden 2019-08-25 20:47:10 1345927630 ... Jamie 4066595222529 -82.7251 41254 0
1112225 26.69 home fraud_Witting, Beer and Ernser 2020-04-07 13:37:06 1365341826 ... Erika 180046617132290 -88.9655 62939 0
907535 40.11 personal_care fraud_Becker, Harris and Harvey 2019-12-28 16:48:07 1356713287 ... Zachary 374821819075109 -77.2218 14522 0
169363 54.45 food_dining fraud_O'Keefe-Wisoky 2019-03-30 17:59:38 1333130378 ... Brooke 4425161475596168 -100.3900 76905 0
102279 5.98 kids_pets fraud_Waelchi Inc 2019-02-28 22:56:08 1330556168 ... Christopher 4822367783500458 -81.5929 33844 0
[10 rows x 19 columns]
~~~~~ Validation dataframe before pipeline
amt category merchant trans_date_trans_time unix_time ... first cc_num long zip is_fraud
286442 59.54 personal_care fraud_Crooks and Sons 2019-05-20 19:57:21 1337543841 ... Megan 348789608637806 -98.6538 68950 0
259525 154.19 misc_pos fraud_Turcotte-Halvorson 2019-05-09 12:22:48 1336566168 ... Marissa 4400011257587661852 -98.7858 68859 0
706250 2.36 shopping_net fraud_Kozey-Boehm 2019-10-28 09:53:45 1351418025 ... Bradley 3542162746848552 -93.4824 56029 0
557846 65.13 entertainment fraud_Brown-Greenholt 2019-08-25 22:40:54 1345934454 ... Philip 6592243974328236 -86.2715 36111 0
984124 123.34 misc_pos fraud_Hermann-Gaylord 2020-02-04 08:28:45 1359966525 ... Theresa 30199621383748 -96.2238 75452 0
379560 55.73 kids_pets fraud_Larkin Ltd 2019-06-23 19:47:39 1340480859 ... Stacy 4961003488432306 -76.1950 17929 0
645012 66.42 gas_transport fraud_Raynor, Feest and Miller 2019-10-01 08:19:12 1349079552 ... Brandy 676195318214 -96.5249 77412 0
631986 1.30 misc_net fraud_McGlynn-Heathcote 2019-09-26 01:07:43 1348621663 ... Shannon 2269768987945882 -77.8664 14510 0
454841 1.27 personal_care fraud_Zulauf LLC 2019-07-20 23:09:40 1342825780 ... Joshua 4266200684857219 -98.0684 68961 0
599151 6.48 shopping_pos fraud_Kris-Padberg 2019-09-11 13:10:03 1347369003 ... Frank 3501509250702469 -81.7361 34112 0
~~~~~ Train dataframe after pipeline
amt category merchant trans_date_trans_time unix_time dob street ... job last first cc_num long zip is_fraud
509059 0.068966 0.788219 0.654717 0.0 0.0 -0.421986 -0.295909 ... 0.674448 0.512351 -0.526083 -0.295909 -0.418632 -0.415006 0
395295 -0.068966 1.057319 1.098113 0.0 0.0 -0.932624 -0.671889 ... -0.874788 0.923014 -0.001710 -0.671889 -0.928066 -0.921454 0
536531 -0.068966 -0.364541 -0.430189 0.0 0.0 0.888889 0.669278 ... -0.219015 0.644710 0.107754 0.669278 0.889151 0.885111 0
271001 -0.123153 0.788219 0.886792 0.0 0.0 0.033097 0.039164 ... -0.007216 -0.268423 0.879276 0.039164 0.035377 0.036342 0
532788 -0.118227 0.788219 0.650943 0.0 0.0 -0.938534 -0.676240 ... -0.355263 -0.110128 0.104761 -0.676240 -0.933962 -0.927315 0
1275175 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
1117784 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
429225 -0.266010 0.788219 0.656604 0.0 0.0 -0.921986 -0.664056 ... -0.873514 -0.434747 1.168044 -0.664056 -0.917453 -0.910903 0
739916 1.004926 0.788219 0.758491 0.0 0.0 -0.427896 -0.300261 ... -0.522920 -0.348703 0.412628 -0.300261 -0.424528 -0.420868 0
93872 -0.004926 -0.106084 -0.230189 0.0 0.0 -1.419622 -1.030461 ... -0.176146 -0.521408 -0.524515 -1.030461 -1.413915 -1.404455 0
[10 rows x 19 columns]
~~~~~ Test dataframe after pipeline
amt category merchant trans_date_trans_time unix_time dob street ... job last first cc_num long zip is_fraud
734803 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
875327 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
549897 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
770188 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
698390 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
557456 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
1112225 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
907535 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
169363 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN 0
102279 -0.330049 0.801726 0.777358 -1.0 -1.0 0.06974 0.066144 ... 0.887097 0.41231 1.181157 0.066144 0.071934 0.072685 0
[10 rows x 19 columns]
~~~~~ Validation dataframe after pipeline
amt category merchant trans_date_trans_time unix_time dob street merch_lat ... city_pop job last first cc_num long zip is_fraud
286442 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
259525 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
706250 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
557846 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
984124 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
379560 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
645012 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
631986 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
454841 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
599151 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN 0
The two issues are separate.
The warning that the pipeline is not fitted is because of how pipelines report themselves as fitted: they just check whether their last (non-passthrough) step is fitted (source code). So your custom scaler isn't reported as having been fit. check_is_fitted
looks for attributes with trailing underscores, or you implement your own __sklearn_is_fitted__
method returning a boolean, see the developer guide. (It's also better to keep the __init__
method to just setting method parameters as attributes; you could instantiate self.scaler_
at fit
time instead and deal with both problems.)
The second issue is actually a problem, the new NaNs. I think we need additional information, but two thoughts come to my mind: