I want to apply log()
to my DataFrame
and MinMaxScaler() together.
I want the output to be a pandas DataFrame() with indexes and columns from the original data.
I want to use the parameters used to fit_transform()
to inverse_transform()
resulting in a new data frame. So, it needs to be constructed inside the FunctionTransformer
.
What I tried:
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
# Initialize MinMaxScaler with range (0, 1)
scaler_logMinMax = MinMaxScaler(feature_range=(0, 1))
# Log transformation function
def log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
X_log = np.log(X + shift) # Apply log transformation with a small shift
return pd.DataFrame(scaler.fit_transform(X_log)) # Scale the log-transformed data
# Inverse transformation: first unscale, then inverse log transform
def inv_log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
X_unscaled = scaler.inverse_transform(X) # Inverse scaling
return np.exp(X_unscaled) - shift # Inverse of log transformation
# Create FunctionTransformer for the log and scale transformation
log_and_scale_transformer = FunctionTransformer(func=log_and_scale, inverse_func=inv_log_and_scale, validate=True)
df_subset = pd.DataFrame(
{
1: [135.2342984, 83.17136704, 23.41329775, 3.574450787],
2: [59.31328422, 18.15285711, 11.1736562, 4.788951527],
3: [45.0087282, 4.094515245, 106.536704, 527.0962651],
}
)
df_subset.columns = [1, 2, 3]
df_subset.index = ["201001", "201002", "201003", "201004"]
df_subset.index.name = "Date"
df_subset.columns.name = "id"
cols_to_apply_scaler = [1, 2]
df_subset
id 1 2 3
Date
201001 135.234298 59.313284 45.008728
201002 83.171367 18.152857 4.094515
201003 23.413298 11.173656 106.536704
201004 3.574451 4.788952 527.096265
# Transforming
df_subset[cols_to_apply_scaler] = pd.DataFrame(log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler]))
df_subset
id 1 2 3
Date
201001 NaN NaN 45.008728
201002 NaN NaN 4.094515
201003 NaN NaN 106.536704
201004 NaN NaN 527.096265
# The way that I expect to apply the inverse transformer.
# df_subset[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(df_subset[cols_to_apply_scaler])
Questions:
pd.DataFrame(log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler]))
works, but it can't assign to the original DataFrame because the name of columns change. How to fix it?scaler_logMinMax
from fit_transform()
were carried through the inverse_transform
?I also tried log_and_scale_transformer = log_and_scale_transformer.set_output(transform="pandas")
after creating the dataframe, but it did not work.
I need to filter the columns before applying the function.
I also want to stick with FunctionTransformer
because I use other transformers with the same structure. For ex:
# Define the inverse transformation function with a shift
def inv_y(X, shift=0.5):
return 1 / (X + shift)
# Define the inverse inverse transformation to revert to original values
def inv_inv_y(X, shift=0.5):
return (1 - X * shift) / X
# Create the FunctionTransformer
inverse_transformer = FunctionTransformer(func=inv_y, inverse_func=inv_inv_y, validate=False, check_inverse=True)
In summary, I cannot apply a function and a scaler together.
With a different simple example, it works:
# DataFrame Example
X = np.array([[0, 1, 2], [2, 3, 4], [5, 7, 9]])
cols = ["A", "B", "C"]
cols_to_apply_scaler = cols[:-1]
X = pd.DataFrame(X, columns=cols, index=[0,1,2])
X
A B C
0 0 1 2
1 2 3 4
2 5 7 9
# Transforming
X[cols_to_apply_scaler] = pd.DataFrame(log_and_scale_transformer.fit_transform(X[cols_to_apply_scaler]))
A B C
0 0.000000 0.000000 2
1 0.958971 0.564575 4
2 1.000000 1.000000 9
/home/guilherme/anaconda3/envs/time_series/lib/python3.11/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but FunctionTransformer was fitted with feature names
warnings.warn(
# Inverse
X[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(X[cols_to_apply_scaler])
X
A B C
0 6.203855e-25 1.0 2
1 2.000000e+00 3.0 4
2 5.000000e+00 7.0 9
But I did not understand the warning. Can I fix it?
Concerning your first question, to preserve the index and columns work with Dataframes.
Concerning your second question, the values used in fit_transform()
are carried through to inverse_transform()
because the state of the scaler is stored internally within the object instance.
Full example based on you OP:
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
import pandas as pd, numpy as np
scaler_logMinMax = MinMaxScaler(feature_range=(0, 1))
# Log transformation function
def log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
X_log = np.log(X + shift)
scaled = scaler.fit_transform(X_log)
return pd.DataFrame(scaled, index=X.index, columns=X.columns)
def inv_log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
unscaled = scaler.inverse_transform(X)
return pd.DataFrame(np.exp(unscaled) - shift, index=X.index, columns=X.columns)
log_and_scale_transformer = FunctionTransformer(
func=log_and_scale,
inverse_func=inv_log_and_scale,
validate=False # Allow pandas
)
df_subset = pd.DataFrame(
{
1: [135.2342984, 83.17136704, 23.41329775, 3.574450787],
2: [59.31328422, 18.15285711, 11.1736562, 4.788951527],
3: [45.0087282, 4.094515245, 106.536704, 527.0962651],
},
index=["201001", "201002", "201003", "201004"]
)
df_subset.columns = [1, 2, 3]
df_subset.index.name = "Date"
df_subset.columns.name = "id"
cols_to_apply_scaler = [1, 2]
# fit and transform
df_subset[cols_to_apply_scaler] = log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler])
print("Transformed DataFrame:")
print(df_subset)
# inverse transform the same columns
df_subset[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(df_subset[cols_to_apply_scaler])
print("\nInverse Transformed DataFrame:")
print(df_subset)
This prints
id 1 2 3
Date
201001 1.000000 1.000000 45.008728
201002 0.894048 0.789684 4.094515
201003 0.574869 0.649927 106.536704
201004 0.000000 0.000000 527.096265
and
id 1 2 3
Date
201001 135.234298 59.313284 45.008728
201002 83.171367 18.152857 4.094515
201003 23.413298 11.173656 106.536704
201004 3.574451 4.788952 527.096265