Search code examples
pythonpandasscikit-learndata-preprocessing

How to create a scaler applying log transformation and MinMaxScaler in sklearn


I want to apply log() to my DataFrame and MinMaxScaler() together. I want the output to be a pandas DataFrame() with indexes and columns from the original data. I want to use the parameters used to fit_transform() to inverse_transform() resulting in a new data frame. So, it needs to be constructed inside the FunctionTransformer.

What I tried:

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

# Initialize MinMaxScaler with range (0, 1)
scaler_logMinMax = MinMaxScaler(feature_range=(0, 1))

# Log transformation function
def log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
    X_log = np.log(X + shift)  # Apply log transformation with a small shift
    return pd.DataFrame(scaler.fit_transform(X_log))  # Scale the log-transformed data


# Inverse transformation: first unscale, then inverse log transform
def inv_log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
    X_unscaled = scaler.inverse_transform(X)  # Inverse scaling
    return np.exp(X_unscaled) - shift  # Inverse of log transformation

# Create FunctionTransformer for the log and scale transformation
log_and_scale_transformer = FunctionTransformer(func=log_and_scale, inverse_func=inv_log_and_scale, validate=True)
df_subset = pd.DataFrame(
    {
        1: [135.2342984, 83.17136704, 23.41329775, 3.574450787],
        2: [59.31328422, 18.15285711, 11.1736562, 4.788951527],
        3: [45.0087282, 4.094515245, 106.536704, 527.0962651],
    }
)
df_subset.columns = [1, 2, 3]
df_subset.index = ["201001", "201002", "201003", "201004"]
df_subset.index.name = "Date"
df_subset.columns.name = "id"
cols_to_apply_scaler = [1, 2]
df_subset

id  1           2           3
Date            
201001  135.234298  59.313284   45.008728
201002  83.171367   18.152857   4.094515
201003  23.413298   11.173656   106.536704
201004  3.574451    4.788952    527.096265
# Transforming
df_subset[cols_to_apply_scaler] = pd.DataFrame(log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler]))
df_subset

id  1   2   3
Date            
201001  NaN NaN 45.008728
201002  NaN NaN 4.094515
201003  NaN NaN 106.536704
201004  NaN NaN 527.096265
# The way that I expect to apply the inverse transformer.
# df_subset[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(df_subset[cols_to_apply_scaler])

Questions:

  1. The pd.DataFrame(log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler])) works, but it can't assign to the original DataFrame because the name of columns change. How to fix it?
  2. How the values of scaler_logMinMax from fit_transform() were carried through the inverse_transform?

I also tried log_and_scale_transformer = log_and_scale_transformer.set_output(transform="pandas") after creating the dataframe, but it did not work.

I need to filter the columns before applying the function. I also want to stick with FunctionTransformer because I use other transformers with the same structure. For ex:

# Define the inverse transformation function with a shift
def inv_y(X, shift=0.5):
    return 1 / (X + shift)

# Define the inverse inverse transformation to revert to original values
def inv_inv_y(X, shift=0.5):
    return (1 - X * shift) / X

# Create the FunctionTransformer
inverse_transformer = FunctionTransformer(func=inv_y, inverse_func=inv_inv_y, validate=False, check_inverse=True)

In summary, I cannot apply a function and a scaler together.


With a different simple example, it works:

# DataFrame Example
X = np.array([[0, 1, 2], [2, 3, 4], [5, 7, 9]])
cols = ["A", "B", "C"]
cols_to_apply_scaler = cols[:-1]
X = pd.DataFrame(X, columns=cols, index=[0,1,2])
X

    A   B   C
0   0   1   2
1   2   3   4
2   5   7   9

# Transforming
X[cols_to_apply_scaler] = pd.DataFrame(log_and_scale_transformer.fit_transform(X[cols_to_apply_scaler]))

    A           B           C
0   0.000000    0.000000    2
1   0.958971    0.564575    4
2   1.000000    1.000000    9

/home/guilherme/anaconda3/envs/time_series/lib/python3.11/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but FunctionTransformer was fitted with feature names
  warnings.warn(

# Inverse
X[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(X[cols_to_apply_scaler])
X

    A               B   C
0   6.203855e-25    1.0 2
1   2.000000e+00    3.0 4
2   5.000000e+00    7.0 9

But I did not understand the warning. Can I fix it?


Solution

  • Concerning your first question, to preserve the index and columns work with Dataframes.

    Concerning your second question, the values used in fit_transform() are carried through to inverse_transform() because the state of the scaler is stored internally within the object instance.

    Full example based on you OP:

    from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
    import pandas as pd, numpy as np
    
    
    scaler_logMinMax = MinMaxScaler(feature_range=(0, 1))
    
    # Log transformation function
    def log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
        X_log = np.log(X + shift)
        scaled = scaler.fit_transform(X_log)
        return pd.DataFrame(scaled, index=X.index, columns=X.columns)
    
    
    def inv_log_and_scale(X, scaler=scaler_logMinMax, shift=1e-9):
        unscaled = scaler.inverse_transform(X)
        return pd.DataFrame(np.exp(unscaled) - shift, index=X.index, columns=X.columns)
    
    
    log_and_scale_transformer = FunctionTransformer(
        func=log_and_scale, 
        inverse_func=inv_log_and_scale, 
        validate=False  # Allow pandas
    )
    
    
    df_subset = pd.DataFrame(
        {
            1: [135.2342984, 83.17136704, 23.41329775, 3.574450787],
            2: [59.31328422, 18.15285711, 11.1736562, 4.788951527],
            3: [45.0087282, 4.094515245, 106.536704, 527.0962651],
        },
        index=["201001", "201002", "201003", "201004"]
    )
    df_subset.columns = [1, 2, 3]
    df_subset.index.name = "Date"
    df_subset.columns.name = "id"
    cols_to_apply_scaler = [1, 2]
    
    # fit and transform
    df_subset[cols_to_apply_scaler] = log_and_scale_transformer.fit_transform(df_subset[cols_to_apply_scaler])
    
    
    print("Transformed DataFrame:")
    print(df_subset)
    
    # inverse transform the same columns
    df_subset[cols_to_apply_scaler] = log_and_scale_transformer.inverse_transform(df_subset[cols_to_apply_scaler])
    
    
    print("\nInverse Transformed DataFrame:")
    print(df_subset)
    

    This prints

    id           1         2           3
    Date                                
    201001  1.000000  1.000000   45.008728
    201002  0.894048  0.789684    4.094515
    201003  0.574869  0.649927  106.536704
    201004  0.000000  0.000000  527.096265
    

    and

    id            1          2           3
    Date                                 
    201001  135.234298  59.313284   45.008728
    201002   83.171367  18.152857    4.094515
    201003   23.413298  11.173656  106.536704
    201004    3.574451   4.788952  527.096265