Search code examples
pythondatetimescikit-learnscikit-learn-pipeline

How can I use sklearn's make_column_selector to select all valid datetime columns?


I want to select columns based on their datetime data types. My DataFrame has for example columns with types np.dtype('datetime64[ns]'), np.datetime64 and 'datetime64[ns, UTC]'.

Is there a generic way to select all columns with a datetime datatype?


For instance, this works:

from sklearn.compose import make_column_selector
selector = make_column_selector(dtype_include=(np.dtype('datetime64[ns]'),np.datetime64))
selected_columns = selector(df)

But this doesn't (datatype from a pandas df with 'UTC'):

from sklearn.compose import make_column_selector
selector = make_column_selector(dtype_include=np.dtype('datetime64[ns, UTC]')
selected_columns = selector(df)

Compared to numeric data types where you can simply use np.number instead of np.int64 etc.

API-Reference to make_column_selector: LINK


Solution

  • As far as I know, there is not parent class to catch all the datetime columns in a generic way as you pointed out with np.number.

    My guess is you want to use it in a sklearn pipeline for preprocessing? In that case you can use a custom selector:

    # Custom function to check if dtype is a datetime
    def is_datetime(dtype):
        dtype_str = str(dtype)
        # Check for timezone-aware and timezone-naive datetimes
        if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
            return True
        return False
    
    # Custom function to mimic make_column_selector with custom type checks
    def custom_column_selector(df, dtype_checker):
        return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]
    

    Test

    This is a simple test of the custom selector on a dummy DataFrame with different types of data types, and for the dates also with timezone-aware and timezone-naive dates.

    # Imports
    import pandas as pd
    import numpy as np
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
    from sklearn.impute import SimpleImputer
    from sklearn import set_config
    from sklearn.compose import make_column_selector
    
    # Config
    set_config(transform_output="pandas")
    
    # Example DataFrame with mixed dtypes and different datetime types
    df = pd.DataFrame({
        'a': range(10),
        'b': pd.to_datetime(range(10), unit='D', origin=pd.Timestamp('2023-01-01')),
        'c': pd.to_datetime(range(10), utc=True),
        'd': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('America/New_York'),
        'e': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('Europe/London'),
        'f': ['cat', 'dog']*5,
        'g': [1.1, 2.2]*5
    })
    
    # Custom function to check if dtype is a datetime
    def is_datetime(dtype):
        dtype_str = str(dtype)
        # Check for timezone-aware and timezone-naive datetimes
        if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
            return True
        return False
    
    # Define the datetime transformer, exchange with whichever you pefer
    def extract_year(df):
        return df.apply(lambda x: x.dt.year)
    
    datetime_transformer = Pipeline(steps=[
        ('year', FunctionTransformer(extract_year, validate=False))
    ])
    
    # Custom column selector function
    def custom_column_selector(dtype_checker):
        def selector(df):
            return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]
        return selector
    
    # Create preprocessor with transformers
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
            ('cat', OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)),
            ('datetime', datetime_transformer, custom_column_selector(is_datetime))
        ])
    
    # Usage of the preprocessor
    data_transformed = preprocessor.fit_transform(df)
    

    After preprocessing:

    num__a num__g cat__f_cat cat__f_dog datetime__b datetime__c datetime__d datetime__e
    -1.5667 -1 1 0 2023 1970 2023 2023
    -1.21854 1 0 1 2023 1970 2023 2023
    -0.870388 -1 1 0 2023 1970 2023 2023
    -0.522233 1 0 1 2023 1970 2023 2023
    -0.174078 -1 1 0 2023 1970 2023 2023
    0.174078 1 0 1 2023 1970 2023 2023
    0.522233 -1 1 0 2023 1970 2023 2023
    0.870388 1 0 1 2023 1970 2023 2023
    1.21854 -1 1 0 2023 1970 2023 2023
    1.5667 1 0 1 2023 1970 2023 2023