python datetime scikit-learn scikit-learn-pipeline

How can I use sklearn's make_column_selector to select all valid datetime columns?

I want to select columns based on their datetime data types. My DataFrame has for example columns with types np.dtype('datetime64[ns]'), np.datetime64 and 'datetime64[ns, UTC]'.

Is there a generic way to select all columns with a datetime datatype?

For instance, this works:

from sklearn.compose import make_column_selector
selector = make_column_selector(dtype_include=(np.dtype('datetime64[ns]'),np.datetime64))
selected_columns = selector(df)

But this doesn't (datatype from a pandas df with 'UTC'):

from sklearn.compose import make_column_selector
selector = make_column_selector(dtype_include=np.dtype('datetime64[ns, UTC]')
selected_columns = selector(df)

Compared to numeric data types where you can simply use np.number instead of np.int64 etc.

API-Reference to make_column_selector: LINK

Solution

As far as I know, there is not parent class to catch all the datetime columns in a generic way as you pointed out with np.number.

My guess is you want to use it in a sklearn pipeline for preprocessing? In that case you can use a custom selector:

# Custom function to check if dtype is a datetime
def is_datetime(dtype):
    dtype_str = str(dtype)
    # Check for timezone-aware and timezone-naive datetimes
    if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
        return True
    return False

# Custom function to mimic make_column_selector with custom type checks
def custom_column_selector(df, dtype_checker):
    return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]

Test

This is a simple test of the custom selector on a dummy DataFrame with different types of data types, and for the dates also with timezone-aware and timezone-naive dates.

# Imports
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn import set_config
from sklearn.compose import make_column_selector

# Config
set_config(transform_output="pandas")

# Example DataFrame with mixed dtypes and different datetime types
df = pd.DataFrame({
    'a': range(10),
    'b': pd.to_datetime(range(10), unit='D', origin=pd.Timestamp('2023-01-01')),
    'c': pd.to_datetime(range(10), utc=True),
    'd': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('America/New_York'),
    'e': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('Europe/London'),
    'f': ['cat', 'dog']*5,
    'g': [1.1, 2.2]*5
})

# Custom function to check if dtype is a datetime
def is_datetime(dtype):
    dtype_str = str(dtype)
    # Check for timezone-aware and timezone-naive datetimes
    if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
        return True
    return False

# Define the datetime transformer, exchange with whichever you pefer
def extract_year(df):
    return df.apply(lambda x: x.dt.year)

datetime_transformer = Pipeline(steps=[
    ('year', FunctionTransformer(extract_year, validate=False))
])

# Custom column selector function
def custom_column_selector(dtype_checker):
    def selector(df):
        return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]
    return selector

# Create preprocessor with transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
        ('cat', OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)),
        ('datetime', datetime_transformer, custom_column_selector(is_datetime))
    ])

# Usage of the preprocessor
data_transformed = preprocessor.fit_transform(df)

After preprocessing:

num__a	num__g	cat__f_cat	cat__f_dog	datetime__b	datetime__c	datetime__d	datetime__e
-1.5667	-1	1	0	2023	1970	2023	2023
-1.21854	1	0	1	2023	1970	2023	2023
-0.870388	-1	1	0	2023	1970	2023	2023
-0.522233	1	0	1	2023	1970	2023	2023
-0.174078	-1	1	0	2023	1970	2023	2023
0.174078	1	0	1	2023	1970	2023	2023
0.522233	-1	1	0	2023	1970	2023	2023
0.870388	1	0	1	2023	1970	2023	2023
1.21854	-1	1	0	2023	1970	2023	2023
1.5667	1	0	1	2023	1970	2023	2023