I want to select columns based on their datetime data types. My DataFrame has for example columns with types np.dtype('datetime64[ns]')
, np.datetime64
and 'datetime64[ns, UTC]'
.
Is there a generic way to select all columns with a datetime datatype?
For instance, this works:
from sklearn.compose import make_column_selector
selector = make_column_selector(dtype_include=(np.dtype('datetime64[ns]'),np.datetime64))
selected_columns = selector(df)
But this doesn't (datatype from a pandas df with 'UTC'):
from sklearn.compose import make_column_selector
selector = make_column_selector(dtype_include=np.dtype('datetime64[ns, UTC]')
selected_columns = selector(df)
Compared to numeric data types where you can simply use np.number
instead of np.int64 etc.
API-Reference to make_column_selector
: LINK
As far as I know, there is not parent class to catch all the datetime columns in a generic way as you pointed out with np.number
.
My guess is you want to use it in a sklearn
pipeline
for preprocessing? In that case you can use a custom selector:
# Custom function to check if dtype is a datetime
def is_datetime(dtype):
dtype_str = str(dtype)
# Check for timezone-aware and timezone-naive datetimes
if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
return True
return False
# Custom function to mimic make_column_selector with custom type checks
def custom_column_selector(df, dtype_checker):
return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]
This is a simple test of the custom selector on a dummy DataFrame
with different types of data types, and for the dates also with timezone-aware and timezone-naive dates.
# Imports
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn import set_config
from sklearn.compose import make_column_selector
# Config
set_config(transform_output="pandas")
# Example DataFrame with mixed dtypes and different datetime types
df = pd.DataFrame({
'a': range(10),
'b': pd.to_datetime(range(10), unit='D', origin=pd.Timestamp('2023-01-01')),
'c': pd.to_datetime(range(10), utc=True),
'd': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('America/New_York'),
'e': pd.Series(pd.date_range("2023-01-01", periods=10)).dt.tz_localize('Europe/London'),
'f': ['cat', 'dog']*5,
'g': [1.1, 2.2]*5
})
# Custom function to check if dtype is a datetime
def is_datetime(dtype):
dtype_str = str(dtype)
# Check for timezone-aware and timezone-naive datetimes
if 'datetime64[ns' in dtype_str or np.issubdtype(dtype, np.datetime64):
return True
return False
# Define the datetime transformer, exchange with whichever you pefer
def extract_year(df):
return df.apply(lambda x: x.dt.year)
datetime_transformer = Pipeline(steps=[
('year', FunctionTransformer(extract_year, validate=False))
])
# Custom column selector function
def custom_column_selector(dtype_checker):
def selector(df):
return [col for col, dtype in df.dtypes.items() if dtype_checker(dtype)]
return selector
# Create preprocessor with transformers
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), make_column_selector(dtype_include=np.number)),
('cat', OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object)),
('datetime', datetime_transformer, custom_column_selector(is_datetime))
])
# Usage of the preprocessor
data_transformed = preprocessor.fit_transform(df)
After preprocessing:
num__a | num__g | cat__f_cat | cat__f_dog | datetime__b | datetime__c | datetime__d | datetime__e |
---|---|---|---|---|---|---|---|
-1.5667 | -1 | 1 | 0 | 2023 | 1970 | 2023 | 2023 |
-1.21854 | 1 | 0 | 1 | 2023 | 1970 | 2023 | 2023 |
-0.870388 | -1 | 1 | 0 | 2023 | 1970 | 2023 | 2023 |
-0.522233 | 1 | 0 | 1 | 2023 | 1970 | 2023 | 2023 |
-0.174078 | -1 | 1 | 0 | 2023 | 1970 | 2023 | 2023 |
0.174078 | 1 | 0 | 1 | 2023 | 1970 | 2023 | 2023 |
0.522233 | -1 | 1 | 0 | 2023 | 1970 | 2023 | 2023 |
0.870388 | 1 | 0 | 1 | 2023 | 1970 | 2023 | 2023 |
1.21854 | -1 | 1 | 0 | 2023 | 1970 | 2023 | 2023 |
1.5667 | 1 | 0 | 1 | 2023 | 1970 | 2023 | 2023 |