I have a pandas dataframe with both scalar columns and array columns, e.g.
df = pd.DataFrame({
"scalar": [1, 2, 3, 4],
"array": [[10,20], [30,40], [50, 60], [70, 80]],
})
I want to write a sklearn transformer to flatten it, so that
transformer = ???
transformer.fit_transform(df)
===>
[[1 10 20
2 30 40
3 50 60
4 70 80]]
How might I achieve that?
Since that is a stateless transformation, you can use FunctionTransformer
to define a transformer from a function.
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer
df = pd.DataFrame({
"scalar": [1, 2, 3, 4],
"array": [[10,20], [30,40], [50, 60], [70, 80]],
})
def flatten_df_rows(df):
def flatten(row):
# flatten lists recursively
for val in row:
if isinstance(val, list):
yield from flatten(val)
else:
yield val
# flatten each row of the df recursively
return np.array([list(flatten(row)) for row in df.values.tolist()])
transform = FunctionTransformer(flatten_df_rows)
out = transform.fit_transform(df)
Output:
>>> out
array([[ 1, 10, 20],
[ 2, 30, 40],
[ 3, 50, 60],
[ 4, 70, 80]])