In my df
below, I want to :
col_E
using z-scorescol_D
& col_E
See below for the dataset
import pandas as pd
from scipy import stats
# intialise data of lists
df = {
'col_A':['P0', 'P1', 'P2', 'P4', 'P5'],
'col_B':[1,1,1,1,1],
'col_C':[1,2,3,5,9],
'col_D':[120.05, 181.90, 10.34, 153.10, 311.17],
'col_E':[110.21, 191.12, 190.21, 12.00, 245.09 ],
'col_F':[100.22,199.10, 191.13,199.99, 255.19],
'col_G':[140.29, 291.07, 390.22, 245.09, 4122.62],
}
# Create DataFrame
df = pd.DataFrame(df)
# Print the output.
df
Desired: flag all outliers in col_D
first and then col_D
and col_E
secondly (Note: In my image below 10.34
and 12.00
were randomly highlighted)
Q1
Attempt:
#Q1
exclude_cols = ['col_A','col_B','col_C','col_D','col_F','col_G']
include_cols = ['col_E'] # desired column
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
#Q2
exclude_cols = ['col_A','col_B','col_C','col_F','col_G']
include_cols = ['col_D','col_E'] # desired columns
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return ''
else:
s=df[(np.abs(stats.zscore(df['col_E'])) > 3)] # not sure of this part of the code
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=include_cols)
Thanks!
I assume the following meanings to demonstrate a broader range of usage.
If Q2 is meant to calculated on multiple columns separately, then you can simply loop your Q1 solution over multiple columns, which should be trivial so I will omit such situation here.
axis=None
must be used). According to the official docs, when applying style over a DataFrame, the returning object must also be a DataFrame with the same index and columns as the subset. This is what caused the reshaping and DataFrame construction artifacts.Note that z=3
is lowered to 1.5
for demonstration purpose.
# desired column
include_cols = ['col_E']
# additional control
outlier_threshold = 1.5 # 3 won't work!
ddof = 0 # degree of freedom correction. Sample = 1 and population = 0.
def flag_outliers(s: pd.Series):
outlier_mask = np.abs(stats.zscore(s, ddof=ddof)) > outlier_threshold
# replace boolean values with corresponding strings
return ['background-color: yellow' if val else '' for val in outlier_mask]
df.style.apply(flag_outliers, subset=include_cols)
Result
include_cols = ['col_D', 'col_E'] # desired columns
outlier_threshold = 1.5
ddof = 0
def flag_outliers(s: pd.DataFrame) -> pd.DataFrame:
outlier_mask = np.abs(stats.zscore(s.values.reshape(-1), axis=None, ddof=ddof)) > outlier_threshold
# prepare the array of string to be returned
arr = np.array(['background-color: yellow' if val else '' for val in outlier_mask], dtype=object).reshape(s.shape)
# cast the array into dataframe
return pd.DataFrame(arr, columns=s.columns, index=s.index)
df.style.apply(flag_outliers, axis=None, subset=include_cols)
Result