For example, we have dataset tips
with columns day
, total_bill
and sex
.
I want to visualize boxplots (x=day
, y=total_bill
,color=sex
). After that I want to calculate test and p-value in every day between female and male participants. If p-value < 0.05, I want to add asterisk. How could I change the code below?
In this example the comparison between different days without sex:
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
tips = px.data.tips()
fig = go.Figure()
for day in ['Thur','Fri','Sat','Sun']:
fig.add_trace(go.Box(
y=tips[tips['day'] == day].total_bill,
name=day,
boxpoints='outliers'
))
def add_pvalue_annotation(days, y_range, symbol=''):
"""
arguments:
days --- a list of two different days e.g. ['Thur','Sat']
y_range --- a list of y_range in the form [y_min, y_max] in paper units
"""
pvalue = stats.ttest_ind(
tips[tips['day']==days[0]].total_bill,
tips[tips['day']==days[1]].total_bill)[1]
# print(pvalue)
if pvalue >= 0.05:
symbol = 'ns'
if pvalue < 0.05:
symbol = '*'
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[0], x1=days[0], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[0], y0=y_range[1], x1=days[1], y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=days[1], y0=y_range[1], x1=days[1], y1=y_range[0],
line=dict(
color="black",
width=2,
)
)
## add text at the correct x, y coordinates
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
bar_xcoord_map = {x: idx for idx, x in enumerate(['Thur','Fri','Sat','Sun'])}
fig.add_annotation(dict(font=dict(color="black",size=14),
x=(bar_xcoord_map[days[0]] + bar_xcoord_map[days[1]])/2,
y=y_range[1]*1.03,
showarrow=False,
text=symbol,
textangle=0,
xref="x",
yref="paper"
))
add_pvalue_annotation(['Thur','Sun'],[1.01,1.02])
add_pvalue_annotation(['Thur','Sat'],[1.05,1.06])
fig.show()
I found this useful example here: Plotly box p-value significant annotation
When you are setting up the boxplots, using px.box
from plotly.express will be useful since you can pass the argument color="sex"
which will create two boxplots for each gender for every day. You'll also want to sort the tips
DataFrame so that the days of the week are plotted in order.
Then the add_pvalue_annotation
function can be modified so that we are calculating the p-value for the t-test between men and women within each day (instead of the t-tests between tips for different days of the week). You'll also want to the change the starting and ending point of the annotations so that they are between the Men and Women categories within the same day instead of between different days.
For the tips
dataset, I ran t-tests between the men and women within each day of the week (e.g. men and women on Thur, men and women on Fri...), and none of the p-values are below 0.05.
However, to demonstrate that add_pvalue_annotation
function will place annotations down correctly, I set the p-value threshold to 0.15 so that the p-value between men and women on Friday (p-value = 0.13
) will be annotated on the chart.
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
from pandas.api.types import CategoricalDtype
tips = px.data.tips()
cat_order = ['Thur', 'Fri', 'Sat', 'Sun']
cat_weekdays = CategoricalDtype(cat_order, ordered=True)
tips['day'] = tips['day'].astype(cat_weekdays)
tips.sort_values(by='day', inplace=True)
fig = px.box(tips, x="day", y="total_bill", color="sex")
def add_pvalue_annotation(day, y_range, symbol='', pvalue_th=0.05):
"""
arguments:
days --- the day for which you want to calculate the p-value on a t-test between Men and Women (e.g. 'Thur')
x_coordinate --- the x-coordinate
y_range --- a list of y_range in the form [y_min, y_max] in paper units
"""
pvalue = stats.ttest_ind(
tips[(tips['day']==day) & (tips['sex'] == 'Male')].total_bill,
tips[(tips['day']==day) & (tips['sex'] == 'Female')].total_bill
)[1]
# print(f"pvalue between men and women on {day}: {pvalue}")
# if pvalue >= pvalue_th:
# symbol = 'ns'
if pvalue < pvalue_th:
## for bars, there is a direct mapping from the bar number to 0, 1, 2...
bar_xcoord_map = {x: idx for idx, x in enumerate(cat_order)}
x_coordinate = bar_xcoord_map[day]
x_start, x_end = x_coordinate - 0.2, x_coordinate + 0.2
symbol = '*'
fig.add_shape(type="line",
xref="x", yref="paper",
x0=x_start, y0=y_range[0], x1=x_start, y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=x_start, y0=y_range[1], x1=x_end, y1=y_range[1],
line=dict(
color="black",
width=2,
)
)
fig.add_shape(type="line",
xref="x", yref="paper",
x0=x_end, y0=y_range[1], x1=x_end, y1=y_range[0],
line=dict(
color="black",
width=2,
)
)
## add text at the correct x, y coordinates
fig.add_annotation(dict(font=dict(color="black",size=14),
x=x_coordinate,
y=y_range[1]*1.03,
showarrow=False,
text=symbol,
textangle=0,
xref="x",
yref="paper"
))
for day in cat_order:
add_pvalue_annotation(day, [1.01,1.02], pvalue_th=0.15)
fig.show()