dataset: https://github.com/rashida048/Datasets/blob/master/StudentsPerformance.csv
from bokeh.models import Range1d #used to set x and y limits #p.y_range=Range1d(120, 230)
def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
# Compute quartiles for each group
q1 = df_gb[vals].quantile(q=0.25)
q2 = df_gb[vals].quantile(q=0.5)
q3 = df_gb[vals].quantile(q=0.75)
# Compute interquartile region and upper and lower bounds for outliers
iqr = q3 - q1
upper_cutoff = q3 + 1.5*iqr
lower_cutoff = q1 - 1.5*iqr
# Find the outliers for each category
def outliers(group):
cat = group.name
outlier_inds = (group[vals] > upper_cutoff[cat]) \
| (group[vals] < lower_cutoff[cat])
return group[vals][outlier_inds]
# Apply outlier finder
out = df_gb.apply(outliers).dropna()
# Points of outliers for plotting
outx = []
outy = []
for cat in cats:
# only add outliers if they exist
if cat in out and not out[cat].empty:
for value in out[cat]:
outx.append(cat)
outy.append(value)
# If outliers, shrink whiskers to smallest and largest non-outlier
qmin = df_gb[vals].min()
qmax = df_gb[vals].max()
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
cats = [str(i) for i in cats]
# Build figure
p = figure(sizing_mode='stretch_width', x_range=cats,height=300,toolbar_location=None)
p.xgrid.grid_line_color = None
p.ygrid.grid_line_width = 2
p.yaxis.axis_label = ylabel
p.xaxis.axis_label = xlabel
p.title=title
p.y_range.start=0
p.title.align = 'center'
# stems
p.segment(cats, upper, cats, q3, line_width=2, line_color="black")
p.segment(cats, lower, cats, q1, line_width=2, line_color="black")
# boxes
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'],
alpha=0.7, line_width=2, line_color="black")
# median (almost-0 height rects simpler than segments)
p.rect(cats, q2, 0.5, 0.01, line_color="black", line_width=2)
# whiskers (almost-0 height rects simpler than segments)
p.rect(cats, lower, 0.2, 0.01, line_color="black")
p.rect(cats, upper, 0.2, 0.01, line_color="black")
# outliers
p.circle(outx, outy, size=6, color="black")
return p
p = box_plot(df, 'Total', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
show(p)
Hi there, from the code and dataset above I am able to produce a boxplot considering I pass through categorical variables. however I am unable to produce anything when I try to produce a boxplot for a single column. for example just checking the spread of the math scores. i tried to do
cats = df['math score']
but it didnt work. any suggestions?
I am not sute if this it is the best to implement this both in one function, but if this is your goal, one solution can be, to add a few if-else
conditions.
Here is a description of the changes:
First give label
a default.
# old
# def box_plot(df, vals, label, ylabel=None,xlabel=None,title=None):
# new
def box_plot(df, vals, label=None, ylabel=None,xlabel=None,title=None):
Then add a if-else
part for the groupby section.
# old
# # Group Data frame
# df_gb = df.groupby(label)
# # Get the categories
# cats = list(df_gb.groups.keys())
# new
if label is not None:
# Group Data frame
df_gb = df.groupby(label)
# Get the categories
cats = list(df_gb.groups.keys())
else:
df_gb = df[[vals]]
cats = [vals]
Now the calculation for the outliners is a bit different, because we don't have to loop over a number of columns. Only onw column is left.
if label is not None:
out = df_gb.apply(outliers).dropna()
else:
out = df[(df[vals] > upper_cutoff) | (df[vals] < lower_cutoff)]
The upper and lower part are now floats
and not a list
.
if label is not None:
upper = [min([x,y]) for (x,y) in zip(qmax, upper_cutoff)]
lower = [max([x,y]) for (x,y) in zip(qmin, lower_cutoff)]
else:
upper =min(qmax, upper_cutoff)
lower =max(qmin, lower_cutoff)
I also added (changed) the line below, to avoid a warning.
colors = ['#a50f15', '#de2d26', '#fb6a4a', '#fcae91', '#fee5d9'][:len(cats)]
p.rect(cats, (q3 + q1)/2, 0.5, q3 - q1, fill_color=colors, alpha=0.7, line_width=2, line_color="black")
With these changes the output for
p = box_plot(df, 'math score', 'race/ethnicity', ylabel='Total spread',xlabel='',title='BoxPlot')
is still the same, but
p = box_plot(df, 'math score', ylabel='Total spread',xlabel='',title='BoxPlot')
gives us now a boxplot.