I have 5 data frames that contain the prediction scores of ML models. Those data frames have the same exact columns, just different samples (each data frame was made with a different seed). Lets call the data frames seed1
, seed2
...
There are 3 types of columns that I'm interested in. Columns that start with cr, with pd and with st_po. In each one of the 5 data frames there are 4 columns that start with cr
, 4 that start with pd
and 4 that start with st_po
.
I want to make boxplots with rows. Each row represents a data frame and its 4 columns. Overall I'll have 3 plots. A plot for the cr
columns, a plot for the pd
columns and a plot for the st_po
.
For example the cr
plot would look something like this, with 5 rows, in each row there are 4 boxes (cause of the 4 columns, each color represents a column):
Data - this is a subset, for one of the five data frames. The other 4 look exactly the same just different sample names.
structure(list(st_po_meta = c(0.382992297410965, 0.460950464010239,
0.447804838418961, 0.447804838418961, 0.460950464010239, 0.447804838418961,
0.369836807250977, 0.447804838418961, 0.369836807250977, 0.447804838418961
), st_po_meta_added = c(0.460011065006256, 0.52004611492157, 0.253930300474167,
0.222006008028984, 0.302200853824615, 0.485153168439865, 0.20485857129097,
0.350892871618271, 0.331338971853256, 0.295754462480545), st_po_meta_genes = c(0.277256995439529,
0.425392180681229, 0.182383552193642, 0.253527283668518, 0.329186052083969,
0.305586904287338, 0.188975885510445, 0.238625407218933, 0.497761845588684,
0.342641144990921), st_po_all = c(0.565486133098602,
0.564990341663361, 0.164183273911476, 0.15946152806282, 0.234778091311455,
0.396436214447021, 0.172556579113007, 0.257463246583939, 0.43759897351265,
0.200696632266045), cr_meta = c(0.0779446139931679, 0.274154871702194,
0.0718425810337067, 0.0718425810337067, 0.274154871702194, 0.0718425810337067,
0.160841777920723, 0.0718425810337067, 0.160841777920723, 0.0718425810337067
), cr_meta_added = c(0.0662130266427994, 0.258626192808151, 0.0601647943258286,
0.060332003980875, 0.141631454229355, 0.113691322505474, 0.0786028951406479,
0.0668068528175354, 0.171470999717712, 0.0754544585943222), cr_meta_genes = c(0.144175127148628,
0.145591989159584, 0.0984272509813309, 0.0906868129968643, 0.28544145822525,
0.138114541769028, 0.091837003827095, 0.0904595032334328, 0.211963757872581,
0.163982316851616), cr_all = c(0.118267595767975,
0.199180424213409, 0.0867970511317253, 0.0653180256485939, 0.203389659523964,
0.149213299155235, 0.126333728432655, 0.0975232273340225, 0.256154090166092,
0.146669581532478), pd_meta = c(0.287342727184296, 0.417357206344604,
0.255898356437683, 0.255898356437683, 0.417357206344604, 0.255898356437683,
0.471998482942581, 0.255898356437683, 0.471998482942581, 0.255898356437683
), pd_meta_added = c(0.299634903669357, 0.418549239635468, 0.424916654825211,
0.432371437549591, 0.584436595439911, 0.296403467655182, 0.563782930374146,
0.3688924908638, 0.578023612499237, 0.278784334659576), pd_meta_genes = c(0.314525783061981,
0.343217849731445, 0.315391361713409, 0.353350460529327, 0.562197327613831,
0.292534917593002, 0.616392850875854, 0.284660279750824, 0.532478809356689,
0.341239869594574), pd_all = c(0.216887220740318,
0.283130913972855, 0.248720198869705, 0.421182304620743, 0.564142644405365,
0.204288363456726, 0.778401911258698, 0.272624760866165, 0.511143803596497,
0.186424404382706)), row.names = c("04d83340b8bd", "122T", "1c2a5ac94492",
"1d209304d988", "212T", "24ab7fecc92e", "356T", "379fe8924c51",
"39T", "3ec4d3fc8bd1"), class = "data.frame")
Reformat the data before plotting, here we are row binding all the seeds, as an example I am using the same dataframe 3 times - 1:3 seeds. Then subset by column types and convert wide-to-long, then rowbind all again. Finally, we have a long format data suitable for ggplot using facets:
# create example 3 dataframes
seed2 <- seed3 <- seed1
library(ggplot2)
library(data.table)
d <- rbindlist(mget(ls(pattern = "^seed")), idcol = "data")
dplot <- rbindlist(
sapply(c("cr", "pd", "st_po"), function(i){
cols <- c("data", colnames(d)[ startsWith(colnames(d), i) ])
x <- melt(d[, ..cols ], id.vars = "data", variable.name = "grp", value.name = "value")
x[, grp := gsub(paste0(i, "_"), "", grp)]
x}, simplify = FALSE),
idcol = "type")
ggplot(dplot, aes(grp, value, fill = grp)) +
geom_boxplot() +
facet_grid(type~data)
Or flip the facet:
ggplot(dplot, aes(grp, value, fill = grp)) +
geom_boxplot() +
facet_grid(data~type)