I'll start with my goal which is to generate graphs for each of my variables (magnitude [mag], duration [dura] and distance [dist] but with distinct error bars for train and test. :
I have a data-frame that looks like this: (screenshot + dput below). It shows the responses (magnitude, distance, duration) during train and test of various biological strains along with their standard error (SEM). For example, the duration response at train is in column "train_avg_dura" and at test is "test_avg_dura". The standard error for each of these is in the columns train_duraSEM and test_duraSEM
df_group_sum.wide (data-frame)
dput data:
df_group_sum.wide <-
structure(list(strain = structure(1:8, .Label = c("N2", "acy-1(LOF)",
"acy-1(GOF)", "pde-4", "unc-43", "crh-1", "glr-1", "avr-14"), class = "factor"),
test_avg_dist = c(0.23102447163515, 0.198503787878788, 0.23892936802974,
0.247270588235294, 0.148316666666667, 0.195762711864407,
0.204740740740741, 0.238755154639175), test_avg_dura = c(1.04759733036707,
1.15537878787879, 0.914684014869888, 1.12286274509804, 0.828916666666667,
0.785491525423729, 0.788407407407407, 1.02309278350515),
test_avg_mag = c(0.112163461525871, 0.113447031611172, 0.15930172539742,
0.105397926645665, 0.0370000063024116, 0.0823626968797451,
0.0441620688813484, 0.135786546158742), test_distSEM = c(0.00460504533342531,
0.0050568065734325, 0.00945562739572128, 0.00524044558789062,
0.00882224860763199, 0.00983820301449839, 0.0162322856355826,
0.00738407922404085), test_duraSEM = c(0.0187491841242793,
0.0287113186085301, 0.0283764910080623, 0.0215386973519077,
0.0471018319675206, 0.0341593217329755, 0.0564553992545153,
0.0271939362203803), test_magSEM = c(0.00335619679815181,
0.00443251320170775, 0.00919066553588191, 0.00432150262248429,
0.00400887448034098, 0.00664866437888279, 0.00575860867691942,
0.00524462205156711), train_avg_dist = c(0.337652222222222,
0.294218518518519, 0.338651851851852, 0.311313725490196,
0.254675, 0.2737, 0.390688888888889, 0.314817948717949),
train_avg_dura = c(1.3543, 1.429, 1.19151851851852, 1.37256862745098,
1.236, 1.06376666666667, 1.41396296296296, 1.31512820512821
), train_avg_mag = c(0.1930557426236, 0.19297076970836, 0.212916856705011,
0.127417008935649, 0.0841239843171108, 0.117210954090848,
0.115413610503398, 0.179227387006556)), class = "data.frame", .Names = c("strain",
"test_avg_dist", "test_avg_dura", "test_avg_mag", "test_distSEM",
"test_duraSEM", "test_magSEM", "train_avg_dist", "train_avg_dura",
"train_avg_mag"), row.names = c(NA, -8L))
The problem I am having is how to add error bars using SEM since I need them to be merged into one variable as opposed to two when I map the variable to geom_bar. I think this is a melt issue but I can't figure it out.
Update:
The melted data-frame I used to plot the graph is as follows:
structure(list(strain = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("N2", "acy-1(LOF)",
"acy-1(GOF)", "pde-4", "unc-43", "crh-1", "glr-1", "avr-14"), class = "factor"),
variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L), .Label = c("test_avg_dist",
"test_avg_dura", "test_avg_mag", "test_avg_prob", "test_avg_spd",
"test_distSEM", "test_duraSEM", "test_magSEM", "test_probSEM",
"test_spdSEM", "train_avg_dist", "train_avg_dura", "train_avg_mag",
"train_avg_prob", "train_avg_spd", "train_distSEM", "train_duraSEM",
"train_magSEM", "train_probSEM", "train_spdSEM"), class = "factor"),
value = c(0.23102447163515, 0.198503787878788, 0.23892936802974,
0.247270588235294, 0.148316666666667, 0.195762711864407,
0.204740740740741, 0.238755154639175, 1.04759733036707, 1.15537878787879,
0.914684014869888, 1.12286274509804, 0.828916666666667, 0.785491525423729,
0.788407407407407, 1.02309278350515, 0.112163461525871, 0.113447031611172,
0.15930172539742, 0.105397926645665, 0.0370000063024116,
0.0823626968797451, 0.0441620688813484, 0.135786546158742,
0.457040018571118, 0.563727434411572, 0.624264612406578,
0.392625726149316, 0.219488346025285, 0.355836464305103,
0.158243463050796, 0.549997886634136, 0.218104671667048,
0.175578055416405, 0.256197987699313, 0.218534931269605,
0.181253278716812, 0.235434749265196, 0.236043513165036,
0.229165553562148, 0.00460504533342531, 0.0050568065734325,
0.00945562739572128, 0.00524044558789062, 0.00882224860763199,
0.00983820301449839, 0.0162322856355826, 0.00738407922404085,
0.0187491841242793, 0.0287113186085301, 0.0283764910080623,
0.0215386973519077, 0.0471018319675206, 0.0341593217329755,
0.0564553992545153, 0.0271939362203803, 0.00335619679815181,
0.00443251320170775, 0.00919066553588191, 0.00432150262248429,
0.00400887448034098, 0.00664866437888279, 0.00575860867691942,
0.00524462205156711, 0.00460504533342531, 0.0050568065734325,
0.00945562739572128, 0.00524044558789062, 0.00882224860763199,
0.00983820301449839, 0.0162322856355826, 0.00738407922404085,
0.00148090077905166, 0.00224725406956702, 0.00293788372166611,
0.00142518092482957, 0.00475313026432338, 0.00259537819051875,
0.00439432015310276, 0.00179190641262238, 0.337652222222222,
0.294218518518519, 0.338651851851852, 0.311313725490196,
0.254675, 0.2737, 0.390688888888889, 0.314817948717949, 1.3543,
1.429, 1.19151851851852, 1.37256862745098, 1.236, 1.06376666666667,
1.41396296296296, 1.31512820512821, 0.1930557426236, 0.19297076970836,
0.212916856705011, 0.127417008935649, 0.0841239843171108,
0.117210954090848, 0.115413610503398, 0.179227387006556,
0.525206741295172, 0.606796097537911, 0.592920766963248,
0.383218177729097, 0.294853306191478, 0.37983654970313, 0.244065736387288,
0.529995494304863, 0.245519078777542, 0.204069564920836,
0.279438682643543, 0.223741850875084, 0.203505986396722,
0.244494243449087, 0.263225928969608, 0.235094347033923,
0.00509151719343593, 0.00741331297357774, 0.0110354960774679,
0.0058641318136066, 0.0114389388703232, 0.0108143010933781,
0.0182904578688527, 0.00913426247712326, 0.0167858570502119,
0.0279705569908445, 0.030133138276768, 0.0219057666071679,
0.0479637760140276, 0.0332974908188985, 0.0605392786801207,
0.0323033076008837, 0.00498395111761598, 0.0081988397756359,
0.0107052683837969, 0.00442352355941589, 0.00723029142814287,
0.00764631328347674, 0.00980735575566329, 0.00789476278044047,
0.00509151719343593, 0.00741331297357774, 0.0110354960774679,
0.0058641318136066, 0.0114389388703232, 0.0108143010933781,
0.0182904578688527, 0.00913426247712326, 0.00139403793044242,
0.00220415921330836, 0.00299625483623813, 0.00144528089431754,
0.00441088530148196, 0.00248394605240026, 0.00319027562414684,
0.00174638373495128)), row.names = c(NA, -160L), .Names = c("strain",
"variable", "value"), class = "data.frame")
The code I used to plot this (after removing SEM rows) is as follows:
(abs_bar_mag <-
df_group_sum.long %>%
filter(grepl("mag", variable)) %>%
ggplot(aes(x = strain,
y = value,
fill = variable))+
scale_fill_manual(values=c("lightseagreen", "indianred1"))+
geom_bar(stat="identity", position = "dodge") +
#geom_errorbar(aes(ymin=value-1, ymax=value+1), width=.1, position = position_dodge(width=0.9)) +
theme(panel.background = element_blank()) +
theme(text = element_text(size = 20),
axis.line = element_line(colour = "black")) +
ggtitle("") +
theme(plot.title = element_text(size = 30, hjust = 0.5, face = "bold"),
axis.text = element_text(size = 70),
strip.text = element_text(size = 40),
axis.text.x = element_text(angle = 65, hjust = 1, size = 40),
axis.title.y = (element_text(size = 65)))
+
labs(colour = "",
y = "Magnitude",
x = "") +
scale_colour_manual(values = rev())
)
I appreciate any pointers or solutions you may have!
Thanks, Aram
The issue here is that the avg columns and the SEM (standard error) columns need to stay together. This requires to reshape two value columns simultaneously. See section 3.a of Efficient reshaping using data.tables for more details.
Therefore, we start with the data in wide format (df_group_sum.wide
). To be in line with the code provided by the OP, only magnitudes are plotted.
library(data.table)
library(ggplot2)
molten <- melt(
data.table(df_group_sum.wide), id.vars = "strain",
measure.vars = patterns("avg_mag$", "magSEM$"),
value.name = c("avg", "SEM"))[
, variable := forcats::lvls_revalue(variable, c("test_mag", "train_mag"))][]
molten
strain variable avg SEM 1: N2 test_mag 0.11216346 0.003356197 2: acy-1(LOF) test_mag 0.11344703 0.004432513 3: acy-1(GOF) test_mag 0.15930173 0.009190666 4: pde-4 test_mag 0.10539793 0.004321503 5: unc-43 test_mag 0.03700001 0.004008874 6: crh-1 test_mag 0.08236270 0.006648664 7: glr-1 test_mag 0.04416207 0.005758609 8: avr-14 test_mag 0.13578655 0.005244622 9: N2 train_mag 0.19305574 NA 10: acy-1(LOF) train_mag 0.19297077 NA 11: acy-1(GOF) train_mag 0.21291686 NA 12: pde-4 train_mag 0.12741701 NA 13: unc-43 train_mag 0.08412398 NA 14: crh-1 train_mag 0.11721095 NA 15: glr-1 train_mag 0.11541361 NA 16: avr-14 train_mag 0.17922739 NA
ggplot(molten,
aes(strain, avg, ymin = avg - SEM, ymax = avg + SEM, fill = variable)) +
geom_col(position = "dodge") +
geom_errorbar(width=.1, position = position_dodge(width=0.9)) +
scale_fill_manual(values=c("lightseagreen", "indianred1")) +
theme_bw() +
labs(fill = "", y = "Magnitude", x = "")
The OP also has provided a data.frame in long format df_group_sum.long
which does contain more data than df_group_sum.wide
. These should be plotted as well, now.
By looking at the variable names
unique(df_group_sum.long$variable)
[1] test_avg_dist test_avg_dura test_avg_mag test_avg_prob test_avg_spd [6] test_distSEM test_duraSEM test_magSEM test_probSEM test_spdSEM [11] train_avg_dist train_avg_dura train_avg_mag train_avg_prob train_avg_spd [16] train_distSEM train_duraSEM train_magSEM train_probSEM train_spdSEM 20 Levels: test_avg_dist test_avg_dura test_avg_mag test_avg_prob ... train_spdSEM
the data.frame seems to contain aggregated data (avg
and SEM
) of five different variables (dist
, dura
, mag
, prob
, spd
) of two data sets (train
and test
). Again, avg
and SEM
need to stay together on one row for plotting bar charts with error bars.
Unfortunately, the naming scheme is inconsistent. It would have been better if the variables containing standard errors would have been named similar to train_avg_mag
, e.g., train_SEM_mag
instead of train_magSEM
.
So, the first step is to split up the variable
names to get the different groups separately:
library(data.table)
DT <- data.table(df_group_sum.long)
DT[, c("dataset", "measure", "variable") :=
DT[, tstrsplit(variable, "_|SEM$")][is.na(V3), `:=`(V3 = V2, V2 = "SEM")]]
DT
strain variable value dataset measure 1: N2 dist 0.231024472 test avg 2: acy-1(LOF) dist 0.198503788 test avg 3: acy-1(GOF) dist 0.238929368 test avg 4: pde-4 dist 0.247270588 test avg 5: unc-43 dist 0.148316667 test avg --- 156: pde-4 spd 0.001445281 train SEM 157: unc-43 spd 0.004410885 train SEM 158: crh-1 spd 0.002483946 train SEM 159: glr-1 spd 0.003190276 train SEM 160: avr-14 spd 0.001746384 train SEM
unique(DT[, variable])
"dist" "dura" "mag" "prob" "spd"
unique(DT[, dataset])
"test" "train"
unique(DT[, measure])
"avg" "SEM"
Now, the abbreviated variable names are replaced by their full names using an update on join:
abbr2full <- data.table(
variable = c("dist", "dura", "mag"),
full = c("Distance", "Duration", "Magnitude")
)
DT[abbr2full, on = "variable", variable := full][]
Finally, a facetted plot of all five variables is created. dcast()
is used to reshape the data from long to a wide format where each row has two measures avg
and SEM
.
library(ggplot2)
ggplot(dcast(DT, ... ~ measure),
aes(strain, avg, ymin = avg - SEM, ymax = avg + SEM, fill = dataset)) +
geom_col(position = "dodge") +
geom_errorbar(width=.1, position = position_dodge(width=0.9)) +
scale_fill_manual(values=c("lightseagreen", "indianred1")) +
theme_bw() +
labs(fill = "", y = "Average", x = "") +
facet_wrap(~ variable, scales = "free_y") +
theme(axis.text.x = element_text(angle = 65, hjust = 1))