Search code examples
rggplot2meltgeom-bar

How do I melt/gather multiple variables (error bars) into one for mapping to geom_bar?


I'll start with my goal which is to generate graphs for each of my variables (magnitude [mag], duration [dura] and distance [dist] but with distinct error bars for train and test. :

Almost finished graph enter image description here

I have a data-frame that looks like this: (screenshot + dput below). It shows the responses (magnitude, distance, duration) during train and test of various biological strains along with their standard error (SEM). For example, the duration response at train is in column "train_avg_dura" and at test is "test_avg_dura". The standard error for each of these is in the columns train_duraSEM and test_duraSEM

df_group_sum.wide (data-frame)

dput data:
df_group_sum.wide <-
structure(list(strain = structure(1:8, .Label = c("N2", "acy-1(LOF)",
"acy-1(GOF)", "pde-4", "unc-43", "crh-1", "glr-1", "avr-14"), class = "factor"),
test_avg_dist = c(0.23102447163515, 0.198503787878788, 0.23892936802974,
0.247270588235294, 0.148316666666667, 0.195762711864407,
0.204740740740741, 0.238755154639175), test_avg_dura = c(1.04759733036707,
1.15537878787879, 0.914684014869888, 1.12286274509804, 0.828916666666667,
0.785491525423729, 0.788407407407407, 1.02309278350515),
test_avg_mag = c(0.112163461525871, 0.113447031611172, 0.15930172539742,
0.105397926645665, 0.0370000063024116, 0.0823626968797451,
0.0441620688813484, 0.135786546158742), test_distSEM = c(0.00460504533342531,
0.0050568065734325, 0.00945562739572128, 0.00524044558789062,
0.00882224860763199, 0.00983820301449839, 0.0162322856355826,
0.00738407922404085), test_duraSEM = c(0.0187491841242793,
0.0287113186085301, 0.0283764910080623, 0.0215386973519077,
0.0471018319675206, 0.0341593217329755, 0.0564553992545153,
0.0271939362203803), test_magSEM = c(0.00335619679815181,
0.00443251320170775, 0.00919066553588191, 0.00432150262248429,
0.00400887448034098, 0.00664866437888279, 0.00575860867691942,
0.00524462205156711), train_avg_dist = c(0.337652222222222,
0.294218518518519, 0.338651851851852, 0.311313725490196,
0.254675, 0.2737, 0.390688888888889, 0.314817948717949),
train_avg_dura = c(1.3543, 1.429, 1.19151851851852, 1.37256862745098,
1.236, 1.06376666666667, 1.41396296296296, 1.31512820512821
), train_avg_mag = c(0.1930557426236, 0.19297076970836, 0.212916856705011,
0.127417008935649, 0.0841239843171108, 0.117210954090848,
0.115413610503398, 0.179227387006556)), class = "data.frame", .Names = c("strain",
"test_avg_dist", "test_avg_dura", "test_avg_mag", "test_distSEM",
"test_duraSEM", "test_magSEM", "train_avg_dist", "train_avg_dura",
"train_avg_mag"), row.names = c(NA, -8L))

data-frame I start with

The problem I am having is how to add error bars using SEM since I need them to be merged into one variable as opposed to two when I map the variable to geom_bar. I think this is a melt issue but I can't figure it out.

Update:

The melted data-frame I used to plot the graph is as follows:

structure(list(strain = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("N2", "acy-1(LOF)", 
"acy-1(GOF)", "pde-4", "unc-43", "crh-1", "glr-1", "avr-14"), class = "factor"), 
    variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
    5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 
    7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 
    9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 
    11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 
    12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 
    14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 
    15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 
    17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 
    18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 20L, 
    20L, 20L, 20L, 20L, 20L, 20L, 20L), .Label = c("test_avg_dist", 
    "test_avg_dura", "test_avg_mag", "test_avg_prob", "test_avg_spd", 
    "test_distSEM", "test_duraSEM", "test_magSEM", "test_probSEM", 
    "test_spdSEM", "train_avg_dist", "train_avg_dura", "train_avg_mag", 
    "train_avg_prob", "train_avg_spd", "train_distSEM", "train_duraSEM", 
    "train_magSEM", "train_probSEM", "train_spdSEM"), class = "factor"), 
    value = c(0.23102447163515, 0.198503787878788, 0.23892936802974, 
    0.247270588235294, 0.148316666666667, 0.195762711864407, 
    0.204740740740741, 0.238755154639175, 1.04759733036707, 1.15537878787879, 
    0.914684014869888, 1.12286274509804, 0.828916666666667, 0.785491525423729, 
    0.788407407407407, 1.02309278350515, 0.112163461525871, 0.113447031611172, 
    0.15930172539742, 0.105397926645665, 0.0370000063024116, 
    0.0823626968797451, 0.0441620688813484, 0.135786546158742, 
    0.457040018571118, 0.563727434411572, 0.624264612406578, 
    0.392625726149316, 0.219488346025285, 0.355836464305103, 
    0.158243463050796, 0.549997886634136, 0.218104671667048, 
    0.175578055416405, 0.256197987699313, 0.218534931269605, 
    0.181253278716812, 0.235434749265196, 0.236043513165036, 
    0.229165553562148, 0.00460504533342531, 0.0050568065734325, 
    0.00945562739572128, 0.00524044558789062, 0.00882224860763199, 
    0.00983820301449839, 0.0162322856355826, 0.00738407922404085, 
    0.0187491841242793, 0.0287113186085301, 0.0283764910080623, 
    0.0215386973519077, 0.0471018319675206, 0.0341593217329755, 
    0.0564553992545153, 0.0271939362203803, 0.00335619679815181, 
    0.00443251320170775, 0.00919066553588191, 0.00432150262248429, 
    0.00400887448034098, 0.00664866437888279, 0.00575860867691942, 
    0.00524462205156711, 0.00460504533342531, 0.0050568065734325, 
    0.00945562739572128, 0.00524044558789062, 0.00882224860763199, 
    0.00983820301449839, 0.0162322856355826, 0.00738407922404085, 
    0.00148090077905166, 0.00224725406956702, 0.00293788372166611, 
    0.00142518092482957, 0.00475313026432338, 0.00259537819051875, 
    0.00439432015310276, 0.00179190641262238, 0.337652222222222, 
    0.294218518518519, 0.338651851851852, 0.311313725490196, 
    0.254675, 0.2737, 0.390688888888889, 0.314817948717949, 1.3543, 
    1.429, 1.19151851851852, 1.37256862745098, 1.236, 1.06376666666667, 
    1.41396296296296, 1.31512820512821, 0.1930557426236, 0.19297076970836, 
    0.212916856705011, 0.127417008935649, 0.0841239843171108, 
    0.117210954090848, 0.115413610503398, 0.179227387006556, 
    0.525206741295172, 0.606796097537911, 0.592920766963248, 
    0.383218177729097, 0.294853306191478, 0.37983654970313, 0.244065736387288, 
    0.529995494304863, 0.245519078777542, 0.204069564920836, 
    0.279438682643543, 0.223741850875084, 0.203505986396722, 
    0.244494243449087, 0.263225928969608, 0.235094347033923, 
    0.00509151719343593, 0.00741331297357774, 0.0110354960774679, 
    0.0058641318136066, 0.0114389388703232, 0.0108143010933781, 
    0.0182904578688527, 0.00913426247712326, 0.0167858570502119, 
    0.0279705569908445, 0.030133138276768, 0.0219057666071679, 
    0.0479637760140276, 0.0332974908188985, 0.0605392786801207, 
    0.0323033076008837, 0.00498395111761598, 0.0081988397756359, 
    0.0107052683837969, 0.00442352355941589, 0.00723029142814287, 
    0.00764631328347674, 0.00980735575566329, 0.00789476278044047, 
    0.00509151719343593, 0.00741331297357774, 0.0110354960774679, 
    0.0058641318136066, 0.0114389388703232, 0.0108143010933781, 
    0.0182904578688527, 0.00913426247712326, 0.00139403793044242, 
    0.00220415921330836, 0.00299625483623813, 0.00144528089431754, 
    0.00441088530148196, 0.00248394605240026, 0.00319027562414684, 
    0.00174638373495128)), row.names = c(NA, -160L), .Names = c("strain", 
"variable", "value"), class = "data.frame")

The code I used to plot this (after removing SEM rows) is as follows:

    (abs_bar_mag <- 
    df_group_sum.long %>% 
    filter(grepl("mag", variable)) %>% 
    ggplot(aes(x = strain,
               y = value,
               fill = variable))+
    scale_fill_manual(values=c("lightseagreen", "indianred1"))+
    geom_bar(stat="identity", position = "dodge") + 
    #geom_errorbar(aes(ymin=value-1, ymax=value+1), width=.1, position = position_dodge(width=0.9)) +
    theme(panel.background = element_blank()) +
    theme(text = element_text(size = 20),
          axis.line = element_line(colour = "black")) +
    ggtitle("") +
    theme(plot.title = element_text(size = 30, hjust = 0.5, face = "bold"),
          axis.text = element_text(size = 70),
          strip.text = element_text(size = 40),
          axis.text.x = element_text(angle = 65, hjust = 1,  size = 40),
          axis.title.y = (element_text(size = 65)))
  +
    labs(colour = "",
         y = "Magnitude",
         x = "") +
    scale_colour_manual(values = rev())
  )

I appreciate any pointers or solutions you may have!

Thanks, Aram


Solution

  • The issue here is that the avg columns and the SEM (standard error) columns need to stay together. This requires to reshape two value columns simultaneously. See section 3.a of Efficient reshaping using data.tables for more details.

    Therefore, we start with the data in wide format (df_group_sum.wide). To be in line with the code provided by the OP, only magnitudes are plotted.

    library(data.table)
    library(ggplot2)
    
    molten <- melt(
      data.table(df_group_sum.wide), id.vars = "strain", 
      measure.vars = patterns("avg_mag$", "magSEM$"),
      value.name = c("avg", "SEM"))[
        , variable := forcats::lvls_revalue(variable, c("test_mag", "train_mag"))][]
    molten
    
            strain  variable        avg         SEM
     1:         N2  test_mag 0.11216346 0.003356197
     2: acy-1(LOF)  test_mag 0.11344703 0.004432513
     3: acy-1(GOF)  test_mag 0.15930173 0.009190666
     4:      pde-4  test_mag 0.10539793 0.004321503
     5:     unc-43  test_mag 0.03700001 0.004008874
     6:      crh-1  test_mag 0.08236270 0.006648664
     7:      glr-1  test_mag 0.04416207 0.005758609
     8:     avr-14  test_mag 0.13578655 0.005244622
     9:         N2 train_mag 0.19305574          NA
    10: acy-1(LOF) train_mag 0.19297077          NA
    11: acy-1(GOF) train_mag 0.21291686          NA
    12:      pde-4 train_mag 0.12741701          NA
    13:     unc-43 train_mag 0.08412398          NA
    14:      crh-1 train_mag 0.11721095          NA
    15:      glr-1 train_mag 0.11541361          NA
    16:     avr-14 train_mag 0.17922739          NA
    
    ggplot(molten, 
      aes(strain, avg, ymin = avg - SEM, ymax = avg + SEM, fill = variable)) +
    geom_col(position = "dodge") + 
    geom_errorbar(width=.1, position = position_dodge(width=0.9)) +
    scale_fill_manual(values=c("lightseagreen", "indianred1")) +
    theme_bw() +  
    labs(fill = "", y = "Magnitude", x = "")
    

    enter image description here


    The OP also has provided a data.frame in long format df_group_sum.long which does contain more data than df_group_sum.wide. These should be plotted as well, now.

    By looking at the variable names

    unique(df_group_sum.long$variable)
    
     [1] test_avg_dist  test_avg_dura  test_avg_mag   test_avg_prob  test_avg_spd  
     [6] test_distSEM   test_duraSEM   test_magSEM    test_probSEM   test_spdSEM   
    [11] train_avg_dist train_avg_dura train_avg_mag  train_avg_prob train_avg_spd 
    [16] train_distSEM  train_duraSEM  train_magSEM   train_probSEM  train_spdSEM  
    20 Levels: test_avg_dist test_avg_dura test_avg_mag test_avg_prob ... train_spdSEM
    

    the data.frame seems to contain aggregated data (avg and SEM) of five different variables (dist, dura, mag, prob, spd) of two data sets (train and test). Again, avg and SEM need to stay together on one row for plotting bar charts with error bars.

    Unfortunately, the naming scheme is inconsistent. It would have been better if the variables containing standard errors would have been named similar to train_avg_mag, e.g., train_SEM_mag instead of train_magSEM.

    So, the first step is to split up the variable names to get the different groups separately:

    library(data.table)
    DT <- data.table(df_group_sum.long)
    DT[, c("dataset", "measure", "variable") := 
         DT[, tstrsplit(variable, "_|SEM$")][is.na(V3), `:=`(V3 = V2, V2 = "SEM")]]
    DT
    
             strain variable       value dataset measure
      1:         N2     dist 0.231024472    test     avg
      2: acy-1(LOF)     dist 0.198503788    test     avg
      3: acy-1(GOF)     dist 0.238929368    test     avg
      4:      pde-4     dist 0.247270588    test     avg
      5:     unc-43     dist 0.148316667    test     avg
     ---                                                
    156:      pde-4      spd 0.001445281   train     SEM
    157:     unc-43      spd 0.004410885   train     SEM
    158:      crh-1      spd 0.002483946   train     SEM
    159:      glr-1      spd 0.003190276   train     SEM
    160:     avr-14      spd 0.001746384   train     SEM
    
    unique(DT[, variable])
    

    "dist" "dura" "mag" "prob" "spd"

    unique(DT[, dataset])
    

    "test" "train"

    unique(DT[, measure])
    

    "avg" "SEM"

    Now, the abbreviated variable names are replaced by their full names using an update on join:

    abbr2full <- data.table(
      variable = c("dist", "dura", "mag"), 
      full = c("Distance", "Duration", "Magnitude")
    )
    DT[abbr2full, on = "variable", variable := full][]
    

    Finally, a facetted plot of all five variables is created. dcast() is used to reshape the data from long to a wide format where each row has two measures avg and SEM.

    library(ggplot2)
    ggplot(dcast(DT, ... ~ measure), 
           aes(strain, avg, ymin = avg - SEM, ymax = avg + SEM, fill = dataset)) +
      geom_col(position = "dodge") + 
      geom_errorbar(width=.1, position = position_dodge(width=0.9)) +
      scale_fill_manual(values=c("lightseagreen", "indianred1")) +
      theme_bw() +  
      labs(fill = "", y = "Average", x = "") + 
      facet_wrap(~ variable, scales = "free_y") +
      theme(axis.text.x = element_text(angle = 65, hjust = 1))
    

    enter image description here