Search code examples
rggplot2nse

Using aes_ instead of aes_string for ggplot


I'm trying to loop through a subset of columns of the data frame btest (data below) and plot some data. I still do not quite understand the tidyeval system, though it seems this is not fully implemented in ggplot2 yet anyway?

I can do this using the code below:

bcells <- LETTERS[1:9]
lapply(bcells, function(cell)
                ggplot(data = btest) + 
                geom_point(aes_string(x = 'response', 
                                      y = cell, 
                                      color = 'response')))

However, the documentation for aes_string and aes_ states that aes_ is preferable.

aes_string and aes_ are particularly useful when writing functions that create plots because you can use strings or quoted names/calls to define the aesthetic mappings, rather than having to use substitute to generate a call to aes().

I recommend using aes_(), because creating the equivalents of aes(colour = "my colour") or aes{x = `X$1`} with aes_string() is quite clunky.

MY QUESTION: I do not understand why aes_ is preferable, or how I would implement the same code using aes_ instead of aes_string here.

My data is below:

structure(list(A = c(0.982753626864792, 0.490259710510256, 0.454306062926931, 
0.443880090600994, 0.992704372174903, 0.831160693384458, 0.740981059382211, 
0.971324123908582, 0.589614636646592, 0.663016559532728, 0.0442355006911685, 
0.758388810061299, 0.39969185826509, 0.96343122781417, 0.578266180477106, 
0.935289565081073, 0.954414616390872, 0.476708144579943, 0.906912570353835, 
0.860767420084079, 0.878055964477507, 0.717065411183562, 0.626679994549329, 
0.00471648517899614), B = c(0, 0.0359489937221843, 0.0455022610469154, 
0.0245991717594771, 0, 0.0150049906282552, 0.0357444181630181, 
0.00409885247542489, 0.0153491030612378, 0.0116215258999156, 
0.0971266806096337, 0.00523783384210994, 0.000104337240641022, 
0.000475801870965878, 0.0388763858222573, 0, 0.00947456311935685, 
0.091028293882243, 0, 0.00114639793715674, 0.01709111810953, 
0.00642564686487351, 0.0122005596623948, 0.0735538728126912), 
    C = c(0.00396910361917928, 0.3726373775819, 0.389150409858995, 
    0.236004149077653, 0.00654522233764124, 0.0757762646142197, 
    0.064467293054645, 0.021714367242937, 0.0510773710590119, 
    0.279742249706993, 0.228435750343793, 0.0163968987432784, 
    0.0386501968648076, 0.0147706021018908, 0.117796354856799, 
    0.0262705486829676, 0.0116437209145554, 0.249652632820836, 
    0.0708357724821996, 0.115182751748712, 0.0485081551895102, 
    0.0430306406326062, 0.0185687667917195, 0.062321917083855
    ), D = c(0, 0, 0.00728600019514972, 0.00320524248329104, 
    0, 0.0063037888029564, 0.00654538187729239, 0, 0.0176038859003177, 
    0, 0.0181870750390433, 0.00152581718814669, 0.00977725964480791, 
    0, 0, 0.00447626637015039, 0, 0, 0, 0, 0, 0.0453332704320773, 
    0, 0), E = c(0, 0, 0, 0.000475498116547242, 0.000291416366767824, 
    0, 0, 0.000108185143509404, 0, 0, 0.000585484789620521, 0, 
    0.00119989502426795, 0.000562924764494004, 0, 0, 0.000232527879948303, 
    6.96708420418182e-05, 0.000472096790474276, 0.000545274075130702, 
    0.000572161953294472, 0, 0, 0.0111234621378363), F = c(0.0113619316667346, 
    0.0761221446319925, 0.0940043097282167, 0.181463421237771, 
    0.00045898912068803, 0.0379484560273567, 0.130661228056559, 
    0.00273248163097645, 0.27374951093064, 0.0456196648603633, 
    0.311899809955928, 0.200378764906006, 0.483217874497928, 
    0.0162868512293491, 0.187555044444225, 0.0336927109381938, 
    0.0179346325967824, 0.141906152617276, 0.0167439810037839, 
    0.0137012129908311, 0.0297632632518369, 0.170891255992311, 
    0.210301640776889, 0.0808642159093989), G = c(0.00140289433926378, 
    0.00867420181911554, 0, 0.0103474797609997, 0, 0.0217237781037489, 
    0, 2.19895985703425e-05, 0, 0, 0.113543191682212, 0.00442851495302812, 
    0, 0, 0.0176396645397039, 0.000270908927614937, 0.00629993909848545, 
    0.0406351052576609, 0.00503557936970754, 0.00865694316409033, 
    0.0191377054890488, 0, 0.0115149714931613, 0.243819008858108
    ), H = c(0, 0.00293624962565618, 0, 0.0109784338152019, 0, 
    0, 0, 0, 0.0368789010169724, 0, 0, 0.0104630777433289, 0.0618086409502145, 
    0, 0.0252902404777352, 0, 0, 0, 0, 0, 0, 0.0101281816458403, 
    0.051238858176748, 0), I = c(0.000512443510029838, 0.0134213221088962, 
    0.00975095624379213, 0.0890465131480651, 0, 0.0120820284390054, 
    0.0216006194662742, 0, 0.0157265913852278, 0, 0.185986506888601, 
    0.00318028256280335, 0.00554993751224351, 0.00447259221913014, 
    0.0345761293821743, 0, 0, 0, 0, 0, 0.00687163152927302, 0.00712559324872945, 
    0.0694952085497587, 0.523601038019114), `P-value` = c(9999, 
    9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 
    9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 
    9999, 9999, 9999), Correlation = c(0.787379117728473, 0.713767273835577, 
    0.432941448432532, 0.654688521787571, 0.690623129562749, 
    0.72269025999843, 0.535092134674879, 0.795288368310815, 0.754840745986047, 
    0.0872468087627683, 0.760738916041899, 0.875990453791969, 
    0.878637700077733, 0.851326230903871, 0.458259685017224, 
    0.815125101981778, 0.299231595131615, 0.613359452217542, 
    0.424264050686203, 0.691764490900993, 0.806704730396525, 
    0.602426815978143, 0.786361339790331, 0.871574807143838), 
    RMSE = c(0.698736121897212, 0.75020398425833, 0.901716663988092, 
    0.763690363629575, 0.758968447930353, 0.757909848657902, 
    0.8482750320726, 0.695776594753745, 0.668395739137566, 1.04995120161959, 
    0.654740332409367, 0.590052129521314, 0.484783647407576, 
    0.659059332792332, 0.890274043213301, 0.687194392331628, 
    0.966871968720401, 0.807189528281839, 0.906250907041538, 
    0.770762860306121, 0.704446496934398, 0.805908330153981, 
    0.652426738364919, 0.490900219800415), sample_id = c("x6494", 
    "x1867", "x5038", "x5118", "x4631", "x6126", "x2051", "x0346", 
    "x2056", "x4949", "x5784", "x7357", "x1509", "x9449", "x0167", 
    "x9521", "x1494", "x7623", "x9705", "x4810", "x3549", "x6336", 
    "x9699", "x8727"), patient_id = c("x6494", "x1867", "x5038", 
    "x5118", "x4631", "x6126", "x2051", "x0346", "x2056", "x4949", 
    "x5784", "x7357", "x1509", "x9449", "x0167", "x9521", "x1494", 
    "x7623", "x9705", "x4810", "x3549", "x6336", "x9699", "x8727"
    ), treated = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
    TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
    TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), timing = c("post", 
    "pre", "post", "post", "post", "pre", "pre", "post", "pre", 
    "post", "pre", "post", "post", "post", "pre", "post", "pre", 
    "post", "post", "post", "pre", "post", "post", "pre"), response = c("nonresp", 
    "nonresp", "nonresp", "nonresp", "nonresp", "resp", "nonresp", 
    "nonresp", "nonresp", "nonresp", "nonresp", "nonresp", "resp", 
    "nonresp", "nonresp", "nonresp", "nonresp", "resp", "nonresp", 
    "nonresp", "nonresp", "nonresp", "resp", "nonresp"), dataset = c("sny", 
    "sny", "sny", "sny", "sny", "sny", "sny", "sny", "sny", "sny", 
    "sny", "sny", "sny", "sny", "sny", "sny", "sny", "sny", "sny", 
    "sny", "sny", "sny", "sny", "sny"), OS_status = c(1, 1, 1, 
    1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 
    0, 1), OS_time = c(36.5, 78.2142857142857, 62.5714285714286, 
    140.785714285714, 26.0714285714286, 99.0714285714286, 41.7142857142857, 
    151.214285714286, 203.357142857143, 36.5, 26.0714285714286, 
    104.285714285714, 234.642857142857, 31.2857142857143, 140.785714285714, 
    140.785714285714, 104.285714285714, 208.571428571429, 62.5714285714286, 
    20.8571428571429, 26.0714285714286, 359.785714285714, 229.428571428571, 
    20.8571428571429), filtercol = structure(c(2L, 3L, 2L, 2L, 
    2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 
    2L, 3L, 2L, 2L, 3L), .Label = c("on", "post", "pre"), class = "factor")), class = "data.frame", row.names = c(NA, 
-24L), .Names = c("A", "B", "C", "D", "E", "F", "G", "H", "I", 
"P-value", "Correlation", "RMSE", "sample_id", "patient_id", 
"treated", "timing", "response", "dataset", "OS_status", "OS_time", 
"filtercol"))

Solution

  • Aside from objections on the grounds that storing code as strings is bad, the idea behind preferring aes_ to aes_string is explained in their examples in the documentation:

    # You can't easily mimic these calls with aes_string
    aes(`$100`, colour = "smooth")
    aes_(~ `$100`, colour = "smooth")
    # Ok, you can, but it requires a _lot_ of quotes
    aes_string("`$100`", colour = '"smooth"')
    

    It also lays out how to get your example to work:

    # Three ways of generating the same aesthetics
    aes(mpg, wt, col = cyl)
    aes_(quote(mpg), quote(wt), col = quote(cyl))
    aes_(~mpg, ~wt, col = ~cyl)
    aes_string("mpg", "wt", col = "cyl")
    ...
    # Convert strings to names with as.name
    var <- "cyl"
    ...
    aes_(col = as.name(var))
    

    In context, then,

    library(ggplot2)
    
    plots <- lapply(LETTERS[1:9], function(cell){
        ggplot(data = btest, 
               aes_(~response, as.name(cell), color = ~response)) + 
            geom_point()
    })
    
    plots[[3]]
    

    However, at some point in the future, ggplot2 NSE will change, likely to use rlang so it's more uniform with the rest of the tidyverse, so don't put too much effort into figuring out its subtleties now; if you can get aes_string to do what you want, go for it. For now.

    Instead, I might suggest a different approach entirely: converting your nine plots into one facetted one. You'll need to reshape the data to long form, but at that point it's not much more complicated:

    library(tidyr)
    
    btest %>% 
        gather(cell, value, A:I) %>% 
        ggplot(aes(response, value, color = response)) + 
        geom_point() + 
        facet_wrap(~cell)