Search code examples
rfunctiondplyrtidyevalquosure

Tidy evaluation syntax in R user-defined function


I want to define a generic function

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  xvar <- enquo(xvar)
  yvar <- enquo(yvar)
  groupvar <- enquo(groupvar)

  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if( "yield" %in% names(tmp) )
  {
    tmp <- tmp %>%
      mutate_at(vars(!!yvar), ~ifelse(round(yield, 0) < 85, NA, .))
  }
  
  # Compute IQR for each year
  tmp_iqr <- tmp %>%
    group_by(!!groupvar) %>%
    summarise(iqr=IQR(!!yvar, na.rm = TRUE))
  
  p <- ggplot(data = tmp %>% mutate_at(vars(!!yvar), ~ifelse(tmp_iqr[which(tmp_iqr[[!!groupvar]] %in% (!!xvar)),]$iqr == 0, . + runif(1, -0.01, 0.01), . )), aes(x = !!xvar, y = !!yvar))
  p <- p + stat_boxplot(aes(group = !!groupvar), na.rm = TRUE, coef = 10000)   # Trick (large unrealistic coef value) so whiskers end at min(y) & max(y)
  p <- p + geom_boxplot(na.rm = TRUE, outlier.shape = NA)

  return(p)
}

which is able to plot boxplot whiskers extending to min/max even when the IQR is 0. I am trying to achieve this by adding tiny random numbers (below significance level) to the incriminate data to avoid IQR=0.

However, I must have missed something in the syntax about quosure, because running this function

func_boxplot2(data, date, days, date)

with the dataset

structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 
7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 
10L, 10L, 10L, 10L, 10L), .Label = c("2010", "2011", "2012", 
"2013", "2014", "2015", "2016", "2017", "2018", "2019"), class = c("ordered", 
"factor")), station = c("41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", 
"41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", 
"41MEU1", "41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", 
"41N043", "41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", 
"41R001", "41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", 
"41R012", "41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", 
"41WOL1", "41B011", "41MEU1", "41N043", "41R001", "41R012", "41WOL1"
), days = c(16, 15, 45, 26, 14, 14, 32, 7, 87, 42, 24, 23, 25, 
25, 55, 29, 29, 16, 11, 14, 58, 21, 19, 10, 10, 14, 33, 18, 10, 
7, 9, 10, 19, 7, 8, 7, 1, 5, 15, 8, 1, 4, 5, 6, 14, 6, 5, 5, 
3, 5, 19, 8, 4, 5, 3, 4, 16, 3, 1, 3), yield = c(98.4817351598173, 
49.4520547945205, 95.8561643835616, 97.6712328767123, 98.2648401826484, 
95.1598173515982, 97.8767123287671, 27.9109589041096, 98.310502283105, 
98.972602739726, 97.203196347032, 96.2100456621005, 98.7818761384335, 
96.7554644808743, 97.4954462659381, 98.8046448087432, 98.747723132969, 
98.3037340619308, 99.0525114155251, 96.1986301369863, 97.1004566210046, 
96.4954337899543, 96.3698630136986, 98.2077625570776, 96.62100456621, 
98.3675799086758, 95.6963470319635, 96.8835616438356, 93.5844748858447, 
87.8196347031963, 91.2328767123288, 92.5570776255708, 81.5182648401827, 
82.7739726027397, 90.1826484018265, 87.1461187214612, 87.2153916211293, 
92.9986338797814, 94.6948998178506, 85.5760473588342, 92.3611111111111, 
96.2204007285975, 86.3698630136986, 86.4269406392694, 87.796803652968, 
93.2762557077626, 96.6438356164384, 95.6164383561644, 71.3812785388128, 
93.7442922374429, 96.3698630136986, 97.2602739726027, 95.7876712328767, 
94.7146118721461, 87.6141552511416, 43.0821917808219, 88.6872146118722, 
92.6826484018265, 90.365296803653, 86.541095890411), environ = structure(c(5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 
3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 
3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L, 5L, 
4L, 6L, 3L, 5L, 3L, 5L, 4L, 6L, 3L, 5L, 3L), .Label = c("Urbain avec très forte influence du trafic", 
"Urbain avec forte influence du trafic", "Urbain avec influence modérée du trafic", 
"Urbain avec faible influence du trafic", "Urbain avec très faible influence du trafic", 
"Industriel avec influence modérée du trafic"), class = "factor")), row.names = c(NA, 
-60L), class = c("tbl_df", "tbl", "data.frame"))

gives me the following errors

 Error: Problem with `mutate()` input `days`.
x Must extract column with a single valid subscript.
x Subscript `date` has size 60 but must be size 1.
ℹ Input `days` is `(structure(function (..., .x = ..1, .y = ..2, . = ..1) ...`.

What is wrong in my syntax, please ?

Many thanks,

A.

======== UPDATE ==========

Using the suggested updated function

func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
{
  # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
  if("yield" %in% names(tmp)) {
    tmp <-
      tmp %>%
      mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
  }
  
  tmp <-
    tmp %>%
    group_by({{groupvar}}) %>%
    mutate(
      across({{yvar}}, function (x) {
        ifelse(
          IQR({{yvar}}, na.rm = TRUE) == 0,
          x + runif(1, -0.01,0.01),
          x
        )
      })
    )

  ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
    stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
    geom_boxplot(na.rm = TRUE, outlier.shape = NA)
}

results in the following plot

enter image description here

As stated in my comment, it seems that the processing of tmp output for all rows of the same year the first value yvar of this year which explains the plot. Indeed, commenting this block gives to following figure

enter image description here


Solution

  • The condition in ifelse wasn't the correct length. You can change the data ggplot argument to this.

    data =
      tmp %>%
        group_by(!!groupvar) %>%
        mutate_at(
            vars(!!yvar),
            if (IQR(., na.rm = TRUE) == 0) {
              . + runif(1, -0.01,0.01)
            } else {
              .
            }
        )
    

    Your usage of quosure and !! is correct, however you should use the more recent {{ operator instead.

    This is the updated function

    func_boxplot2 <- function(tmp, xvar, yvar, groupvar)
    {
      # If variable yield exists, put concentrations to NA for all yields < annual_yield_thres
      if("yield" %in% names(tmp)) {
        tmp <-
          tmp %>%
          mutate(across({{yvar}}, ~ifelse(round(yield, 0) < 85, NA, .)))
      }
      
      tmp <-
        tmp %>%
        group_by({{groupvar}}) %>%
        mutate(
          across({{yvar}}, function (x) {
            if (IQR(x, na.rm = TRUE) == 0) {
              x + runif(length(x), -0.01, 0.01)
            } else {
              x
            }
          })
        )
    
      ggplot(tmp, aes(x = {{xvar}}, y = {{yvar}})) +
        stat_boxplot(aes(group = {{groupvar}}), na.rm = TRUE, coef = 10000) +
        geom_boxplot(na.rm = TRUE, outlier.shape = NA)
    }
    
    func_boxplot2(data, date, days, date)
    

    plot