Search code examples
rfunctiontidyversestability

Custom column names as arguments in the functions of stability R package


I developed the stability R package which can be installed from CRAN.

install.packages("stability")

However, I have difficulty in making it to take custom column names as function arguments. Here is an example of add_anova function

library(stability)
data(ge_data)

YieldANOVA <-
  add_anova(
      .data = ge_data
    , .y    = Yield
    , .rep  = Rep
    , .gen  = Gen
    , .env  = Env
  )
YieldANOVA

The above code works fine. However, when I change the column names of the data.frame, it doesn't work as below:

df1 <- ge_data
names(df1) <- c("G", "Institute", "R", "Block", "E", "Y")

fm1 <-
  add_anova(
      .data = df1
    , .y    = Y
    , .rep  = Rep
    , .gen  = G
    , .env  = E
  )

Error in model.frame.default(formula = terms(.data$Y ~ .data$E + .data$Rep:.data$E +  : 
  invalid type (NULL) for variable '.data$Rep'

Similarly another function stab_reg

fm1Reg <-
  stab_reg(
      .data = df1
    , .y    = Y
    , .gen  = G
    , .env  = E
  )

Error in eval(predvars, data, env) : object 'Gen' not found

The codes of these functions can be accessed by

getAnywhere(add_anova.default)

function (.data, .y, .rep, .gen, .env) 
{
    Y <- enquo(.y)
    Rep <- enquo(.rep)
    G <- enquo(.gen)
    E <- enquo(.env)
    fm1 <- lm(formula = terms(.data$Y ~ .data$E + .data$Rep:.data$E + 
        .data$G + .data$G:.data$E, keep.order = TRUE), data = .data)
    fm1ANOVA <- anova(fm1)
    rownames(fm1ANOVA) <- c("Env", "Rep(Env)", "Gen", "Gen:Env", 
        "Residuals")
    fm1ANOVA[1, 4] <- fm1ANOVA[1, 3]/fm1ANOVA[2, 3]
    fm1ANOVA[2, 4] <- NA
    fm1ANOVA[1, 5] <- 1 - pf(as.numeric(fm1ANOVA[1, 4]), fm1ANOVA[1, 
        1], fm1ANOVA[2, 1])
    fm1ANOVA[2, 5] <- 1 - pf(as.numeric(fm1ANOVA[2, 4]), fm1ANOVA[2, 
        1], fm1ANOVA[5, 1])
    class(fm1ANOVA) <- c("anova", "data.frame")
    return(list(anova = fm1ANOVA))
}
<bytecode: 0xc327c28>
<environment: namespace:stability>

and

   getAnywhere(stab_reg.default)

function (.data, .y, .rep, .gen, .env) 
{
    Y <- enquo(.y)
    Rep <- enquo(.rep)
    G <- enquo(.gen)
    E <- enquo(.env)
    g <- length(levels(.data$G))
    e <- length(levels(.data$E))
    r <- length(levels(.data$Rep))
    g_means <- .data %>% dplyr::group_by(!!G) %>% dplyr::summarize(Mean = mean(!!Y))
    names(g_means) <- c("G", "Mean")
    DataNew <- .data %>% dplyr::group_by(!!G, !!E) %>% dplyr::summarize(GEMean = mean(!!Y)) %>% 
        dplyr::group_by(!!E) %>% dplyr::mutate(EnvMean = mean(GEMean))
    IndvReg <- lme4::lmList(GEMean ~ EnvMean | Gen, data = DataNew)
    IndvRegFit <- summary(IndvReg)
    StabIndvReg <- tibble::as_tibble(data.frame(g_means, Slope = coef(IndvRegFit)[, 
        , 2][, 1], LCI = confint(IndvReg)[, , 2][, 1], UCI = confint(IndvReg)[, 
        , 2][, 2], R.Sqr = IndvRegFit$r.squared, RMSE = IndvRegFit$sigma, 
        SSE = IndvRegFit$sigma^2 * IndvRegFit$df[, 2], Delta = IndvRegFit$sigma^2 * 
            IndvRegFit$df[, 2]/r))
    MeanSlopePlot <- ggplot(data = StabIndvReg, mapping = aes(x = Slope, 
        y = Mean)) + geom_point() + geom_text(aes(label = G), 
        size = 2.5, vjust = 1.25, colour = "black") + geom_vline(xintercept = 1, 
        linetype = "dotdash") + geom_hline(yintercept = mean(StabIndvReg$Mean), 
        linetype = "dotdash") + labs(x = "Slope", y = "Mean") + 
        scale_x_continuous(sec.axis = dup_axis(), labels = scales::comma) + 
        scale_y_continuous(sec.axis = dup_axis(), labels = scales::comma) + 
        theme_bw()
    return(list(StabIndvReg = StabIndvReg, MeanSlopePlot = MeanSlopePlot))
}
<bytecode: 0xe431010>
<environment: namespace:stability>

Solution

  • One of the problems in the data 'df1' is the column name is 'R' instead of "Rep" which was passed into the function. Second, the terms passed into the formula are quosures. we could change it to string with quo_names and then construct formula with paste

    add_anova1 <- function (.data, .y, .rep, .gen, .env) {
        y1 <- quo_name(enquo(.y))
        r1 <- quo_name(enquo(.rep))
        g1 <- quo_name(enquo(.gen))
        e1 <- quo_name(enquo(.env))
    
        fm <- formula(paste0(y1, "~", paste(e1, paste(r1, e1, sep=":"), 
                      g1, paste(g1, e1, sep=":"), sep="+")))
    
        fm1 <- lm(terms(fm, keep.order = TRUE), data = .data)
        fm1ANOVA <- anova(fm1)
        rownames(fm1ANOVA) <- c("Env", "Rep(Env)", "Gen", "Gen:Env", 
            "Residuals")
        fm1ANOVA[1, 4] <- fm1ANOVA[1, 3]/fm1ANOVA[2, 3]
        fm1ANOVA[2, 4] <- NA
        fm1ANOVA[1, 5] <- 1 - pf(as.numeric(fm1ANOVA[1, 4]), fm1ANOVA[1, 
            1], fm1ANOVA[2, 1])
        fm1ANOVA[2, 5] <- 1 - pf(as.numeric(fm1ANOVA[2, 4]), fm1ANOVA[2, 
            1], fm1ANOVA[5, 1])
        class(fm1ANOVA) <- c("anova", "data.frame")
        return(list(anova = fm1ANOVA))
    
     }
    
    YieldANOVA2 <- add_anova1(
          .data = df1
        , .y    = Y
        , .rep  = R
        , .gen  = G
        , .env  = E
      )
    

    -checking with the output generated using 'ge_data' without changing the column names

    all.equal(YieldANOVA, YieldANOVA2, check.attributes = FALSE)
    #[1] TRUE
    

    Similarly stab_reg could be changed