Search code examples
rclassificationdo.call

R. do.call function returns to much


Having the following

> library(gbm)
> mdl<-gbm::gbm(data=iris,formula=Species ~ .,distribution="gaussian")

Then:

> mdl
gbm::gbm(formula = Species ~ ., distribution = "gaussian", data = iris)
A gradient boosted model with gaussian loss function.
100 iterations were performed.
There were 4 predictors of which 4 had non-zero influence.

What is what I want.

However, If I use do.call:

> mdl<-do.call(gbm::gbm,list(data=iris,formula=Species ~ .,distribution="gaussian"))

Then

> mdl
(function (formula = formula(data), distribution = "bernoulli", 
    data = list(), weights, var.monotone = NULL, n.trees = 100, 
    interaction.depth = 1, n.minobsinnode = 10, shrinkage = 0.1, 
    bag.fraction = 0.5, train.fraction = 1, cv.folds = 0, keep.data = TRUE,
...
...
...
A gradient boosted model with gaussian loss function.
100 iterations were performed.
There were 4 predictors of which 4 had non-zero influence.

It prints the definition of gbm function at the beginning, then the gbm call (including the whole dataset iris) and finally the text that I am looking for.

I need to use do.call because I want to parametrize all the arguments for any classification algorithm and then put the output inside a verbatimTextOutput in shiny.

Is there any way to prevent do.call to return the definition of gbm as well as the entire dataset?. Or maybe other way to execute gbm passing arguments inside a list?

Thanks.


Solution

  • We have to do two things to prevent this behavior:

    First, we can use substitute() on iris. This will prevent the call from listing all columns of the iris data set.

    Second, we should avoid using gbm::gbm in do.call and rather load the library and use the string "gbm" instead to call the function:

    library(gbm)
    
    mdl <- do.call("gbm",
                   list(data = substitute(iris),
                        formula= Species ~ .,
                        distribution = "gaussian")
                   )
    mdl
    
    #> gbm(formula = Species ~ ., distribution = "gaussian", data = iris)
    #> A gradient boosted model with gaussian loss function.
    #> 100 iterations were performed.
    #> There were 4 predictors of which 4 had non-zero influence.
    

    If we use gbm::gbm inside do call it will include the whole function definition in the captured call:

    mdl <- do.call(gbm::gbm,
                   list(data = substitute(iris),
                        formula= Species ~ .,
                        distribution = "gaussian")
                   )
    mdl
    
    #> (function (formula = formula(data), distribution = "bernoulli", 
    #>     data = list(), weights, var.monotone = NULL, n.trees = 100, 
    #>     interaction.depth = 1, n.minobsinnode = 10, shrinkage = 0.1, 
    #>     bag.fraction = 0.5, train.fraction = 1, cv.folds = 0, keep.data = TRUE, 
    #>     verbose = FALSE, class.stratify.cv = NULL, n.cores = NULL) 
    #> {
    #>     mcall <- match.call()
    #>     lVerbose <- if (!is.logical(verbose)) {
    #>         FALSE
    #>     }
    #>     else {
    #>         verbose
    #>     }
    #>     mf <- match.call(expand.dots = FALSE)
    #>     m <- match(c("formula", "data", "weights", "offset"), names(mf), 
    #>         0)
    #>     mf <- mf[c(1, m)]
    #>     mf$drop.unused.levels <- TRUE
    #>     mf$na.action <- na.pass
    #>     mf[[1]] <- as.name("model.frame")
    #>     m <- mf
    #>     mf <- eval(mf, parent.frame())
    #>     Terms <- attr(mf, "terms")
    #>     w <- model.weights(mf)
    #>     offset <- model.offset(mf)
    #>     y <- model.response(mf)
    #>     if (missing(distribution)) {
    #>         distribution <- guessDist(y)
    #>     }
    #>     if (is.character(distribution)) {
    #>         distribution <- list(name = distribution)
    #>     }
    #>     if (!is.element(distribution$name, getAvailableDistributions())) {
    #>         stop("Distribution ", distribution$name, " is not supported.")
    #>     }
    #>     if (distribution$name == "multinomial") {
    #>         warning("Setting `distribution = \"multinomial\"` is ill-advised as it is ", 
    #>             "currently broken. It exists only for backwards compatibility. ", 
    #>             "Use at your own risk.", call. = FALSE)
    #>     }
    #>     var.names <- attributes(Terms)$term.labels
    #>     x <- model.frame(terms(reformulate(var.names)), data = data, 
    #>         na.action = na.pass)
    #>     response.name <- as.character(formula[[2L]])
    #>     class.stratify.cv <- getStratify(class.stratify.cv, d = distribution)
    #>     group <- NULL
    #>     num.groups <- 0
    #>     if (distribution$name != "pairwise") {
    #>         nTrain <- floor(train.fraction * nrow(x))
    #>     }
    #>     else {
    #>         distribution.group <- distribution[["group"]]
    #>         if (is.null(distribution.group)) {
    #>             stop(paste("For pairwise regression, `distribution` must be a list of", 
    #>                 "the form `list(name = \"pairwise\", group = c(\"date\",", 
    #>                 "\"session\", \"category\", \"keywords\"))`."))
    #>         }
    #>         i <- match(distribution.group, colnames(data))
    #>         if (any(is.na(i))) {
    #>             stop("Group column does not occur in data: ", distribution.group[is.na(i)], 
    #>                 ".")
    #>         }
    #>         group <- factor(do.call(paste, c(data[, distribution.group, 
    #>             drop = FALSE], sep = ":")))
    #>         if ((!missing(weights)) && (!is.null(weights))) {
    #>             w.min <- tapply(w, INDEX = group, FUN = min)
    #>             w.max <- tapply(w, INDEX = group, FUN = max)
    #>             if (any(w.min != w.max)) {
    #>                 stop("For `distribution = \"pairwise\"`, all instances for the same ", 
    #>                   "group must have the same weight.")
    #>             }
    #>             w <- w * length(w.min)/sum(w.min)
    #>         }
    #>         perm.levels <- levels(group)[sample(1:nlevels(group))]
    #>         group <- factor(group, levels = perm.levels)
    #>         ord.group <- order(group, -y)
    #>         group <- group[ord.group]
    #>         y <- y[ord.group]
    #>         x <- x[ord.group, , drop = FALSE]
    #>         w <- w[ord.group]
    #>         num.groups.train <- max(1, round(train.fraction * nlevels(group)))
    #>         nTrain <- max(which(group == levels(group)[num.groups.train]))
    #>         Misc <- group
    #>     }
    #>     cv.error <- NULL
    #>     if (cv.folds == 1) {
    #>         cv.folds <- 0
    #>     }
    #>     if (cv.folds > 1) {
    #>         cv.results <- gbmCrossVal(cv.folds = cv.folds, nTrain = nTrain, 
    #>             n.cores = n.cores, class.stratify.cv = class.stratify.cv, 
    #>             data = data, x = x, y = y, offset = offset, distribution = distribution, 
    #>             w = w, var.monotone = var.monotone, n.trees = n.trees, 
    #>             interaction.depth = interaction.depth, n.minobsinnode = n.minobsinnode, 
    #>             shrinkage = shrinkage, bag.fraction = bag.fraction, 
    #>             var.names = var.names, response.name = response.name, 
    #>             group = group)
    #>         cv.error <- cv.results$error
    #>         p <- cv.results$predictions
    #>     }
    #>     gbm.obj <- gbm.fit(x = x, y = y, offset = offset, distribution = distribution, 
    #>         w = w, var.monotone = var.monotone, n.trees = n.trees, 
    #>         interaction.depth = interaction.depth, n.minobsinnode = n.minobsinnode, 
    #>         shrinkage = shrinkage, bag.fraction = bag.fraction, nTrain = nTrain, 
    #>         keep.data = keep.data, verbose = lVerbose, var.names = var.names, 
    #>         response.name = response.name, group = group)
    #>     gbm.obj$train.fraction <- train.fraction
    #>     gbm.obj$Terms <- Terms
    #>     gbm.obj$cv.error <- cv.error
    #>     gbm.obj$cv.folds <- cv.folds
    #>     gbm.obj$call <- mcall
    #>     gbm.obj$m <- m
    #>     if (cv.folds > 1) {
    #>         gbm.obj$cv.fitted <- p
    #>     }
    #>     if (distribution$name == "pairwise") {
    #>         gbm.obj$ord.group <- ord.group
    #>         gbm.obj$fit <- gbm.obj$fit[order(ord.group)]
    #>     }
    #>     gbm.obj
    #> })(formula = Species ~ ., distribution = "gaussian", data = iris)
    #> A gradient boosted model with gaussian loss function.
    #> 100 iterations were performed.
    #> There were 4 predictors of which 4 had non-zero influence.
    

    Created on 2023-02-20 by the reprex package (v2.0.1)