Search code examples
revaluationscoping

R: Scoping rules are getting me into murky waters


Any help with this problem I'm having would be greatly appreciated. I am a moderately advanced R programmer, but so far all my solutions have failed me. I start with the logic behind what I am trying to do, followed by my attempt, followed by the test cases. I tried to be as explicit as possible.

I should probably mention that I sort of know what the problem is, but I don't know what the solution is.

# sqldf has some limitations:

cpaste <- function(x) paste(x, collapse = ", ")

dd <- data.frame(a = 1:10)
b <- 5:8

# this is what I want to get
sqldf("select * from dd where a in (5, 6, 7, 8)")

# but I want to get it by typing this
sqldf(sprintf("select * from %s where a in (%s)", dd, b)) # error

# and it doesn't work, because this is what sprintf expects:
sqldf(sprintf("select * from %s where a in (%s)", "dd", paste(b, collapse = ", ")))

# in other words, 
# (1) the name of data frame, not the data frame itself, and
# (2) the vector must be turned into a single string with comma separated values

# I wrote a wrapper function for sqldf
# it uses sprintf to create the sql string that I need to feed to sqldf
# but before doing that it does (1) and (2) as mentioned above
# so I can do this and it would work:
run_sql("select * from %s where a in (%s)", dd, b)

# it works until I try running it inside another funciton
# where I start running into some problems

# here's the function, followed by test cases

run_sql <- function(zcode = NULL, ..., display = TRUE, eval = TRUE) {

  require(sqldf)

  ellipsis <- as.list(match.call(expand.dots = TRUE))
  ellipsis[1] <- NULL
  ellipsis$inline <- NULL
  ellipsis$display <- NULL
  ellipsis$eval <- NULL
  # print(ellipsis)
  # print(lapply(ellipsis, class))

  ffn <- function(x) {
    if (is.name(x)) { # the argument passed is itself a variable
      if (is.data.frame(eval(x))) {
        as.character(x) # returns just the name of the data frame
      } else if (is.atomic(eval(x))) {
        cpaste(eval(x)) # return the atomic vector as comma-sep string
      } else "_____FAIL1_____"
    } else if (is.call(x)) { # the argument passed is a function call, eg 2:4
      if (is.atomic(eval(x))) cpaste(eval(x)) else "_____FAIL2_____"
    } else {
      if (is.atomic(x)) cpaste(x) else "_____FAIL3_____"
    }
  }
  ellipsis <- lapply(ellipsis, ffn)

  zcode <- do.call(sprintf, unname(ellipsis))
  if (display == TRUE) cat(paste0(zcode, "\n\n"))

  if (eval == TRUE) {
    sqldf(zcode)
  } else {
    zcode
  }

}

dd <- data.frame(a = 1:10)
b <- 5:8
run_sql("select * from %s where a > %s", dd, 5)
run_sql("select * from %s where a in (%s)", dd, b)

# it works when the function uses variables in .GlobalEnv
# but this is not the preferred way:
foo <- function() {
  run_sql("select * from %s where a in (%s)", dd, b)
}
foo()

# here's the preferred way
# but things stop working:
foo <- function(x, y) {
  run_sql("select * from %s where a in (%s)", x, y)
}
foo(dd, b) 

# here's one solution to the above, but I am hoping there's a better way
foo <- function(x, y) {
  do.call(run_sql, list("select * from %s where a in (%s)", 
    substitute(x), 
    substitute(y)))
}
foo(dd, b) 

# also, the above solution does not work with local variables
foo <- function() {
  bb <- dd
  do.call(run_sql, list("select * from %s where a in (%s)", 
    bb, 
    substitute(y)))
}
foo()

Solution

  • @G. Grothendieck's solution will probably be easier for many readers of this post. That said, I think you can fix your function by identifying the parent environment of the call to run_sql, then using envir= to specify that environment anytime you call a function that depends on environments -- specifically, eval() and sqldf(). Like so:

    cpaste <- function(x) paste(x, collapse = ", ")
    run_sql <- function(zcode = NULL, ..., display = TRUE, eval = TRUE, envir=parent.frame()) {
    
      require(sqldf)
    
      ellipsis <- as.list(match.call(expand.dots = TRUE))
      ellipsis[1] <- NULL
      ellipsis$inline <- NULL
      ellipsis$display <- NULL
      ellipsis$eval <- NULL
      # print(ellipsis)
      # print(lapply(ellipsis, class))
    
      ffn <- function(x) {
        if (is.name(x)) { # the argument passed is itself a variable
          if (is.data.frame(eval(x, envir=envir))) {
            as.character(x) # returns just the name of the data frame
          } else if (is.atomic(eval(x, envir=envir))) {
            cpaste(eval(x, envir=envir)) # return the atomic vector as comma-sep string
          } else "_____FAIL1_____"
        } else if (is.call(x)) { # the argument passed is a function call, eg 2:4
          if (is.atomic(eval(x, envir=envir))) cpaste(eval(x, envir=envir)) else "_____FAIL2_____"
        } else {
          if (is.atomic(x)) cpaste(x) else "_____FAIL3_____"
        }
      }
      ellipsis <- lapply(ellipsis, ffn)
    
      zcode <- do.call(sprintf, unname(ellipsis))
      if (display == TRUE) cat(paste0(zcode, "\n\n"))
    
      if (eval == TRUE) {
        sqldf(zcode, envir=envir)
      } else {
        zcode
      }
    
    }
    

    This works in your test case using x and y:

    foo <- function(x, y) {
      run_sql("select * from %s where a in (%s)", x, y)
    }
    foo(dd, b) 
    

    And, with some tweaking, in a test case using do.call and local variables:

    foo <- function(y) {
      bb <- dd
      do.call(run_sql, list("select * from %s where a in (%s)", 
                            as.name("bb"), 
                            substitute(y),
                            envir=environment()))
    }
    foo(b)
    

    To understand the problem with your original function, and to understand what environment was visible to eval() each time it was called, I wrote a function called enveval to wrap several sys.xxx functions and the eval() call. Then, back in the run_sql function, I replaced all calls to eval with calls to enveval.

    # enveval: Replace an eval() call with enveval() to see a description of the stack of environments experienced by eval()
    enveval <- function(x, envir=parent.frame()) {
      cat(paste0("EVALUATING ",as.character(x),":\n"))
      stack <- data.frame(frame_num=1:sys.nframe(), call=strtrim(as.character(sys.calls()),15), 
                          is_eval_envir=NA, vars_in_frame=NA, x_exists=NA, eval_x=NA)
      for(i in 1:nrow(stack)) {
        f <- which(i==stack$frame_num)
        stack[f,"is_eval_envir"] <- identical(envir,sys.frame(f))
        stack[f,"vars_in_frame"] <- paste(ls(envir=sys.frame(f)),collapse=",")
        stack[f,"x_exists"] <- exists(as.character(x), where=sys.frame(f))
        if(stack[f,"is_eval_envir"] & stack[f,"x_exists"]) {
          # if all the variables to evaluate are single-element atomic, you can also run the following line:
          if(is.atomic(eval(x, envir=sys.frame(f)))) {
            stack[f,"eval_x"] <- cpaste(eval(x, envir=sys.frame(f)))
          } else {
            stack[f,"eval_x"] <- "[non-atomic]"
          }
        }
      }
      print(stack)
      eval(x, envir=envir)
    }
    
    # The new run_sql where eval is replaced with enveval:
    run_sql <- function(zcode = NULL, ..., display = TRUE, eval = TRUE, envir=parent.frame()) {
    
      require(sqldf)
    
      ellipsis <- as.list(match.call(expand.dots = TRUE))
      ellipsis[1] <- NULL
      ellipsis$inline <- NULL
      ellipsis$display <- NULL
      ellipsis$eval <- NULL
      # print(ellipsis)
      # print(lapply(ellipsis, class))
    
      ffn <- function(x) {
        if (is.name(x)) { # the argument passed is itself a variable
          if (is.data.frame(enveval(x, envir=envir))) {
            as.character(x) # returns just the name of the data frame
          } else if (is.atomic(enveval(x, envir=envir))) {
            cpaste(enveval(x, envir=envir)) # return the atomic vector as comma-sep string
          } else "_____FAIL1_____"
        } else if (is.call(x)) { # the argument passed is a function call, eg 2:4
          if (is.atomic(enveval(x, envir=envir))) cpaste(enveval(x, envir=envir)) else "_____FAIL2_____"
        } else {
          if (is.atomic(x)) cpaste(x) else "_____FAIL3_____"
        }
      }
      ellipsis <- lapply(ellipsis, ffn)
    
      zcode <- do.call(sprintf, unname(ellipsis))
      if (display == TRUE) cat(paste0(zcode, "\n\n"))
    
      if (eval == TRUE) {
        sqldf(zcode, envir=envir)
      } else {
        zcode
      }
    
    }
    

    Playing around with the test cases shows you what enveval sees (and what eval would have seen) each time it's called. For example, running the first test function:

    foo <- function(x, y) {
      run_sql("select * from %s where a in (%s)", x, y)
    }
    foo(dd, b) 
    

    gave the following print-out showing that the frame for the foo(dd, b) call was the useful environment for every call to eval():

    EVALUATING x:
      frame_num            call is_eval_envir                         vars_in_frame x_exists       eval_x
    1         1      foo(dd, b)          TRUE                                   x,y     TRUE [non-atomic]
    2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE         <NA>
    3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE         <NA>
    4         4 FUN(X[[2]], ...         FALSE                                     x     TRUE         <NA>
    5         5 is.data.frame(e         FALSE                                     x     TRUE         <NA>
    6         6 enveval(x, envi         FALSE                     envir,f,i,stack,x     TRUE         <NA>
    EVALUATING y:
      frame_num            call is_eval_envir                         vars_in_frame x_exists     eval_x
    1         1      foo(dd, b)          TRUE                                   x,y     TRUE 5, 6, 7, 8
    2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE       <NA>
    3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE       <NA>
    4         4 FUN(X[[3]], ...         FALSE                                     x    FALSE       <NA>
    5         5 is.data.frame(e         FALSE                                     x    FALSE       <NA>
    6         6 enveval(x, envi         FALSE                     envir,f,i,stack,x    FALSE       <NA>
    EVALUATING y:
      frame_num            call is_eval_envir                         vars_in_frame x_exists     eval_x
    1         1      foo(dd, b)          TRUE                                   x,y     TRUE 5, 6, 7, 8
    2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE       <NA>
    3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE       <NA>
    4         4 FUN(X[[3]], ...         FALSE                                     x    FALSE       <NA>
    5         5 enveval(x, envi         FALSE                     envir,f,i,stack,x    FALSE       <NA>
    EVALUATING y:
      frame_num            call is_eval_envir                         vars_in_frame x_exists     eval_x
    1         1      foo(dd, b)          TRUE                                   x,y     TRUE 5, 6, 7, 8
    2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE       <NA>
    3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE       <NA>
    4         4 FUN(X[[3]], ...         FALSE                                     x    FALSE       <NA>
    5         5 cpaste(enveval(         FALSE                                     x    FALSE       <NA>
    6         6 paste(x, collap         FALSE                          collapse,sep    FALSE       <NA>
    7         7 enveval(x, envi         FALSE                     envir,f,i,stack,x    FALSE       <NA>
    
    select * from x where a in (5, 6, 7, 8)
    
    a
    1 5
    2 6
    3 7
    4 8