Search code examples
regexrfunction-calls

R: Identifying vector & dataframe names from function calls


Say I have a function my.function taking x, a numerical vector, as its only argument. I need to decompose or parse the string from match.call()$x in such a way that I can identify:

  • the vector's name and label if any
  • the structure it's in, if any (dataframe, list, and so on.)
  • the structure this latest structure is in... and so on.

In other words, I need to deduct the hierarchy of the data from the function call. For example, say the function call is

> my.function(iris$Species)

strsplit or regular expressions will tell us that Species is an atomic vector, contained in a dataframe called iris. (is.vector, is.data.frame and others could be used for validating this). The thing gets more complicated though as structures are part of larger structures, and as the syntaxically diverse ways to extract data from structures grow.

To illustrate, imagine that instead of iris$Species, the user uses this (after putting iris in a list, for whichever reason:

> my.function(my.list["iris"][,5])
> my.function(my.list[[2]]$iris[,"Species"]

In order to achieve what I want, I would need to come up with a certain number of regular expressions. Now my question is: before working those regexps, am I overlooking some existing function or an alternate way to deduct hierarchy from the function call?


Solution

  • After experimenting for a bit with MrFlick's solution, I found I got better results using regex. It doesn't account for all situations, but it gives me more than enough flexibility. Thought I'd share it here, might be useful to others. No garanties of course.

    # This function takes a string referring to existing data and parses it
    # to get information on the data structure.
    #
    # Example:
    #
    # > .parse.arg("iris[1:200,c(1,4)]")
    # $arg.str
    # [1] "iris[1:200,c(1,4)]"
    #
    # $row.index
    # [1] "1:200"
    #
    # $col.index
    # [1] "c(1,4)"
    #
    # $df.name
    # [1] "iris"
    #
    # $col.names
    # [1] "Sepal.Length" "Petal.Width"
    
    .parse.arg <- function(arg.str) {
    
      # Check if arg.str is a string
      if(!is.character(arg.str))
        stop("arg.str must be a string")
    
      # Recuperate the object designated by arg.str; this is to allow further work
      x <- try(eval(parse(text=arg.str)))
      if(inherits(x, "try-error")) {
        message("arg.str must match an existing object")
        return()
      }
    
      if(!is.data.frame(x) && !is.atomic(x)) {
        message("arg.str must match an atomic structure (vector/factor) or a dataframe")
        return()
      }
    
      # Initialise output list
      output <- list()
    
      # Store a copy of the arg.str in output object
      output$arg.str <- arg.str
    
      # Trim the string removing leading/trailing blanks
      arg.str <- gsub("^\\s+|\\s+$", "", arg.str)
    
      # Get rid of spaces next to brackets and next to comma in indexing brackets.
      # Note: that way assures us to not remove any spaces in quoted structures
      # such as ['var name']
      arg.str <- gsub("\\s*\\[\\s*","[", arg.str, perl=TRUE) # spaces near [
      arg.str <- gsub("\\s*\\]\\s*","]", arg.str, perl=TRUE) # spaces near ]
      arg.str <- gsub("^(.*)(\\[\\d+:\\d+)?\\s?,\\s?(.+)$", "\\1\\2,\\3", arg.str, perl=TRUE)
    
      # Change [[]] to [] for the last pair of brackets; this simplifies the work
      arg.str <- sub("\\[{2}(.*)\\]{2}$", "[\\1]", arg.str, perl=TRUE)
    
      # Change references to data with ['name'] or [['name']] into $name, also to simplify matters
      re.brack <- '\\[{1,2}[\'\"]'
      if(grepl(re.brack, arg.str)) {
        arg.str <- gsub('\\[{1,2}[\'\"]', "$", arg.str, perl=TRUE)
        arg.str <- gsub('[\'\"]\\]{1,2}', "", arg.str, perl=TRUE)
      }
    
      # Next we'll isolate indexing in the last brackets
      re.index <- "(.*?)\\[(.*?)\\]$"
    
      if(grepl(re.index, arg.str)) {
        indexes <- sub(re.index, "\\2", arg.str, perl=TRUE)
    
        # Further decompose the indexes
        # indexing having 2 elements (rows, columns), will be identified by this regex
        # [1:10,] or [,"Species] will also match
        re.split.index <- "^(.+)?,+(c\\(.*\\)|\\d+|\\d+:\\d+|'.*'|\".+\")$"
        if(grepl(re.split.index, indexes, perl = TRUE)) {
          output$rows.subset <- sub(re.split.index, "\\1", indexes, perl=TRUE)
          output$col.index <- sub(re.split.index, "\\2", indexes, perl=TRUE)
    
          # Remove any empty string
          if(nchar(output$rows.subset) == 0)
            output$rows.subset <- NULL
          if(nchar(output$col.index) == 0)
            output$col.index <- NULL
        }
    
        # When previous regex does not match, it means the index has only 1 element,
        # either row or column. When a comma is present:
        else if(substring(indexes,1,1) == ",")
          output$col.indexes <- sub("^,", "", indexes, perl = TRUE)
    
        else if(substring(indexes,nchar(indexes),nchar(indexes)) == ",")
          output$rows.subset <- sub(",$", "", indexes, perl = TRUE)
    
        # When there is no comma, we'll check if x is a dataframe or not.
        # If it is, the index refers to columns, and otherwise, to rows
        else {
          # first we need to reevaluate the arg.str
          x.tmp <- eval(parse(text = arg.str))
          if(is.data.frame(x.tmp))
            output$col.index <- indexes
          else
            output$rows.subset <- indexes
        }
    
        # Update the string to remove what's already accounted for
        arg.str <- sub(re.index, "\\1", arg.str, perl=TRUE)
      }
    
      # Split arg.str by "$" to identify structures
      output$data.struct <- strsplit(arg.str, "$", fixed = TRUE)[[1]]
    
      # If type of x is dataframe, normally the last element in the data structures
      # should be the df name
      if(is.data.frame(x)) {
        output$df.name <- tail(output$data.struct,1)
        output$col.names <- colnames(x)
      }
    
      # Otherwise, depending on the situation, we'll try to get at the df name and its colnames()
      else {
    
        # If vector is referred to via column indexing, recup the column's name
        # by an evaluation of the form df[col.index]
        if("col.index" %in% names(output)) {
          output$var.name <- eval(parse(text=paste("colnames(",arg.str,"[",output$col.index,"])")))
          #output$col.names <- eval(parse(text=paste("colnames(",arg.str,"[",output$col.index,"])")))
          output$df.name <- tail(output$data.struct,1)
        }
    
        # If there is no column indexing, it means the vector's name is in the
        # data.struc list, along with the df name one level higher, unless the vector
        # was "standalone"
        else {
          output$var.name <- tail(output$data.struct,1)
          if(length(output$data.struct)>1)
            output$df.name <- output$data.struct[length(output$data.struct)-1]
        }
      }
    
      # remove last item from data.struct when it's the same as var.name to avoid redundancy
      output$data.struct <- setdiff(output$data.struct, output$var.name)
    
      # same with df.name and data.struct
      output$data.struct <- setdiff(output$data.struct, output$df.name)
    
      # cleanup
      if(length(output$data.struct)==0)
        output$data.struct <- NULL
    
      # Further validate the items to return;
      if(isTRUE(grepl('[\\(\\[]', output$df.name)))
        output$df.name <- NULL
    
      if(isTRUE(grepl('[\\(\\[]', output$var.name)))
        output$var.name <- NULL
    
      return(output)
    }