Say I have a function my.function
taking x
, a numerical vector, as its only argument. I need to decompose or parse the string from match.call()$x
in such a way that I can identify:
In other words, I need to deduct the hierarchy of the data from the function call. For example, say the function call is
> my.function(iris$Species)
strsplit
or regular expressions will tell us that Species is an atomic vector, contained in a dataframe called iris. (is.vector
, is.data.frame
and others could be used for validating this). The thing gets more complicated though as structures are part of larger structures, and as the syntaxically diverse ways to extract data from structures grow.
To illustrate, imagine that instead of iris$Species, the user uses this (after putting iris
in a list, for whichever reason:
> my.function(my.list["iris"][,5])
> my.function(my.list[[2]]$iris[,"Species"]
In order to achieve what I want, I would need to come up with a certain number of regular expressions. Now my question is: before working those regexps, am I overlooking some existing function or an alternate way to deduct hierarchy from the function call?
After experimenting for a bit with MrFlick's solution, I found I got better results using regex. It doesn't account for all situations, but it gives me more than enough flexibility. Thought I'd share it here, might be useful to others. No garanties of course.
# This function takes a string referring to existing data and parses it
# to get information on the data structure.
#
# Example:
#
# > .parse.arg("iris[1:200,c(1,4)]")
# $arg.str
# [1] "iris[1:200,c(1,4)]"
#
# $row.index
# [1] "1:200"
#
# $col.index
# [1] "c(1,4)"
#
# $df.name
# [1] "iris"
#
# $col.names
# [1] "Sepal.Length" "Petal.Width"
.parse.arg <- function(arg.str) {
# Check if arg.str is a string
if(!is.character(arg.str))
stop("arg.str must be a string")
# Recuperate the object designated by arg.str; this is to allow further work
x <- try(eval(parse(text=arg.str)))
if(inherits(x, "try-error")) {
message("arg.str must match an existing object")
return()
}
if(!is.data.frame(x) && !is.atomic(x)) {
message("arg.str must match an atomic structure (vector/factor) or a dataframe")
return()
}
# Initialise output list
output <- list()
# Store a copy of the arg.str in output object
output$arg.str <- arg.str
# Trim the string removing leading/trailing blanks
arg.str <- gsub("^\\s+|\\s+$", "", arg.str)
# Get rid of spaces next to brackets and next to comma in indexing brackets.
# Note: that way assures us to not remove any spaces in quoted structures
# such as ['var name']
arg.str <- gsub("\\s*\\[\\s*","[", arg.str, perl=TRUE) # spaces near [
arg.str <- gsub("\\s*\\]\\s*","]", arg.str, perl=TRUE) # spaces near ]
arg.str <- gsub("^(.*)(\\[\\d+:\\d+)?\\s?,\\s?(.+)$", "\\1\\2,\\3", arg.str, perl=TRUE)
# Change [[]] to [] for the last pair of brackets; this simplifies the work
arg.str <- sub("\\[{2}(.*)\\]{2}$", "[\\1]", arg.str, perl=TRUE)
# Change references to data with ['name'] or [['name']] into $name, also to simplify matters
re.brack <- '\\[{1,2}[\'\"]'
if(grepl(re.brack, arg.str)) {
arg.str <- gsub('\\[{1,2}[\'\"]', "$", arg.str, perl=TRUE)
arg.str <- gsub('[\'\"]\\]{1,2}', "", arg.str, perl=TRUE)
}
# Next we'll isolate indexing in the last brackets
re.index <- "(.*?)\\[(.*?)\\]$"
if(grepl(re.index, arg.str)) {
indexes <- sub(re.index, "\\2", arg.str, perl=TRUE)
# Further decompose the indexes
# indexing having 2 elements (rows, columns), will be identified by this regex
# [1:10,] or [,"Species] will also match
re.split.index <- "^(.+)?,+(c\\(.*\\)|\\d+|\\d+:\\d+|'.*'|\".+\")$"
if(grepl(re.split.index, indexes, perl = TRUE)) {
output$rows.subset <- sub(re.split.index, "\\1", indexes, perl=TRUE)
output$col.index <- sub(re.split.index, "\\2", indexes, perl=TRUE)
# Remove any empty string
if(nchar(output$rows.subset) == 0)
output$rows.subset <- NULL
if(nchar(output$col.index) == 0)
output$col.index <- NULL
}
# When previous regex does not match, it means the index has only 1 element,
# either row or column. When a comma is present:
else if(substring(indexes,1,1) == ",")
output$col.indexes <- sub("^,", "", indexes, perl = TRUE)
else if(substring(indexes,nchar(indexes),nchar(indexes)) == ",")
output$rows.subset <- sub(",$", "", indexes, perl = TRUE)
# When there is no comma, we'll check if x is a dataframe or not.
# If it is, the index refers to columns, and otherwise, to rows
else {
# first we need to reevaluate the arg.str
x.tmp <- eval(parse(text = arg.str))
if(is.data.frame(x.tmp))
output$col.index <- indexes
else
output$rows.subset <- indexes
}
# Update the string to remove what's already accounted for
arg.str <- sub(re.index, "\\1", arg.str, perl=TRUE)
}
# Split arg.str by "$" to identify structures
output$data.struct <- strsplit(arg.str, "$", fixed = TRUE)[[1]]
# If type of x is dataframe, normally the last element in the data structures
# should be the df name
if(is.data.frame(x)) {
output$df.name <- tail(output$data.struct,1)
output$col.names <- colnames(x)
}
# Otherwise, depending on the situation, we'll try to get at the df name and its colnames()
else {
# If vector is referred to via column indexing, recup the column's name
# by an evaluation of the form df[col.index]
if("col.index" %in% names(output)) {
output$var.name <- eval(parse(text=paste("colnames(",arg.str,"[",output$col.index,"])")))
#output$col.names <- eval(parse(text=paste("colnames(",arg.str,"[",output$col.index,"])")))
output$df.name <- tail(output$data.struct,1)
}
# If there is no column indexing, it means the vector's name is in the
# data.struc list, along with the df name one level higher, unless the vector
# was "standalone"
else {
output$var.name <- tail(output$data.struct,1)
if(length(output$data.struct)>1)
output$df.name <- output$data.struct[length(output$data.struct)-1]
}
}
# remove last item from data.struct when it's the same as var.name to avoid redundancy
output$data.struct <- setdiff(output$data.struct, output$var.name)
# same with df.name and data.struct
output$data.struct <- setdiff(output$data.struct, output$df.name)
# cleanup
if(length(output$data.struct)==0)
output$data.struct <- NULL
# Further validate the items to return;
if(isTRUE(grepl('[\\(\\[]', output$df.name)))
output$df.name <- NULL
if(isTRUE(grepl('[\\(\\[]', output$var.name)))
output$var.name <- NULL
return(output)
}