Search code examples
rdataframeapply

Quickly binning values within R


I have a massive data frame in R that looks something like this:

Dataframe 1

ID start stop
+ 3 5
- 9 13

I also have another dataframe that looks something like this:

Dataframe 2

ID name position
+ hello 4
+ there 7
- my 11
+ name 12

What I want to do is for every row in dataframe 1, I want to determine it's "distance" within a given range or print NA if it doesn't fall into a range or the "ID" (+/-) doesn't match. So the output would be something like:

ID name position relative_position
- hello 4 2
+ there 7 NA
- my 12 3
+ name 12 NA

I currently have a relatively long loop that uses apply() and a few layers of if functions to run through this process. The problem I'm having is that I need to do this for a multiple REALLY big dataframes (gigabytes in size), so efficiency is key. What's the most efficient way of going through this process?

I've also played around with the cut() function but I can't figure out how to print the results I want or how to make intervals discontinuous. Even if it works I don't know if it would be more efficient.


Solution

  • Update

    I have now the full code. It works even if there is overlap of intervals. In this case, the algorithm finds the closest interval by default. If the option closestOnly = FALSE, then it looks farther away if id does not match.

    library(data.table)
    library(microbenchmark)
    
    findDistance <- function(intervals, values, closestOnly = TRUE, verbose = FALSE)
    {
        ## The trick is to sort by position everything,
        ## `type` is 0 if it is an interval start position,
        ##           1 = interval end position, and
        ##           2 = is a position in `values`
        ## `id` is the index (row number) either in `intervals` or `values`
        n.interval <- nrow(intervals)
        dt <- rbind(data.table(id = 1:n.interval, pos = intervals$start, type = 0),
                    data.table(id = 1:nrow(values), pos = values$pos, type = 2),
                    data.table(id = 1:n.interval, pos = intervals$end, type = 1)
                    )
        ## The sorting algorithm needs to be stable (original order is preserved for ties)
        setorder(dt, pos)
        len <- nrow(dt)
    
        values[, relative_position := as.integer(NA)]
        ## A stack (behaving like a set on pop(end)) that will contain the interval row number
        depth <- 0
        stack <- c()
    
         ## loop over the rows of `dt`
        for (i in 1:len)
        {
            
            if ( 0 == dt$type[i] ) ## start
            {
                depth <- depth + 1
                ## add on stack the row numbre of the interval
                stack[depth] <- dt$id[i]
            }
            else if ( 1 == dt$type[i] && 0 < depth ) ## end
            {
                ## pop from the stack
                depth <- depth - 1
                ## Remove the interval from the stack
                ## setdiff preserve element position in the vector
                ## exactly what we want for a "stack" behaving like a set on removal
                stack <- setdiff(stack, dt$id[i])
            }
            else if ( 2 == dt$type[i]) ## Value
            {
                if (verbose) cat("=====\nThe value:\n")
                value.id <- dt$id[i]
                if (verbose) print(values[value.id,])
                if ( 0 < depth )
                {
                    
                    ## the intervals on the `stack` all contain the current value
                    n.depth <- if (closestOnly) depth else 1
                    for (i.depth in depth:n.depth)
                    {
                        index <- stack[i.depth]
                        if (values[value.id,id] != intervals[index,id])
                        {
                            if (verbose) cat(paste("\nstack depth", depth - i.depth, "is not matching\n"))
                        } else
                        {
                            distance <- values[value.id, pos] - intervals[index, start] + 1
                            if (verbose) cat("\nis inside this interval (most imbricated):\n")
                            if (verbose) print(intervals[index,])
                            if (verbose) print(paste("Distance =", distance))
                            values[value.id, relative_position := distance]
                            break
                        }
                    }
                } else {
                    if (verbose) cat("\nis outside any interval\n")
                }
            } else {
                warning("should not be here")
                break
            }
            ## print(stack)
        }
        
        return(values)
    }
    
    

    Wimpel is really fast but it does not necessary find the closest distance. This is illustrated by this example:

    findDistanceWimpel <- function (intervals, values)
    {
        values[intervals, relative_position := x.pos - i.start + 1, on = .(id, pos >= start, pos <= end)]
    }
    
    ## DF1
    intervals <- data.table(id = c('-', '-'),
                            start = c(2, 1),
                            end = c(5, 16))
    ## DF2
    values <- data.table(id = c('-'),
                         name = c("test"),
                         pos = c(4))
    
    
    res <- copy(findDistance(intervals, values, closestOnly = FALSE))
    resW <- findDistanceWimpel(intervals, values)[]
    merge(res, resW, by= c("id", "name", "pos"))[]
    #>   id name pos relative_position.x relative_position.y
    #>1:  - test   4                   3                   4
    

    If overlapping intervals is fine as for Wimpel method, go for it as performance is a lot better. But if you need better interval overlapping solution, this algorithm is for you.

    For speed comparison,

    ## DF1
    n.intervals <- 1000
    n.range <- n.intervals %/% 4
    start <- sample.int(n.range, n.intervals, replace = TRUE)
    end <- start + sample.int(10, n.intervals, replace = TRUE) - 1
    intervals <- data.table(id = c('+', '-')[sample.int(2, n.intervals, replace = TRUE)],
                            start = start,
                            end = end)
    
    ## DF2
    n.values <- 5000
    values <- data.table(id =  c('+', '-')[sample.int(2, n.values, replace = TRUE)],
                         name = paste0("name", 1:n.values),
                         pos = sample.int(n.range, n.values, replace = TRUE))
    
    
    microbenchmark({ findDistance(intervals, values, closestOnly = FALSE) },
                   { findDistanceWimpel(intervals, values) },
                   times = 3)
    #> Unit: milliseconds
    #>                                                         expr         min           lq        mean      median          uq         max neval cld
    #> {     findDistance(intervals, values, closestOnly = FALSE) }  8844.04990 98920.261426 8960.048019 8996.472944 9018.047074 9039.621204     3  a
    #>                {     findDistanceWimpel(intervals, values) }    5.341248     5.413585    6.388424    5.485922    6.912013    8.338103     3   b
    
    

    A reimplementation of my algorithm in C (what data.table does under the hood), should reach similar performance (or better).

    Old post

    It is mainly a problem of finding if many positions are in intervals. This is an algorithmic issue.

    Here is an efficient algorithm to find for all position the containing intervals. I leave in exercise the computation of distance and the proper formatting of the result ;-) (it should be easy from this canvas).

    library(data.table)
    
    ## DF1
    intervals <- data.table(id = c('+', '-'),
                            start = c(3, 9),
                            end = c(5, 13))
    
    ## DF2
    values <- data.table(id = c('-', '+', '-', '+'),
                         name = c("hello", "there", "my", "name"),
                         pos = c(4, 7, 12, 12))
    
    ## The trick is to sort by position everything,
    ## `type` is 0 if it is an interval start position,
    ##           1 = interval end position, and
    ##           2 = is a position in `values`
    ## `id` is the index (row number) either in `intervals` or `values`
    n.interval <- nrow(intervals)
    dt <- rbind(data.table(id = 1:n.interval, pos = intervals$start, type = 0),
                data.table(id = 1:n.interval, pos = intervals$end, type = 1),
                data.table(id = 1:nrow(values), pos = values$pos, type = 2))
    setorder(dt, pos)
    len <- nrow(dt)
    
    ## A stack that will contain the interval row number
    depth <- 0
    stack <- numeric(16384)
    
    ## loop over the rows of `dt`                     
    for (i in 1:len)
    {
        if ( 0 == dt$type[i] ) ## start
        {
            depth <- depth + 1
            ## add on stack the row numbre of the interval
            stack[depth] <- dt$id[i]
        }
        else if ( 1 == dt$type[i] && 0 < depth ) ## end
        {
            ## pop from the stack
            depth <- depth - 1
        }
        else if ( 2 == dt$type[i]) ## Value
        {
            cat("=====\nThe value:\n")
            value.id <- dt$id[i]
            print(values[value.id,])
            if ( 0 < depth )
            {
                ## the intervals on the `stack` all contain the current value
                index <- stack[depth]
                cat("\nis inside this interval (most imbricated):\n")
                print(intervals[index,])
            } else {
                cat("\nis outside any interval\n\n")
            }
        } else {
            warning("should not be here")
            break
        }
    }
    
    

    EDIT: Careful, the algorithm only works properly if start and end are well imbricated. (1, 5) and (3, 7) are not properly imbricated (3 is in (1, 5) and 7 outside). The pop on 5 from the stack will remove the interval (3, 7). If the interval are not properly imbricated, the stack should become a set and the pop on an interval end should remove the matching start.