Search code examples
rlapplysapply

Vectorised solution for iterating through a list of indices and updating a corresponding matrix


I have a list of lists of the following form

xx = list("a_1" = list("A", "C"), 
          "a_2" = list("B", "C"), 
          "a_3" = list("B", "B"), 
          "a_4" = list("C", "B"), 
          "a_5" = list("B", "A"),
          "a_6" = list("B", "A"))

Note that this list contains sublists that are duplicates, like "a_5" and "a_6" in the above example. Using this list of lists, I want to update a matrix of the form

m = matrix(data = 0, nrow = 3, ncol = 3)         # initialise matrix of zeros
rownames(m) = c("A", "B", "C")                   # name rows
colnames(m) = c("A", "B", "C")                   # name columns

such that we add one to the index of the matrix corresponding to a given pair. This is easy to do using a for loop

for (item in xx) {
  
  # add one to the matrix index if item in xx
  m[item[[1]], item[[2]]] = m[item[[1]], item[[2]]] + 1
  
}

which yields the expected output

  A B C
A 0 0 1
B 2 1 1
C 0 1 0

though this can be slightly cumbersome for large lists. I was hoping to do this in a vectorised way using the apply() methods but I wasn't able to get anything working using a nested combination of lapply() to iterate through the list and sapply() to update the values to the matrix.

So I was wondering how to do this using variant(s) of apply()?


Solution

  • Fully vectorized:

    x <- matrix(unlist(xx), length(xx), 2, 1)
    m + tabulate(
      match(x[,1], row.names(m)) + nrow(m)*(match(x[,2], colnames(m)) - 1L),
      length(m)
    )
    #>   A B C
    #> A 0 0 1
    #> B 2 1 1
    #> C 0 1 0
    

    Benchmarking

    Define the various approaches as functions.

    floop <- function(m, xx) {
      for (item in xx) m[item[[1]], item[[2]]] = m[item[[1]], item[[2]]] + 1L
      m
    }
    
    fsapply <- function(mm, xx) {
      sapply(xx, \(x) mm[x[[1]], x[[2]]] <<- mm[x[[1]], x[[2]]] + 1L)
      mm
    }
    
    faggregate <- function(m, xx) {
      sel <- data.frame(t(sapply(xx, unlist)))
      sel <- aggregate(cbind(sel[0], value=1), sel, FUN=sum)
      m[as.matrix(sel[1:2])] <- m[as.matrix(sel[1:2])] + sel$value
      m
    }
    
    fvectorized <- function(m, xx) {
      x <- matrix(unlist(xx), length(xx), 2, 1)
      m + tabulate(
        match(x[,1], row.names(m)) + nrow(m)*(match(x[,2], colnames(m)) - 1L),
        length(m)
      )
    }
    
    ftable <- function(m, xx) {
      m + c(table(as.data.frame(matrix(unlist(xx), length(xx), 2, 1))))
    }
    

    Create a larger test example:

    xx <- lapply(1:1e4, \(i) as.list(sample(LETTERS, 2, 1)))
    m <- matrix(0L, 26, 26, 0, rep(list(LETTERS), 2))
    

    Benchmark:

    microbenchmark::microbenchmark(
      floop = floop(m, xx),
      fsapply = fsapply(m, xx),
      faggregate = faggregate(m, xx),
      fvectorized = fvectorized(m, xx),
      ftable = ftable(m, xx),
      check = "equal"
    )
    #> Unit: microseconds
    #>         expr     min       lq      mean   median       uq     max neval
    #>        floop  8611.9  9051.65 11519.122  9877.15 12723.70 54497.1   100
    #>      fsapply 15160.5 15813.15 17868.492 17069.90 19383.50 28591.2   100
    #>   faggregate 14888.5 15540.90 16733.166 16100.20 17810.55 20828.0   100
    #>  fvectorized   910.6  1019.10  1201.803  1078.90  1226.75  5886.9   100
    #>       ftable  1297.0  1573.40  1765.658  1691.40  1823.95  4364.7   100