Search code examples
rsparse-matrixterm-document-matrix

Sparse Matrix as a result of crossprod of sparse matrices


I have been working around this problem for a while without finding a satisfactory solution.

I have data in a binary sparse matrix (TermDocumentMatrix) with dim ([1] 340436 763717). I here use an extract as proof of concept:

m = structure(list(i = c(1L, 2L, 5L, 2L, 4L, 3L, 5L, 4L), j = c(1L, 1L, 1L, 2L, 2L, 
    3L, 3L, 3L), v = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), nrow = 5L, ncol = 3L, 
    dimnames = list(Terms = c("action", "activities", "advisory", "alike", "almanac"),
                    Docs = c("1000008721", "1000010083","1000013295"))), 
    class = c("TermDocumentMatrix", "simple_triplet_matrix"), weighting = c("binary", "bin"))

inspect(m)
<<TermDocumentMatrix (terms: 5, documents: 3)>>
Non-/sparse entries: 8/7
Sparsity           : 47%
Maximal term length: 10
Weighting          : binary (bin)
Sample             :
            Docs
Terms        1000008721 1000010083 1000013295
  action              1          0          0
  activities          1          1          0
  advisory            0          0          1
  alike               0          1          1
  almanac             1          0          1

I want to normalize to unit length every vectorized document, and then obtain a (sparse) matrix with the Docs on rows and Docs on cols and the dot product of the corresponding normalized vectors as values.

Expected output:

Sparse Matrix:
            Docs
Docs         1000008721 1000010083 1000013295 ... N
  1000008721  1.0000000  0.4082483  0.3333333     .
  1000010083  0.4082483  1.0000000  0.4082483     .  
  1000013295  0.3333333  0.4082483  1.0000000     .
    ...
   N               .          .          .

or also: data.table
 ID1              ID2          v
1000008721     1000008721      1
1000010083     1000008721     0.4082483
1000013295     1000008721     0.3333333
   ...             ...         ...

This would be easy to achieve with crossprod_simple_triplet_matrix(m) after applying the normalization with a function that divides every value for the norm. The euclidean norm in the with a binary vector reduces to sqrt(col_sums(m)).

Since I cannot by as.matrix() transformation ("Error: cannot allocate vector of size 968.6 Gb"), and I couldn't find any other way, I used data.table that may circumvent the need to apply a function over a sparse matrix:

# exploit the triples and manipulate through data.table
dt = as.data.table(list(i=m$i,j=m$j,v=m$v))

# obtain euclidean norm for every column 
dt[,e.norm:=list(as.numeric(sqrt(sum(v)))),by=j]

# divide the v for the corresponding group, subset and replace
dt = dt[,v.norm:=v/e.norm][,.(i,j,v.norm)][,v:=v.norm][,.(i,j,v)]

m$v <- dt$v
inspect(m)
            Docs
Terms        1000008721 1000010083 1000013295
  action      0.5773503  0.0000000  0.0000000
  activities  0.5773503  0.7071068  0.0000000
  advisory    0.0000000  0.0000000  0.5773503
  alike       0.0000000  0.7071068  0.5773503
  almanac     0.5773503  0.0000000  0.5773503

(What would the equivalent of this (maybe with slam) be?)

QUESTION: Given that crossprod_simple_triplet_matrix(tdm) still returns a dense matrix (hence memory error) can you think about a similar data.table solution to return a sparse matrix with the cross product of two sparse matrices, or any alternative way?


Solution

  • A 340436 x 763717 sparse matrix with 35879680 non-zero elements will result in a very large object (~30 GB). My machine isn't able to hold that object in memory with 16GB RAM. However, the cross product is straightforward to do piecemeal. The function bigcrossprod below performs the cross product in piecemeal, converts the results to data.table objects, and then rbinds the objects. The crossprod operation is broken into nseg separate operations.

    library(data.table)
    library(Matrix)
    
    bigcrossprod <- function(m, nseg) {
      jmax <- ncol(m)
      # get the indices to split the columns of m into chunks that will have
      # approximately the same expense for crossprod
      sumj <- cumsum(as.numeric(jmax:1))
      dtj <- data.table(
        j = 1:jmax,
        int = sumj %/% (sumj[jmax]/nseg + 1)
      )[
        , .(idx1 = min(j), idx2 = max(j)), int
      ][, idx1:idx2]
      rbindlist(
        lapply(
          1:nseg,
          function(seg) {
            cat("\r", seg) # user feedback
            j1 <- dtj$idx1[seg]
            m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
                                    m[, j1:jmax])), "dgTMatrix")
            data.table(
              i = attr(m2, "i") + j1,
              j = attr(m2, "j") + j1,
              v = attr(m2, "x")
            )
          }
        )
      )
    }
    

    Demonstrating with a somewhat smaller sparse matrix:

    idx <- sample(34043*76371, 358796)
    dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
                     j = as.integer(((idx - 1) %/% 34043L) + 1),
                     key = "j")[, v := 1/sqrt(.N), j]
    m <- sparseMatrix(dt$i, dt$j, x = dt$v)
    # calculate crossprod on the full matrix and convert the result to triplet
    # form in order to represent as a data.table
    m2 <- as(crossprod(m), "dgTMatrix")
    dt2 <- data.table(i = attr(m2, "i") + 1L,
                      j = attr(m2, "j") + 1L,
                      v = attr(m2, "x"))
    # calculate crossprod in 10 chunks
    dt3 <- bigcrossprod(m, 10)
    #>  1 2 3 4 5 6 7 8 9 10
    # convert the result into a symmetric sparse matrix in triplet form (the same as m2)
    m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
    # remove elements from the data.table objects that are redundant due to symmetry
    dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
    dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
    # check that the dgTMatrix and data.table objects are identical
    identical(m2, m3)
    #> [1] TRUE
    identical(dt2, dt3)
    #> [1] TRUE
    

    In order to calculate the cross product of the 340436 x 763717 sparse matrix with 35879680 elements, instead of storing the list of data.table objects in a list to pass to rbindlist, save the individual data.table objects for later processing using the fst package. Instead of returning a single data.table, the following version of bigcrossprod returns a character vector of length nseg containing .fst file paths. Again, demonstrating with the smaller matrix:

    library(data.table)
    library(Matrix)
    library(fst)
    
    bigcrossprod <- function(m, nseg, path) {
      jmax <- ncol(m)
      # get the indices to split the columns of m into chunks that will have
      # approximately the same expense for crossprod
      sumj <- cumsum(as.numeric(jmax:1))
      dtj <- data.table(
        j = 1:jmax,
        int = sumj %/% (sumj[jmax]/nseg + 1)
      )[
        , .(idx1 = min(j), idx2 = max(j)), int
      ][, idx1:idx2]
      vapply(
        1:nseg,
        function(seg) {
          cat("\r", seg) # user feedback
          j1 <- dtj$idx1[seg]
          m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE],
                                  m[, j1:jmax])), "dgTMatrix")
          filepath <- file.path(path, paste0("dt", seg, ".fst"))
          write.fst(
            data.table(
              i = attr(m2, "i") + j1,
              j = attr(m2, "j") + j1,
              v = attr(m2, "x")),
            filepath
          )
          filepath
        },
        character(1)
      )
    }
    
    idx <- sample(34043*76371, 358796)
    dt <- data.table(i = as.integer(((idx - 1) %% 34043L) + 1),
                     j = as.integer(((idx - 1) %/% 34043L) + 1),
                     key = "j")[, v := 1/sqrt(.N), j]
    m <- sparseMatrix(dt$i, dt$j, x = dt$v)
    # calculate crossprod on the full matrix and convert the result to triplet
    # form in order to represent as a data.table
    m2 <- as(crossprod(m), "dgTMatrix")
    dt2 <- data.table(i = attr(m2, "i") + 1L,
                      j = attr(m2, "j") + 1L,
                      v = attr(m2, "x"))
    # calculate crossprod in 10 chunks, read the resulting .fst files,
    # and rbindlist into a single data.table
    dt3 <- rbindlist(lapply(bigcrossprod(m, 10, "C:/temp"),
                            function(x) read.fst(x, as.data.table = TRUE)))
    #> 1 2 3 4 5 6 7 8 9 10
    # convert the result into a symmetric sparse matrix in triplet form (the same as m2)
    m3 <- as(forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)), "dgTMatrix")
    # remove elements from the data.table objects that are redundant due to symmetry
    dt2 <- unique(dt2[i > j, `:=`(i = j, j = i)])
    dt3 <- setorder(dt3[i > j, `:=`(i = j, j = i)], i, j)
    # check that the dgTMatrix and data.table objects are identical
    identical(m2, m3)
    #> [1] TRUE
    identical(dt2, dt3)
    #> [1] TRUE
    

    I was able process a 340436 x 763717 sparse matrix with 35879680 non-zero elements in about 15 minutes with 16GB of RAM.

    Explanation:

    A walkthrough of the logic in bigcrossprod using the OP's 5 x 3 example matrix:

    idx <- c(1,2,5,7,9,13:15)
    dt <- data.table(i = as.integer(((idx - 1) %% 5) + 1),
                     j = as.integer(((idx - 1) %/% 5) + 1),
                     key = "j")[, v := 1/sqrt(.N), j]
    (m <- sparseMatrix(dt$i, dt$j, x = dt$v))
    #> 5 x 3 sparse Matrix of class "dgCMatrix"
    #>                                   
    #> [1,] 0.5773503 .         .        
    #> [2,] 0.5773503 0.7071068 .        
    #> [3,] .         .         0.5773503
    #> [4,] .         0.7071068 0.5773503
    #> [5,] 0.5773503 .         0.5773503
    nseg <- 2 # process the cross product of m in two chunks
    jmax <- ncol(m)
    sumj <- cumsum(as.numeric(jmax:1))
    # In order to balance the processing between chunks, process column 1 first,
    # then columns 2:3 next. Each crossprod call will result in 3 elements.
    (dtj <- data.table(j = 1:jmax, int = sumj %/% (sumj[jmax]/nseg + 1))[, .(idx1 = min(j), idx2 = max(j)), int][, idx1:idx2])
    #>    idx1 idx2
    #> 1:    1    1
    #> 2:    2    3
    # process the first chunk
    seg <- 1L
    j1 <- dtj$idx1[seg]
    # cross product of column 1 and the full matrix
    (m2 <- as(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax]), "dgTMatrix"))
    #> 1 x 3 sparse Matrix of class "dgTMatrix"
    #>                           
    #> [1,] 1 0.4082483 0.3333333
    # The indices of m2 are 0-based. Add j1 to the i,j indices of m2 when converting
    # to a data.table.
    (dt1 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
    #>    i j         v
    #> 1: 1 1 1.0000000
    #> 2: 1 2 0.4082483
    #> 3: 1 3 0.3333333
    # process the second chunk
    seg <- 2L
    j1 <- dtj$idx1[seg] # starts at column 2
    # Cross product of columns 2:3 and columns 2:jmax (2:3). The end result
    # will be a symmetric matrix, so we need only the upper triangular matrix.
    (m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg], drop = FALSE], m[, j1:jmax])), "dgTMatrix"))
    #> 2 x 2 sparse Matrix of class "dgTMatrix"
    #>                 
    #> [1,] 1 0.4082483
    #> [2,] . 1.0000000
    (dt2 <- data.table(i = attr(m2, "i") + j1, j = attr(m2, "j") + j1, v = attr(m2, "x")))
    #>    i j         v
    #> 1: 2 2 1.0000000
    #> 2: 2 3 0.4082483
    #> 3: 3 3 1.0000000
    # the final matrix (in data.table form) is the two data.tables row-bound
    # together (in bigcrossprod, the lapply returns a list of data.table objects for
    # rbindlist)
    (dt3 <- rbindlist(list(dt1, dt2)))
    #>    i j         v
    #> 1: 1 1 1.0000000
    #> 2: 1 2 0.4082483
    #> 3: 1 3 0.3333333
    #> 4: 2 2 1.0000000
    #> 5: 2 3 0.4082483
    #> 6: 3 3 1.0000000
    # dt3 can be converted to a symmetric sparse matrix
    (m3 <- forceSymmetric(sparseMatrix(dt3$i, dt3$j, x = dt3$v)))
    #> 3 x 3 sparse Matrix of class "dsCMatrix"
    #>                                   
    #> [1,] 1.0000000 0.4082483 0.3333333
    #> [2,] 0.4082483 1.0000000 0.4082483
    #> [3,] 0.3333333 0.4082483 1.0000000
    

    And, finally, a parallel version of bigcrossprod (for Linux):

    library(data.table)
    library(Matrix)
    library(parallel)
    library(fst)
    
    bigcrossprod <- function(m, nseg, path, nthreads = nseg) {
      jmax <- ncol(m)
      # get the indices to split the columns of m into chunks that will have
      # approximately the same expense for crossprod
      sumj <- cumsum(as.numeric(jmax:1))
      dtj <- data.table(
        j = 1:jmax,
        int = sumj %/% (sumj[jmax]/nseg + 1)
      )[
        , .(idx1 = min(j), idx2 = max(j)), int
      ][, idx1:idx2]
      cl <- makeCluster(nthreads, type = "FORK", outfile = "")
      on.exit(stopCluster(cl))
      unlist(
        parLapply(
          cl,
          1:nseg,
          function(seg) {
            j1 <- dtj$idx1[seg]
            m2 <- as(triu(crossprod(m[, j1:dtj$idx2[seg],drop = FALSE],
                                    m[, j1:jmax])), "dgTMatrix")
            filepath <- file.path(path, paste0("dt", seg, ".fst"))
            write.fst(
              data.table(
                i = attr(m2, "i") + j1,
                j = attr(m2, "j") + j1,
                v = attr(m2, "x")),
              filepath
            )
            filepath
          }
        )
      )
    }