Search code examples
rtext-miningtmslam

Efficient jaccard similarity DocumentTermMatrix


I want a way to efficiently calculate Jaccard similarity between documents of a tm::DocumentTermMatrix. I can do something similar for cosine similarity via the slam package as shown in this answer. I came across another question and response on CrossValidated that was R specific but about matrix algebra not necessarily the most efficient route. I tried to implement that solution with more efficient slam functions but do not get the same solution as when I use a less efficient approach of coercing the DTM to a matrix and using proxy::dist.

How can I efficiently calculate Jaccard similarity between documents of a large DocumentTermMatrix in R?

#Data & Pacages

library(Matrix);library(proxy);library(tm);library(slam);library(Matrix)

mat <- structure(list(i = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 3L, 1L, 
    2L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), j = c(1L, 1L, 2L, 2L, 3L, 3L, 
    4L, 4L, 4L, 5L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), v = c(1, 
    1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1), nrow = 4L, 
        ncol = 12L, dimnames = structure(list(Docs = c("1", "2", 
        "3", "4"), Terms = c("computer", "is", "fun", "not", "too", 
        "no", "it's", "dumb", "what", "should", "we", "do")), .Names = c("Docs", 
        "Terms"))), .Names = c("i", "j", "v", "nrow", "ncol", "dimnames"
    ), class = c("DocumentTermMatrix", "simple_triplet_matrix"), weighting = c("term frequency", 
    "tf"))

#Inefficient Calculation (expected output)

proxy::dist(as.matrix(mat), method = 'jaccard')

##       1     2     3
## 2 0.000            
## 3 0.875 0.875      
## 4 1.000 1.000 1.000

#My Attempt

A <- slam::tcrossprod_simple_triplet_matrix(mat)
im <- which(A > 0, arr.ind=TRUE)
b <- slam::row_sums(mat)
Aim <- A[im]

stats::as.dist(Matrix::sparseMatrix(
      i = im[,1],
      j = im[,2],
      x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
      dims = dim(A)
))

##     1   2   3
## 2 2.0        
## 3 0.1 0.1    
## 4 0.0 0.0 0.0

Outputs do not match.

FYI Here is the original text:

c("Computer is fun. Not too fun.", "Computer is fun. Not too fun.", 
    "No it's not, it's dumb.", "What should we do?")

I'd expect elements 1 & 2 to be 0 distance and element 3 to be closer to element 1 than element 1 and 4 (I'd expect furthest distance as no words are shared) as seen in the proxy::dist solution.

EDIT

Note that even on a medium sized DTM the matrix becomes huge. Here's an example with the vegan package. Note 4 minutes to solve where as the cosine similarity is ~5 seconds.

library(qdap); library(quanteda);library(vegan);library(slam)
x <- quanteda::convert(quanteda::dfm(rep(pres_debates2012$dialogue), stem = FALSE, 
        verbose = FALSE, removeNumbers = FALSE), to = 'tm')


## <<DocumentTermMatrix (documents: 2912, terms: 3368)>>
## Non-/sparse entries: 37836/9769780
## Sparsity           : 100%
## Maximal term length: 16
## Weighting          : term frequency (tf)

tic <- Sys.time()
jaccard_dist_mat <- vegan::vegdist(as.matrix(x), method = 'jaccard')
Sys.time() - tic #Time difference of 4.01837 mins

tic <- Sys.time()
tdm <- t(x)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(tdm)/(sqrt(col_sums(tdm^2) %*% t(col_sums(tdm^2))))
Sys.time() - tic #Time difference of 5.024992 secs

Solution

  • Jaccard measure is a measure between SETS and input matrix should be binary. The very first line says:

    ## common values:
    A = tcrossprod(m)
    

    In case of bag-of-words DTM this is not the number of common values!

    library(text2vec)
    library(magrittr)
    library(Matrix)
    
    jaccard_similarity <- function(m) {
      A <- tcrossprod(m)
      im <- which(A > 0, arr.ind=TRUE, useNames = F)
      b <- rowSums(m)
      Aim <- A[im]
      sparseMatrix(
        i = im[,1],
        j = im[,2],
        x = Aim / (b[im[,1]] + b[im[,2]] - Aim),
        dims = dim(A)
      )
    }
    
    jaccard_distance <- function(m) {
      1 - jaccard_similarity(m)
    }
    
    cosine <- function(m) {
      m_normalized <- m / sqrt(rowSums(m ^ 2))
      tcrossprod(m_normalized)
    }
    

    Benchmarks:

    data("movie_review")
    tokens <- movie_review$review %>% tolower %>% word_tokenizer
    
    dtm <- create_dtm(itoken(tokens), hash_vectorizer(hash_size = 2**16))
    dim(dtm)
    # 5000 65536
    
    system.time(dmt_cos <- cosine(dtm))
    # user  system elapsed 
    #  2.524   0.169   2.693 
    
    system.time( {
      dtm_binary <- transform_binary(dtm)
      # or simply
      # dtm_binary <- sign(dtm)
      dtm_jac <- jaccard_similarity(dtm_binary)  
    })
    #   user  system elapsed 
    # 11.398   1.599  12.996
    max(dtm_jac)
    # 1
    dim(dtm_jac)
    # 5000 5000
    

    EDIT 2016-07-01:

    See even faster version from text2vec 0.4 (~2.85x when not need to convert from dgCMatrix to dgTMatrix and ~1.75x when need column major dgCMatrix)

    jaccard_dist_text2vec_04 <- function(x, y = NULL, format = 'dgCMatrix') {
      if (!inherits(x, 'sparseMatrix'))
        stop("at the moment jaccard distance defined only for sparse matrices")
      # union x
      rs_x = rowSums(x)
      if (is.null(y)) {
        # intersect x
        RESULT = tcrossprod(x)
        rs_y = rs_x
      } else {
        if (!inherits(y, 'sparseMatrix'))
          stop("at the moment jaccard distance defined only for sparse matrices")
        # intersect x y
        RESULT = tcrossprod(x, y)
        # union y
        rs_y = rowSums(y)
      }
      RESULT = as(RESULT, 'dgTMatrix')
      # add 1 to indices because of zero-based indices in sparse matrices
      # 1 - (...) because we calculate distance, not similarity
      RESULT@x <- 1 - RESULT@x / (rs_x[RESULT@i + 1L] + rs_y[RESULT@j + 1L] - RESULT@x)
      if (!inherits(RESULT, format))
        RESULT = as(RESULT, format)
      RESULT
    }
    system.time( {
       dtm_binary <- transform_binary(dtm)
       dtm_jac <-jaccard_dist(dtm_binary, format = 'dgTMatrix')
     })
     #  user  system elapsed 
     # 4.075   0.517   4.593  
    system.time( {
       dtm_binary <- transform_binary(dtm)
       dtm_jac <-jaccard_dist(dtm_binary, format = 'dgCMatrix')
     })
     #  user  system elapsed 
     # 6.571   0.939   7.516