Search code examples
rtm

R tm package select huge amount of words to keep in text corpus


I have around 70.000 frequent_words which I want to keep in a text corpus in the same order they appeared (order matters). Which i got like this:

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=50)

Just doing:

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
dtm <- removeSparseTerms(dtm, 0.8)

Would not work since I need that same filtered text_corpus twice:

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))

I tried the code below:

keepWords <- content_transformer(function(x, words) {
  regmatches(x,
             gregexpr(paste0("(\\b",  paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T, useBytes = T)
             , invert = T) <- " "
  return(x)
})
txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)

When I run it I get the error:

Error in gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"),  : 
  assertion 'tree->num_tags == num_tags' failed in executing regexp: file 'tre-compile.c', line 634
Calls: preprocess ... tm_parLapply -> lapply -> FUN -> FUN -> regmatches<- -> gregexpr
Execution halted

This is caused due to the long regular expression. Removing non frequent words is out of the question since length(less_frequent_words) > 1.000.000 and takes to long with:

chunk <- 500
n <- length(less_frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(less_frequent_words, r)

for (i in 1:length(d)) {
  txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}

I also tried something with joining but it gives me a unique text corpus in each iteration:

chunk <- 500
n <- length(frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(frequent_words, r)

joined_txt_corpus <- VCorpus(VectorSource(list()))
for (i in 1:length(d)) {
  new_corpus <- tm_map(txt_corpus, keepWords, c(paste(d[[i]])))
  joined_txt_corpus <- c(joined_txt_corpus, new_corpus)
  txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}
txt_corpus <- joined_txt_corpus

Is there an efficient way to do the same selection like text_corpus <- tm_map(txt_corpus, keepWords, frequent_words) but with many words? Any help and hints are appreciated! Thanks!

Reproducable example:

library(tm)
data(crude)

txt_corpus <- crude

txt_corpus <- tm_map(txt_corpus, content_transformer(tolower))
txt_corpus <- tm_map(txt_corpus, removePunctuation)
txt_corpus <- tm_map(txt_corpus, stripWhitespace)

article_words <- c("a", "an", "the")
txt_corpus <- tm_map(txt_corpus, removeWords, article_words)
txt_corpus <- tm_map(txt_corpus, removeNumbers)

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=80)
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf), dictionary=frequent_words))

# Use many words just using frequent_words once works
# frequent_words <- c(frequent_words, frequent_words, frequent_words, frequent_words)

# keepWords function
keepWords <- content_transformer(function(x, words) {
  regmatches(x,
             gregexpr(paste0("(\\b",  paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T)
             , invert = T) <- " "
  return(x)
})

txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)

# Get bigram from text_corpus
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))
bidtmm <- col_sums(bidtm)
bidtmm <- as.matrix(bidtmm)
print(bidtmm)

Output:

        [,1]
in in     14
in of     21
in oil    19
in to     28
of in     21
of of     20
of oil    20
of to     29
oil in    18
oil of    18
oil oil   13
oil to    33
to in     32
to of     35
to oil    21
to to     41

Solution

  • I looked at your requirements and maybe a combination to tm and quanteda can help. See below.

    Once you have a list of frequent words you can use quanteda in parallel to get the bigrams.

    library(quanteda)
    
    # set number of threads 
    quanteda_options(threads = 4) 
    
    my_corp <- corpus(crude) # corpus from tm can be used here (txt_corpus)
    my_toks <- tokens(my_corp, remove_punct = TRUE) # add extra removal if needed
    
    # Use list of frequent words from tm. 
    # speed gain should occur here
    my_toks <- tokens_keep(my_toks, frequent_words)
    
    # ngrams, concatenator is _ by default
    bitoks <- tokens_ngrams(my_toks)
    
    textstat_frequency(dfm(bitoks)) # ordered from high to low
    
       feature frequency rank docfreq group
    1    to_to        41    1      12   all
    2    to_of        35    2      15   all
    3   oil_to        33    3      17   all
    4    to_in        32    4      12   all
    5    of_to        29    5      14   all
    6    in_to        28    6      11   all
    7    in_of        21    7       8   all
    8   to_oil        21    7      13   all
    9    of_in        21    7      10   all
    10  of_oil        20   10      14   all
    11   of_of        20   10       8   all
    12  in_oil        19   12      10   all
    13  oil_in        18   13      11   all
    14  oil_of        18   13      11   all
    15   in_in        14   15       9   all
    16 oil_oil        13   16      10   all
    

    quanteda does have a topfeatures function, but it doesn't work like findfreqterms. Otherwise you could do it completely in quanteda.

    If the dfm generation is taking too much memory, you can use as.character to transform the token object and use this either in dplyr or data.table. See code below.

    library(dplyr)
    out_dp <- tibble(features = as.character(bitoks)) %>% 
      group_by(features) %>% 
      tally()
    
    
    library(data.table)
    out_dt <- data.table(features = as.character(bitoks))
    out_dt <- out_dt[, .N, by = features]