I have around 70.000 frequent_words
which I want to keep in a text corpus in the same order they appeared (order matters). Which i got like this:
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=50)
Just doing:
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
dtm <- removeSparseTerms(dtm, 0.8)
Would not work since I need that same filtered text_corpus
twice:
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))
I tried the code below:
keepWords <- content_transformer(function(x, words) {
regmatches(x,
gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T, useBytes = T)
, invert = T) <- " "
return(x)
})
txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)
When I run it I get the error:
Error in gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"), :
assertion 'tree->num_tags == num_tags' failed in executing regexp: file 'tre-compile.c', line 634
Calls: preprocess ... tm_parLapply -> lapply -> FUN -> FUN -> regmatches<- -> gregexpr
Execution halted
This is caused due to the long regular expression. Removing non frequent words is out of the question since length(less_frequent_words)
> 1.000.000 and takes to long with:
chunk <- 500
n <- length(less_frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(less_frequent_words, r)
for (i in 1:length(d)) {
txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}
I also tried something with joining but it gives me a unique text corpus in each iteration:
chunk <- 500
n <- length(frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(frequent_words, r)
joined_txt_corpus <- VCorpus(VectorSource(list()))
for (i in 1:length(d)) {
new_corpus <- tm_map(txt_corpus, keepWords, c(paste(d[[i]])))
joined_txt_corpus <- c(joined_txt_corpus, new_corpus)
txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}
txt_corpus <- joined_txt_corpus
Is there an efficient way to do the same selection like text_corpus <- tm_map(txt_corpus, keepWords, frequent_words)
but with many words? Any help and hints are appreciated! Thanks!
Reproducable example:
library(tm)
data(crude)
txt_corpus <- crude
txt_corpus <- tm_map(txt_corpus, content_transformer(tolower))
txt_corpus <- tm_map(txt_corpus, removePunctuation)
txt_corpus <- tm_map(txt_corpus, stripWhitespace)
article_words <- c("a", "an", "the")
txt_corpus <- tm_map(txt_corpus, removeWords, article_words)
txt_corpus <- tm_map(txt_corpus, removeNumbers)
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=80)
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf), dictionary=frequent_words))
# Use many words just using frequent_words once works
# frequent_words <- c(frequent_words, frequent_words, frequent_words, frequent_words)
# keepWords function
keepWords <- content_transformer(function(x, words) {
regmatches(x,
gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T)
, invert = T) <- " "
return(x)
})
txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)
# Get bigram from text_corpus
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))
bidtmm <- col_sums(bidtm)
bidtmm <- as.matrix(bidtmm)
print(bidtmm)
Output:
[,1]
in in 14
in of 21
in oil 19
in to 28
of in 21
of of 20
of oil 20
of to 29
oil in 18
oil of 18
oil oil 13
oil to 33
to in 32
to of 35
to oil 21
to to 41
I looked at your requirements and maybe a combination to tm and quanteda can help. See below.
Once you have a list of frequent words you can use quanteda in parallel to get the bigrams.
library(quanteda)
# set number of threads
quanteda_options(threads = 4)
my_corp <- corpus(crude) # corpus from tm can be used here (txt_corpus)
my_toks <- tokens(my_corp, remove_punct = TRUE) # add extra removal if needed
# Use list of frequent words from tm.
# speed gain should occur here
my_toks <- tokens_keep(my_toks, frequent_words)
# ngrams, concatenator is _ by default
bitoks <- tokens_ngrams(my_toks)
textstat_frequency(dfm(bitoks)) # ordered from high to low
feature frequency rank docfreq group
1 to_to 41 1 12 all
2 to_of 35 2 15 all
3 oil_to 33 3 17 all
4 to_in 32 4 12 all
5 of_to 29 5 14 all
6 in_to 28 6 11 all
7 in_of 21 7 8 all
8 to_oil 21 7 13 all
9 of_in 21 7 10 all
10 of_oil 20 10 14 all
11 of_of 20 10 8 all
12 in_oil 19 12 10 all
13 oil_in 18 13 11 all
14 oil_of 18 13 11 all
15 in_in 14 15 9 all
16 oil_oil 13 16 10 all
quanteda does have a topfeatures
function, but it doesn't work like findfreqterms
. Otherwise you could do it completely in quanteda.
If the dfm
generation is taking too much memory, you can use as.character to transform the token object and use this either in dplyr or data.table. See code below.
library(dplyr)
out_dp <- tibble(features = as.character(bitoks)) %>%
group_by(features) %>%
tally()
library(data.table)
out_dt <- data.table(features = as.character(bitoks))
out_dt <- out_dt[, .N, by = features]