Search code examples
rword-countstop-words

R Word cloud - Cannot remove English stopwords


I tried remove the English stopwords from the text before building a word cloud but it didn't work. I read several posts and tried what was suggested without any luck. Any help will be appreciated.

library(tm)
library(wordcloud)
library(RColorBrewer)
library(SnowballC)

textdata <- c(A secur breach expos privat inform of student loan borrow from Aug. 20-22 dure a comput softwar upgrade. User of the DOE Direct Loan Web site were abl to view inform other than their own if they use certain option when access the program web pages. SSNs were among the data element expos online.  Softwar compani Affiliat Comput Servic (ACS) creat the technolog for the Direct Loan Servic featur on the DoE site. )


#Create corpus and clean data
txt <- Corpus(VectorSource(textdata))
txtCorpus <- tm_map(txt, removePunctuation)
txtCorpus <- tm_map(txt, removeNumbers)
txtCorpus <- tm_map(txt, content_transformer(tolower))
txtCorpus <- tm_map(txtCorpus, removeWords, stopwords("english"))
txtCorpus <- tm_map(txt, stripWhitespace); #inspect(docs[1])
txtCorpus <- tm_map(txt, stemDocument)

#Creat tdm
tdm <- TermDocumentMatrix(txtCorpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v, stringsAsFactors = FALSE)
head(d, 10)

Output

        word    freq

the     the     8469        
and     and     5790        
inform  inform  2629        
was     was     2487        
secur   secur   2249        
were    were    1901        
social  social  1890    

Solution

  • Fix your corpus cleansing:

    library(tm)
    library(wordcloud)
    library(RColorBrewer)
    library(SnowballC)
    textdata <- c("A secur breach expos privat inform of student loan borrow from Aug. 20-22 dure a comput softwar upgrade. User of the DOE Direct Loan Web site were abl to view inform other than their own if they use certain option when access the program web pages. SSNs were among the data element expos online.  Softwar compani Affiliat Comput Servic (ACS) creat the technolog for the Direct Loan Servic featur on the DoE site. ")
    corp <- Corpus(VectorSource(textdata))
    corp <- tm_map(corp, removePunctuation)
    corp <- tm_map(corp, removeNumbers)
    corp <- tm_map(corp, content_transformer(tolower))
    corp <- tm_map(corp, removeWords, stopwords("english"))
    corp <- tm_map(corp, stripWhitespace); #inspect(docs[1])
    corp <- tm_map(corp, stemDocument)
    
    tdm <- TermDocumentMatrix(corp)
    m <- as.matrix(tdm)
    v <- sort(rowSums(m),decreasing=TRUE)
    d <- data.frame(word = names(v),freq=v, stringsAsFactors = FALSE)
    head(d, 10)
    #            word freq
    # loan       loan    3
    # comput   comput    2
    # direct   direct    2
    # doe         doe    2
    # expo       expo    2
    # inform   inform    2
    # servic   servic    2
    # site       site    2
    # softwar softwar    2
    # web         web    2