Search code examples
rterm-document-matrix

Keep all the text phrases for data frequency


I have a data frame with only one column "text"

"text"
"User Interfaces"
"Twitter"
"Text Normalization"
"Term weighting"
"Teenagers"
"Team member replacement"

I would like to take a dataframe with the frequency of every phrase, like this:

 "User Interfaces",1
 "Twitter",1
 "Text Normalization",1
 "Term weighting",1
 "Teenagers",1
 "Team member replacement",1

in order to make it I use this:

library(tm) 
df <- read.csv("C:/Users/acel/Desktop/myphr.csv", header=TRUE, sep=",")
corpusD <- Corpus(VectorSource(df$text))
corpusD <- tm_map(corpusD, tolower)
corpusD <- tm_map(corpusD, removeWords, stopwords('english'))
corpusD <- tm_map(corpusD, removeNumbers)
corpusD <- tm_map(corpusD, stripWhitespace)
corpusD <- tm_map(corpusD, PlainTextDocument)
corpusD <- tm_map(corpusD, stemDocument, language = "english")
corpusC <- Corpus(VectorSource(corpusD))
matrixD <- TermDocumentMatrix(corpusC)
matrixD <- removeSparseTerms(matrixD, 0.75)
MatrixDfreq <- rowSums(as.matrix(matrixD))
MatrixDfreq<-sort(MatrixDfreq, decreasing = TRUE)
MatrixDtop30<- MatrixDfreq [1:30]

but when I check the result from MatrixDtop30 I see one word counted like user,1 and interface,1 instead of seeing "user interface",1

Any idea why this is happening?


Solution

  • I think this would be a lot easier using data.table operations.

    library(data.table)
    df = data.frame(text = c("test", "test" ,"test" , "test2", "test3", "test2"))
    
    > df
       text
    1  test
    2  test
    3  test
    4 test2
    5 test3
    6 test2
    
    setDT(df)
    df = df[ , .(Number = .N), by = .(text)]
    
    > df
        text Number
    1:  test      3
    2: test2      2
    3: test3      1
    

    Edit

    We can include stemming with this

    library(data.table)
    library(SnowballC)
    df = data.frame(text = c("test", "testing" ,"test" , "test2", "test3", "test2"))
    
    > df
         text
    1    test
    2 testing
    3    test
    4   test2
    5   test3
    6   test2
    
    df$text = wordStem(df$text, language = "porter")
    
    > df
       text
    1  test
    2  test
    3  test
    4 test2
    5 test3
    6 test2
    
    setDT(df)
    df = df[ , .(Number = .N), by = .(text)]
    
    > df
        text Number
    1:  test      3
    2: test2      2
    3: test3      1