Search code examples
rtm

R remove specific word in a txte like: the this


txt <- readLines("this.txt")

library(tm)

corpus <- Corpus(VectorSource(txt))

corpus <- tm_map (corpus, removePunctuation)

tdm <- TermDocumentMatrix (corpus)

m <- as.matrix (tdm)

d <- data.frame(freq = sort(rowSums(m),decreasing = TRUE))

Solution

  • I think you're asking how to remove words like 'the' and 'this' using the tm library? If so, try this:

    corpus <- tm_map(txt, removeWords, stopwords("english"))
    

    To remove specific words:

    corpus <- tm_map(corpus, removeWords, c("hello","is","it","me","you're","looking","for?"))
    

    Edit: I created an example using War and Peace, which works. Try converting your terms to lower case before creating a document-term matrix. Like so:

    library(tm)
    
    # load
    txt <- readLines("this.txt")
    corpus <- Corpus(VectorSource(txt))
    
    # clean
    corpus <- tm_map(corpus, removePunctuation)
    corpus <- tm_map(corpus, removeNumbers)
    corpus <- tm_map(corpus, tolower)
    corpus <- tm_map(corpus, removeWords, stopwords("english")) 
    corpus <- tm_map(corpus, PlainTextDocument)
    
    # create dtm and get terms
    dtm <- DocumentTermMatrix(corpus)
    dtm$dimnames$Terms
    

    Change the code to fit your text file and the output should similar to this:

    dtm$dimnames$Terms
     [1] "almost"          "anonymous"       "anyone"          "anywhere"        "author"          "away"           
     [7] "aylmer"          "book"            "chapter"         "contents"        "copy"            "cost"           
    [13] "date"            "david"           "ebook"           "english"         "give"            "gutenberg"      
    [19] "iii"             "included"        "january"         "language"        "last"            "leo"            
    [25] "license"         "louise"          "march"           "maude"           "may"             "one"            
    [31] "online"          "peace"           "posting"         "project"         "restrictions"    "reuse"          
    [37] "start"           "terms"           "title"           "tolstoy"         "tolstoytolstoi"  "translators"    
    [43] "updated"         "use"             "vii"             "volunteer"       "war"             "whatsoever"     
    [49] "widger"          "wwwgutenbergorg"