r tm

R remove specific word in a txte like: the this

txt <- readLines("this.txt")

library(tm)

corpus <- Corpus(VectorSource(txt))

corpus <- tm_map (corpus, removePunctuation)

tdm <- TermDocumentMatrix (corpus)

m <- as.matrix (tdm)

d <- data.frame(freq = sort(rowSums(m),decreasing = TRUE))

Solution

I think you're asking how to remove words like 'the' and 'this' using the tm library? If so, try this:

corpus <- tm_map(txt, removeWords, stopwords("english"))

To remove specific words:

corpus <- tm_map(corpus, removeWords, c("hello","is","it","me","you're","looking","for?"))

Edit: I created an example using War and Peace, which works. Try converting your terms to lower case before creating a document-term matrix. Like so:

library(tm)

# load
txt <- readLines("this.txt")
corpus <- Corpus(VectorSource(txt))

# clean
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("english")) 
corpus <- tm_map(corpus, PlainTextDocument)

# create dtm and get terms
dtm <- DocumentTermMatrix(corpus)
dtm$dimnames$Terms

Change the code to fit your text file and the output should similar to this:

dtm$dimnames$Terms
 [1] "almost"          "anonymous"       "anyone"          "anywhere"        "author"          "away"           
 [7] "aylmer"          "book"            "chapter"         "contents"        "copy"            "cost"           
[13] "date"            "david"           "ebook"           "english"         "give"            "gutenberg"      
[19] "iii"             "included"        "january"         "language"        "last"            "leo"            
[25] "license"         "louise"          "march"           "maude"           "may"             "one"            
[31] "online"          "peace"           "posting"         "project"         "restrictions"    "reuse"          
[37] "start"           "terms"           "title"           "tolstoy"         "tolstoytolstoi"  "translators"    
[43] "updated"         "use"             "vii"             "volunteer"       "war"             "whatsoever"     
[49] "widger"          "wwwgutenbergorg"