library(janeaustenr)
library(tidytext)
library(tidyverse)
library(tm)
library(corpus)
text <- removeNumbers(sensesensibility)
text <- data.frame(text)
tidy_text <- text %>% unnest_tokens(bigram,text,token='ngrams',n=2)
tidy_text %>%count(bigram,sort =TRUE)
tidy_text <-tidy_text %>% separate(bigram,c('word1','word2'),sep =' ')
tidy_text_filtered <- tidy_text %>%
filter(!word1 %in% stop_words$word)%>%
filter(!word2 %in% stop_words$word)
trigram_count <- tidy_text_filtered%>% count(word1,word2, sort= TRUE)
united <- trigram_count%>%unite(bigram,word1,word2,sep=' ')%>%
filter(n >1)
united <- united %>% bind_tf_idf(bigram,n)
However i am getting this error : "Error in tapply(n, documents, sum) : arguments must have same length"
What could be wrong in my usage of bind_tf_df
The bind_tf_idf
includes three argument 'term', 'document' and 'n'. We can create the 'document' column
out <- united %>%
mutate(book = 'sensesensibility') %>%
bind_tf_idf(bigram, document = book, n)
if we need to create 'chapters', check for the word 'chapter' in the original 'text' data.frame
library(stringr)
text <- text %>%
mutate(chapter = cumsum(str_detect(text,
regex("chapter",ignore_case = TRUE))))
tidy_text <- text %>%
unnest_tokens(bigram,text,token='ngrams',n=2)
...
trigram_count <- tidy_text_filtered%>%
count(chapter, word1,word2, sort= TRUE)
united <- trigram_count%>%
unite(bigram,word1,word2,sep=' ')%>%
filter(n >1)
out <- united %>%
bind_tf_idf(bigram, document = chapter, n)
head(out)
# chapter bigram n tf idf tf_idf
#1 21 sir john 12 0.2068966 0.9162907 0.1895774
#2 21 miss steeles 11 0.1896552 2.1202635 0.4021189
#3 9 sir john 9 0.6000000 0.9162907 0.5497744
#4 13 sir john 9 0.3750000 0.9162907 0.3436090
#5 23 lady middleton 9 0.4090909 1.0788097 0.4413312
#6 40 colonel brandon 9 0.4736842 0.6931472 0.3283329