Search code examples
rtidytext

usage of bind tf_df in R


    library(janeaustenr)
    library(tidytext)
    library(tidyverse)
    library(tm)
    library(corpus)

   text <- removeNumbers(sensesensibility)

text <- data.frame(text)

tidy_text <- text %>% unnest_tokens(bigram,text,token='ngrams',n=2)


tidy_text %>%count(bigram,sort =TRUE)
             
             
tidy_text <-tidy_text %>% separate(bigram,c('word1','word2'),sep =' ')

tidy_text_filtered <- tidy_text %>% 
                      filter(!word1 %in% stop_words$word)%>%
                      filter(!word2 %in% stop_words$word)
                   


trigram_count <- tidy_text_filtered%>% count(word1,word2, sort= TRUE)
                

united <- trigram_count%>%unite(bigram,word1,word2,sep=' ')%>%
          filter(n >1)

united <- united %>% bind_tf_idf(bigram,n)

However i am getting this error : "Error in tapply(n, documents, sum) : arguments must have same length"

What could be wrong in my usage of bind_tf_df


Solution

  • The bind_tf_idf includes three argument 'term', 'document' and 'n'. We can create the 'document' column

    out <- united %>%
                 mutate(book = 'sensesensibility') %>%
                 bind_tf_idf(bigram, document = book, n) 
    

    if we need to create 'chapters', check for the word 'chapter' in the original 'text' data.frame

    library(stringr)
    text <- text %>% 
         mutate(chapter = cumsum(str_detect(text, 
                  regex("chapter",ignore_case = TRUE))))
    tidy_text <- text %>% 
                  unnest_tokens(bigram,text,token='ngrams',n=2)
    ...
    
    trigram_count <- tidy_text_filtered%>%
                      count(chapter, word1,word2, sort= TRUE)
    united <- trigram_count%>%
                     unite(bigram,word1,word2,sep=' ')%>%
                     filter(n >1)
    
    out <- united %>%                 
                 bind_tf_idf(bigram, document = chapter, n) 
    head(out)
    #  chapter          bigram  n        tf       idf    tf_idf
    #1      21        sir john 12 0.2068966 0.9162907 0.1895774
    #2      21    miss steeles 11 0.1896552 2.1202635 0.4021189
    #3       9        sir john  9 0.6000000 0.9162907 0.5497744
    #4      13        sir john  9 0.3750000 0.9162907 0.3436090
    #5      23  lady middleton  9 0.4090909 1.0788097 0.4413312
    #6      40 colonel brandon  9 0.4736842 0.6931472 0.3283329