Search code examples
rmachine-learningdplyrnlptidytext

Modifying the sentiment of certain words in tidytext get_sentiments()


I am trying to modify the sentiment of a few specific words in my df to make them more suitable for my context, where they were used with a negative connotation but have been classified as having a positive sentiment. The words are "talent" and "prefer".

Here is my code:

#Loading packages
library(dplyr)
library(ggplot2)
require(readxl)
library(tidytext)
require(writexl)

data example:

dput(sentiment_words[1:20,c(7,8,9)])

data output:

structure(list(word = c("talent", "prefer", "lies", "hard", "worsen", 
"addicts", "obnoxious", "unbearable", "sickening", "irritating", 
"weird", "inconsiderate", "weird", "overwhelming", "issue", "complaints", 
"confined", "love", "confined", "idiots"), sentiment = c("positive", 
"positive", "negative", "negative", "negative", "negative", "negative", 
"negative", "negative", "negative", "negative", "negative", "negative", 
"negative", "negative", "negative", "negative", "positive", "negative", 
"negative"), count = c(79L, 3L, 53L, 316L, 2L, 2L, 3L, 2L, 2L, 
7L, 24L, 2L, 24L, 2L, 198L, 21L, 4L, 52L, 4L, 19L)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L), groups = structure(list(
    word = c("addicts", "complaints", "confined", "ftw", "hard", 
    "idiots", "inconsiderate", "irritating", "issue", "lies", 
    "lost", "love", "obnoxious", "overwhelming", "sickening", 
    "unbearable", "weird", "worsen"), .rows = structure(list(
        6L, 16L, c(17L, 19L), 2L, 4L, 20L, 12L, 10L, 15L, 3L, 
        1L, 18L, 7L, 14L, 9L, 8L, c(11L, 13L), 5L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -18L), .drop = TRUE))
 ###### Sentiment Analysis by Word ######
## Using "TIDYTEXT" sentiment dictionary
sentiment_words <- df |>
  tidytext::unnest_tokens(output="word", input="post") |>
  dplyr::anti_join(tidytext::stop_words)|>
  dplyr::inner_join(tidytext::get_sentiments("bing"))

sentiment_words %>%
  count(word, sort = TRUE)
# Check the Most common positive and negative words
sentiment_words <-
sentiment_words %>% group_by(word) %>% mutate(count = n())
 
bing_word_counts <- sentiment_words %>%
  dplyr::inner_join(tidytext::get_sentiments("bing") %>%
  count(word, sentiment, sort = TRUE))

Solution

  • get_sentiments("bing") returns a regular tibble with 2 string columns that you can filter and wrangle as you see fit:

    library(tidytext)
    library(dplyr)
    library(stringr)
    
    get_sentiments("bing")
    #> # A tibble: 6,786 × 2
    #>    word        sentiment
    #>    <chr>       <chr>    
    #>  1 2-faces     negative 
    #>  2 abnormal    negative 
    #>  3 abolish     negative 
    #>  4 abominable  negative 
    #>  5 abominably  negative 
    #>  6 abominate   negative 
    #>  7 abomination negative 
    #>  8 abort       negative 
    #>  9 aborted     negative 
    #> 10 aborts      negative 
    #> # ℹ 6,776 more rows
    
    # modified sentiments tibble
    sentiments_mod <-
      get_sentiments("bing") |> 
      mutate(sentiment = case_when(
        word %in% c("talent", "prefer") ~ "negative", 
        .default = sentiment))
    
    

    Though there's no magic involved, so "prefers" and "talents" are still classified as positives, which may or may not be what you are after:

    filter(sentiments_mod, str_starts(word, "talent|prefer"))
    
    #> # A tibble: 10 × 2
    #>    word       sentiment
    #>    <chr>      <chr>    
    #>  1 prefer     negative 
    #>  2 preferable positive 
    #>  3 preferably positive 
    #>  4 prefered   positive 
    #>  5 preferes   positive 
    #>  6 preferring positive 
    #>  7 prefers    positive 
    #>  8 talent     negative 
    #>  9 talented   positive 
    #> 10 talents    positive
    

    When you have applied all required modification to your sentiment table, use that ( sentiments_mod ) in your workflow:

    df <- tibble(post = "talent prefer lies hard worsen addicts obnoxious 
                 unbearable sickening irritating weird inconsiderate weird 
                 overwhelming issue complaints confined love confined idiots")
    df |>
      unnest_tokens(output="word", input="post") |>
      anti_join(stop_words)|>
      inner_join(sentiments_mod)
    #> Joining with `by = join_by(word)`
    #> Joining with `by = join_by(word)`
    #> # A tibble: 20 × 2
    #>    word          sentiment
    #>    <chr>         <chr>    
    #>  1 talent        negative 
    #>  2 prefer        negative 
    #>  3 lies          negative 
    #>  4 hard          negative 
    #>  5 worsen        negative 
    #>  6 addicts       negative 
    #>  7 obnoxious     negative 
    #>  8 unbearable    negative 
    #>  9 sickening     negative 
    #> 10 irritating    negative 
    #> 11 weird         negative 
    #> 12 inconsiderate negative 
    #> 13 weird         negative 
    #> 14 overwhelming  negative 
    #> 15 issue         negative 
    #> 16 complaints    negative 
    #> 17 confined      negative 
    #> 18 love          positive 
    #> 19 confined      negative 
    #> 20 idiots        negative
    

    Created on 2023-11-28 with reprex v2.0.2