Search code examples
rshinyrtweetwordcloud2

Chinese symbols in shiny wordcloud


I'm currently creating a shiny app that load recent Dutch tweets concerning the corona virus, and on another tab I want to display a wordcloud with the most frequently used words.

The table works fine, but the wordcloud shows mainly chinese signs. I was thinking that it may be smileys used in the tweets, but that doesn't seem to be the case.

The code that i've written:

library(tidyverse)
library(shiny)
library(rtweet)
library(dplyr)
library(glue)
library(reactable)
library(purrr)
library(wordcloud2)
library(tidytext)
library(tm)

make_url_html <- function(url) {
  if(length(url) < 2) {
    if(!is.na(url)) {
      as.character(glue("<a title = {url} target = '_new' href = '{url}'>{url}</a>") )
    } else {
      ""
    }
  } else {
    paste0(purrr::map_chr(url, ~ paste0("<a title = '", .x, "' target = '_new' href = '", .x, "'>", .x, "</a>", collapse = ", ")), collapse = ", ")
  }
}


# UI page instellen
ui <- fluidPage(
titlePanel("Corona op twitter"),
h4("Meest gebruikte woorden omtrent populaire COVID-19 hashtags op de Nederlandse twitter"),
tabsetPanel(
  #Eerste tab bevat de twitter tabel
  tabPanel(
    title = "Zoek tweets",
    sidebarLayout(
      sidebarPanel(
        # Radiobuttons voor de hastags
        radioButtons(
          inputId = "hashtag_to_search",
          label = "Kies hashtag",
          choices = c("#coronavirus" = "#coronavirus", "#coronahulp" = "#coronahulp")
        ),
        #Slider voor het aantal tweets
        sliderInput("num_tweets_to_download",
                    "Aantal tweets:",
                    min = 1,
                    max = 100,
                    value = 50)
      ),
      mainPanel(
        reactableOutput("tweet_table")
      )
    )
  ),
  tabPanel(
    # Tweede tab bevat de wordcloud
    title = "Wordcloud",
    sidebarLayout(
      sidebarPanel(
        radioButtons(
          inputId = "hashtag",
          label = "Choose hashtag",
          choices = c("#coronavirus" = "virus", "#coronahulp" = "hulp")
        ),
        sliderInput("num",
                    "Number of words:",
                    min = 1,
                    max = 100,
                    value = 50)
      ),

      # Show a plot of the generated distribution
      mainPanel(
        wordcloud2Output("cloud", width = "100%", height = "800px"),
        reactableOutput("table")
      )
    )
  )
)   
)


# Server met tabel en wordcloud
server <- function(input, output) {

  # Data inladen
  tweet_df <- reactive({
    search_tweets(paste("lang:nl", input$hashtag_to_search), n = input$num_tweets_to_download, include_rts = FALSE)
  })

  # data schoonmaken
  word <- c("we", "coronavirus", "nl", "nederland", "https",  stopwords("nl"))
  new_stopwords_df <- data.frame(word)

  tweet_clean <- reactive({
    req(tweet_df())
    tweet_df() %>%
      mutate(text = lapply(text, tolower),
             text = str_replace_all(text, "https://t.co/[a-z,A-Z,0-9]*", ""),
             text = str_replace(text,"RT @[a-z,A-Z,0-9,_]*: ",""),
             text = str_replace_all(text,"#[a-z,A-Z]*",""),
             text = str_replace_all(text,"@[a-z,A-Z]*",""),
             text = str_replace_all(text,"\\b[a-zA-Z]{1}\\b",""),
             text = str_replace_all(text,"[:digit:]",""),
             text = str_replace_all(text,"[^[:alnum:] ]",""),
             text = str_replace_all(text," "," ")) %>%
      select(status_id, text) %>% unnest_tokens(word,text) %>%
      anti_join(new_stopwords_df, by = "word") %>% drop_na(word)
  })

  tweet_clean_freq <- reactive({
    req(tweet_clean())
    tweet_clean() %>%
      group_by(word) %>%
      summarise(freq =n()) %>%
      arrange(desc(freq)) %>%
      head(data, n = 50)
  })

   output$table <- renderReactable({reactable(tweet_clean())})
   output$cloud <- renderWordcloud2({
     wordcloud2(data = tweet_clean_freq()
     )
   })

   # Tabel
   tweet_table_data <- reactive({
     req(tweet_df())
     tweet_df() %>%
       select(user_id, status_id, created_at, screen_name, text, favorite_count, retweet_count, urls_expanded_url) %>%
       mutate(
         Tweet = glue::glue("{text} <a href='https://twitter.com/{screen_name}/status/{status_id}'>>> </a>"),
         URLs = purrr::map_chr(urls_expanded_url, make_url_html)
       )%>%
       select(DateTime = created_at, User = screen_name, Tweet, Likes = favorite_count, RTs = retweet_count, URLs)
   })

   output$tweet_table <- renderReactable({
     reactable::reactable(tweet_table_data(), 
                          filterable = TRUE, searchable = TRUE, bordered = TRUE, striped = TRUE, highlight = TRUE,
                          showSortable = TRUE, defaultSortOrder = "desc", defaultPageSize = 25, showPageSizeOptions = TRUE, pageSizeOptions = c(25, 50, 75, 100, 200), 
                          columns = list(
                            DateTime = colDef(defaultSortOrder = "asc"),
                            User = colDef(defaultSortOrder = "asc"),
                            Tweet = colDef(html = TRUE, minWidth = 190, resizable = TRUE),
                            Likes = colDef(filterable = FALSE, format = colFormat(separators = TRUE)),
                            RTs = colDef(filterable =  FALSE, format = colFormat(separators = TRUE)),
                            URLs = colDef(html = TRUE)
                          )
     )
   })

}


# Applicatie
shinyApp(ui = ui, server = server)

I've tried to check what the problem is by adding a table under the wordcloud, but there it also shows chinese symbols. When I try my code outside of shiny context (and without reactive aspects), it seems to work fine.

Btw: I know I've not connected the radiobuttons yet, I want to get the wordcloud working first.

Thanks!


Solution

  • Found the problem, I didn't remove emoticons from the text.

    I added this line of code;

    text = sapply(text,function(row) iconv(row, "latin1", "ASCII", sub="")))
    

    to the mutate function and that solved the issue.