Search code examples
rggplot2

Plotting stm model expected topic probability using ggplot2


I'm studying a online review dataset and using stm package from R for analyzing the data.

I've read a post which illustrates how to use stm package for studying text-based data and found it quite useful. Especially, I think the graphic below taken from the post might be useful for my study.

enter image description here

The post only reveals part of the code for this graphic.

topic_labels <- tribble(
  ~topic, ~category, ~color, 
  list(4, 8, 24, 50), "Anthropology/History", "#d4ae0b", 
  list(44, 48), "Journalism/Satire", "#3E7D49", 
  list(9, 17, 27, 30, 32), "Philosophy", "#c73200", 
  list(2, 3, 5, 11, 13, 15, 18, 20, 26, 28, 33, 34, 35, 39, 43), "Politics", "#de860b", 
  list(14, 23, 29, 31, 36, 37, 38, 42, 45, 47), "Political Economy", "#6F8FCF",
  list(21, 25, 41), "Military", "#b7a4d6", 
  list(1, 6, 7, 10, 12, 16, 19, 22, 40), "Sociology", "#8f1f3f", 
  list(46, 49), "Science/Math", "#767676") %>% 
  unnest(topic) %>% 
  unnest(topic) %>% 
  mutate(topic = factor(topic))

I've tried generating the code using my own dataset but failed. Here's my code:

library(stm)
library(tidyverse)
library(ggplot2)

# Step 1: Create a mapping of topics to categories
topic_categories <- c(
  "Price" = "11,21,25,7",
  "Services" = "1,9,5",
  "Environment" = "18,13,24,3",
  "Hygiene" = "26,19,4,23",
  "Personnel" = "20,6,15",
  "Values" = "14,17,22",
  "Perception" = "2,16,8,12",
  "Others" = "10"
)

# Step 2: Create a data frame with topic probabilities
topic_probabilities <- colMeans(stm26$theta)
topic_data <- data.frame(
  topic = 1:length(topic_probabilities),
  probability = topic_probabilities
)

# Step 3: Assign categories to topics
topic_data$category <- NA
for (cat in names(topic_categories)) {
  topics <- as.numeric(strsplit(topic_categories[cat], ",")[[1]])
  topic_data$category[topic_data$topic %in% topics] <- cat
}

# Step 4: Get top words for each topic
top_words <- labelTopics(stm26, n = 2)
topic_data$top_words <- apply(top_words$prob, 1, function(x) paste(x, collapse = ", "))

# Step 5: Assign colors to categories
category_colors <- c(
  "Price" = "#8f1f3f",
  "Services" = "#d4ae0b",
  "Environment" = "#de860b",
  "Hygiene" = "#6F8FCF",
  "Personnel" = "#c73200",
  "Values" = "#b7a4d6",
  "Perception" = "#3E7D49",
  "Others" = "#767676"
)

# Step 6: Create the plot
ggplot(topic_data, aes(y = reorder(topic, probability), x = probability, color = category)) +
  geom_segment(aes(x = 0, xend = probability, yend = reorder(topic, probability)), size = 0.5) +
  geom_point(size = 1) +
  geom_text(aes(label = top_words), hjust = 0, nudge_x = 0.002, size = 3) +
  scale_color_manual(values = category_colors) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1), 
                     limits = c(0, 0.18)) +
  geom_vline(xintercept = seq(0.05, 0.15, by = 0.05), color = "lightgrey") +  
  facet_grid(category ~ ., scales = "free_y", space = "free_y", switch = "y") +
  theme_minimal() +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_text(face = "bold", size = 8, color = "black", margin = margin(r = -25)), 
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),  
    panel.grid.minor.x = element_blank(),  
    legend.position = "none",
    strip.placement = "outside",
    strip.text.y.left = element_text(angle = 0, hjust = 1, face = "bold"),
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, face = "italic"),
    plot.margin = margin(5.5, 40, 5.5, 5.5)  # Adjusted right margin to prevent text cutoff
  ) +
  labs(
    title = "Title",
    subtitle = "Subtitle",
    x = "Expected topic probability"
  )

The result is a little bit different from the graphic of the post. enter image description here

Firstly, I can't move the categories name to the top of topic line. Secondly, the grid line (in grey color) is intermittent not continuous.

Anyone has ideas how to modify the code to generate graphic which resembles to the one showed on the post?


Solution

  • Here in an option to achieve your desired result by switching to ggforce::facet_col to move the strip text to the top, uses ggtext to color the strip text and finally uses annotation_custom and clip="off" to get continuous grid lines without breaks.

    Using some fake random example data:

    library(tidyverse)
    library(ggtext)
    library(ggforce)
    
    set.seed(123)
    
    topic_data <- data.frame(
      topic = sample(LETTERS, 26),
      probability = runif(26, 0, .18),
      category = sample(names(category_colors), 26, replace = TRUE),
      top_words = sample(letters, 26)
    ) |>
      left_join(
        enframe(category_colors, name = "category", value = "color")
      ) |>
      mutate(
        category = factor(category, names(category_colors)),
        category_facet = glue::glue(
          "<span style='color: {color}'>{category}</span>"
        ),
        category_facet = reorder(category_facet, as.numeric(category))
      )
    
    ggplot(topic_data, aes(
      y = reorder(topic, probability),
      x = probability, color = category
    )) +
      lapply(
        seq(0.05, 0.15, 0.05),
        \(x) {
          annotation_custom(
            grid::segmentsGrob(
              y0 = unit(0, "npc"), y1 = unit(1, "npc") + unit(16, "pt"),
              gp = grid::gpar(
                col = "lightgrey"
              )
            ),
            xmin = x,
            xmax = x
          )
        }
      ) +
      geom_segment(aes(
        x = 0, xend = probability,
        yend = reorder(topic, probability)
      ), size = 0.5) +
      geom_point(size = 1) +
      geom_text(aes(label = top_words), hjust = 0, nudge_x = 0.002, size = 3) +
      scale_color_manual(values = category_colors) +
      scale_x_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 0.18),
        expand = c(0, 0, .05, 0)
      ) +
      ggforce::facet_col(~category_facet,
        scales = "free_y",
        space = "free"
      ) +
      theme_minimal() +
      theme(
        axis.text.y = element_text(
          face = "bold", size = 8
        ),
        axis.ticks.y = element_blank(),
        panel.grid = element_blank(),
        legend.position = "none",
        strip.text.x = ggtext::element_markdown(
          hjust = 0, face = "bold", 
          size = 12,
          margin = margin(l = -13)
        ),
        strip.background = element_blank(),
        strip.clip = "off",
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5, face = "italic"),
        plot.margin = margin(5.5, 40, 5.5, 5.5)
      ) +
      labs(
        title = "Title",
        subtitle = "Subtitle",
        x = "Expected topic probability",
        y = NULL
      ) +
      coord_cartesian(clip = "off")
    

    enter image description here