Search code examples
rggplot2geom-bargeom-textgeom-col

Can I add grouping line labels above my ggplot bar/column chart?


I'm interested in adding grouping labels above my ggplot bar charts. This feature exists for data visualizations such as phylogenetic trees (in ggtree), but I haven't found a way to do it in ggplot.

I've tried toying around with geom_text, and geom_label, but I haven't had success yet. Perhaps there's another package that enables this functionality? I've attached some example code that should be fully reproducible. I'd like the rating variable to go over the bars of the continents listed (spanning multiple continents).

Any help is greatly appreciated! Thank you!

P.S. pardon all the comments - I was writing a teaching tutorial.

#load necessary packages
library(tidyverse)
library(stringr)
library(hrbrthemes)
library(scales)

#load data
covid<- read_csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv", na = ".")  

#this makes a new dataframe (total_cases) that only has the latest COVID cases count and location data
total_cases <- covid %>% filter(date == "2021-05-23") %>% 
  group_by(location, total_cases) %>% 
  summarize()

#get number for world total cases. 
world <- total_cases %>%
  filter(location == "World") %>%
  select(total_cases)

#make new column that has the proportion of total world cases (number was total on that day)
total_cases$prop_total <- total_cases$total_cases/world$total_cases

#this specifies what the continents are so we can filter them out with dplyr
continents <- c("North America", "South America", "Antarctica", "Asia", "Europe", "Africa", "Australia")

#Using dyplr, we're choosing total_cases pnly for the continents
contin_cases <- total_cases %>%
  filter(location %in% continents)

#Loading a colorblind accessible palette
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

#Add a column that rates proportion of cases categorically.   
contin_cases <- contin_cases %>% 
  mutate(rating = case_when(prop_total <= 0.1 ~ 'low',
                            prop_total <= 0.2 ~ 'medium',
                            prop_total <= 1 ~ 'high'))

#Ploting it on a bar chart. 
plot1 <- ggplot(contin_cases, 
           aes(x = reorder(location, prop_total),
               y = prop_total,
               fill = location)) +
  geom_bar(stat="identity", color="white") +
  ylim(0, 1) +
  geom_text(aes(y = prop_total,
                label = round(prop_total, 4)),
            vjust = -1.5) +
  scale_fill_manual(name = "Continent", 
                    values = cbbPalette) +
  labs(title = "Proportion of total COVID-19 Cases Per Continent", 
       caption ="Figure 1. Asia leads total COVID case count as of May 23rd, 2021. No data exists in this dataset for Antarctica.") +
  ylab("Proportion of total cases") +
  xlab("") + #this makes x-axis blank
  theme_classic()+
    theme(
    plot.caption = element_text(hjust = 0, face = "italic"))

plot1

Here's something similar to what I'm trying to achieve:

bar chart showing total covid cases by continent as of May 2021


Solution

  • One approach to achieve your desired result would be via geom_segment. To this end I first prepare a dataset containing the start and end positions of the segments to be put on top of the bars by rating group. Basically this involves converting the discrete locations to numerics.

    Afterwards it's pretty straightforward to add the segments and the labels.

    library(tidyverse)
    library(hrbrthemes)
    library(scales)
    
    # Loading a colorblind accessible palette
    cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
    
    width <- .45 # Half of default width of bars
    df_segment <- contin_cases %>% 
      ungroup() %>% 
      # Convert location to numerics
      mutate(loc_num = as.numeric(fct_reorder(location, prop_total))) %>%
      group_by(rating) %>% 
      summarise(x = min(loc_num) - width, xend = max(loc_num) + width,
                y = max(prop_total) * 1.5, yend = max(prop_total) * 1.5)
    
    ggplot(
      contin_cases,
      aes(
        x = reorder(location, prop_total),
        y = prop_total,
        fill = location
      )
    ) +
      geom_bar(stat = "identity", color = "white") +
      ylim(0, 1) +
      geom_segment(data = df_segment, aes(x = x, xend = xend, y = max(y), yend = max(yend), 
                                          color = rating, group = rating), 
                   inherit.aes = FALSE, show.legend = FALSE) +
      geom_text(data = df_segment, aes(x = .5 * (x + xend), y = max(y), label = str_to_title(rating), color = rating), 
                vjust = -.5, inherit.aes = FALSE, show.legend = FALSE) +
      geom_text(aes(
        y = prop_total,
        label = round(prop_total, 4)
      ),
      vjust = -1.5
      ) +
      scale_fill_manual(
        name = "Continent",
        values = cbbPalette
      ) +
      labs(
        title = "Proportion of total COVID-19 Cases Per Continent",
        caption = "Figure 1. Asia leads total COVID case count as of May 23rd, 2021. No data exists in this dataset for Antarctica."
      ) +
      ylab("Proportion of total cases") +
      xlab("") + # this makes x-axis blank
      theme_classic() +
      theme(
        plot.caption = element_text(hjust = 0, face = "italic")
      )
    

    DATA

    contin_cases <- structure(list(location = c(
      "Africa", "Asia", "Australia", "Europe",
      "North America", "South America"
    ), total_cases = c(
      4756650, 49204489,
      30019, 46811325, 38790782, 27740153
    ), prop_total = c(
      0.0284197291646085,
      0.293983843894959, 0.000179355607369132, 0.2796853202015, 0.231764691226676,
      0.165740097599109
    ), rating = c(
      "low", "high", "low", "high",
      "high", "medium"
    )), class = c(
      "grouped_df", "tbl_df", "tbl",
      "data.frame"
    ), row.names = c(NA, -6L), groups = structure(list(
      location = c(
        "Africa", "Asia", "Australia", "Europe", "North America",
        "South America"
      ), .rows = structure(list(
        1L, 2L, 3L, 4L,
        5L, 6L
      ), ptype = integer(0), class = c(
        "vctrs_list_of",
        "vctrs_vctr", "list"
      ))
    ), row.names = c(NA, -6L), class = c(
      "tbl_df",
      "tbl", "data.frame"
    ), .drop = TRUE))