Search code examples
rforcats

Why is fct_lump from forcats not working on my data?


I've been off R for a few months, so that might have had some consequences.

I found this dataset on the internet. I treated it some, so I'll just dput() it here, but it originally came from https://ourworldindata.org/terrorism.

> dput(ter)
structure(list(region = c("Afghanistan", "Albania", "Algeria", 
"Angola", "Argentina", "Australasia & Oceania", "Australia", 
"Austria", "Azerbaijan", "Bahrain", "Bangladesh", "Belgium", 
"Brazil", "Burkina Faso", "Burundi", "Cameroon", "Canada", "Central African Republic", 
"Central America & Caribbean", "Central Asia", "Chad", "Chile", 
"China", "Colombia", "Cote d'Ivoire", "Czech Republic", "Democratic Republic of the Congo", 
"Djibouti", "Dominican Republic", "East Asia", "Eastern Europe", 
"Ecuador", "Egypt", "Ethiopia", "Finland", "France", "Gabon", 
"Georgia", "Germany", "Greece", "Honduras", "India", "Indonesia", 
"Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Jordan", 
"Kenya", "Kosovo", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", 
"Liberia", "Libya", "Malawi", "Malaysia", "Maldives", "Mali", 
"Malta", "Mexico", "Middle East & North Africa", "Mozambique", 
"Myanmar", "Nepal", "Netherlands", "Niger", "Nigeria", "North America", 
"Macedonia", "Norway", "Pakistan", "Palestine", "Papua New Guinea", 
"Paraguay", "Peru", "Philippines", "Poland", "Russia", "Rwanda", 
"Saudi Arabia", "Serbia", "Sierra Leone", "Somalia", "South Africa", 
"South America", "South Asia", "South Sudan", "Southeast Asia", 
"Spain", "Sri Lanka", "Sub-Saharan Africa", "Sudan", "Sweden", 
"Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Tunisia", 
"Turkey", "Uganda", "Ukraine", "UK", "USA", "Venezuela", "Vietnam", 
"Western Europe", "World", "Yemen", "Zambia", "Zimbabwe"), Code = c("AFG", 
"ALB", "DZA", "AGO", "ARG", NA, "AUS", "AUT", "AZE", "BHR", "BGD", 
"BEL", "BRA", "BFA", "BDI", "CMR", "CAN", "CAF", NA, NA, "TCD", 
"CHL", "CHN", "COL", "CIV", "CZE", "COD", "DJI", "DOM", NA, NA, 
"ECU", "EGY", "ETH", "FIN", "FRA", "GAB", "GEO", "DEU", "GRC", 
"HND", "IND", "IDN", "IRN", "IRQ", "IRL", "ISR", "ITA", "JAM", 
"JOR", "KEN", "OWID_KOS", "KGZ", "LAO", "LVA", "LBN", "LBR", 
"LBY", "MWI", "MYS", "MDV", "MLI", "MLT", "MEX", NA, "MOZ", "MMR", 
"NPL", "NLD", "NER", "NGA", NA, "MKD", "NOR", "PAK", "PSE", "PNG", 
"PRY", "PER", "PHL", "POL", "RUS", "RWA", "SAU", "SRB", "SLE", 
"SOM", "ZAF", NA, NA, "SSD", NA, "ESP", "LKA", NA, "SDN", "SWE", 
"SYR", "TWN", "TJK", "TZA", "THA", "TUN", "TUR", "UGA", "UKR", 
"GBR", "USA", "VEN", "VNM", NA, "OWID_WRL", "YEM", "ZMB", "ZWE"
), Year = c(2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017), `Terrorism fatalities (GTD, 2018)` = c(6092, 
0, 12, 7, 0, 4, 4, 2, 5, 6, 25, 2, 0, 53, 20, 228, 6, 601, 4, 
6, 62, 0, 16, 84, 3, 0, 596, 0, 2, 16, 101, 0, 877, 67, 2, 7, 
0, 0, 1, 0, 2, 465, 20, 39, 6476, 0, 3, 0, 0, 4, 126, 0, 0, 1, 
0, 17, 0, 289, 0, 4, 1, 361, 1, 23, 10819, 22, 218, 4, 0, 148, 
1805, 124, 0, 0, 1076, 50, 0, 4, 8, 496, 0, 61, 2, 31, 0, 0, 
1912, 21, 101, 7664, 581, 811, 21, 1, 6712, 82, 5, 2026, 0, 1, 
8, 72, 5, 222, 7, 40, 42, 95, 5, 0, 83, 26445, 762, 0, 0)), class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -115L), spec = structure(list(
    cols = list(Entity = structure(list(), class = c("collector_character", 
    "collector")), Code = structure(list(), class = c("collector_character", 
    "collector")), Year = structure(list(), class = c("collector_double", 
    "collector")), `Terrorism fatalities (GTD, 2018)` = structure(list(), class = c("collector_double", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), skip = 1), class = "col_spec"))

I tried many things, always shrinking the important code. In the end, I used the example code from vignette("forcats") (I hae tidyverse loaded), and tried it with different datasets:

starwars %>%
  mutate(skin_color = fct_lump(skin_color, n = 5)) %>%
  count(skin_color, sort = TRUE)
  
ter %>%
  mutate(hair = fct_lump(region, n = 5)) %>%
  count(hair, sort = TRUE)

gss_cat %>%
  mutate(relig = fct_lump(relig, n = 5)) %>%
  count(relig, sort = TRUE)

It works as expected with both starwars and gss_cat, but not with ter (my data):

> ter %>%
+   mutate(hair = fct_lump(region, n = 5)) %>%
+   count(hair, sort = TRUE)
# A tibble: 115 x 2
   hair                      n
   <fct>                 <int>
 1 Afghanistan               1
 2 Albania                   1
 3 Algeria                   1
 4 Angola                    1
 5 Argentina                 1
 6 Australasia & Oceania     1
 7 Australia                 1
 8 Austria                   1
 9 Azerbaijan                1
10 Bahrain                   1
# … with 105 more rows

Why is this happening? Why isn't fct_lump() working here?


Solution

  • It looks like you wish to lump together regions with less than 5 fatalities into an "other" category. This is straightforward in base R

    ter$region <- as.character(ter$region)
    ter$region[which(ter$`Terrorism fatalities (GTD, 2018)` < 5)] <- "Other"
    ter$region <- factor(ter$region)
    

    If you wanted, you could then use forcats to relevel according to the fatality level:

    ter$region <- fct_reorder(ter$region, ter$`Terrorism fatalities (GTD, 2018)`)
    
    ggplot(ter, aes(region, `Terrorism fatalities (GTD, 2018)`)) +
      geom_col() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
    

    enter image description here

    Or if you run the above code but lump together all regions with a fatality of less than 500, you get:

    enter image description here