I've been off R for a few months, so that might have had some consequences.
I found this dataset on the internet. I treated it some, so I'll just dput()
it here, but it originally came from https://ourworldindata.org/terrorism.
> dput(ter)
structure(list(region = c("Afghanistan", "Albania", "Algeria",
"Angola", "Argentina", "Australasia & Oceania", "Australia",
"Austria", "Azerbaijan", "Bahrain", "Bangladesh", "Belgium",
"Brazil", "Burkina Faso", "Burundi", "Cameroon", "Canada", "Central African Republic",
"Central America & Caribbean", "Central Asia", "Chad", "Chile",
"China", "Colombia", "Cote d'Ivoire", "Czech Republic", "Democratic Republic of the Congo",
"Djibouti", "Dominican Republic", "East Asia", "Eastern Europe",
"Ecuador", "Egypt", "Ethiopia", "Finland", "France", "Gabon",
"Georgia", "Germany", "Greece", "Honduras", "India", "Indonesia",
"Iran", "Iraq", "Ireland", "Israel", "Italy", "Jamaica", "Jordan",
"Kenya", "Kosovo", "Kyrgyzstan", "Laos", "Latvia", "Lebanon",
"Liberia", "Libya", "Malawi", "Malaysia", "Maldives", "Mali",
"Malta", "Mexico", "Middle East & North Africa", "Mozambique",
"Myanmar", "Nepal", "Netherlands", "Niger", "Nigeria", "North America",
"Macedonia", "Norway", "Pakistan", "Palestine", "Papua New Guinea",
"Paraguay", "Peru", "Philippines", "Poland", "Russia", "Rwanda",
"Saudi Arabia", "Serbia", "Sierra Leone", "Somalia", "South Africa",
"South America", "South Asia", "South Sudan", "Southeast Asia",
"Spain", "Sri Lanka", "Sub-Saharan Africa", "Sudan", "Sweden",
"Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Tunisia",
"Turkey", "Uganda", "Ukraine", "UK", "USA", "Venezuela", "Vietnam",
"Western Europe", "World", "Yemen", "Zambia", "Zimbabwe"), Code = c("AFG",
"ALB", "DZA", "AGO", "ARG", NA, "AUS", "AUT", "AZE", "BHR", "BGD",
"BEL", "BRA", "BFA", "BDI", "CMR", "CAN", "CAF", NA, NA, "TCD",
"CHL", "CHN", "COL", "CIV", "CZE", "COD", "DJI", "DOM", NA, NA,
"ECU", "EGY", "ETH", "FIN", "FRA", "GAB", "GEO", "DEU", "GRC",
"HND", "IND", "IDN", "IRN", "IRQ", "IRL", "ISR", "ITA", "JAM",
"JOR", "KEN", "OWID_KOS", "KGZ", "LAO", "LVA", "LBN", "LBR",
"LBY", "MWI", "MYS", "MDV", "MLI", "MLT", "MEX", NA, "MOZ", "MMR",
"NPL", "NLD", "NER", "NGA", NA, "MKD", "NOR", "PAK", "PSE", "PNG",
"PRY", "PER", "PHL", "POL", "RUS", "RWA", "SAU", "SRB", "SLE",
"SOM", "ZAF", NA, NA, "SSD", NA, "ESP", "LKA", NA, "SDN", "SWE",
"SYR", "TWN", "TJK", "TZA", "THA", "TUN", "TUR", "UGA", "UKR",
"GBR", "USA", "VEN", "VNM", NA, "OWID_WRL", "YEM", "ZMB", "ZWE"
), Year = c(2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017), `Terrorism fatalities (GTD, 2018)` = c(6092,
0, 12, 7, 0, 4, 4, 2, 5, 6, 25, 2, 0, 53, 20, 228, 6, 601, 4,
6, 62, 0, 16, 84, 3, 0, 596, 0, 2, 16, 101, 0, 877, 67, 2, 7,
0, 0, 1, 0, 2, 465, 20, 39, 6476, 0, 3, 0, 0, 4, 126, 0, 0, 1,
0, 17, 0, 289, 0, 4, 1, 361, 1, 23, 10819, 22, 218, 4, 0, 148,
1805, 124, 0, 0, 1076, 50, 0, 4, 8, 496, 0, 61, 2, 31, 0, 0,
1912, 21, 101, 7664, 581, 811, 21, 1, 6712, 82, 5, 2026, 0, 1,
8, 72, 5, 222, 7, 40, 42, 95, 5, 0, 83, 26445, 762, 0, 0)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -115L), spec = structure(list(
cols = list(Entity = structure(list(), class = c("collector_character",
"collector")), Code = structure(list(), class = c("collector_character",
"collector")), Year = structure(list(), class = c("collector_double",
"collector")), `Terrorism fatalities (GTD, 2018)` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
I tried many things, always shrinking the important code. In the end, I used the example code from vignette("forcats")
(I hae tidyverse
loaded), and tried it with different datasets:
starwars %>%
mutate(skin_color = fct_lump(skin_color, n = 5)) %>%
count(skin_color, sort = TRUE)
ter %>%
mutate(hair = fct_lump(region, n = 5)) %>%
count(hair, sort = TRUE)
gss_cat %>%
mutate(relig = fct_lump(relig, n = 5)) %>%
count(relig, sort = TRUE)
It works as expected with both starwars
and gss_cat
, but not with ter
(my data):
> ter %>%
+ mutate(hair = fct_lump(region, n = 5)) %>%
+ count(hair, sort = TRUE)
# A tibble: 115 x 2
hair n
<fct> <int>
1 Afghanistan 1
2 Albania 1
3 Algeria 1
4 Angola 1
5 Argentina 1
6 Australasia & Oceania 1
7 Australia 1
8 Austria 1
9 Azerbaijan 1
10 Bahrain 1
# … with 105 more rows
Why is this happening? Why isn't fct_lump()
working here?
It looks like you wish to lump together regions with less than 5 fatalities into an "other" category. This is straightforward in base R
ter$region <- as.character(ter$region)
ter$region[which(ter$`Terrorism fatalities (GTD, 2018)` < 5)] <- "Other"
ter$region <- factor(ter$region)
If you wanted, you could then use forcats
to relevel according to the fatality level:
ter$region <- fct_reorder(ter$region, ter$`Terrorism fatalities (GTD, 2018)`)
ggplot(ter, aes(region, `Terrorism fatalities (GTD, 2018)`)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Or if you run the above code but lump together all regions with a fatality of less than 500, you get: