Search code examples
rdplyrtidyversedata-cleaning

dplyr filter out groups in which the max value (per group) is below the top-3 max-values (per group)


So I have this dataframe:

structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 
4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 
6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 
8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9), year = c("2017", "2018", 
"2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026", 
"2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024", 
"2025", "2026", "2017", "2018", "2019", "2020", "2021", "2022", 
"2023", "2024", "2025", "2026", "2017", "2018", "2019", "2020", 
"2021", "2022", "2023", "2024", "2025", "2026", "2017", "2018", 
"2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026", 
"2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024", 
"2025", "2026", "2017", "2018", "2019", "2020", "2021", "2022", 
"2023", "2024", "2025", "2026", "2017", "2018", "2019", "2020", 
"2021", "2022", "2023", "2024", "2025", "2026", "2017", "2018", 
"2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026"
), volume = c(0.0013, 0.0013, 0.0012579, 0.0011895, 0.0011421, 
0.0010842, 0.0010211, 0.0010158, 0.00099474, 0.00092632, 0.07878, 
0.078791, 0.077295, 0.076638, 0.075538, 0.074468, 0.074776, 0.074051, 
0.071706, 0.068056, 0.023269, 0.023011, 0.022374, 0.021962, 0.021408, 
0.020949, 0.020811, 0.020354, 0.019309, 0.018042, 0.0004, 0.0004, 
0.00038421, 0.00035263, 0.00033158, 0.00032105, 0.00026842, 0.00028421, 
0.00026842, 0.00024211, 0.0002, 0.0001, 0.00011579, 0, 0, 0, 
0, 0, 0, 0, 0.028422, 0.028361, 0.027768, 0.027501, 0.027029, 
0.02651, 0.026588, 0.026209, 0.025094, 0.023391, 0.0001, 0.0001, 
0, 0, 0, 0, 0, 0, 0, 0, 0.0047, 0.0047158, 0.0048368, 0.0048316, 
0.0049263, 0.0049737, 0.0049947, 0.0051684, 0.0052526, 0.0051842, 
0.0106, 0.010389, 0.010279, 0.010005, 0.0098421, 0.0096368, 0.0094053, 
0.0093368, 0.0092526, 0.0089316)), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -90L))

Which looks like this:

# A tibble: 6 × 3
     id year   volume
  <dbl> <chr>   <dbl>
1     1 2017  0.0013 
2     1 2018  0.0013 
3     1 2019  0.00126
4     1 2020  0.00119
5     1 2021  0.00114
6     1 2022  0.00108

Id has 9 distinct IDs, each with 10 rows. Now I would like to find the maximum value for the column volume and then filter out the groups (or just make an extra column like inTop3 ) that highlights those IDs which are in the top-3 highest volume values.

This could mean that the largest 3 values are within the group with ID = 2. But I really only want to compare the maximum value of each group with the maximum value of each other group.

Getting the maximum value per group is trivial:

df %>% 
  group_by(id) %>% 
  mutate(
    m = max(volume)
  ) 

But then I am a little lost how to go on. Especially I wonder how I could create a boolean column that indicates wheter a group is in the top-3 or not.


Solution

  • Another possible solution:

    library(dplyr)
    
    df %>% 
      group_by(id) %>% 
      summarise(m = max(volume)) %>% 
      slice_max(m, n = 3)
    
    #> # A tibble: 3 × 2
    #>      id      m
    #>   <dbl>  <dbl>
    #> 1     2 0.0788
    #> 2     6 0.0284
    #> 3     3 0.0233
    

    To get the entire group for each of the 3 max-values:

    library(tidyverse)
    
    df %>% 
      group_by(id) %>% 
      summarise(m = max(volume)) %>% 
      slice_max(m, n = 3) %>% 
      group_split(id) %>% 
      map(~ inner_join(df, .x, by = "id"))
    
    #> [[1]]
    #> # A tibble: 10 × 4
    #>       id year  volume      m
    #>    <dbl> <chr>  <dbl>  <dbl>
    #>  1     2 2017  0.0788 0.0788
    #>  2     2 2018  0.0788 0.0788
    #>  3     2 2019  0.0773 0.0788
    #>  4     2 2020  0.0766 0.0788
    #>  5     2 2021  0.0755 0.0788
    #>  6     2 2022  0.0745 0.0788
    #>  7     2 2023  0.0748 0.0788
    #>  8     2 2024  0.0741 0.0788
    #>  9     2 2025  0.0717 0.0788
    #> 10     2 2026  0.0681 0.0788
    #> 
    #> [[2]]
    #> # A tibble: 10 × 4
    #>       id year  volume      m
    #>    <dbl> <chr>  <dbl>  <dbl>
    #>  1     3 2017  0.0233 0.0233
    #>  2     3 2018  0.0230 0.0233
    #>  3     3 2019  0.0224 0.0233
    #>  4     3 2020  0.0220 0.0233
    #>  5     3 2021  0.0214 0.0233
    #>  6     3 2022  0.0209 0.0233
    #>  7     3 2023  0.0208 0.0233
    #>  8     3 2024  0.0204 0.0233
    #>  9     3 2025  0.0193 0.0233
    #> 10     3 2026  0.0180 0.0233
    #> 
    #> [[3]]
    #> # A tibble: 10 × 4
    #>       id year  volume      m
    #>    <dbl> <chr>  <dbl>  <dbl>
    #>  1     6 2017  0.0284 0.0284
    #>  2     6 2018  0.0284 0.0284
    #>  3     6 2019  0.0278 0.0284
    #>  4     6 2020  0.0275 0.0284
    #>  5     6 2021  0.0270 0.0284
    #>  6     6 2022  0.0265 0.0284
    #>  7     6 2023  0.0266 0.0284
    #>  8     6 2024  0.0262 0.0284
    #>  9     6 2025  0.0251 0.0284
    #> 10     6 2026  0.0234 0.0284