Search code examples
rdataframeranking

How compute ranks/percentiles for many columns in dataframe, while filtering each column for criteria


We have to compute percentiles for 100 columns in an data frame. In the example below, the column names that need percentiles are pctile_columns. The criteria for receiving percentiles is (1) the column is not NA, and (2) the min_pg column is >= 12. We are struggling to obtain the correct set of percentiles:

Data + Attempt

temp_df = structure(list(group_var = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), 
                         min_pg = c(11, 15, 19, 7, 5, 34, 32, 27, 24, 18, 13, 10), 
                         stat1 = c(0.35, 0.32, 0.27, NA, NA, 0.42, 0.45, 0.47, 0.33, NA, 0.24, 0.39)), 
                    row.names = c(NA, -12L), class = "data.frame")

library(dplyr)
pctile_columns <- c('stat1') 
temp_output <- temp_df %>%
  group_by(group_var) %>%
  mutate(across(.cols = all_of(pctile_columns),
                .fns = ~ if_else(is.na(.) | min_pg < 12, as.numeric(NA),
                                 rank(., ties.method = "max")), 
                .names = "{.col}__rank")) %>%
  mutate(across(.cols = all_of(pctile_columns),
                .fns = ~ if_else(is.na(.) | min_pg < 12, as.numeric(NA),
                                 round((rank(., ties.method = "max") - 1) / (n() - 1) * 100, 0)),
                .names = "{.col}__pctile")) 

Output

# Groups:   group_var [1]
   group_var min_pg stat1 stat1__rank stat1__pctile
       <dbl>  <dbl> <dbl>       <dbl>         <dbl>
 1         1     11  0.35          NA            NA
 2         1     15  0.32           3            18
 3         1     19  0.27           2             9
 4         1      7 NA             NA            NA
 5         1      5 NA             NA            NA
 6         1     34  0.42           7            55
 7         1     32  0.45           8            64
 8         1     27  0.47           9            73
 9         1     24  0.33           4            27
10         1     18 NA             NA            NA
11         1     13  0.24           1             0
12         1     10  0.39          NA            NA

The problem with this output is that the ranks go from 1-9, whereas they should go from 1-7. Even though the stat1 values with min_pg < 12 are correctly being assigned an NA value, these stat1 values are still being factored into the rank equation when computing the ranks for all of the other rows. The correct set of ranks should be 1-7 in this instance, as there are 7 metrics that meet the criteria for stat1 to receive a rank/percentile.

How can we revise our code to compute ranks/percentiles properly per our criteria?


Solution

  • You could write a statfun and use it in by.

    > statfun <- \(x, stat) {
    +   rk <- \(x, m, z=12) rank(replace(x, m < z, NA), 'keep', 'max')  ## rank fun
    +   pctl <- \(x) round((x - 1L)/length(na.omit(x) - 1)*100L)  ## perc fun
    +   o <- lapply(stat, \(s) {
    +     r <- with(x, rk(get(s), x$min_pg))
    +     p <- pctl(r)
    +     data.frame(r, p) |> setNames(paste(s, c('rank', 'percentile'), sep='_'))
    +   })
    +   cbind(x, o)
    + }
    > by(temp_df, ~group_var, statfun, stat=c('stat1', 'stat2')) |> do.call(what='rbind')
         group_var min_pg stat1 stat2 stat1_rank stat1_percentile stat2_rank stat2_percentile
    1.1          1     11  0.35    NA         NA               NA         NA               NA
    1.2          1     15  0.32  0.45          3               29          3               29
    1.3          1     19  0.27  0.89          2               14          5               14
    1.4          1      7    NA    NA         NA               NA         NA               NA
    1.5          1      5    NA  0.27         NA               NA         NA               NA
    1.6          1     34  0.42  0.63          5               57          4               57
    1.7          1     32  0.45    NA          6               71         NA               71
    1.8          1     27  0.47  0.24          7               86          1               86
    1.9          1     24  0.33    NA          4               43         NA               43
    1.10         1     18    NA  0.27         NA               NA          2               NA
    1.11         1     13  0.24    NA          1                0         NA                0
    1.12         1     10  0.39  0.43         NA               NA         NA               NA
    2.13         2     11  0.35  0.42         NA               NA         NA               NA
    2.14         2     12  0.31    NA          2               29         NA               29
    2.15         2     13  0.27  0.47          1               14          5               14
    2.16         2      6    NA  0.45         NA               NA         NA               NA
    2.17         2      5    NA  0.39         NA               NA         NA               NA
    2.18         2     31  0.43  0.45          3               57          4               57
    2.19         2     22  0.45  0.35          5               71          3               71
    2.20         2     29  0.45  0.27          5               86          1               86
    2.21         2     24  0.63  0.31          6               43          2               43
    2.22         2     11    NA  0.35         NA               NA         NA               NA
    2.23         2     11  0.27  0.32         NA                0         NA                0
    2.24         2      9  0.89  0.33         NA               NA         NA               NA
    

    Data:

    > dput(temp_df)
    structure(list(group_var = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
    ), min_pg = c(11L, 15L, 19L, 7L, 5L, 34L, 32L, 27L, 24L, 18L, 
    13L, 10L, 11L, 12L, 13L, 6L, 5L, 31L, 22L, 29L, 24L, 11L, 11L, 
    9L), stat1 = c(0.35, 0.32, 0.27, NA, NA, 0.42, 0.45, 0.47, 0.33, 
    NA, 0.24, 0.39, 0.35, 0.31, 0.27, NA, NA, 0.43, 0.45, 0.45, 0.63, 
    NA, 0.27, 0.89), stat2 = c(NA, 0.45, 0.89, NA, 0.27, 0.63, NA, 
    0.24, NA, 0.27, NA, 0.43, 0.42, NA, 0.47, 0.45, 0.39, 0.45, 0.35, 
    0.27, 0.31, 0.35, 0.32, 0.33)), class = "data.frame", row.names = c(NA, 
    -24L))