Search code examples
rgreatest-n-per-grouptidyr

In R, looking for better way to get the max value out of subgroups (groups within groups)


For a given DF containing groups X, Y, Z, I'd like to keep the rows containing the maximum value within each of a number of columns (A_Column, C_Column, N_Column) if there is a corresponding PI value at that row. For example, for group X, maximum values should be kept for PI values C and A from the respective columns. Below is my attempt - is there a shorter / more elegant way to achieve the same result?

My starting DF:

> DF
  Group           H PI PC A_Column C_Column N_Column
1     X       AA001  C  C        2    -0.10       22
2     X         A16  A  C        3    -0.12       13
3     X    A2015_01  C  C        5    -0.80       51
4     X         AA1  C  A        2    -0.30       32
5     Y   AAA-16-04  A  A        5    -0.20       15
6     Y    A01_2009  O  A        8    -0.40       28
7     Z        AA02  A  A       17    -0.30       12
8     Z AAD003-2014  A  N        3    -0.13       43
9     Z         AD4  N  N        5    -0.60       45

The output I end up with:

> DF_max
  Group         H PI PC A_Column C_Column N_Column
1     X     AA001  C  C        2    -0.10       22
2     X       A16  A  C        3    -0.12       13
5     Y AAA-16-04  A  A        5    -0.20       15
7     Z      AA02  A  A       17    -0.30       12
9     Z       AD4  N  N        5    -0.60       45

My code:

library(dplyr)
library(tidyverse)

# toy example to get the maximum values out of every group

Group <- c("X","X","X","X","Y","Y","Z","Z","Z")
H <-c("AA001","A16","A2015_01","AA1","AAA-16-04","A01_2009","AA02","AAD003-2014","AD4")
PI <- c("C","A","C","C","A","O","A","A","N")
PC <- c("C","C","C","A","A","A","A","N","N")
A_Column <- c(2,3,5,2,5,8,17,3,5)
C_Column <- -c(.1,.12,.8,.3,.2,.4,.3,.13,.6)
N_Column <- c(22,13,51,32,15,28,12,43,45)

DF <- data.frame(Group, H, PI, PC, A_Column, C_Column, N_Column)
DF

# tidy data column Values-Labels before using dplyr
gather_DF <- gather(DF, key = Col_labels, value = Obs, -Group, -H, -PI, -PC)
gather_DF

# look for value label matches within each group
gather_DF$Col_labels_match <- gather_DF$Col_labels 
map = setNames(c("A", "C", "N"), c("A_Column", "C_Column", "N_Column"))
gather_DF$Col_labels_match <- map[unlist(gather_DF$Col_labels_match)]

# get max values per group, where PI equals Col_labels
max_DF <- gather_DF %>% filter(Col_labels_match==PI) %>% group_by(Group, PI) %>% top_n(1, Obs)
max_ID <- unique(max_DF$H)
DF_max <- DF[which(DF$H %in% max_ID),] # pull max values out of DF to get original formatting
DF_max

UPDATE: The code below uses @Arun's data.table method - however an error comes up if PI is not placed in quotes (see below; I corrected using "PI" instead of PI) and the output does not have the right max for group gY (G=6 is coming up instead of G=17).

df <- structure(list(Group = c("gX", "gX", "gY", "gY", "gY", "gZ", 
                               "gW", "gW", "gV", "gV", "gV", "gT", "gR", "gR", "gR", "gR", "gR", 
                               "gS", "gQ", "gL"), PI = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L, 
                                                                   1L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 1L), .Label = c("C", 
                                                                                                                                   "G", "O"), class = "factor"), H = 1:20, PC = structure(c(2L, 
                                                                                                                                                                                            2L, 2L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 
                                                                                                                                                                                            2L, 2L, 1L), .Label = c("C", "G", "I", "O"), class = "factor"), 
                     C = c(NA, NA, NA, NA, NA, NA, 3, 1, NA, NA, NA, NA, NA, NA, 
                           NA, NA, NA, NA, NA, 2), I = c(NA, NA, NA, NA, NA, NA, NA, 
                                                         NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1), G = c(16, 
                                                                                                                   10, 6, 17, 12, 14, 13, 11, NA, NA, NA, 9, 5, 2, 15, 3, 1, 
                                                                                                                   7, 8, 4), N = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
                                                                                                                                   NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
                                                                                                                                   NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
                                                                                                                                   NA_real_, NA_real_, NA_real_), O = c(NA, NA, NA, NA, NA, 
                                                                                                                                                                        NA, NA, NA, 3, 2, 1, NA, 8, 4, 5, 7, 6, NA, NA, NA)), .Names = c("Group", 
                                                                                                                                                                                                                                         "PI", "H", "PC", "C", "I", "G", "N", "O"), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                  -20L), class = "data.frame")
df

dt <- data.table(Group=df$Group, PI=df$PI, H=df$H, PC=df$PC, 
                 C = df$C, 
                 I = df$I, 
                 G = df$G, 
                 N = df$N, 
                 O = df$O)


> Maxdt <- dt[PI %in% c("C", "I", "G", "N", "O"),
> +             .SD[which.max(get(PI))],
> +             by=.(Group, PI)]  
Error in get(PI) : invalid first argument

Maxdt <- dt[PI %in% c("C", "I", "G", "N", "O"),
                         .SD[which.max(get("PI"))],
                         by=.(Group, PI)] 



    > Maxdt
    Group PI  H PC  C  I  G  N  O
 1:    gX  G  1  G NA NA 16 NA NA
 2:    gY  G  3  G NA NA  6 NA NA
 3:    gZ  G  6  G NA NA 14 NA NA
 4:    gW  C  7  C  3 NA 13 NA NA
 5:    gV  O  9  O NA NA NA NA  3
 6:    gT  G 12  I NA NA  9 NA NA
 7:    gR  O 13  O NA NA  5 NA  8
 8:    gS  G 18  G NA NA  7 NA NA
 9:    gQ  G 19  G NA NA  8 NA NA
10:    gL  C 20  C  2  1  4 NA NA

Solution

  • Here's another way using data.table:

    dt <- data.table(Group, H, PI, PC, A=A_Column, C=C_Column, N=N_Column)
    dt[PI %in% c("A", "C", "N"),
        .SD[which.max(get(PI))],
        by=.(Group, PI)]        
    
    #    Group PI         H PC  A     C  N
    # 1:     X  C     AA001  C  2 -0.10 22
    # 2:     X  A       A16  C  3 -0.12 13
    # 3:     Y  A AAA-16-04  A  5 -0.20 15
    # 4:     Z  A      AA02  A 17 -0.30 12
    # 5:     Z  N       AD4  N  5 -0.60 45
    

    It should be pretty straightforward to follow what's going on. On those any of rows where PI matches A,C,N, we group by Group, PI.. .SD contains the Subset of Data for each group..

    get(PI) returns the values corresponding to the character value stored in PI and the row corresponding to the max value of that column pointed to by PI is returned for each Group, PI.

    For e.g., for the first combination, Group=X, PI=C. get(PI) == get("C") which returns c(-0.1, -0.8, -0.3) for that group on which which.max returns the index 1.