Search code examples
rdata.tablememory-efficient

R Data.Table Aggregate


library(data.table)
set.seed(01)
DATA = data.table("STUDENT" = c(1:2000),
                  "GROUP" = c(sample(letters[1:3], size=2000, r = T)),
                  "CLASS" = c(sample(1:6, size=2000, r = T)),
                  "SCORE1" = c(sample(50:100,size=2000,r=T)),
                  "SCORE2" = c(sample(40:100, size=2000, r = T)))

Say you have this data and you wish to make SEVERAL extra data.table that are aggregated or collapse...

a) mean SCORE1 and mean SCORE2 by GROUP

b) mean SCORE1 and mean SCORE2 by CLASS

c) mean SCORE1 and mean SCORE2 by GROUP and by CLASS

d) count of GROUP by CLASS

Is there more efficient way to use data.table to create these 4 data.table? Or do you simply recommend making all of them and storing as 4 separate data.tables?


Solution

  • They all have different structures

    (a)

    DATA[, .(mSCORE1 = mean(SCORE1), mSCORE2= mean(SCORE2)), by=.(GROUP)]
    
       GROUP  mSCORE1  mSCORE2
    1:     a 74.59971 69.98240
    2:     c 75.12943 71.30817
    3:     b 74.95964 69.97608
    

    (b and d combined)

    DATA[, .(mSCORE1 = mean(SCORE1), mSCORE2= mean(SCORE2),nGROUPS =uniqueN(GROUP)), by=.(CLASS)]
    
       CLASS  mSCORE1  mSCORE2 nGROUPS
    1:     6 75.01685 70.14607       3
    2:     3 75.76025 70.18927       3
    3:     4 74.25532 70.74164       3
    4:     5 75.77126 69.46921       3
    5:     1 73.28797 71.56646       3
    6:     2 75.17595 70.44282       3
    

    (c)

    DATA[, lapply(.SD, mean, na.rm=T), by=.(GROUP,CLASS),.SDcols = patterns("SC")]
    
        GROUP CLASS   SCORE1   SCORE2
     1:     a     6 75.27027 69.00901
     2:     c     3 78.05660 70.18868
     3:     a     4 75.72727 68.95868
     4:     b     6 74.20455 70.78788
     5:     a     5 75.94915 69.78814
     6:     c     1 73.93043 72.63478
     7:     c     2 75.18812 71.80198
     8:     b     2 76.46667 67.89167
     9:     c     5 75.28814 70.63559
    10:     a     1 72.43519 69.72222
    11:     a     2 73.87500 71.85000
    12:     b     1 73.48387 72.38710
    13:     b     5 76.11429 67.80000
    14:     b     3 75.07477 69.84112
    15:     c     6 75.71681 70.51327
    16:     b     4 74.26786 71.41964
    17:     a     3 74.12500 70.54808
    18:     c     4 72.38542 72.19792
    

    If you want to get all these in one table, of course you would have class-specific summaries and group-specific summaries replicated across rows of group and class. Here is an illustration in a single pipeline

    DATA[, .(mSCORE1_grpclass = mean(SCORE1), mSCORE2_grpclass= mean(SCORE2), NSTUDENTS=.N), by=.(GROUP,CLASS)] %>% 
      .[,`:=`(
          mSCORE1_grp=sum(NSTUDENTS*mSCORE1_grpclass)/sum(NSTUDENTS),
          mSCORE2_grp=sum(NSTUDENTS*mSCORE2_grpclass)/sum(NSTUDENTS)),by=.(GROUP)] %>%
      .[,`:=`(
        mSCORE1_class=sum(NSTUDENTS*mSCORE1_grpclass)/sum(NSTUDENTS),
        mSCORE2_class=sum(NSTUDENTS*mSCORE2_grpclass)/sum(NSTUDENTS),
        nGROUPS_class = uniqueN(GROUP)),by=.(CLASS)] %>% 
      .[]
    

    Output:

        GROUP CLASS mSCORE1_grpclass mSCORE2_grpclass NSTUDENTS mSCORE1_grp mSCORE2_grp mSCORE1_class mSCORE2_class nGROUPS_class
     1:     a     6         75.27027         69.00901       111    74.59971    69.98240      75.01685      70.14607             3
     2:     c     3         78.05660         70.18868       106    75.12943    71.30817      75.76025      70.18927             3
     3:     a     4         75.72727         68.95868       121    74.59971    69.98240      74.25532      70.74164             3
     4:     b     6         74.20455         70.78788       132    74.95964    69.97608      75.01685      70.14607             3
     5:     a     5         75.94915         69.78814       118    74.59971    69.98240      75.77126      69.46921             3
     6:     c     1         73.93043         72.63478       115    75.12943    71.30817      73.28797      71.56646             3
     7:     c     2         75.18812         71.80198       101    75.12943    71.30817      75.17595      70.44282             3
     8:     b     2         76.46667         67.89167       120    74.95964    69.97608      75.17595      70.44282             3
     9:     c     5         75.28814         70.63559       118    75.12943    71.30817      75.77126      69.46921             3
    10:     a     1         72.43519         69.72222       108    74.59971    69.98240      73.28797      71.56646             3
    11:     a     2         73.87500         71.85000       120    74.59971    69.98240      75.17595      70.44282             3
    12:     b     1         73.48387         72.38710        93    74.95964    69.97608      73.28797      71.56646             3
    13:     b     5         76.11429         67.80000       105    74.95964    69.97608      75.77126      69.46921             3
    14:     b     3         75.07477         69.84112       107    74.95964    69.97608      75.76025      70.18927             3
    15:     c     6         75.71681         70.51327       113    75.12943    71.30817      75.01685      70.14607             3
    16:     b     4         74.26786         71.41964       112    74.95964    69.97608      74.25532      70.74164             3
    17:     a     3         74.12500         70.54808       104    74.59971    69.98240      75.76025      70.18927             3
    18:     c     4         72.38542         72.19792        96    75.12943    71.30817      74.25532      70.74164             3