Search code examples
rdata.tableaggregateextract

Pulling collapsed summary report from data.table


Trying to update and summarize a larger data.table; data as follows.

    structure(list(y1_countyname = c("Montgomery County", "Elmore County", 
"Dallas County", "Chilton County", "Jefferson County", "Escambia County", 
"Escambia County", "Harris County"), n2 = c(867L, 835L, 115L, 
169L, 75L, 599L, 144L, 90L), y2_geoid = c("01001", "01001", "01001", 
"01001", "01001", "01003", "01003", "01003"), y1_geoid = c("01101", 
"01051", "01047", "01021", "01073", "12033", "01053", "48201"
), y2_ling_zo = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), y1_ling_zo = c(9L, 
9L, 9L, 11L, 11L, 7L, 7L, 12L), ling_mig = c(0, 0, 0, 1, 1, 1, 
1, 1), grp_y2_geoid = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), i7 = c(0L, 
0L, 0L, 0L, 0L, 599L, 144L, 0L), i9 = c(867L, 835L, 115L, 0L, 
0L, 0L, 0L, 0L), i11 = c(0L, 0L, 0L, 169L, 75L, 0L, 0L, 0L), 
    i12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 90L)), row.names = c(NA, 
-8L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>, sorted = c("y2_geoid", 
"y1_ling_zo"))

> in_1314_sub
       y1_countyname  n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig
1: Montgomery County 867    01001    01101          9          9        0
2:     Elmore County 835    01001    01051          9          9        0
3:     Dallas County 115    01001    01047          9          9        0
4:    Chilton County 169    01001    01021          9         11        1
5:  Jefferson County  75    01001    01073          9         11        1
6:   Escambia County 599    01003    12033          9          7        1
7:   Escambia County 144    01003    01053          9          7        1
8:     Harris County  90    01003    48201          9         12        1
   grp_y2_geoid  i7  i9 i11 i12
1:            1   0 867   0   0
2:            1   0 835   0   0
3:            1   0 115   0   0
4:            1   0   0 169   0
5:            1   0   0  75   0
6:            2 599   0   0   0
7:            2 144   0   0   0
8:            2   0   0   0  90

the report I'm trying to append to:

structure(list(y1_countyname = c("Autauga County Non-migrants", 
"Baldwin County Non-migrants"), n2 = c(41198L, 148883L), y2_geoid = c("01001", 
"01003"), y1_geoid = c("01001", "01003"), y2_ling_zo = c(9L, 
9L), y1_ling_zo = c(9L, 9L), ling_mig = c(0, 0), nm_7 = c(NA_integer_, 
NA_integer_), nm_9 = c(41198L, 148883L), nm_11 = c(NA_integer_, 
NA_integer_), nm_12 = c(NA_integer_, NA_integer_), nm_15 = c(NA_integer_, 
NA_integer_)), row.names = c(NA, -2L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>)

> in_1314_non_sof[, c(1:12)]
                 y1_countyname     n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants  41198    01001    01001          9          9
2: Baldwin County Non-migrants 148883    01003    01003          9          9
   ling_mig nm_7   nm_9 nm_11 nm_12 nm_15
1:        0   NA  41198    NA    NA    NA
2:        0   NA 148883    NA    NA    NA

using this:

in_1314_non_sof[, c('i7', 'i9', 'i11', 'i12'):= 
.(sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 7L]),
 sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 9L]),
 sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 11L]),
 sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 12L])),
 by = .(unique(in_1314_sub$grp_y2_geoid))]

results in:

in_1314_non_sof[, -c(8:12)]
                 y1_countyname     n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants  41198    01001    01001          9          9
2: Baldwin County Non-migrants 148883    01003    01003          9          9
   ling_mig  i7   i9 i11 i12
1:        0 743 1817 244  90
2:        0 743 1817 244  90

where hoped for results would be:

  i7   i9    i11  i12
  0   1817   244   0
 743   0      0    90

What am I missing to arrive at hoped for results. I don't think this is a duplicate of conditional_collapse; but, perhaps I've missed a useful by member.


Solution

  • With the updated post, and using a more natural join,

    in_1314_non_sof[, -c(8:12)][in_1314_sub[, lapply(.SD, sum), .SDcols=i7:i12, by=y2_geoid], on=.(y2_geoid)]
    

    Output

                     y1_countyname     n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig    i7    i9   i11   i12
                            <char>  <int>   <char>   <char>      <int>      <int>    <num> <int> <int> <int> <int>
    1: Autauga County Non-migrants  41198    01001    01001          9          9        0     0  1817   244     0
    2: Baldwin County Non-migrants 148883    01003    01003          9          9        0   743     0     0    90
    

    If you don't have i7:i12 estimated, and you need to do so from n2, you can

    # get cts by geoid
    cts_by_geoid <- dcast(data = in_1314_sub[, sum(n2), by=.(y2_geoid,g=paste0("i", y1_ling_zo))],
                          formula = y2_geoid~g,
                          value.var="V1")
    
    # merge with non_sof table
    in_1314_non_sof[,-c(8,12)][cts_by_geoid, on=.(y2_geoid)]
    

    Output, as above