Trying to update and summarize a larger data.table; data as follows.
structure(list(y1_countyname = c("Montgomery County", "Elmore County",
"Dallas County", "Chilton County", "Jefferson County", "Escambia County",
"Escambia County", "Harris County"), n2 = c(867L, 835L, 115L,
169L, 75L, 599L, 144L, 90L), y2_geoid = c("01001", "01001", "01001",
"01001", "01001", "01003", "01003", "01003"), y1_geoid = c("01101",
"01051", "01047", "01021", "01073", "12033", "01053", "48201"
), y2_ling_zo = c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L), y1_ling_zo = c(9L,
9L, 9L, 11L, 11L, 7L, 7L, 12L), ling_mig = c(0, 0, 0, 1, 1, 1,
1, 1), grp_y2_geoid = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), i7 = c(0L,
0L, 0L, 0L, 0L, 599L, 144L, 0L), i9 = c(867L, 835L, 115L, 0L,
0L, 0L, 0L, 0L), i11 = c(0L, 0L, 0L, 169L, 75L, 0L, 0L, 0L),
i12 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 90L)), row.names = c(NA,
-8L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>, sorted = c("y2_geoid",
"y1_ling_zo"))
> in_1314_sub
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig
1: Montgomery County 867 01001 01101 9 9 0
2: Elmore County 835 01001 01051 9 9 0
3: Dallas County 115 01001 01047 9 9 0
4: Chilton County 169 01001 01021 9 11 1
5: Jefferson County 75 01001 01073 9 11 1
6: Escambia County 599 01003 12033 9 7 1
7: Escambia County 144 01003 01053 9 7 1
8: Harris County 90 01003 48201 9 12 1
grp_y2_geoid i7 i9 i11 i12
1: 1 0 867 0 0
2: 1 0 835 0 0
3: 1 0 115 0 0
4: 1 0 0 169 0
5: 1 0 0 75 0
6: 2 599 0 0 0
7: 2 144 0 0 0
8: 2 0 0 0 90
the report I'm trying to append to:
structure(list(y1_countyname = c("Autauga County Non-migrants",
"Baldwin County Non-migrants"), n2 = c(41198L, 148883L), y2_geoid = c("01001",
"01003"), y1_geoid = c("01001", "01003"), y2_ling_zo = c(9L,
9L), y1_ling_zo = c(9L, 9L), ling_mig = c(0, 0), nm_7 = c(NA_integer_,
NA_integer_), nm_9 = c(41198L, 148883L), nm_11 = c(NA_integer_,
NA_integer_), nm_12 = c(NA_integer_, NA_integer_), nm_15 = c(NA_integer_,
NA_integer_)), row.names = c(NA, -2L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x55ab0e577580>)
> in_1314_non_sof[, c(1:12)]
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants 41198 01001 01001 9 9
2: Baldwin County Non-migrants 148883 01003 01003 9 9
ling_mig nm_7 nm_9 nm_11 nm_12 nm_15
1: 0 NA 41198 NA NA NA
2: 0 NA 148883 NA NA NA
using this:
in_1314_non_sof[, c('i7', 'i9', 'i11', 'i12'):=
.(sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 7L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 9L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 11L]),
sum(in_1314_sub$n2[in_1314_sub$y1_ling_zo == 12L])),
by = .(unique(in_1314_sub$grp_y2_geoid))]
results in:
in_1314_non_sof[, -c(8:12)]
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo
1: Autauga County Non-migrants 41198 01001 01001 9 9
2: Baldwin County Non-migrants 148883 01003 01003 9 9
ling_mig i7 i9 i11 i12
1: 0 743 1817 244 90
2: 0 743 1817 244 90
where hoped for results would be:
i7 i9 i11 i12
0 1817 244 0
743 0 0 90
What am I missing to arrive at hoped for results. I don't think this is a duplicate of conditional_collapse; but, perhaps I've missed a useful by
member.
With the updated post, and using a more natural join,
in_1314_non_sof[, -c(8:12)][in_1314_sub[, lapply(.SD, sum), .SDcols=i7:i12, by=y2_geoid], on=.(y2_geoid)]
Output
y1_countyname n2 y2_geoid y1_geoid y2_ling_zo y1_ling_zo ling_mig i7 i9 i11 i12
<char> <int> <char> <char> <int> <int> <num> <int> <int> <int> <int>
1: Autauga County Non-migrants 41198 01001 01001 9 9 0 0 1817 244 0
2: Baldwin County Non-migrants 148883 01003 01003 9 9 0 743 0 0 90
If you don't have i7:i12 estimated, and you need to do so from n2, you can
# get cts by geoid
cts_by_geoid <- dcast(data = in_1314_sub[, sum(n2), by=.(y2_geoid,g=paste0("i", y1_ling_zo))],
formula = y2_geoid~g,
value.var="V1")
# merge with non_sof table
in_1314_non_sof[,-c(8,12)][cts_by_geoid, on=.(y2_geoid)]
Output, as above