I have a database as follows:
DT <- structure(list(Year = c(2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005,
2005, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006,
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006,
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007,
2007, 2007, 2007, 2007, 2007, 2007), Type = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
3), Value = c(0.504376244608734, 0.544791523560323, 0.536356351248399,
0.186754489979335, 0.0145059662169885, 0.552467068108315, 0.728991908748136,
0.0782701833265232, 0.0770140143185365, 0.745720346755096, 0.182549844851049,
0.0037854136407528, 0.892426526130476, 0.670307075099745, 0.0787676704471466,
0.243642889274613, 0.61622932816441, 0.773909954748003, 0.0368627127466908,
0.864836276200213, 0.363247130858897, 0.170719500081567, 0.458862115912474,
0.764369844834086, 0.22138732039061, 0.950217140815184, 0.119026355092504,
0.806698643902745, 0.809697143416323, 0.0161168403745759, 0.56149794546334,
0.0663374185634651, 0.851044662622003, 0.144127493261805, 0.646129610173195,
0.180326314861961, 0.346305710081752, 0.689186084156133, 0.0902438913162577,
0.493067567084055, 0.829728867159447, 0.212655417404949, 0.873112880345332,
0.57019799015934, 0.666924788035991, 0.421470848297274, 0.137822577124685,
0.646797965126931, 0.00186628356193685, 0.220630784144145, 0.636097250212043,
0.337161167241577, 0.763014675300797, 0.0290609945874959, 0.179775595422681,
0.926270372245386, 0.14413707866326, 0.308460218540821, 0.505730133160804,
0.92831463570813, 0.2406601397661, 0.469013177711661, 0.0514836845684897,
0.8773477591701, 0.988870207825279, 0.0409427390691713, 0.345261503182235,
0.457678159145652, 0.928521904779235, 0.981654149874765, 0.165376851871405,
0.657749413049735, 0.645610554242246, 0.288901032482677, 0.903464871012278,
0.91288926903878, 0.331819964874993, 0.451775254733976, 0.561567931867726,
0.934770693643712, 0.0515071551015609, 0.0772762108900331, 0.233674539049138,
0.636764452840065, 0.673165028674493, 0.806944576060158, 0.763410488346345,
0.661058275398286, 0.275215831961986, 0.821051953775588), Value2 = c(0.898973133700585,
0.0043728119746469, 0.90370150590114, 0.664255277142381, 0.478255150030532,
0.428181937562552, 0.0547471373342867, 0.382060484866744, 0.467990590870777,
0.44613758335896, 0.767317422802576, 0.378150639908367, 0.490578474103678,
0.677901331005272, 0.287571260541928, 0.201396158908221, 0.504989505596871,
0.854550423135574, 0.545208640791417, 0.951248990134053, 0.958420479001103,
0.916437669811835, 0.299402641214852, 0.966388390213139, 0.511359402704707,
0.0867219533353825, 0.88481040004275, 0.158676351804193, 0.0723357399252373,
0.605048894989562, 0.60104443547608, 0.608164723564692, 0.309073275149768,
0.183031315824665, 0.495737621177827, 0.981936843144856, 0.601436476710344,
0.442362735422709, 0.497899316486054, 0.0545162134700136, 0.572666465987199,
0.0134330483790179, 0.494252845049882, 0.752561338910785, 0.269231150235318,
0.580397043886635, 0.00438648885146109, 0.974859546601355, 0.964309270817873,
0.740961468264743, 0.966289928060099, 0.165450408579171, 0.457088887715921,
0.725271665700556, 0.611801886877621, 0.693114823445831, 0.509441044895801,
0.668642268489104, 0.0769213109282016, 0.0106313240133811, 0.653738670103508,
0.515077318720933, 0.0355798295524966, 0.916849288357794, 0.489540407953311,
0.355080030655249, 0.0584185346727107, 0.117505910926226, 0.840486642923002,
0.0919621689925281, 0.513293731647231, 0.813987689492758, 0.520895630669219,
0.417642884334403, 0.549898208275446, 0.190152036926942, 0.730222922437507,
0.247328458018061, 0.587109508511267, 0.850096530635719, 0.929032051736368,
0.929910983683225, 0.461558252621238, 0.106247873795127, 0.177666580357953,
0.85962988262837, 0.531897323076434, 0.105528819826748, 0.0349104003049517,
0.180758384726269), Value3 = c(0.728747048185938, 0.136214396563203,
0.0552254916905935, 0.888943411458351, 0.593186561829418, 0.142192475897417,
0.397839605231809, 0.128332683559321, 0.818143628566787, 0.675081193031822,
0.267554700398382, 0.289692778583473, 0.395043380675461, 0.582592369450023,
0.999361780203229, 0.421977850130829, 0.723404859329269, 0.333410997686596,
0.545945290276875, 0.510878802866974, 0.746682101648222, 0.625853669469718,
0.0366957172106372, 0.417685335838607, 0.106323486037796, 0.0127310987059773,
0.291264331038641, 0.690392584005106, 0.0367947033685097, 0.287721087095362,
0.389582158765541, 0.179954765659721, 0.688980485242488, 0.492296704771236,
0.177765364735501, 0.311877860895471, 0.402659917512069, 0.579307427105039,
0.588566648357923, 0.741057591300206, 0.111932877257211, 0.515443723005798,
0.679584351614947, 0.0197622696399569, 0.0326379476305644, 0.736148474541639,
0.0115696238487739, 0.0530159587501624, 0.710708890129421, 0.537042840144158,
0.0277825198238522, 0.851349803530179, 0.448963399024373, 0.42841165712813,
0.0615511042450435, 0.210541933956987, 0.983517611560273, 0.533691182135933,
0.61993895519575, 0.136074538018663, 0.716185070081669, 0.67982888131481,
0.186059692566576, 0.0129160598675656, 0.832257317305668, 0.0269936347869698,
0.579065014243438, 0.857987264303428, 0.270050217297758, 0.606374993010002,
0.565105220120649, 0.977264711860796, 0.14241840012272, 0.942496958955904,
0.652070963472916, 0.912867524689929, 0.0249357414986835, 0.87704909395977,
0.72849611059358, 0.525707690655331, 0.290223239565496, 0.992723233891769,
0.178173444691217, 0.0292681960925434, 0.65696953770876, 0.452973377851251,
0.471917712361899, 0.117830393053313, 0.126107861454795, 0.0848074010166607
)), row.names = c(NA, -90L), class = c("tbl_df", "tbl", "data.frame"
))
With the DT
database, I want to create stratified sub samples based on the following data:
Ratios <- structure(list(State = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "C", "C",
"C", "C", "C", "C", "C", "C"), Year = c(2005, 2005, 2005, 2006,
2006, 2006, 2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006,
2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006, 2007, 2007,
2007), Type = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), ratio = c(0.2, 0.2, 0.6,
0.45, 0.2, 0.35, 0.1, 0.1, 0.8, 0.05, 0.5, 0.45, 0.3, 0.5, 0.2,
0.3, 0.2, 0.5, 0.35, 0.45, 0.2, 0.35, 0.2, 0.45, 0.2, 0.2, 0.6
)), row.names = c(NA, -27L), class = c("tbl_df", "tbl", "data.frame"
))
I am trying to figure out how to approach this issue. In order to achieve the maximum possible sample size, I need to figure out the largest possible sub sample based on the ratios. Fig 1. Ratios
shows the ratios for State A in 2005.
I first have to determine the ratios in DT
:
# This is just to determine the ratio of `Type` in the data `DT`
DT_1 <- DT %>% count(Year, Type) %>% group_by(Year)
DT_2 <- DT %>% count(Year, Type) %>% group_by(Year) %>% mutate(n = n/sum(n))
DT_RAT <- cbind(DT_1 , DT_2)
setDT(DT_RAT)[,Year1:=NULL]
setDT(DT_RAT)[,Type1:=NULL]
rm(DT_1 , DT_2)
Now I should with that information create samples. I have to start with the biggest share (For State A, this is Type==1
.) So for sample DT_A
, half of the sample should be of Type
= 1. From DT_RAT
I know that this amounts to 13 observations of Type 1
(the max amount of type 1 observations). I would like to do something like:
DT_A <- DT[,.SD[sample(.N, min(13,.N))],by = Type==1]
But I am not allowed to do that. Can anyone help me to make this happen?
EDIT:
The desired result for DT_A
(the sub-sample of DT
for State A) is as follows.
Type 1
(see Fig 1)Type 2
Type 3
DT has 13 observation of Type 1
for 2005, so we take all of those. Because these 13 should be half of the sample. The total sample is 26 observations
There are also 13 observation of Type 2
for 2005, but since only 0.45 should consist of Type 2
we take 12. Only 5 percent should be of Type 3
, so that is the 1 observation remaining.
Type 1
(see Fig 1)Type 2
Type 3
I think this pipeline gives you what you need:
library(tidyr)
library(dplyr)
library(purrr)
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size)) %>%
mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>%
select(State, Year, Type, sample) %>%
arrange(State, Year, Type)
Output
# A tibble: 27 x 4
State Year Type sample
<chr> <dbl> <dbl> <list>
1 A 2005 1 <tibble [1 x 3]>
2 A 2005 2 <tibble [1 x 3]>
3 A 2005 3 <tibble [4 x 3]>
4 A 2006 1 <tibble [4 x 3]>
5 A 2006 2 <tibble [2 x 3]>
6 A 2006 3 <tibble [3 x 3]>
7 A 2007 1 <tibble [1 x 3]>
8 A 2007 2 <tibble [1 x 3]>
9 A 2007 3 <tibble [5 x 3]>
10 B 2005 1 <tibble [1 x 3]>
# ... with 17 more rows
You can further append to that pipeline a %>% unnest(sample)
, but I will leave it like this as you never specify your expected output. This format is also easier for me to explain to you the logic behind.
The key is that you need to determine the proper sample size for each combination of State, Year, Type
. The formula I used to determine the sample size is
sample_size = pmax(1, round(min(n / ratio) * ratio))
For example, let n = c(13, 13, 4)
, the same numbers as your example shows. You want to slice a sample based on ratio = c(0.5, 0.45, 0.05)
. Then, the max sample size you can have is min(n / ratio)
, i.e. min(c(26, 28.88889, 80)) = 26
. Any number greater than this is invalid as you don't have enough observations to sample from (for Type 2 and 3 in this case). Then we use the total sample size times the ratio to compute the sample size for each Type. We have to round
the result as only natural numbers are allowed; we also need at least one observation for each type. That's why you see that pmax(1, ...)
.
For the example shown in your post, this formula returns
> pmax(1, round(min(c(13, 13, 4) / c(0.5, 0.45, 0.05)) * c(0.5, 0.45, 0.05)))
[1] 13 12 1
With this setup, we can start the sampling procedure you want. We first
> DT %>%
count(Year, Type)
# A tibble: 9 x 3
Year Type n
<dbl> <dbl> <int>
1 2005 1 13
2 2005 2 13
3 2005 3 4
4 2006 1 16
5 2006 2 11
6 2006 3 3
7 2007 1 11
8 2007 2 14
9 2007 3 5
so that we know how many observations are there for each Year
and Type
. Then,
> DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio)))
# A tibble: 27 x 6
# Groups: State, Year [9]
Year Type n State ratio sample_size
<dbl> <dbl> <int> <chr> <dbl> <dbl>
1 2005 1 13 A 0.2 1
2 2005 1 13 B 0.05 1
3 2005 1 13 B 0.35 3
4 2005 2 13 A 0.2 1
5 2005 2 13 B 0.5 4
6 2005 2 13 C 0.45 9
7 2005 3 4 A 0.6 4
8 2005 3 4 B 0.45 4
9 2005 3 4 C 0.2 4
10 2006 1 16 A 0.45 4
# ... with 17 more rows
so that we know the proper sample size for each Year
, Type
, and State
. Next,
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size))
# A tibble: 27 x 5
State Year Type sample_size data
<chr> <dbl> <dbl> <dbl> <list>
1 A 2005 1 1 <tibble [13 x 3]>
2 B 2005 1 1 <tibble [13 x 3]>
3 B 2005 1 3 <tibble [13 x 3]>
4 A 2005 2 1 <tibble [13 x 3]>
5 B 2005 2 4 <tibble [13 x 3]>
6 C 2005 2 9 <tibble [13 x 3]>
7 A 2005 3 4 <tibble [4 x 3]>
8 B 2005 3 4 <tibble [4 x 3]>
9 C 2005 3 4 <tibble [4 x 3]>
10 A 2006 1 4 <tibble [16 x 3]>
# ... with 17 more rows
so that the sample size and data are matched for each Year
and Type
. Last, we take a sample of the data for each group using the slice_sample
function, select the columns we need, and then arrange the data for display.
DT %>%
count(Year, Type) %>%
right_join(Ratios, by = c("Year", "Type")) %>%
group_by(State, Year) %>%
mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>%
ungroup() %>%
select(State, Year, Type, sample_size) %>%
left_join(DT, by = c("Year","Type")) %>%
nest(data = -c(State, Year, Type, sample_size)) %>%
mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>%
select(State, Year, Type, sample) %>%
arrange(State, Year, Type)
The output of this pipeline is shown at the beginning of this post. I think you can also achieve the same result in a data.table
way. However, I will leave it like this as this pipeline is better for illustration purpose.