Search code examples
rdata.tablesample

Creating stratified sample based on another dataset


I have a database as follows:

DT <- structure(list(Year = c(2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007), Type = c(1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 
3), Value = c(0.504376244608734, 0.544791523560323, 0.536356351248399, 
0.186754489979335, 0.0145059662169885, 0.552467068108315, 0.728991908748136, 
0.0782701833265232, 0.0770140143185365, 0.745720346755096, 0.182549844851049, 
0.0037854136407528, 0.892426526130476, 0.670307075099745, 0.0787676704471466, 
0.243642889274613, 0.61622932816441, 0.773909954748003, 0.0368627127466908, 
0.864836276200213, 0.363247130858897, 0.170719500081567, 0.458862115912474, 
0.764369844834086, 0.22138732039061, 0.950217140815184, 0.119026355092504, 
0.806698643902745, 0.809697143416323, 0.0161168403745759, 0.56149794546334, 
0.0663374185634651, 0.851044662622003, 0.144127493261805, 0.646129610173195, 
0.180326314861961, 0.346305710081752, 0.689186084156133, 0.0902438913162577, 
0.493067567084055, 0.829728867159447, 0.212655417404949, 0.873112880345332, 
0.57019799015934, 0.666924788035991, 0.421470848297274, 0.137822577124685, 
0.646797965126931, 0.00186628356193685, 0.220630784144145, 0.636097250212043, 
0.337161167241577, 0.763014675300797, 0.0290609945874959, 0.179775595422681, 
0.926270372245386, 0.14413707866326, 0.308460218540821, 0.505730133160804, 
0.92831463570813, 0.2406601397661, 0.469013177711661, 0.0514836845684897, 
0.8773477591701, 0.988870207825279, 0.0409427390691713, 0.345261503182235, 
0.457678159145652, 0.928521904779235, 0.981654149874765, 0.165376851871405, 
0.657749413049735, 0.645610554242246, 0.288901032482677, 0.903464871012278, 
0.91288926903878, 0.331819964874993, 0.451775254733976, 0.561567931867726, 
0.934770693643712, 0.0515071551015609, 0.0772762108900331, 0.233674539049138, 
0.636764452840065, 0.673165028674493, 0.806944576060158, 0.763410488346345, 
0.661058275398286, 0.275215831961986, 0.821051953775588), Value2 = c(0.898973133700585, 
0.0043728119746469, 0.90370150590114, 0.664255277142381, 0.478255150030532, 
0.428181937562552, 0.0547471373342867, 0.382060484866744, 0.467990590870777, 
0.44613758335896, 0.767317422802576, 0.378150639908367, 0.490578474103678, 
0.677901331005272, 0.287571260541928, 0.201396158908221, 0.504989505596871, 
0.854550423135574, 0.545208640791417, 0.951248990134053, 0.958420479001103, 
0.916437669811835, 0.299402641214852, 0.966388390213139, 0.511359402704707, 
0.0867219533353825, 0.88481040004275, 0.158676351804193, 0.0723357399252373, 
0.605048894989562, 0.60104443547608, 0.608164723564692, 0.309073275149768, 
0.183031315824665, 0.495737621177827, 0.981936843144856, 0.601436476710344, 
0.442362735422709, 0.497899316486054, 0.0545162134700136, 0.572666465987199, 
0.0134330483790179, 0.494252845049882, 0.752561338910785, 0.269231150235318, 
0.580397043886635, 0.00438648885146109, 0.974859546601355, 0.964309270817873, 
0.740961468264743, 0.966289928060099, 0.165450408579171, 0.457088887715921, 
0.725271665700556, 0.611801886877621, 0.693114823445831, 0.509441044895801, 
0.668642268489104, 0.0769213109282016, 0.0106313240133811, 0.653738670103508, 
0.515077318720933, 0.0355798295524966, 0.916849288357794, 0.489540407953311, 
0.355080030655249, 0.0584185346727107, 0.117505910926226, 0.840486642923002, 
0.0919621689925281, 0.513293731647231, 0.813987689492758, 0.520895630669219, 
0.417642884334403, 0.549898208275446, 0.190152036926942, 0.730222922437507, 
0.247328458018061, 0.587109508511267, 0.850096530635719, 0.929032051736368, 
0.929910983683225, 0.461558252621238, 0.106247873795127, 0.177666580357953, 
0.85962988262837, 0.531897323076434, 0.105528819826748, 0.0349104003049517, 
0.180758384726269), Value3 = c(0.728747048185938, 0.136214396563203, 
0.0552254916905935, 0.888943411458351, 0.593186561829418, 0.142192475897417, 
0.397839605231809, 0.128332683559321, 0.818143628566787, 0.675081193031822, 
0.267554700398382, 0.289692778583473, 0.395043380675461, 0.582592369450023, 
0.999361780203229, 0.421977850130829, 0.723404859329269, 0.333410997686596, 
0.545945290276875, 0.510878802866974, 0.746682101648222, 0.625853669469718, 
0.0366957172106372, 0.417685335838607, 0.106323486037796, 0.0127310987059773, 
0.291264331038641, 0.690392584005106, 0.0367947033685097, 0.287721087095362, 
0.389582158765541, 0.179954765659721, 0.688980485242488, 0.492296704771236, 
0.177765364735501, 0.311877860895471, 0.402659917512069, 0.579307427105039, 
0.588566648357923, 0.741057591300206, 0.111932877257211, 0.515443723005798, 
0.679584351614947, 0.0197622696399569, 0.0326379476305644, 0.736148474541639, 
0.0115696238487739, 0.0530159587501624, 0.710708890129421, 0.537042840144158, 
0.0277825198238522, 0.851349803530179, 0.448963399024373, 0.42841165712813, 
0.0615511042450435, 0.210541933956987, 0.983517611560273, 0.533691182135933, 
0.61993895519575, 0.136074538018663, 0.716185070081669, 0.67982888131481, 
0.186059692566576, 0.0129160598675656, 0.832257317305668, 0.0269936347869698, 
0.579065014243438, 0.857987264303428, 0.270050217297758, 0.606374993010002, 
0.565105220120649, 0.977264711860796, 0.14241840012272, 0.942496958955904, 
0.652070963472916, 0.912867524689929, 0.0249357414986835, 0.87704909395977, 
0.72849611059358, 0.525707690655331, 0.290223239565496, 0.992723233891769, 
0.178173444691217, 0.0292681960925434, 0.65696953770876, 0.452973377851251, 
0.471917712361899, 0.117830393053313, 0.126107861454795, 0.0848074010166607
)), row.names = c(NA, -90L), class = c("tbl_df", "tbl", "data.frame"
))

With the DT database, I want to create stratified sub samples based on the following data:

Ratios <- structure(list(State = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "C", "C", 
"C", "C", "C", "C", "C", "C"), Year = c(2005, 2005, 2005, 2006, 
2006, 2006, 2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006, 
2007, 2007, 2007, 2005, 2005, 2005, 2006, 2006, 2006, 2007, 2007, 
2007), Type = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), ratio = c(0.2, 0.2, 0.6, 
0.45, 0.2, 0.35, 0.1, 0.1, 0.8, 0.05, 0.5, 0.45, 0.3, 0.5, 0.2, 
0.3, 0.2, 0.5, 0.35, 0.45, 0.2, 0.35, 0.2, 0.45, 0.2, 0.2, 0.6
)), row.names = c(NA, -27L), class = c("tbl_df", "tbl", "data.frame"
))

Fig 1. Ratios strong text

I am trying to figure out how to approach this issue. In order to achieve the maximum possible sample size, I need to figure out the largest possible sub sample based on the ratios. Fig 1. Ratios shows the ratios for State A in 2005.

I first have to determine the ratios in DT:

# This is just to determine the ratio of `Type` in the data `DT`
DT_1 <- DT %>%  count(Year, Type) %>% group_by(Year)
DT_2 <- DT %>%  count(Year, Type) %>% group_by(Year) %>% mutate(n = n/sum(n))
DT_RAT <- cbind(DT_1 , DT_2)
setDT(DT_RAT)[,Year1:=NULL]
setDT(DT_RAT)[,Type1:=NULL]
rm(DT_1 , DT_2)

Now I should with that information create samples. I have to start with the biggest share (For State A, this is Type==1.) So for sample DT_A, half of the sample should be of Type = 1. From DT_RAT I know that this amounts to 13 observations of Type 1 (the max amount of type 1 observations). I would like to do something like:

DT_A <- DT[,.SD[sample(.N, min(13,.N))],by = Type==1]

But I am not allowed to do that. Can anyone help me to make this happen?

EDIT:

The desired result for DT_A (the sub-sample of DT for State A) is as follows.

  • 0.5 of DT_A in 2005 should consist of Type 1 (see Fig 1)
  • 0.45 of DT_A in 2005 should consist of Type 2
  • 0.05 of DT_A in 2005 should consist of Type 3

DT has 13 observation of Type 1 for 2005, so we take all of those. Because these 13 should be half of the sample. The total sample is 26 observations There are also 13 observation of Type 2 for 2005, but since only 0.45 should consist of Type 2 we take 12. Only 5 percent should be of Type 3, so that is the 1 observation remaining.

  • 13 = 0.5 of DT_A in 2005 should consist of Type 1 (see Fig 1)
  • 12 = 0.45 of DT_A in 2005 should consist of Type 2
  • 1 = 0.05 of DT_A in 2005 should consist of Type 3

Solution

  • I think this pipeline gives you what you need:

    library(tidyr)
    library(dplyr)
    library(purrr)
    
    DT %>% 
      count(Year, Type) %>% 
      right_join(Ratios, by = c("Year", "Type")) %>% 
      group_by(State, Year) %>% 
      mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>% 
      ungroup() %>% 
      select(State, Year, Type, sample_size) %>% 
      left_join(DT, by = c("Year","Type")) %>% 
      nest(data = -c(State, Year, Type, sample_size)) %>% 
      mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>% 
      select(State, Year, Type, sample) %>% 
      arrange(State, Year, Type)
    

    Output

    # A tibble: 27 x 4
       State  Year  Type sample          
       <chr> <dbl> <dbl> <list>          
     1 A      2005     1 <tibble [1 x 3]>
     2 A      2005     2 <tibble [1 x 3]>
     3 A      2005     3 <tibble [4 x 3]>
     4 A      2006     1 <tibble [4 x 3]>
     5 A      2006     2 <tibble [2 x 3]>
     6 A      2006     3 <tibble [3 x 3]>
     7 A      2007     1 <tibble [1 x 3]>
     8 A      2007     2 <tibble [1 x 3]>
     9 A      2007     3 <tibble [5 x 3]>
    10 B      2005     1 <tibble [1 x 3]>
    # ... with 17 more rows
    

    You can further append to that pipeline a %>% unnest(sample), but I will leave it like this as you never specify your expected output. This format is also easier for me to explain to you the logic behind.

    The key is that you need to determine the proper sample size for each combination of State, Year, Type. The formula I used to determine the sample size is

    sample_size = pmax(1, round(min(n / ratio) * ratio))
    

    For example, let n = c(13, 13, 4), the same numbers as your example shows. You want to slice a sample based on ratio = c(0.5, 0.45, 0.05). Then, the max sample size you can have is min(n / ratio), i.e. min(c(26, 28.88889, 80)) = 26. Any number greater than this is invalid as you don't have enough observations to sample from (for Type 2 and 3 in this case). Then we use the total sample size times the ratio to compute the sample size for each Type. We have to round the result as only natural numbers are allowed; we also need at least one observation for each type. That's why you see that pmax(1, ...).

    For the example shown in your post, this formula returns

    > pmax(1, round(min(c(13, 13, 4) / c(0.5, 0.45, 0.05)) * c(0.5, 0.45, 0.05)))
    [1] 13 12  1
    

    With this setup, we can start the sampling procedure you want. We first

    > DT %>% 
          count(Year, Type)
    
    # A tibble: 9 x 3
       Year  Type     n
      <dbl> <dbl> <int>
    1  2005     1    13
    2  2005     2    13
    3  2005     3     4
    4  2006     1    16
    5  2006     2    11
    6  2006     3     3
    7  2007     1    11
    8  2007     2    14
    9  2007     3     5
    

    so that we know how many observations are there for each Year and Type. Then,

    > DT %>% 
        count(Year, Type) %>% 
        right_join(Ratios, by = c("Year", "Type")) %>% 
        group_by(State, Year) %>% 
        mutate(sample_size = pmax(1, round(min(n / ratio) * ratio)))
    
    # A tibble: 27 x 6
    # Groups:   State, Year [9]
        Year  Type     n State ratio sample_size
       <dbl> <dbl> <int> <chr> <dbl>       <dbl>
     1  2005     1    13 A      0.2            1
     2  2005     1    13 B      0.05           1
     3  2005     1    13 B      0.35           3
     4  2005     2    13 A      0.2            1
     5  2005     2    13 B      0.5            4
     6  2005     2    13 C      0.45           9
     7  2005     3     4 A      0.6            4
     8  2005     3     4 B      0.45           4
     9  2005     3     4 C      0.2            4
    10  2006     1    16 A      0.45           4
    # ... with 17 more rows
    

    so that we know the proper sample size for each Year, Type, and State. Next,

    DT %>% 
      count(Year, Type) %>% 
      right_join(Ratios, by = c("Year", "Type")) %>% 
      group_by(State, Year) %>% 
      mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>% 
      ungroup() %>% 
      select(State, Year, Type, sample_size) %>% 
      left_join(DT, by = c("Year","Type")) %>% 
      nest(data = -c(State, Year, Type, sample_size))
    
    # A tibble: 27 x 5
       State  Year  Type sample_size data             
       <chr> <dbl> <dbl>       <dbl> <list>           
     1 A      2005     1           1 <tibble [13 x 3]>
     2 B      2005     1           1 <tibble [13 x 3]>
     3 B      2005     1           3 <tibble [13 x 3]>
     4 A      2005     2           1 <tibble [13 x 3]>
     5 B      2005     2           4 <tibble [13 x 3]>
     6 C      2005     2           9 <tibble [13 x 3]>
     7 A      2005     3           4 <tibble [4 x 3]> 
     8 B      2005     3           4 <tibble [4 x 3]> 
     9 C      2005     3           4 <tibble [4 x 3]> 
    10 A      2006     1           4 <tibble [16 x 3]>
    # ... with 17 more rows
    

    so that the sample size and data are matched for each Year and Type. Last, we take a sample of the data for each group using the slice_sample function, select the columns we need, and then arrange the data for display.

    DT %>% 
      count(Year, Type) %>% 
      right_join(Ratios, by = c("Year", "Type")) %>% 
      group_by(State, Year) %>% 
      mutate(sample_size = pmax(1, round(min(n / ratio) * ratio))) %>% 
      ungroup() %>% 
      select(State, Year, Type, sample_size) %>% 
      left_join(DT, by = c("Year","Type")) %>% 
      nest(data = -c(State, Year, Type, sample_size)) %>% 
      mutate(sample = map2(data, sample_size, ~slice_sample(.x, n = .y))) %>% 
      select(State, Year, Type, sample) %>% 
      arrange(State, Year, Type)
    

    The output of this pipeline is shown at the beginning of this post. I think you can also achieve the same result in a data.table way. However, I will leave it like this as this pipeline is better for illustration purpose.