Search code examples
rrandomwhile-looptidyversematching

Using for loops, while, tidyverse, or packages to create a dataset with matching characteristics from a previous one (sampling)


I'm working with panel data. We assessed children in 2019 and 2020. Therefore, I have two datasets (2019 and 2020) and I want to create a third dataset matching the data from the second dataset (2020) that matches the characteristics of the first dataset (2019).This third dataset will have fewer participants, but they'll share the same characteristics of their "peers" from 2019. So, the proportion of boys and girls will be about the same of 2019, the mother's age will be about the same, etc.

Examaple: enter image description here

Code:

df_2019 = structure(list(asqse_quest = c(24, 24, 24, 24, 24, 24, 24, 24, 
                                         24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
                                         24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
                                         24, 24, 24, 24, 24, 24, 24, 24, 24, 24), year_completed_cat = structure(c(2L, 
                                                                                                                   2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                   2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                   2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                   2L), levels = c("18", "19", "20", "21", "22", "23", "24"), class = "factor"), 
                         sex_male = c(1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 
                                      1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 
                                      0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0), momage = c(36, 
                                                                                                  39, 22, 20, 29, 40, 31, 37, 29, 38, 24, 35, 32, 30, 32, 31, 
                                                                                                  29, 21, 28, 29, 40, 21, 38, 29, 28, 33, 25, 25, 30, 29, 25, 
                                                                                                  27, 28, 31, 24, 28, 35, 29, 17, 35, 32, 29, 27, 24, 29, 25, 
                                                                                                  28, 24, 21, 26), momed = c(4, 4, 2, 2, 4, 3, 2, 3, 2, 4, 
                                                                                                                             3, 4, 4, 4, 4, 4, 3, 4, 3, 4, 4, 2, 2, 4, 4, 4, 4, 4, 4, 
                                                                                                                             4, 2, 4, 3, 3, 3, 3, 4, 4, 2, 4, 4, 3, 2, 2, 3, 4, 4, 3, 
                                                                                                                             2, 4), income = c(4, 4, 2, 3, 4, 1, 2, 5, 4, 4, 5, 4, 4, 
                                                                                                                                               4, 4, 4, 4, 2, 3, 3, 4, 2, 3, 4, 4, 4, 5, 4, 3, 3, 4, 4, 
                                                                                                                                               3, 4, 1, 4, 2, 4, 3, 4, 4, 3, 4, 3, 4, 4, 4, 3, 4, 4)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                           -50L))


df_2020 = structure(list(asqse_quest = c(24, 24, 24, 24, 24, 24, 24, 24, 
                                         24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
                                         24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 
                                         24, 24, 24, 24, 24, 24, 24, 24, 24, 24), year_completed_cat = structure(c(3L, 
                                                                                                                   3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
                                                                                                                   3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
                                                                                                                   3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
                                                                                                                   3L), levels = c("18", "19", "20", "21", "22", "23", "24"), class = "factor"), 
                         sex_male = c(1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 
                                      0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 
                                      1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1), momage = c(23, 
                                                                                                  26, 33, 34, 29, 26, 23, 29, 40, 36, 33, 18, 31, 31, 31, 32, 
                                                                                                  34, 35, 29, 37, 19, 30, 33, 25, 32, 35, 37, 27, 23, 29, 28, 
                                                                                                  26, 30, 27, 38, 28, 29, 39, 26, 25, 29, 39, 35, 32, 20, 38, 
                                                                                                  31, 27, 28, 23), momed = c(2, 4, 4, 3, 4, 3, 2, 2, 3, 4, 
                                                                                                                             1, 2, 2, 4, 4, 4, 4, 2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 4, 
                                                                                                                             2, 1, 4, 3, 2, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 2, 4, 4, 4, 
                                                                                                                             4, 1), income = c(2, 4, 4, 4, 4, 5, 3, 2, 2, 4, 1, 3, 4, 
                                                                                                                                               5, 1, 4, 3, 1, 4, 5, 5, 4, 4, 4, 3, 4, 4, 2, 4, 5, 1, 4, 
                                                                                                                                               4, 1, 4, 4, 4, 4, 3, 4, 4, 4, 5, 4, 2, 4, 4, 4, 4, 4)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                           -50L))

Created on 2024-07-12 with reprex v2.1.0


Solution

  • You can try the MatchIt package, which has a function that performs propensity score matching.

    We first combine the two datasets with bind_rows, assigning an id to distinguish the two datasets:

    library(dplyr)
    
    data <- bind_rows(df_2019, df_2020, .id="year") |>
      mutate(year=+(year==1)) # 1=2019 (treated), 0=2020 (controls)
    

    Rows corresponding to year==1 are your treated (from 2019 data) and year==0 corresponds to your controls (from 2020 data).

    To find controls that match as closely to the treated as possible, we can use the matchit function. There are a number of arguments, and for brevity, we'll just use the defaults.

    library(MatchIt)
    

    We'll first try to match exactly on years completed, sex, and mother's age and see if we have any luck.

    match_obj <- matchit(year ~ asqse_quest+year_completed_cat+sex_male+momage+momed+income,
                         data = data, 
                         exact= ~ year_completed_cat+sex_male+momage,
                         replace = FALSE)
    

    #Error in `matchit()`:
    #! No matches were found.
    

    No surprise because the two datasets don't match at all on years completed. Let's make our matching condition less strict

    match_obj <- matchit(year ~ asqse_quest+year_completed_cat+sex_male+momage+momed+income,
                         data = data, 
                         exact= ~ sex_male+momage,
                         replace = FALSE)
    

    There's no error this time, but we get a warning

    #Warning message:
    #Fewer control units than treated units in some `exact` strata; not all treated units will get a match. 
    

    That's ok. Now summarise the results.

    summary(match_obj)
    ...
    Sample Sizes:
              Control Treated
    All            50      50
    Matched        25      25
    Unmatched      25      25
    Discarded       0       0
    

    The output says that we found 25 controls from the original 50. Other useful information is given but I've omitted here for brevity. Now use match.data to get the matches, together with the original treated.

    matched_data <- match.data(match_obj)
    

    Now we simply filter out the treated and we are left with the matching controls:

    df_2020_new <- filter(matched_data, year==0)
    head(df_2020_new)
    

       asqse_quest year_completed_cat sex_male momage momed income
    1           24                 20        1     23     2      2
    2           24                 20        1     26     4      4
    3           24                 20        1     33     4      4
    4           24                 20        1     34     3      4
    5           24                 20        0     29     4      4
    6           24                 20        1     26     3      5
    7           24                 20        0     23     2      3
    8           24                 20        1     29     2      2
    9           24                 20        0     40     3      2
    10          24                 20        1     36     4      4
    

    Check the help page for matchit to see how you can modify the methods for matching. There's too much detail to give here, but this is the basic idea.