Search code examples
rsimulationmissing-datadata-manipulationimputation

Simulate data and randomly add missing values to dataframe


How can I randomly add missing values to some or each column (say random ~5% missing in each) in a simulated dataframe, plus, is there a more efficient way of simulating a dataframe with both continuous and factor columns?

 #Simulate some data
 N <- 2000
 data <- data.frame(id = 1:2000,age = rnorm(N,18:90),bmi = rnorm(N,15:40),
     chol = rnorm(N,50:350), insulin = rnorm(N,2:40),sbp = rnorm(N, 50:200),
               dbp = rnorm(N, 30:150), sex = c(rep(1, 1000), rep(2, 1000)), 
               smoke = rep(c(1, 2), 1000), educ = sample(LETTERS[1:4]))


 #Manually add some missing values
 data <- data %>%
                    mutate(age = "is.na<-"(age, age <19 | age >88),
                    bmi = "is.na<-"(bmi, bmi >38 | bmi <16),
                    insulin = "is.na<-"(insulin, insulin >38),
                    educ = "is.na<-"(educ, bmi >35))

Solution

  • Here's a tidyverse approach that will remove roughly 20% of your data for each column you specify:

    set.seed(1)
    
    # example data
    N <- 20
    data <- data.frame(id = 1:N,
                       age = rnorm(N,18:90),
                       bmi = rnorm(N,15:40),
                       chol = rnorm(N,50:350))
    
    library(tidyverse)
    
    # specify which variables should have missing data and prc of missing data
    c_names = c("age","bmi")
    prc_missing = 0.20
    
    data %>%
      gather(var, value, -id) %>%    # reshape data
      mutate(r = runif(nrow(.)),     # simulate a random number from 0 to 1 for each row
             value = ifelse(var %in% c_names & r <= prc_missing, NA, value)) %>%  # if it's one of the variables you specified and the random number is less than your threshold update to NA
      select(-r) %>%                 # remove random number
      spread(var, value)             # reshape back to original format
    
    #    id      age      bmi     chol
    # 1   1 17.37355 15.91898 49.83548
    # 2   2 19.18364 16.78214 50.74664
    # 3   3 19.16437 17.07456 52.69696
    # 4   4       NA 16.01065 53.55666
    # 5   5 22.32951 19.61983 53.31124
    # 6   6 22.17953 19.94387 54.29250
    # 7   7 24.48743       NA 56.36458
    # 8   8 25.73832 20.52925 57.76853
    # 9   9 26.57578       NA 57.88765
    # 10 10 26.69461 24.41794 59.88111
    # 11 11 29.51178 26.35868 60.39811
    # 12 12       NA 25.89721 60.38797
    # 13 13       NA 27.38767 62.34112
    # 14 14 28.78530 27.94619 61.87064
    # 15 15 33.12493 27.62294 65.43302
    # 16 16 32.95507       NA 66.98040
    # 17 17 33.98381 30.60571 65.63278
    # 18 18 35.94384       NA 65.95587
    # 19 19 36.82122 34.10003 68.56972
    # 20 20 37.59390 34.76318 68.86495
    

    And this is an alternative that will remove exactly 20% of data for the columns you specify:

    set.seed(1)
    
    # example data
    N <- 20
    data <- data.frame(id = 1:N,
                       age = rnorm(N,18:90),
                       bmi = rnorm(N,15:40),
                       chol = rnorm(N,50:350))
    
    library(tidyverse)
    
    # specify which variables should have missing data and prc of missing data
    c_names = c("age","bmi")
    prc_missing = 0.20
    n_remove = prc_missing * nrow(data)
    
    data %>%
      gather(var, value, -id) %>%   # reshape data
      sample_frac(1) %>%            # shuffle rows
      group_by(var) %>%             # for each variables
      mutate(value = ifelse(var %in% c_names & row_number() <= n_remove, NA, value)) %>%  # update to NA top x number of rows if it's one of the variables you specified
      spread(var, value)            # reshape to original format
    
    # # A tibble: 20 x 4
    #      id   age   bmi  chol
    #   <int> <dbl> <dbl> <dbl>
    # 1     1  17.4  15.9  49.8
    # 2     2  19.2  16.8  50.7
    # 3     3  19.2  17.1  52.7
    # 4     4  NA    16.0  53.6
    # 5     5  22.3  NA    53.3
    # 6     6  22.2  19.9  54.3
    # 7     7  24.5  20.8  56.4
    # 8     8  25.7  NA    57.8
    # 9     9  26.6  NA    57.9
    # 10    10  NA    NA    59.9
    # 11    11  NA    26.4  60.4
    # 12    12  NA    25.9  60.4
    # 13    13  29.4  27.4  62.3
    # 14    14  28.8  27.9  61.9
    # 15    15  33.1  27.6  65.4
    # 16    16  33.0  29.6  67.0
    # 17    17  34.0  30.6  65.6
    # 18    18  35.9  31.9  66.0
    # 19    19  36.8  34.1  68.6
    # 20    20  37.6  34.8  68.9