r simulation missing-data data-manipulation imputation

Simulate data and randomly add missing values to dataframe

How can I randomly add missing values to some or each column (say random ~5% missing in each) in a simulated dataframe, plus, is there a more efficient way of simulating a dataframe with both continuous and factor columns?

 #Simulate some data
 N <- 2000
 data <- data.frame(id = 1:2000,age = rnorm(N,18:90),bmi = rnorm(N,15:40),
     chol = rnorm(N,50:350), insulin = rnorm(N,2:40),sbp = rnorm(N, 50:200),
               dbp = rnorm(N, 30:150), sex = c(rep(1, 1000), rep(2, 1000)), 
               smoke = rep(c(1, 2), 1000), educ = sample(LETTERS[1:4]))


 #Manually add some missing values
 data <- data %>%
                    mutate(age = "is.na<-"(age, age <19 | age >88),
                    bmi = "is.na<-"(bmi, bmi >38 | bmi <16),
                    insulin = "is.na<-"(insulin, insulin >38),
                    educ = "is.na<-"(educ, bmi >35))

Solution

Here's a tidyverse approach that will remove roughly 20% of your data for each column you specify:

set.seed(1)

# example data
N <- 20
data <- data.frame(id = 1:N,
                   age = rnorm(N,18:90),
                   bmi = rnorm(N,15:40),
                   chol = rnorm(N,50:350))

library(tidyverse)

# specify which variables should have missing data and prc of missing data
c_names = c("age","bmi")
prc_missing = 0.20

data %>%
  gather(var, value, -id) %>%    # reshape data
  mutate(r = runif(nrow(.)),     # simulate a random number from 0 to 1 for each row
         value = ifelse(var %in% c_names & r <= prc_missing, NA, value)) %>%  # if it's one of the variables you specified and the random number is less than your threshold update to NA
  select(-r) %>%                 # remove random number
  spread(var, value)             # reshape back to original format

#    id      age      bmi     chol
# 1   1 17.37355 15.91898 49.83548
# 2   2 19.18364 16.78214 50.74664
# 3   3 19.16437 17.07456 52.69696
# 4   4       NA 16.01065 53.55666
# 5   5 22.32951 19.61983 53.31124
# 6   6 22.17953 19.94387 54.29250
# 7   7 24.48743       NA 56.36458
# 8   8 25.73832 20.52925 57.76853
# 9   9 26.57578       NA 57.88765
# 10 10 26.69461 24.41794 59.88111
# 11 11 29.51178 26.35868 60.39811
# 12 12       NA 25.89721 60.38797
# 13 13       NA 27.38767 62.34112
# 14 14 28.78530 27.94619 61.87064
# 15 15 33.12493 27.62294 65.43302
# 16 16 32.95507       NA 66.98040
# 17 17 33.98381 30.60571 65.63278
# 18 18 35.94384       NA 65.95587
# 19 19 36.82122 34.10003 68.56972
# 20 20 37.59390 34.76318 68.86495

And this is an alternative that will remove exactly 20% of data for the columns you specify:

set.seed(1)

# example data
N <- 20
data <- data.frame(id = 1:N,
                   age = rnorm(N,18:90),
                   bmi = rnorm(N,15:40),
                   chol = rnorm(N,50:350))

library(tidyverse)

# specify which variables should have missing data and prc of missing data
c_names = c("age","bmi")
prc_missing = 0.20
n_remove = prc_missing * nrow(data)

data %>%
  gather(var, value, -id) %>%   # reshape data
  sample_frac(1) %>%            # shuffle rows
  group_by(var) %>%             # for each variables
  mutate(value = ifelse(var %in% c_names & row_number() <= n_remove, NA, value)) %>%  # update to NA top x number of rows if it's one of the variables you specified
  spread(var, value)            # reshape to original format

# # A tibble: 20 x 4
#      id   age   bmi  chol
#   <int> <dbl> <dbl> <dbl>
# 1     1  17.4  15.9  49.8
# 2     2  19.2  16.8  50.7
# 3     3  19.2  17.1  52.7
# 4     4  NA    16.0  53.6
# 5     5  22.3  NA    53.3
# 6     6  22.2  19.9  54.3
# 7     7  24.5  20.8  56.4
# 8     8  25.7  NA    57.8
# 9     9  26.6  NA    57.9
# 10    10  NA    NA    59.9
# 11    11  NA    26.4  60.4
# 12    12  NA    25.9  60.4
# 13    13  29.4  27.4  62.3
# 14    14  28.8  27.9  61.9
# 15    15  33.1  27.6  65.4
# 16    16  33.0  29.6  67.0
# 17    17  34.0  30.6  65.6
# 18    18  35.9  31.9  66.0
# 19    19  36.8  34.1  68.6
# 20    20  37.6  34.8  68.9