Search code examples
rpurrr

How to save multiple steps in map function


I am trying to create multiple samples, fit a model on each sample, and print the results.

dat <- data.frame(
  x=rnorm(20, mean=5, sd=1),
  y=rnorm(20, mean=6, sd=1),
  weight = rnorm(20, mean=1, sd=0.2)
)

f <- function(data, var.x, var.y, n) {
  # select sample
  data_subset <- data %>% sample_n(n, weight = weight, replace = T) %>% select(all_of(var.x), all_of(var.y)) %>% drop_na()
  # fit model
  model <- lm(unlist(data_subset %>% select(y)) ~ unlist(data_subset %>% select(x))) 
  print(model)
}

f(data=dat, var.x="x", var.y="y", n=20)

Is there a way to save the dataset within a map function and fit the model? Here is an attempt:

    f <- function(data, var.x, var.y, n) {
map(seq_len(20),
      # select sample
      data_subset <- data %>% sample_n(n, weight = weight, replace = T) %>% select(all_of(var.x), all_of(var.y)) %>% drop_na()
      # fit model
      model <- lm(unlist(data_subset %>% select(y)) ~ unlist(data_subset %>% select(x))) 
      print(model)
)
    }

Solution

  • Sure,

    library(dplyr)
    library(purrr) # map
    set.seed(42)
    dat <- data.frame( x=rnorm(20, mean=5, sd=1), y=rnorm(20, mean=6, sd=1), weight = rnorm(20, mean=1, sd=0.2) )
    head(dat,3)
    #          x        y    weight
    # 1 6.370958 5.693361 1.0411997
    # 2 4.435302 4.218692 0.9277885
    # 3 5.363128 5.828083 1.1516326
    out <- tibble(run = 1:20) %>%
      mutate(
        data = map(run, ~ sample_n(dat, size = 5, weight = weight, replace = TRUE)),
        mdl = map(data, ~ lm(y ~ x, data = .x)),
        smry = map(mdl, ~ summary(.x))
      )
    

    At this point, three columns of out are list-columns,

    out
    # # A tibble: 20 × 4
    #      run data         mdl    smry      
    #    <int> <list>       <list> <list>    
    #  1     1 <df [5 × 3]> <lm>   <smmry.lm>
    #  2     2 <df [5 × 3]> <lm>   <smmry.lm>
    #  3     3 <df [5 × 3]> <lm>   <smmry.lm>
    #  4     4 <df [5 × 3]> <lm>   <smmry.lm>
    #  5     5 <df [5 × 3]> <lm>   <smmry.lm>
    #  6     6 <df [5 × 3]> <lm>   <smmry.lm>
    #  7     7 <df [5 × 3]> <lm>   <smmry.lm>
    #  8     8 <df [5 × 3]> <lm>   <smmry.lm>
    #  9     9 <df [5 × 3]> <lm>   <smmry.lm>
    # 10    10 <df [5 × 3]> <lm>   <smmry.lm>
    # 11    11 <df [5 × 3]> <lm>   <smmry.lm>
    # 12    12 <df [5 × 3]> <lm>   <smmry.lm>
    # 13    13 <df [5 × 3]> <lm>   <smmry.lm>
    # 14    14 <df [5 × 3]> <lm>   <smmry.lm>
    # 15    15 <df [5 × 3]> <lm>   <smmry.lm>
    # 16    16 <df [5 × 3]> <lm>   <smmry.lm>
    # 17    17 <df [5 × 3]> <lm>   <smmry.lm>
    # 18    18 <df [5 × 3]> <lm>   <smmry.lm>
    # 19    19 <df [5 × 3]> <lm>   <smmry.lm>
    # 20    20 <df [5 × 3]> <lm>   <smmry.lm>
    

    and we can access any of the individual elements using [[:

    out$data[[1]]
    #           x        y    weight
    # 1  4.721211 5.391074 1.1285799
    # 2  4.893875 5.569531 1.0865636
    # 3  6.370958 5.693361 1.0411997
    # 4  6.370958 5.693361 1.0411997
    # 5  4.866679 6.504955 1.0179521
    # 6  4.893875 5.569531 1.0865636
    # 7  6.511522 5.742731 0.8377214
    # 8  5.404268 7.895193 0.7263438
    # 9  4.715747 5.215541 1.1358578
    # 10 4.866679 6.504955 1.0179521
    # 11 6.511522 5.742731 0.8377214
    # 12 6.370958 5.693361 1.0411997
    # 13 2.343545 5.149092 1.0179666
    # 14 6.511522 5.742731 0.8377214
    # 15 5.632863 7.214675 0.8546590
    # 16 6.370958 5.693361 1.0411997
    # 17 5.632863 7.214675 0.8546590
    # 18 4.905341 4.236837 1.2888203
    # 19 4.435302 4.218692 0.9277885
    # 20 2.343545 5.149092 1.0179666
    
    out$data[1:2]
    # [[1]]
    #          x        y   weight
    # 1 4.721211 5.391074 1.128580
    # 2 4.893875 5.569531 1.086564
    # 3 6.370958 5.693361 1.041200
    # 4 6.370958 5.693361 1.041200
    # 5 4.866679 6.504955 1.017952
    # [[2]]
    #          x        y    weight
    # 1 4.893875 5.569531 1.0865636
    # 2 6.511522 5.742731 0.8377214
    # 3 5.404268 7.895193 0.7263438
    # 4 4.715747 5.215541 1.1358578
    # 5 4.866679 6.504955 1.0179521
    out$mdl[1:2]
    # [[1]]
    # Call:
    # lm(formula = y ~ x, data = .x)
    # Coefficients:
    # (Intercept)            x  
    #     6.11533     -0.06334  
    # [[2]]
    # Call:
    # lm(formula = y ~ x, data = .x)
    # Coefficients:
    # (Intercept)            x  
    #      5.4066       0.1476  
    out$smry[1:2]
    # [[1]]
    # Call:
    # lm(formula = y ~ x, data = .x)
    # Residuals:
    #        1        2        3        4        5 
    # -0.42521 -0.23582 -0.01843 -0.01843  0.69788 
    # Coefficients:
    #             Estimate Std. Error t value Pr(>|t|)  
    # (Intercept)  6.11533    1.59236   3.840   0.0311 *
    # x           -0.06334    0.28966  -0.219   0.8409  
    # ---
    # Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    # Residual standard error: 0.4913 on 3 degrees of freedom
    # Multiple R-squared:  0.01569, Adjusted R-squared:  -0.3124 
    # F-statistic: 0.04782 on 1 and 3 DF,  p-value: 0.8409
    # [[2]]
    # Call:
    # lm(formula = y ~ x, data = .x)
    # Residuals:
    #       1       2       3       4       5 
    # -0.5593 -0.6248  1.6910 -0.8870  0.3801 
    # Coefficients:
    #             Estimate Std. Error t value Pr(>|t|)
    # (Intercept)   5.4066     4.4193   1.223    0.309
    # x             0.1476     0.8308   0.178    0.870
    # Residual standard error: 1.224 on 3 degrees of freedom
    # Multiple R-squared:  0.01041, Adjusted R-squared:  -0.3195 
    # F-statistic: 0.03155 on 1 and 3 DF,  p-value: 0.8703
    

    If you need to get more "complicated" than this (or if you want to anyway ...), look into the broom package and friends.

    Note:

    • I didn't functionize this, but it should be relatively simple to do that, my point here is to demonstrate doing things in steps, where the first such step is to store a list-column of "just data";