Search code examples
rdplyrapplyglm

r nested glm with two subgroups


I am trying to run a simple glm model like this.

         library(dplyr)
         library(purrr)
         library(tidyr)
         library(broom)

         data("mtcars")
         head(mtcars)
         
         mtcars$Name <- row.names(mtcars)
         row.names(mtcars) <- NULL

         glm(mpg ~ wt, data=mtcars)

No issues so far.

Next I am trying to run this model on every subgroup of gear i.e gear=3, gear=4, gear=5 so I am running my glm model within a dlply function like this.


Model1 <- plyr::dlply(mtcars, "gear",
            function(x)
              tryCatch(
                glm(mpg ~ wt,
                    data =x ),
                error = function(e) NA), .drop = TRUE)

SummaryCars <- map2_df(Model1,
                        names(Model1),
                          ~broom::tidy(.x, confint = TRUE)[2,] %>%
                              mutate(gear = .y))

Now I have a third subgroup carb. This variable has 6 levels

table(mtcars$carb)

1  2  3  4  6  8 
7 10  3 10  1  1 

Exclude carb levels 6 and 8. I like to run my model on carb levels 1,2,3,&4. For each level of gear. But I like to exclude one level of carb during each iteration.

Model1 - Carb Level 1,2,3 (Exclude data from carb= 4)

 ```
 Model1 <- plyr::dlply(mtcars, "gear",
        function(x)
          tryCatch(
            glm(mpg ~ wt,
                data =x ),
            error = function(e) NA), .drop = TRUE)

  SummaryCars <- map2_df(Model1,
                    names(Model1),
                      ~broom::tidy(.x, confint = TRUE)[2,] %>%
                          mutate(gear = .y))
   ```

**Model2 ** - Carb Level 1,2,4 (Exclude data from carb= 3)

 ```
 Model1 <- plyr::dlply(mtcars, "gear",
        function(x)
          tryCatch(
            glm(mpg ~ wt,
                data =x ),
            error = function(e) NA), .drop = TRUE)

  SummaryCars <- map2_df(Model1,
                    names(Model1),
                      ~broom::tidy(.x, confint = TRUE)[2,] %>%
                          mutate(gear = .y))
   ```

**Model3 ** - Carb Level 1,3,4 (Exclude data from carb= 2)

 ```
 Model1 <- plyr::dlply(mtcars, "gear",
        function(x)
          tryCatch(
            glm(mpg ~ wt,
                data =x ),
            error = function(e) NA), .drop = TRUE)

  SummaryCars <- map2_df(Model1,
                    names(Model1),
                      ~broom::tidy(.x, confint = TRUE)[2,] %>%
                          mutate(gear = .y))
   ```

**Mode4l ** - Carb Level 2,3,4 (Exclude data from carb= 1)

 ```
 Model1 <- plyr::dlply(mtcars, "gear",
        function(x)
          tryCatch(
            glm(mpg ~ wt,
                data =x ),
            error = function(e) NA), .drop = TRUE)

  SummaryCars <- map2_df(Model1,
                    names(Model1),
                      ~broom::tidy(.x, confint = TRUE)[2,] %>%
                          mutate(gear = .y))
   ```

As you can see I can run the model within each levels of Gear (3,4,5) but I am not sure how to add another loop on top of this where data from one level is exclude and rest are considered.

Expected Final Results

         Model            SubModel     Estimate    Lower(CI)      Upper(CI)    stdError   p
         Exclude carb= 4  Gear = 3     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 4  Gear = 4     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 4  Gear = 5     xxx         xxxx           xxxx         xxx        x

         Exclude carb= 3  Gear = 3     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 3  Gear = 4     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 3  Gear = 5     xxx         xxxx           xxxx         xxx        x

         Exclude carb= 2  Gear = 3     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 2  Gear = 4     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 2  Gear = 5     xxx         xxxx           xxxx         xxx        x

         Exclude carb= 1  Gear = 3     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 1  Gear = 4     xxx         xxxx           xxxx         xxx        x
         Exclude carb= 1  Gear = 5     xxx         xxxx           xxxx         xxx        x

Any help is much appreciated. Thanks in advance.

I have included the code in my question showing what I have tried.


Solution

  • This solution uses nested purrr::map_dfr() calls around broom::tidy(lm()).

    Edit: OP asked for a solution that handles subsets with no valid data. I now do that by wrapping the tidy(lm()) call in purrr::possibly(). The output will include a single row of NA model parameters for invalid models.

    library(dplyr)
    library(purrr)
    library(broom)
    
    # rmv carb == 6 or 8, add test case with all missing data, then split by gear
    mtc_gears <- filter(mtcars, carb <= 4) %>% 
      mutate(wt = if_else(gear == 4 & carb != 4, NA_real_, wt))
    mtc_gears <- split(mtc_gears, mtc_gears$gear)
    
    safe_tidy_lm <- possibly(
      function(formula, data) tidy(lm(formula, data = data), conf.int = TRUE),
      tibble(.rows = 1)
    )
    
    map_dfr(
      1:4, 
      \(carb_drop) map_dfr(
          mtc_gears, 
          \(mtc_gear) safe_tidy_lm(
            mpg ~ wt, 
            data = filter(mtc_gear, carb != carb_drop)
          ), 
          .id = "Gear"
        ), 
      .id = "Exclude Carb"
    )
    
    #> Warning in qt(a, object$df.residual): NaNs produced
    #> Warning in qt(a, object$df.residual): NaNs produced
    #> # A tibble: 23 × 9
    #>    `Exclude Carb` Gear  term     estim…¹ std.e…² stati…³ p.value conf.…⁴ conf.…⁵
    #>    <chr>          <chr> <chr>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
    #>  1 1              3     (Interc…   25.1    3.51     7.14 3.15e-5   17.2   32.9  
    #>  2 1              3     wt         -2.44   0.842   -2.90 1.59e-2   -4.32  -0.563
    #>  3 1              4     (Interc…   30.2    3.61     8.37 1.40e-2   14.7   45.7  
    #>  4 1              4     wt         -3.38   1.16    -2.92 1.00e-1   -8.37   1.61 
    #>  5 1              5     (Interc…   44.4    1.82    24.3  2.62e-2   21.2   67.5  
    #>  6 1              5     wt         -8.92   0.769  -11.6  5.47e-2  -18.7    0.846
    #>  7 2              3     (Interc…   28.9    3.01     9.58 5.11e-6   22.1   35.7  
    #>  8 2              3     wt         -3.28   0.733   -4.47 1.55e-3   -4.93  -1.62 
    #>  9 2              4     (Interc…   30.2    3.61     8.37 1.40e-2   14.7   45.7  
    #> 10 2              4     wt         -3.38   1.16    -2.92 1.00e-1   -8.37   1.61 
    #> # … with 13 more rows, and abbreviated variable names ¹​estimate, ²​std.error,
    #> #   ³​statistic, ⁴​conf.low, ⁵​conf.high
    

    Created on 2022-11-01 with reprex v2.0.2