Search code examples
rdplyrnon-standard-evaluationr-marginaleffects

Obsolete Data Mask to late to resolve


Hi all I had a question about non-standard evaluation. I fit a few models with different outcome variables and compute the marginal effects like this.

library(palmerpenguins)
library(marginaleffects)
library(sandwich)
library(tidyr)
library(dplyr)


long_pengs = penguins |>
  pivot_longer(cols = c(body_mass_g, flipper_length_mm),
               names_to = 'outcome',
              values_to = 'vals') |>
  drop_na(sex) |>
  summarise(mods = list(lm(vals ~ sex * bill_length_mm, data = pick(everything()))), .by = outcome)


comps = long_pengs |>
  rowwise(outcome) |>
  reframe(avg_comparisons(mods,
                          variables = 'sex',
                          subset(sex == 'female')))

However when I try to cluster bootstrap the standard errors I run into these error message and am wondering how to resolve this issue. I am not wedded to doing this with non-standard evaluation.


# works 
long_pengs |> 
  rowwise(outcome) |>
  reframe(avg_comparisons(mods,
    variables = 'sex',
    subset(sex == 'female')) |>
      inferences(method = 'rsample')) 

long_pengs |>
rowwise(outcome) |>
reframe(avg_comparisons(mods,
                        variables = 'sex',
                        subset(sex == 'female'),
                        vcov = vcovBS(mods, cluster = ~species)))
#> Error in `reframe()`:
#> ℹ In argument: `avg_comparisons(...)`.
#> ℹ In row 1.
#> Caused by error:
#> ! Obsolete data mask.
#> ✖ Too late to resolve `species` after the end of `dplyr::summarise()`.
#> ℹ Did you save an object that uses `species` lazily in a column in the
#>   `dplyr::summarise()` expression ?

long_pengs |> 
  rowwise(outcome) |>
  reframe(avg_comparisons(mods,
    variables = 'sex',
    subset(sex == 'female')) |>
      inferences(method = 'rsample', strata = species))
#> Error in `reframe()`:
#> ℹ In argument: `inferences(...)`.
#> ℹ In row 1.
#> Caused by error:
#> ! object 'species' not found

The desired output would look something like this

## desired output 

m1 = lm(body_mass_g ~ sex * bill_length_mm, data = penguins)

c1 = avg_comparisons(m1, variables = 'sex',
                     subset(sex == 'female'),
                    vcov = vcovBS(m1, cluster = ~species))

m2 = lm(flipper_length_mm ~ sex * bill_length_mm, data = penguins)

c2 = avg_comparisons(m2, variables = 'sex',
                     subset(sex == 'female'),
                     vcov = vcovBS(m2, cluster = ~species))

rbind(c1, c2)
#> 
#>  Estimate Std. Error     z Pr(>|z|)   S   2.5 % 97.5 %
#>   420.487     293.10 1.435    0.151 2.7 -153.97  994.9
#>     0.392       5.16 0.076    0.939 0.1   -9.73   10.5
#> 
#> Term: sex
#> Type:  response 
#> Comparison: mean(male) - mean(female)
#> Columns: term, contrast, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, predicted_lo, predicted_hi, predicted

Created on 2024-11-13 with reprex v2.1.1


Solution

  • You could do it by baking the data into the model call.

    library(palmerpenguins)
    library(marginaleffects)
    library(sandwich)
    library(tidyr)
    library(dplyr)
    
    long_pengs = penguins |>
      pivot_longer(cols = c(body_mass_g, flipper_length_mm),
                   names_to = 'outcome',
                   values_to = 'vals') |>
      drop_na(sex) |>
      summarise(mods = list(do.call(lm, list(formula=vals ~ sex * bill_length_mm, data = pick(everything())))), .by = outcome)
    
    long_pengs |>
      rowwise(outcome) |>
      reframe(avg_comparisons(mods,
                              variables = 'sex',
                              subset(sex == 'female'),
                              vcov = vcovBS(mods, cluster = ~species)))
    #> # A tibble: 2 × 13
    #>   outcome   term  contrast estimate std.error statistic p.value s.value conf.low
    #>   <chr>     <chr> <chr>       <dbl>     <dbl>     <dbl>   <dbl>   <dbl>    <dbl>
    #> 1 body_mas… sex   mean(ma…  420.       288.      1.46     0.144  2.79    -144.  
    #> 2 flipper_… sex   mean(ma…    0.392      4.72    0.0831   0.934  0.0989    -8.86
    #> # ℹ 4 more variables: conf.high <dbl>, predicted_lo <dbl>, predicted_hi <dbl>,
    #> #   predicted <dbl>
    

    Created on 2024-11-13 with reprex v2.1.0