Hi, I have a data frame with a weight column like the example:
df <- tibble::tribble(
~id, ~edu, ~q_d1, ~q_d2_1, ~weight,
1L, 1L, 1L, 0L, 1740,
2L, 1L, 1L, 0L, 1428,
3L, 2L, 1L, 2L, 496,
4L, 2L, 1L, 2L, 550,
5L, 3L, 1L, 1L, 1762,
6L, 4L, 1L, 0L, 1004,
7L, 5L, 1L, 0L, 522,
8L, 3L, 2L, 0L, 1099,
9L, 4L, 2L, 2L, 1295
)
I use srvyr package to calculate summary statistics with group. My script:
sv_design_test <- df %>%
srvyr::as_survey_design(weights = weight)
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(smartphone) %>%
summarize(proportion = srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n())) %>%
select(-proportion_se, -total_se )
Output:
# A tibble: 3 x 4
smartphone proportion total total_unweighted
<chr> <dbl> <dbl> <int>
1 No Internet 0.242 2394 2
2 No smartphone 0.474 4694 4
3 smartphone 0.284 2808 3
but when I add education (edu) to group_by I got an error:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
group_by(edu, smartphone) %>%
summarize(proportion = srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n())) %>%
select(-proportion_se, -total_se )
The error message is:
Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
contrasts can be applied only to factors with 2 or more levels
Your error message (the one about contrasts) says that you need to use factors as your grouping variables. In your original data frame, edu
is numeric, so you can convert it to a factor before you create the survey design.
library(tidyverse)
library(srvyr)
# ...
sv_design_test <- df %>%
mutate(edu = as.factor(edu)) %>%
srvyr::as_survey_design(weights = weight)
Then after you create smartphone
, convert that to a factor as well:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone" ,
q_d2_1 == 0 ~ "No smartphone" ,
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone))
In your second error message (the one about lengths), it's because you have functions in your summarise
that return different numbers of rows. You can verify that by calling those functions separately (the error message says it's argument 3, meaning n = unweighted(n())
, where the issue is).
This returns 15 rows:
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(prop = survey_mean(),
total = survey_total())
#> # A tibble: 15 x 6
#> edu smartphone prop prop_se total total_se
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108.
#> 3 1 smartphone 0 0 0 0
#> 4 2 No Internet 0 0 0 0
#> 5 2 No smartphone 0 0 0 0
#> 6 2 smartphone 1 0 1046 693.
#> 7 3 No Internet 0.384 0.355 1099 1099.
#> 8 3 No smartphone 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762.
#> 10 4 No Internet 0.563 0.369 1295 1295.
#> 11 4 No smartphone 0.437 0.369 1004 1004
#> 12 4 smartphone 0 0 0 0
#> 13 5 No Internet 0 0 0 0
#> 14 5 No smartphone 1 0 522 522
#> 15 5 smartphone 0 0 0 0
While this returns only 7, because there are only 7 combinations of edu
and smartphone
that appear, and therefore only 7 that get counted.
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(n = unweighted(n()))
#> # A tibble: 7 x 3
#> edu smartphone n
#> <fct> <fct> <int>
#> 1 1 No smartphone 2
#> 2 2 smartphone 2
#> 3 3 No Internet 1
#> 4 3 smartphone 1
#> 5 4 No Internet 1
#> 6 4 No smartphone 1
#> 7 5 No smartphone 1
.drop = FALSE
within group_by()
You can force summarize()
to produce results even for combinations of factor levels that don't appear in the data by using the .drop
argument of the group_by()
function.
sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone,
.drop = FALSE) %>%
summarize(prop= srvyr::survey_mean(),
total = srvyr::survey_total(),
total_unweighted = srvyr::unweighted(n()))
#> # A tibble: 15 x 7
#> edu smartphone prop prop_se total total_se total_unweighted
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108. 2
#> 3 1 smartphone 0 0 0 0 0
#> 4 2 No Internet 0 0 0 0 0
#> 5 2 No smartphone 0 0 0 0 0
#> 6 2 smartphone 1 0 1046 693. 2
#> 7 3 No Internet 0.384 0.355 1099 1099. 1
#> 8 3 No smartphone 0 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762. 1
#> 10 4 No Internet 0.563 0.369 1295 1295. 1
#> 11 4 No smartphone 0.437 0.369 1004 1004 1
#> 12 4 smartphone 0 0 0 0 0
#> 13 5 No Internet 0 0 0 0 0
#> 14 5 No smartphone 1 0 522 522 1
#> 15 5 smartphone 0 0 0 0 0
You could make 2 different summarized data frames and then join them.
I'm adding a call to complete
after n()
to fill in missing levels. Making two data frames and joining them gets the following:
props <- sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(prop = survey_mean(),
total = survey_total())
counts <- sv_design_test %>%
dplyr::mutate(smartphone = case_when(
q_d1 == 2 ~ "No Internet",
q_d2_1 > 0 ~ "smartphone",
q_d2_1 == 0 ~ "No smartphone",
TRUE ~ NA_character_)) %>%
mutate(smartphone = as.factor(smartphone)) %>%
group_by(edu, smartphone) %>%
summarise(n = unweighted(n())) %>%
complete(edu, smartphone, fill = list(n = 0))
left_join(props, counts, by = c("edu", "smartphone"))
#> # A tibble: 15 x 7
#> edu smartphone prop prop_se total total_se n
#> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 No Internet 0 0 0 0 0
#> 2 1 No smartphone 1 0 3168 2108. 2
#> 3 1 smartphone 0 0 0 0 0
#> 4 2 No Internet 0 0 0 0 0
#> 5 2 No smartphone 0 0 0 0 0
#> 6 2 smartphone 1 0 1046 693. 2
#> 7 3 No Internet 0.384 0.355 1099 1099. 1
#> 8 3 No smartphone 0 0 0 0 0
#> 9 3 smartphone 0.616 0.355 1762 1762. 1
#> 10 4 No Internet 0.563 0.369 1295 1295. 1
#> 11 4 No smartphone 0.437 0.369 1004 1004 1
#> 12 4 smartphone 0 0 0 0 0
#> 13 5 No Internet 0 0 0 0 0
#> 14 5 No smartphone 1 0 522 522 1
#> 15 5 smartphone 0 0 0 0 0