I'm trying to understand what is happening to my code. Suppose I have data from this Google Forms and I want to calculate the percentages of a question with multiple options (namely, STRATEGY). There are no Nas in STRATEGY, but when I use select(select(ID, SCHOOL, STRATEGY)
it gives me a bunch of Nas. Why is that happening?
Suppose this is a survey on 5 different types of schools (SCHOOL) and we want to know how each type of school deals with learning strategies (STRATEGY).
My code:
Part 1: extract values separed by commas:
### convert Q3:
colnames(df) <- c('ID', 'ORIGIN', 'SCHOOL', 'STRATEGY', 'PREFERENCE', 'STRENGH', 'WEAKNESS')
### try to slipt:
df1 <- df %>%
pivot_longer(c(STRATEGY, STRENGH, WEAKNESS)) %>%
separate_rows(value, sep = ",") %>%
group_by(ID) %>%
mutate(row = row_number()) %>%
pivot_wider() %>%
select(-row) %>%
mutate_at(vars(-ID), na_if,"") ### change them all into NA mutate_all(na_if,"") %>%
## SELECT:
test2 <- df1 %>% select(ID, SCHOOL, STRATEGY) %>%
# filter(!is.na(SCHOOL)) %>%
group_by(SCHOOL)
## see
test2[10:15,]
# A tibble: 6 x 3
# Groups: SCHOOL [2]
ID SCHOOL STRATEGY
<chr> <chr> <chr>
1 PERSON_2 School type C " watching Netflix"
2 PERSON_2 School type C NA
3 PERSON_2 School type C NA
4 PERSON_2 School type C NA
5 PERSON_3 School type D "social media (Instagram"
6 PERSON_3 School type D " Twitter"
> sum(is.na(df$STRATEGY))
[1] 0
>
> sum(is.na(df1$STRATEGY))
[1] 81
I've seen similar questions (ex1 ex2), they recommend that we filter NAs from the group_by
column, but I don't have any NAs in SCHOOL
.
Question: I have NAs on the STRENGH
and WEAKNESS
columns, but not in STRATEGY
. Why are they showing up, then?
Notes:
STRENGH
and WEAKNESS
because I may check them afterwards (in fact, my real dataset has much more questions than these, so I wouldn't like to have to subset every single multiple-option column before analysing)STRATEGY
in relation to School Type A
, School Type B
, etc ?## I don't think the code is right for #2 :
df1 %>%
count(STRATEGY, SCHOOL) %>%
group_by(SCHOOL) %>%
mutate(prop = round((prop.table(n) * 100), digits = 2),
sd = round(sd(prop.table(n)), digits = 2))
## I guess it (ALMOST) works if I convert STRATEGY separatedly:
# sep:
df1 <- df %>%
pivot_longer(c(STRATEGY)) %>%
separate_rows(value, sep = ",") %>%
# group_by(ID) %>%
mutate(row = row_number()) %>%
pivot_wider() %>%
select(-row) %>%
mutate_at(vars(-ID), na_if,"") %>%
select(ID, SCHOOL, STRATEGY)
# count
## prob: It feels like it's grouping by STRATEGY, not by SCHOOL
df1 %>%
count(STRATEGY, SCHOOL) %>%
group_by(SCHOOL) %>%
mutate(prop = round((prop.table(n) * 100), digits = 2),
sd = round(sd(prop.table(n)), digits = 2))
# output:
> head(out)
# A tibble: 6 x 5
# Groups: SCHOOL [4]
STRATEGY SCHOOL n prop sd
<chr> <chr> <int> <dbl> <dbl>
1 " Facebook)" School type A 3 12 0.04
2 " Facebook)" School type B 2 9.09 0.03
3 " Facebook)" School type C 2 9.09 0.03
4 " Facebook)" School type D 3 16.7 0.04
5 " listening to music" School type A 1 4 0.04
6 " listening to music" School type B 1 4.55 0.03
dput(df)
structure(list(ID = c("PERSON_1", "PERSON_2", "PERSON_3", "PERSON_4",
"PERSON_5", "PERSON_6", "PERSON_7", "PERSON_8", "PERSON_9", "PERSON_10",
"PERSON_10", "PERSON_12", "PERSON_13", "PERSON_14", "PERSON_15",
"PERSON_16", "PERSON_17", "PERSON_18", "PERSON_19", "PERSON_20",
"PERSON_21", "PERSON_22", "PERSON_23", "PERSON_24", "PERSON_25"
), ORIGIN = c("RJ", "SP", "AM", "BA", "GO", "MT", "RR", "RO",
"MS", "SC", "CE", "ES", "TO", "MG", "PR", "AC", "PA", "MA", "PI",
"DF", "SE", "PB", "RN", "RS", "AP"), SCHOOL = c("School type C",
"School type C", "School type D", "School type A", "School type B",
"School type A", "School type A", "School type A", "School type C",
"School type C", "School type B", "School type C", "School type D",
"School type B", "School type A", "School type A", "School type B",
"School type D", "School type D", "School type C", "School type A",
"School type C", "School type A", "School type B", "School type B"
), STRATEGY = c("listening to podcasts, social media (Instagram, Twitter, Facebook), Watching youtube videos",
"reading books, listening to music, watching Netflix", "social media (Instagram, Twitter, Facebook), Watching youtube videos",
"reading books, social media (Instagram, Twitter, Facebook), Watching youtube videos, watching Netflix",
"reading books, listening to music, reading blogs", "listening to music, reading blogs",
"reading books, listening to music, listening to podcasts", "listening to podcasts, social media (Instagram, Twitter, Facebook)",
"listening to podcasts", "listening to music, Watching youtube videos",
"listening to music, listening to podcasts, social media (Instagram, Twitter, Facebook), Watching youtube videos, reading blogs, watching Netflix",
"reading books, listening to music, listening to podcasts, reading blogs",
"listening to podcasts, social media (Instagram, Twitter, Facebook)",
"reading blogs, watching Netflix", "social media (Instagram, Twitter, Facebook)",
"reading books, Watching youtube videos, reading blogs", "Watching youtube videos, reading blogs, watching Netflix",
"reading books, listening to music, social media (Instagram, Twitter, Facebook), Watching youtube videos, watching Netflix",
"reading books, listening to podcasts, reading blogs", "listening to music, listening to podcasts",
"reading books, Watching youtube videos", "listening to podcasts, social media (Instagram, Twitter, Facebook), Watching youtube videos",
"listening to music, listening to podcasts", "reading books, social media (Instagram, Twitter, Facebook)",
"listening to music, listening to podcasts"), PREFERENCE = c("",
"", "twitter", "", "youtube", "", "", "", "podcasts", "", "",
"", "Podcasts", "Netflix", "TiK Tok", "blogs", "Netflix", "YOUTUBE",
"BOOKS", "MUSIC", "books", "INSTA", "PODCASTS", "Facebook", "MUSIC"
), STRENGH = c("", "reading books in English, watching tv shows in English",
"", "", "reading books in English, watching tv shows in English",
"listening to music in English, reading books in English, listening to podcasts in English, watching movies in English, watching tv shows in English",
"", "", "", "", "", "", "watching tv shows in English, watching plays in English",
"", "", "listening to music in English, reading books in English, listening to podcasts in English",
"reading books in English, watching tv shows in English", "",
"reading books in English, watching tv shows in English, watching plays in English",
"watching plays in English", "reading books in English, listening to podcasts in English, watching tv shows in English",
"", "listening to podcasts in English, watching tv shows in English",
"listening to music in English", "listening to podcasts in English, watching movies in English, watching tv shows in English, watching plays in English"
), WEAKNESS = c("", "", "listening to podcasts in English, watching tv shows in English",
"", "", "", "listening to podcasts in English, watching tv shows in English",
"", "reading books in English, listening to podcasts in English, watching movies in English, watching tv shows in English, watching plays in English",
"", "", "", "", "", "", "", "", "", "", "", "listening to podcasts in English, watching plays in English",
"reading books in English, listening to podcasts in English, watching movies in English, watching tv shows in English, watching plays in English",
"listening to music in English, reading books in English", "",
"listening to podcasts in English, watching tv shows in English"
)), class = "data.frame", row.names = c(NA, -25L))
Based on the description, we may need to split to longer format with separate_longer_delim
and then do the count
and proportion
after grouping by 'SCHOOL'
library(dplyr)
library(tidyr)
df %>%
select(SCHOOL, STRATEGY) %>%
separate_longer_delim(STRATEGY, delim = regex(",\\s*")) %>%
count(SCHOOL, STRATEGY) %>%
group_by(SCHOOL) %>%
mutate(Prop = proportions(n)) %>%
ungroup
-output
# A tibble: 36 × 4
SCHOOL STRATEGY n Prop
<chr> <chr> <int> <dbl>
1 School type A Facebook) 3 0.12
2 School type A Twitter 3 0.12
3 School type A Watching youtube videos 3 0.12
4 School type A listening to music 3 0.12
5 School type A listening to podcasts 3 0.12
6 School type A reading blogs 2 0.08
7 School type A reading books 4 0.16
8 School type A social media (Instagram 3 0.12
9 School type A watching Netflix 1 0.04
10 School type B Facebook) 2 0.0909
# … with 26 more rows