I have a data set with Likert scale answers. I want to create a new data frame that will calculate the percentages of each category of each level. Doing so i have made the following code:
likert_levels <- c(
"Very Dissatisfied",
"Dissatisfied",
"Average",
"Satisfied",
"Very Satisfied"
)
df <-
tibble(
"q1" = sample(likert_levels, 10, replace = TRUE),
"q2" = sample(likert_levels, 10, replace = TRUE, prob = 5:1),
"q3" = sample(likert_levels, 10, replace = TRUE, prob = 1:5),
"q4" = sample(likert_levels, 10, replace = TRUE, prob = 1:5),
"q5" = sample(c(likert_levels, NA), 10, replace = TRUE)
) %>%
mutate(across(everything(), ~ factor(.x, levels = likert_levels)))%>%
mutate(Country = c("USA","BRAZIL","BRAZIL","BRAZIL","USA","GERMANY","ITALY","GERMANY","BRAZIL","USA"))%>%
relocate(Country,.before=q1)
df$O_1 <- apply(df, 1, function(x) sum(x=="Very Dissatisfied", na.rm=TRUE)) #How many ONEstars in
df$O_2 <- apply(df, 1, function(x) sum(x=="Dissatisfied", na.rm=TRUE)) #each row
df$O_3 <- apply(df, 1, function(x) sum(x=="Average", na.rm=TRUE))
df$O_4 <- apply(df, 1, function(x) sum(x=="Satisfied", na.rm=TRUE))
df$O_5 <- apply(df, 1, function(x) sum(x=="Very Satisfied", na.rm=TRUE))
df$O_sum <- df$O_1 + df$O_2 + df$O_3 + df$O_4 + df$O_5
df <- df[,-c(2: (ncol(df)-6))]
Likert_df = as.data.frame (df %>% group_by(Country,O_sum) %>% summarise(
OO_1 = sum(O_1) / (n() * (O_sum[1])) * 100,
OO_2 = sum(O_2) / (n() * (O_sum[1])) * 100,
OO_3 = sum(O_3) / (n() * (O_sum[1])) * 100,
OO_4 = sum(O_4) / (n() * (O_sum[1])) * 100,
OO_5 = sum(O_5) / (n() * (O_sum[1])) * 100 ) )
Likert_df$ O_sum <- NULL
Likert_df <- as.data.frame(Likert_df %>% group_by(Country) %>% summarise(
OO_1 = mean(OO_1),
OO_2 = mean(OO_2),
OO_3 = mean(OO_3),
OO_4 = mean(OO_4),
OO_5 = mean(OO_5) ))
colnames(Likert_df) <- c("Item", "Strongly Disagree", "Disagree", "So So", "Agree", "Strongly Agree")
DF = Likert_df
resulting to :
DF
Item Strongly Disagree Disagree So So Agree Strongly Agree
1 BRAZIL 10 25 25.0 30.0 10
2 GERMANY 30 20 20.0 20.0 10
3 ITALY 20 40 20.0 0.0 20
4 USA 5 10 42.5 17.5 25
My question is: Is there another way more simpler and faster using dplyr functions with pipes to create the same result ?
You could apply `==`()
on the Likert levels on every slice you get using by()
and proportions
from that. No problems with NA's.
> by(df[-1], df$Country,\(x)
+ proportions(colSums(sapply(likert_levels, \(z) x == z), na.rm=TRUE))) |>
+ do.call(what='rbind')
Very Dissatisfied Dissatisfied Average Satisfied Very Satisfied
BRAZIL 0.2105263 0.2105263 0.00000000 0.2105263 0.3684211
GERMANY 0.1000000 0.4000000 0.10000000 0.4000000 0.0000000
ITALY 0.0000000 0.2000000 0.20000000 0.4000000 0.2000000
USA 0.1333333 0.3333333 0.06666667 0.2000000 0.2666667
If you rely on a Country column rather than row names, you could do
> unique(df[1]) |> cbind(
+ by(df, ~Country,\(x)
+ proportions(colSums(sapply(likert_levels, \(z) x[-1] == z), na.rm=TRUE))) |>
+ do.call(what='rbind')
+ )
Country Very Dissatisfied Dissatisfied Average Satisfied Very Satisfied
1 USA 0.2105263 0.2105263 0.00000000 0.2105263 0.3684211
2 BRAZIL 0.1000000 0.4000000 0.10000000 0.4000000 0.0000000
6 GERMANY 0.0000000 0.2000000 0.20000000 0.4000000 0.2000000
7 ITALY 0.1333333 0.3333333 0.06666667 0.2000000 0.2666667
Should be fast.
Data:
> dput(df)
structure(list(Country = c("USA", "BRAZIL", "BRAZIL", "BRAZIL",
"USA", "GERMANY", "ITALY", "GERMANY", "BRAZIL", "USA"), q1 = structure(c(1L,
5L, 1L, 1L, 2L, 4L, 2L, 2L, 1L, 4L), levels = c("Very Dissatisfied",
"Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"),
q2 = structure(c(2L, 5L, 5L, 1L, 2L, 2L, 4L, 1L, 5L, 5L), levels = c("Very Dissatisfied",
"Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"),
q3 = structure(c(5L, 4L, 4L, 2L, 4L, 2L, 3L, 2L, 4L, 3L), levels = c("Very Dissatisfied",
"Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"),
q4 = structure(c(5L, 2L, 5L, 5L, 2L, 3L, 4L, 4L, 5L, 1L), levels = c("Very Dissatisfied",
"Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"),
q5 = structure(c(4L, 4L, NA, 2L, 5L, 4L, 5L, 4L, 2L, 2L), levels = c("Very Dissatisfied",
"Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
> dput(likert_levels)
c("Very Dissatisfied", "Dissatisfied", "Average", "Satisfied",
"Very Satisfied")