Search code examples
rdataframedplyr

Prop table in R with grouping factor and likert scale answers columns


I have a data set with Likert scale answers. I want to create a new data frame that will calculate the percentages of each category of each level. Doing so i have made the following code:


likert_levels <- c(
  "Very Dissatisfied",
  "Dissatisfied",
  "Average",
  "Satisfied",
  "Very Satisfied"
)

df <-
  tibble(
    "q1" = sample(likert_levels, 10, replace = TRUE),
    "q2" = sample(likert_levels, 10, replace = TRUE, prob = 5:1),
    "q3" = sample(likert_levels, 10, replace = TRUE, prob = 1:5),
    "q4" = sample(likert_levels, 10, replace = TRUE, prob = 1:5),
    "q5" = sample(c(likert_levels, NA), 10, replace = TRUE)
  ) %>%
  mutate(across(everything(), ~ factor(.x, levels = likert_levels)))%>%
  mutate(Country = c("USA","BRAZIL","BRAZIL","BRAZIL","USA","GERMANY","ITALY","GERMANY","BRAZIL","USA"))%>%
  relocate(Country,.before=q1)


df$O_1 <- apply(df, 1, function(x) sum(x=="Very Dissatisfied", na.rm=TRUE)) #How many ONEstars in                                    
df$O_2 <- apply(df, 1, function(x) sum(x=="Dissatisfied", na.rm=TRUE)) #each row
df$O_3 <- apply(df, 1, function(x) sum(x=="Average", na.rm=TRUE))
df$O_4 <- apply(df, 1, function(x) sum(x=="Satisfied", na.rm=TRUE))
df$O_5 <- apply(df, 1, function(x) sum(x=="Very Satisfied", na.rm=TRUE))

df$O_sum <-  df$O_1 + df$O_2 + df$O_3 + df$O_4 + df$O_5
df <- df[,-c(2: (ncol(df)-6))]



Likert_df =  as.data.frame (df %>% group_by(Country,O_sum) %>% summarise( 
  OO_1 = sum(O_1) / (n() * (O_sum[1])) * 100,
  OO_2 = sum(O_2) / (n() * (O_sum[1])) * 100,
  OO_3 = sum(O_3) / (n() * (O_sum[1])) * 100,
  OO_4 = sum(O_4) / (n() * (O_sum[1])) * 100,
  OO_5 = sum(O_5) / (n() * (O_sum[1])) * 100 ) ) 

Likert_df$ O_sum <- NULL

Likert_df <- as.data.frame(Likert_df %>% group_by(Country) %>% summarise(
  
  OO_1 = mean(OO_1),
  OO_2 = mean(OO_2),
  OO_3 = mean(OO_3),
  OO_4 = mean(OO_4),
  OO_5 = mean(OO_5) ))


colnames(Likert_df) <- c("Item",  "Strongly Disagree",  "Disagree",  "So So",  "Agree",  "Strongly Agree")
DF = Likert_df

resulting to :

DF
     Item Strongly Disagree Disagree So So Agree Strongly Agree
1  BRAZIL                10       25  25.0  30.0             10
2 GERMANY                30       20  20.0  20.0             10
3   ITALY                20       40  20.0   0.0             20
4     USA                 5       10  42.5  17.5             25

My question is: Is there another way more simpler and faster using dplyr functions with pipes to create the same result ?


Solution

  • You could apply `==`() on the Likert levels on every slice you get using by() and proportions from that. No problems with NA's.

    > by(df[-1], df$Country,\(x) 
    +    proportions(colSums(sapply(likert_levels, \(z) x == z), na.rm=TRUE))) |> 
    +   do.call(what='rbind')
            Very Dissatisfied Dissatisfied    Average Satisfied Very Satisfied
    BRAZIL          0.2105263    0.2105263 0.00000000 0.2105263      0.3684211
    GERMANY         0.1000000    0.4000000 0.10000000 0.4000000      0.0000000
    ITALY           0.0000000    0.2000000 0.20000000 0.4000000      0.2000000
    USA             0.1333333    0.3333333 0.06666667 0.2000000      0.2666667
    

    If you rely on a Country column rather than row names, you could do

    > unique(df[1]) |> cbind(
    +   by(df, ~Country,\(x) 
    +      proportions(colSums(sapply(likert_levels, \(z) x[-1] == z), na.rm=TRUE))) |> 
    +     do.call(what='rbind')
    + )
      Country Very Dissatisfied Dissatisfied    Average Satisfied Very Satisfied
    1     USA         0.2105263    0.2105263 0.00000000 0.2105263      0.3684211
    2  BRAZIL         0.1000000    0.4000000 0.10000000 0.4000000      0.0000000
    6 GERMANY         0.0000000    0.2000000 0.20000000 0.4000000      0.2000000
    7   ITALY         0.1333333    0.3333333 0.06666667 0.2000000      0.2666667
    

    Should be fast.


    Data:

    > dput(df)
    structure(list(Country = c("USA", "BRAZIL", "BRAZIL", "BRAZIL", 
    "USA", "GERMANY", "ITALY", "GERMANY", "BRAZIL", "USA"), q1 = structure(c(1L, 
    5L, 1L, 1L, 2L, 4L, 2L, 2L, 1L, 4L), levels = c("Very Dissatisfied", 
    "Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"), 
        q2 = structure(c(2L, 5L, 5L, 1L, 2L, 2L, 4L, 1L, 5L, 5L), levels = c("Very Dissatisfied", 
        "Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"), 
        q3 = structure(c(5L, 4L, 4L, 2L, 4L, 2L, 3L, 2L, 4L, 3L), levels = c("Very Dissatisfied", 
        "Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"), 
        q4 = structure(c(5L, 2L, 5L, 5L, 2L, 3L, 4L, 4L, 5L, 1L), levels = c("Very Dissatisfied", 
        "Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor"), 
        q5 = structure(c(4L, 4L, NA, 2L, 5L, 4L, 5L, 4L, 2L, 2L), levels = c("Very Dissatisfied", 
        "Dissatisfied", "Average", "Satisfied", "Very Satisfied"), class = "factor")), class = "data.frame", row.names = c(NA, 
    -10L))
    
    > dput(likert_levels)
    c("Very Dissatisfied", "Dissatisfied", "Average", "Satisfied", 
    "Very Satisfied")