Search code examples
rdataframedatatablestatisticscalculated-columns

Create table with wilcox.test p values on multiple subgroup columns in R


I have df1 below. I would like to systematically calculate wilcox.test p values to test whether the variable is significantly higher/lower per color, as defined in the color1 and color2 columns.

I would like to test this for all samples, and per group in the column group.

I am hoping to create a new data.frame with the results, including the sample numbers per group (n). Anticipated result is shown in df2 below. Note, though that the p values in df2 are made up as examples only.

 df1 <- data.frame(
      stringsAsFactors = FALSE,
               sample = c(1L,2L,3L,4L,
                          5L,6L,7L,8L,9L,10L,11L,12L,13L,14L,15L,
                          16L,17L,18L,19L,20L,21L,22L,23L,24L,25L,
                          26L,27L,28L,29L,30L),
                group = c("a","a","a",
                          "a","a","a","a","a","a","a","a","a","a",
                          "a","a","a","b","b","b","b","b","b","c",
                          "c","c","c","c","c","c","c"),
             variable = c(5L,2L,4L,4L,
                          1L,3L,3L,5L,1L,7L,13L,9L,4L,4L,3L,12L,
                          0L,11L,1L,3L,0L,4L,5L,2L,6L,4L,6L,7L,5L,
                          3L),
               color1 = c("black","white",
                          "white","black","black","white","white",
                          "black","black","black","black","black","white",
                          "white","black","white","black","white",
                          "black","white","black","white","white","white",
                          "black","white","black","black","white",
                          "black"),
               color2 = c("red","blue",
                          "blue","blue","red","blue","blue","red","blue",
                          "red","red","blue","red","red","red",
                          "blue","blue","red","blue","red","red","blue",
                          "red","red","red","blue","red","blue","blue",
                          "blue")
        )

 df2 <- data.frame(
     stringsAsFactors = FALSE,
         group = c("all", "a", "b", "c"),
             n = c(30L, 16L, 6L, 8L),
   color1_pval = c(0.0485, 0.9641, 0.0832, 0.3882),
   color2_pval = c(0.6727, 0.4121, 0.1282, 0.4344)
        )

Solution

  • You can try :

    library(dplyr)
    
    df1 %>%
      group_by(group) %>%
      summarise(n = n(),
                color1_pval = wilcox.test(variable[color1 == 'white'], 
                                          variable[color1 == 'black'])$p.value, 
                color2_pval = wilcox.test(variable[color2 == 'blue'], 
                                          variable[color2 == 'red'])$p.value)
    
    #  group     n color1_pval color2_pval
    #  <chr> <int>       <dbl>       <dbl>
    #1 a        16      0.556        0.457
    #2 b         6      0.0765       0.825
    #3 c         8      0.189        1