Search code examples
rstatisticspairwise.wilcox.test

How to run paired wilcoxon test for multiple variables


I'm a beginner on R Studio and writing my thesis, I would like to run a wilcoxon test for 5 variables dispatched in 2 groups, a full sample and a control sample, and to avoid creating 5 different dataframes I want to run it in one command.

The dataframe is a follow:

structure(list(Sample = c("Full", "Full", "Full", "Full", "Full", 
"Full", "Full", "Full", "Full", "Full", "Full", "Full", "Full", 
"Full", "Full", "Full", "Full", "Full", "Full", "Full", "Control", 
"Control", "Control", "Control", "Control", "Control", "Control", 
"Control", "Control", "Control", "Control", "Control", "Control", 
"Control", "Control", "Control", "Control", "Control", "Control", 
"Control"), EBITDA = c(7027, 9521, 926, 7085, 739, 3055, 1084, 
7051, 90.2, 1863, 3567, 1514, 15212, 921.9, 15222, 336, 128.52, 
21.93, 118.09, 1019.4, 4732.8, 17776, 940.45, 5265, 659.3, 4590.1, 
1014.9, 3198, 72.51, 2083, 4277, 433.29, 9653, 628, 7706, 108, 
118.07, 98.19, 52.39, 1138), SALES = c(30638, 25636, 6100, 25392, 
10321, 24114, 10573, 44152, 700.68, 11588, 40682, 39434, 71203, 
1233.9, 45503, 3553, 1198.01, 120.1, 349.62, 4460.4, 22532, 42384, 
5295.86, 43800, 10704.4, 30427.8, 9051.2, 35555, 857.08, 14125, 
60613, 3302.25, 66639, 1583, 28878, 3967, 1216.73, 819.28, 327.41, 
5065), Net.Income = c(5648, 2363, 29, 2603, 9, 748, 580, 2983, 
26.84, 324, 2669, 99, 2269, 214.1, 3810, -675, 78.17, 11.56, 
30.25, 530.5, 2767.5, 5768, 401.55, 1378, 143, 1461, 265.2, 1311, 
18.46, 856, 885, 281.22, -561, 313, 2198, -446, 55.6, 34.5, 29.15, 
593), Total.Assets = c(53362, 1181372, 8000, 56666, 16175, 30690, 
11459, 75357, 1071.75, 18649, 68198, 72137, 281640, 7279, 94276, 
6482, 776.36, 121.9, 1198.02, 6776.6, 32063, 1965283, 9194.81, 
43395, 7871.5, 49544.5, 9038.8, 37694, 643.75, 18135, 68975, 
8427.42, 158499, 10323, 58993, 3539, 1031.06, 717.73, 568.18, 
7577), ROA = c(0.105843109, 0.002000217, 0.003625, 0.045935835, 
0.000556414, 0.02437276, 0.050615237, 0.039584909, 0.025043107, 
0.017373586, 0.039136045, 0.001372389, 0.008056384, 0.029413381, 
0.040413255, -0.104134526, 0.10068486, 0.094845124, 0.025249098, 
0.078284095, 0.086314443, 0.002934946, 0.043671054, 0.03175481, 
0.018166804, 0.029488642, 0.029340178, 0.034780071, 0.028681942, 
0.047201544, 0.012830736, 0.03337013, -0.003539455, 0.030320643, 
0.037258658, -0.126024301, 0.053925296, 0.048068282, 0.051307594, 
0.078263165)), class = "data.frame", row.names = c(NA, -40L))

I tried the solution proposed here with the wilcox_test function by rstatix but it only gives one p-value as the output and I would like the test score and the p-values for each of the variables.

Any help will be highly appreciated,

Thanks in advance !


Solution

  • Here is an example from my similar question (Apply statistical test to many variables: improve speed) tweaked to suit your use-case:

    df <- structure(list(Sample = c("ISO 14001", "ISO 14001", "ISO 14001", 
                                    "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", 
                                    "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", 
                                    "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", "ISO 14001", 
                                    "ISO 14001", "ISO 14001", "Control Sample", "Control Sample", 
                                    "Control Sample", "Control Sample", "Control Sample", "Control Sample", 
                                    "Control Sample", "Control Sample", "Control Sample", "Control Sample", 
                                    "Control Sample", "Control Sample", "Control Sample", "Control Sample", 
                                    "Control Sample", "Control Sample", "Control Sample", "Control Sample", 
                                    "Control Sample", "Control Sample"), 
                         EBITDA = c(7027, 9521, 926, 
                                    7085, 739, 3055, 1084, 7051, 90.2, 1863, 3567, 1514, 15212, 921.9, 
                                    15222, 336, 128.52, 21.93, 118.09, 1019.4, 4732.8, 17776, 940.45, 
                                    5265, 659.3, 4590.1, 1014.9, 3198, 72.51, 2083, 4277, 433.29, 
                                    9653, 628, 7706, 108, 118.07, 98.19, 52.39, 1138), 
                         SALES = c(30638, 
                                   25636, 6100, 25392, 10321, 24114, 10573, 44152, 700.68, 11588, 
                                   40682, 39434, 71203, 1233.9, 45503, 3553, 1198.01, 120.1, 349.62, 
                                   4460.4, 22532, 42384, 5295.86, 43800, 10704.4, 30427.8, 9051.2, 
                                   35555, 857.08, 14125, 60613, 3302.25, 66639, 1583, 28878, 3967, 
                                   1216.73, 819.28, 327.41, 5065), 
                         Net.Income = c(5648, 2363, 29, 
                                        2603, 9, 748, 580, 2983, 26.84, 324, 2669, 99, 2269, 214.1, 3810, 
                                        -675, 78.17, 11.56, 30.25, 530.5, 2767.5, 5768, 401.55, 1378, 
                                        143, 1461, 265.2, 1311, 18.46, 856, 885, 281.22, -561, 313, 2198, 
                                        -446, 55.6, 34.5, 29.15, 593), 
                         Total.Assets = c(53362, 1181372, 
                                          8000, 56666, 16175, 30690, 11459, 75357, 1071.75, 18649, 68198, 
                                          72137, 281640, 7279, 94276, 6482, 776.36, 121.9, 1198.02, 6776.6, 
                                          32063, 1965283, 9194.81, 43395, 7871.5, 49544.5, 9038.8, 37694, 
                                          643.75, 18135, 68975, 8427.42, 158499, 10323, 58993, 3539, 1031.06, 
                                          717.73, 568.18, 7577), 
                         ROA = c(0.105843109, 0.002000217, 0.003625, 
                                 0.045935835, 0.000556414, 0.02437276, 0.050615237, 0.039584909, 
                                 0.025043107, 0.017373586, 0.039136045, 0.001372389, 0.008056384, 
                                 0.029413381, 0.040413255, -0.104134526, 0.10068486, 0.094845124, 
                                 0.025249098, 0.078284095, 0.086314443, 0.002934946, 0.043671054, 
                                 0.03175481, 0.018166804, 0.029488642, 0.029340178, 0.034780071, 
                                 0.028681942, 0.047201544, 0.012830736, 0.03337013, -0.003539455, 
                                 0.030320643, 0.037258658, -0.126024301, 0.053925296, 0.048068282, 
                                 0.051307594, 0.078263165)), 
                    class = "data.frame", row.names = c(NA,  -40L)
    )
    
    Map_func_pvalues <- function(dataset) {
      tmp <- split(dataset, dataset$Sample)
      stack(Map(function(x, y) wilcox.test(x, y, exact = FALSE)$p.value, tmp[[1]][-1], tmp[[2]][-1]))
    }
    p_values <- Map_func_pvalues(df)
    
    Map_func_statistic <- function(dataset) {
      tmp <- split(dataset, dataset$Sample)
      stack(Map(function(x, y) wilcox.test(x, y, exact = FALSE)$statistic, tmp[[1]][-1], tmp[[2]][-1]))
    }
    test_scores <- Map_func_statistic(df)
    
    merge(p_values, test_scores, by = "ind", suffix = c("_pvalue", "_wilcox_statistic"))
    #>            ind values_pvalue values_wilcox_statistic
    #> 1       EBITDA     0.6948910                     185
    #> 2   Net.Income     0.8817308                     194
    #> 3          ROA     0.7352682                     213
    #> 4        SALES     0.9245727                     196
    #> 5 Total.Assets     0.5978626                     180
    

    Created on 2022-11-15 by the reprex package (v2.0.1)

    Does that solve your problem?