Search code examples
rdplyrtibblerstatix

Statistical test on multiple variables on multiple lists (tibble)


To simplify my data analysis I need to process different statistical tests (in that example a shapiro test) on different different variables and on different groups of data. The purpose is to not write 150 times the same code. To do that I created, from my dataset, a tibble where each list correspond to a group of data.

The dataset:

dput(head(PdataLaKo,25)) 
structure(list(ctd_file = c("2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090", 
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090", 
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090", 
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090", 
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090", 
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090", 
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090", 
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090", 
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090", 
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090", 
"2018-08-03T090018 SBE0251090", "2018-08-03T090018 SBE0251090", 
"2018-08-03T090018 SBE0251090", "2018-08-03T090018 SBE0251090", 
"2018-08-03T090018 SBE0251090"), station = c("1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2", 
"2", "2", "2", "3", "3", "3", "3", "3"), month = c(8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L), day = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 3L), year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), time = c(16.38, 16.38, 16.38, 16.38, 16.38, 16.38, 16.38, 
16.38, 16.38, 16.38, 17.57, 17.57, 17.57, 17.57, 17.57, 17.57, 
17.57, 17.57, 17.57, 17.57, 9.05, 9.05, 9.05, 9.05, 9.05), LAT = c("69.27166667", 
"69.27166667", "69.27166667", "69.27166667", "69.27166667", "69.27166667", 
"69.27166667", "69.27166667", "69.27166667", "69.27166667", "69.25386667", 
"69.25386667", "69.25386667", "69.25386667", "69.25386667", "69.25386667", 
"69.25386667", "69.25386667", "69.25386667", "69.25386667", "69.23883333", 
"69.23883333", "69.23883333", "69.23883333", "69.23883333"), 
    LONG = c("-25.15166667", "-25.15166667", "-25.15166667", 
    "-25.15166667", "-25.15166667", "-25.15166667", "-25.15166667", 
    "-25.15166667", "-25.15166667", "-25.15166667", "-25.0637", 
    "-25.0637", "-25.0637", "-25.0637", "-25.0637", "-25.0637", 
    "-25.0637", "-25.0637", "-25.0637", "-25.0637", "-24.9915", 
    "-24.9915", "-24.9915", "-24.9915", "-24.9915"), bottom_depth = c("226", 
    "226", "226", "226", "226", "226", "226", "226", "226", "226", 
    "229", "229", "229", "229", "229", "229", "229", "229", "229", 
    "229", "255", "255", "255", "255", "255"), timeS = c(223.5, 
    227.236, 228.558, 228.916, 229.336, 229.749, 230.119, 230.49, 
    228.871, 231.587, 242.497, 241.334, 241.591, 241.869, 242.205, 
    242.601, 249.68, 249.758, 250.013, 250.3, 266.6, 267.148, 
    267.57, 267.899, 268.245), pressure = c(0.607, 0.707, 0.808, 
    0.909, 1.01, 1.112, 1.212, 1.313, 1.516, 1.616, 0.503, 0.101, 
    0.202, 0.303, 0.404, 0.505, 0.606, 0.707, 0.808, 0.909, 0.303, 
    0.404, 0.505, 0.606, 0.707), depth = c(0.6, 0.7, 0.8, 0.9, 
    1, 1.1, 1.2, 1.3, 1.5, 1.6, 0.5, 0.1, 0.2, 0.3, 0.4, 0.5, 
    0.6, 0.7, 0.8, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7), salinity = c(30.195, 
    30.1989, 30.199, 30.1737, 30.1894, 30.1824, 30.1546, 30.2275, 
    29.9942, 30.2885, 28.2285, 28.1332, 28.2182, 28.3279, 28.3815, 
    28.4242, 29.0786, 29.1525, 29.2647, 29.3672, 32.484, 32.3028, 
    31.885, 31.3775, 30.978), temperature = c(2.6459, 2.6477, 
    2.6263, 2.5913, 2.6188, 2.608, 2.6106, 2.6746, 2.6973, 2.6837, 
    1.9945, 2.0299, 1.9315, 1.7799, 1.734, 1.7082, 1.9263, 1.928, 
    1.9184, 1.8562, 1.1441, 1.073, 1.052, 1.0518, 1.0603), oxygen = c(352.891, 
    352.339, 352.12, 352.03, 351.884, 351.702, 351.505, 351.296, 
    347.288, 351.001, 345.827, 345.879, 345.866, 345.841, 345.782, 
    345.65, 346.849, 346.78, 346.718, 346.609, 340.888, 340.026, 
    339.348, 338.819, 338.245), oxygen2 = c(102.182, 102.228, 
    102.257, 102.281, 102.313, 102.347, 102.383, 102.427, 101.491, 
    102.644, 97.258, 97.311, 97.301, 97.29, 97.275, 97.253, 98.285, 
    98.293, 98.325, 98.357, 97.857, 97.839, 97.829, 97.812, 97.788
    ), fluorescence = c(0.17892, 0.17232, 0.17139, 0.17278, 0.17479, 
    0.17701, 0.1792, 0.18158, 0.18525, 0.18909, 0.10588, 0.10647, 
    0.10697, 0.10818, 0.10978, 0.11157, 0.1479, 0.14969, 0.15256, 
    0.15607, 0.30833, 0.31088, 0.31321, 0.31532, 0.31758), turbidity = c(0.164, 
    0.164, 0.164, 0.164, 0.163, 0.163, 0.163, 0.163, 0.161, 0.163, 
    0.165, 0.165, 0.165, 0.165, 0.165, 0.164, 0.159, 0.16, 0.16, 
    0.16, 4.742, 4.747, 4.751, 4.753, 4.755), conductivity = c(2.742475, 
    2.743057, 2.74087, 2.73591, 2.740138, 2.738197, 2.736985, 
    2.747572, 2.72913, 2.752912, 2.524555, 2.525942, 2.524426, 
    2.523692, 2.525012, 2.526692, 2.596153, 2.602249, 2.609861, 
    2.613963, 2.806078, 2.786396, 2.752134, 2.712379, 2.682113
    ), pH = c(8.094, 8.098, 8.099, 8.099, 8.099, 8.099, 8.099, 
    8.099, 8.014, 8.099, 8.16, 8.16, 8.16, 8.16, 8.159, 8.159, 
    8.149, 8.148, 8.147, 8.146, 8.121, 8.12, 8.12, 8.119, 8.118
    ), par = c(2627.7, 2627.7, 2627.7, 2627.7, 2627.7, 2627.7, 
    2627.7, 2627.7, 2600, 2627.7, 2627.8, 2627.8, 2627.8, 2627.8, 
    2627.8, 2627.8, 2627.8, 2627.8, 2627.8, 2627.8, 2627.6, NA, 
    2627.6, 2627.6, 2627.6), soundSpeed = c(1454.65, 1454.65, 
    1454.57, 1454.38, 1454.53, 1454.47, 1454.45, 1454.82, 1439.8, 
    1454.96, 1449.08, 1449.22, 1448.88, 1448.36, 1448.23, 1448.18, 
    1449.98, 1450.1, 1450.19, 1450.08, 1450.99, 1450.48, 1449.83, 
    1449.19, 1448.69), svDM = c(1454.55, 1454.55, 1454.48, 1454.29, 
    1454.43, 1454.37, 1454.35, 1454.72, 1439.71, 1454.86, 1448.98, 
    1449.12, 1448.78, 1448.26, 1448.14, 1448.08, 1449.89, 1450.01, 
    1450.1, 1449.98, 1450.91, 1450.4, 1449.76, 1449.12, 1448.61
    ), svWM = c(1454.57, 1454.57, 1454.49, 1454.3, 1454.45, 1454.39, 
    1454.37, 1454.74, 1439.73, 1454.89, 1448.89, 1449.03, 1448.69, 
    1448.18, 1448.05, 1448, 1449.84, 1449.96, 1450.05, 1449.94, 
      1451.01, 1450.49, 1449.83, 1449.16, 1448.62)), row.names = c(NA, 
    25L), class = "data.frame")

#to correspond to the number of observations, the rows are duplicated

    PdataLaKo <- as.data.frame(lapply(PdataLaKo, rep, 250))

To do that I used, the function group_split from the package dplyr, which created a tibble of different lists corresponding to each group (ctd_file). after I tried to apply shapiro.test from the package rstatix to each selected variable of a group with sapply and to each groups with lapply.

PdataLaKo <- PdataLaKo %>%
  group_split(ctd_file)
lnorm <- PdataLaKo[, c(11:24)] %>% 
  map(~ lapply(., sapply(., shapiro.test)))

If I extract one group with filter and do not create a tibble, the sapply function applied to each variable works. Consequently, the problem come from tibble and not the application to selected column.

#non-grouped data
PdataLaKot <- PdataLaKo %>% filter(ctd_file == "2018-08-05T092503 SBE0251090")    
lnorm <- PdataLaKot[, c(11:24)] %>% sapply(shapiro.test)

Solution

  • library(tidyverse)
    
    shapiro_test_safely <- possibly(.f = shapiro.test, otherwise = NA)
    
    a <- df %>%
      group_nest(ctd_file) %>% 
      mutate(
        data = map(data, ~select(.x, where(is.numeric))[, -c(1:4)]),
        res = map(data, ~map(.x, shapiro_test_safely)),
        res = purrr::set_names(res, nm = ctd_file))
    
    fltr <- map(a$res, ~!is.na(.x))
    
    b <- map2(.x = a$res, .y = fltr,  ~.x[.y])
    
    map_df(b, ~map_df(.x, broom::glance)) %>% 
      mutate(nm = map(b, names) %>% unlist()) %>% 
      mutate(SET = rep(names(b), sapply(b, length))) 
    
    #> # A tibble: 43 x 5
    #>    statistic p.value method                      nm           SET               
    #>        <dbl>   <dbl> <chr>                       <chr>        <chr>             
    #>  1     0.862 0.0807  Shapiro-Wilk normality test timeS        2018-08-02T162038~
    #>  2     0.969 0.879   Shapiro-Wilk normality test pressure     2018-08-02T162038~
    #>  3     0.969 0.882   Shapiro-Wilk normality test depth        2018-08-02T162038~
    #>  4     0.800 0.0144  Shapiro-Wilk normality test salinity     2018-08-02T162038~
    #>  5     0.946 0.625   Shapiro-Wilk normality test temperature  2018-08-02T162038~
    #>  6     0.711 0.00119 Shapiro-Wilk normality test oxygen       2018-08-02T162038~
    #>  7     0.776 0.00738 Shapiro-Wilk normality test oxygen2      2018-08-02T162038~
    #>  8     0.939 0.543   Shapiro-Wilk normality test fluorescence 2018-08-02T162038~
    #>  9     0.750 0.00356 Shapiro-Wilk normality test turbidity    2018-08-02T162038~
    #> 10     0.978 0.954   Shapiro-Wilk normality test conductivity 2018-08-02T162038~
    #> # ... with 33 more rows
    

    Created on 2021-09-16 by the reprex package (v2.0.1)

    data

    df <-
      structure(
        list(
          ctd_file = c(
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T162038 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-02T175518 SBE0251090",
            "2018-08-03T090018 SBE0251090",
            "2018-08-03T090018 SBE0251090",
            "2018-08-03T090018 SBE0251090",
            "2018-08-03T090018 SBE0251090",
            "2018-08-03T090018 SBE0251090"
          ),
          station = c(
            "1",
            "1",
            "1",
            "1",
            "1",
            "1",
            "1",
            "1",
            "1",
            "1",
            "2",
            "2",
            "2",
            "2",
            "2",
            "2",
            "2",
            "2",
            "2",
            "2",
            "3",
            "3",
            "3",
            "3",
            "3"
          ),
          month = c(
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L,
            8L
          ),
          day = c(
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            2L,
            3L,
            3L,
            3L,
            3L,
            3L
          ),
          year = c(
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L,
            2018L
          ),
          time = c(
            16.38,
            16.38,
            16.38,
            16.38,
            16.38,
            16.38,
            16.38,
            16.38,
            16.38,
            16.38,
            17.57,
            17.57,
            17.57,
            17.57,
            17.57,
            17.57,
            17.57,
            17.57,
            17.57,
            17.57,
            9.05,
            9.05,
            9.05,
            9.05,
            9.05
          ),
          LAT = c(
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.27166667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.25386667",
            "69.23883333",
            "69.23883333",
            "69.23883333",
            "69.23883333",
            "69.23883333"
          ),
          LONG = c(
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.15166667",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-25.0637",
            "-24.9915",
            "-24.9915",
            "-24.9915",
            "-24.9915",
            "-24.9915"
          ),
          bottom_depth = c(
            "226",
            "226",
            "226",
            "226",
            "226",
            "226",
            "226",
            "226",
            "226",
            "226",
            "229",
            "229",
            "229",
            "229",
            "229",
            "229",
            "229",
            "229",
            "229",
            "229",
            "255",
            "255",
            "255",
            "255",
            "255"
          ),
          timeS = c(
            223.5,
            227.236,
            228.558,
            228.916,
            229.336,
            229.749,
            230.119,
            230.49,
            228.871,
            231.587,
            242.497,
            241.334,
            241.591,
            241.869,
            242.205,
            242.601,
            249.68,
            249.758,
            250.013,
            250.3,
            266.6,
            267.148,
            267.57,
            267.899,
            268.245
          ),
          pressure = c(
            0.607,
            0.707,
            0.808,
            0.909,
            1.01,
            1.112,
            1.212,
            1.313,
            1.516,
            1.616,
            0.503,
            0.101,
            0.202,
            0.303,
            0.404,
            0.505,
            0.606,
            0.707,
            0.808,
            0.909,
            0.303,
            0.404,
            0.505,
            0.606,
            0.707
          ),
          depth = c(
            0.6,
            0.7,
            0.8,
            0.9,
            1,
            1.1,
            1.2,
            1.3,
            1.5,
            1.6,
            0.5,
            0.1,
            0.2,
            0.3,
            0.4,
            0.5,
            0.6,
            0.7,
            0.8,
            0.9,
            0.3,
            0.4,
            0.5,
            0.6,
            0.7
          ),
          salinity = c(
            30.195,
            30.1989,
            30.199,
            30.1737,
            30.1894,
            30.1824,
            30.1546,
            30.2275,
            29.9942,
            30.2885,
            28.2285,
            28.1332,
            28.2182,
            28.3279,
            28.3815,
            28.4242,
            29.0786,
            29.1525,
            29.2647,
            29.3672,
            32.484,
            32.3028,
            31.885,
            31.3775,
            30.978
          ),
          temperature = c(
            2.6459,
            2.6477,
            2.6263,
            2.5913,
            2.6188,
            2.608,
            2.6106,
            2.6746,
            2.6973,
            2.6837,
            1.9945,
            2.0299,
            1.9315,
            1.7799,
            1.734,
            1.7082,
            1.9263,
            1.928,
            1.9184,
            1.8562,
            1.1441,
            1.073,
            1.052,
            1.0518,
            1.0603
          ),
          oxygen = c(
            352.891,
            352.339,
            352.12,
            352.03,
            351.884,
            351.702,
            351.505,
            351.296,
            347.288,
            351.001,
            345.827,
            345.879,
            345.866,
            345.841,
            345.782,
            345.65,
            346.849,
            346.78,
            346.718,
            346.609,
            340.888,
            340.026,
            339.348,
            338.819,
            338.245
          ),
          oxygen2 = c(
            102.182,
            102.228,
            102.257,
            102.281,
            102.313,
            102.347,
            102.383,
            102.427,
            101.491,
            102.644,
            97.258,
            97.311,
            97.301,
            97.29,
            97.275,
            97.253,
            98.285,
            98.293,
            98.325,
            98.357,
            97.857,
            97.839,
            97.829,
            97.812,
            97.788
          ),
          fluorescence = c(
            0.17892,
            0.17232,
            0.17139,
            0.17278,
            0.17479,
            0.17701,
            0.1792,
            0.18158,
            0.18525,
            0.18909,
            0.10588,
            0.10647,
            0.10697,
            0.10818,
            0.10978,
            0.11157,
            0.1479,
            0.14969,
            0.15256,
            0.15607,
            0.30833,
            0.31088,
            0.31321,
            0.31532,
            0.31758
          ),
          turbidity = c(
            0.164,
            0.164,
            0.164,
            0.164,
            0.163,
            0.163,
            0.163,
            0.163,
            0.161,
            0.163,
            0.165,
            0.165,
            0.165,
            0.165,
            0.165,
            0.164,
            0.159,
            0.16,
            0.16,
            0.16,
            4.742,
            4.747,
            4.751,
            4.753,
            4.755
          ),
          conductivity = c(
            2.742475,
            2.743057,
            2.74087,
            2.73591,
            2.740138,
            2.738197,
            2.736985,
            2.747572,
            2.72913,
            2.752912,
            2.524555,
            2.525942,
            2.524426,
            2.523692,
            2.525012,
            2.526692,
            2.596153,
            2.602249,
            2.609861,
            2.613963,
            2.806078,
            2.786396,
            2.752134,
            2.712379,
            2.682113
          ),
          pH = c(
            8.094,
            8.098,
            8.099,
            8.099,
            8.099,
            8.099,
            8.099,
            8.099,
            8.014,
            8.099,
            8.16,
            8.16,
            8.16,
            8.16,
            8.159,
            8.159,
            8.149,
            8.148,
            8.147,
            8.146,
            8.121,
            8.12,
            8.12,
            8.119,
            8.118
          ),
          par = c(
            2627.7,
            2627.7,
            2627.7,
            2627.7,
            2627.7,
            2627.7,
            2627.7,
            2627.7,
            2600,
            2627.7,
            2627.8,
            2627.8,
            2627.8,
            2627.8,
            2627.8,
            2627.8,
            2627.8,
            2627.8,
            2627.8,
            2627.8,
            2627.6,
            NA,
            2627.6,
            2627.6,
            2627.6
          ),
          soundSpeed = c(
            1454.65,
            1454.65,
            1454.57,
            1454.38,
            1454.53,
            1454.47,
            1454.45,
            1454.82,
            1439.8,
            1454.96,
            1449.08,
            1449.22,
            1448.88,
            1448.36,
            1448.23,
            1448.18,
            1449.98,
            1450.1,
            1450.19,
            1450.08,
            1450.99,
            1450.48,
            1449.83,
            1449.19,
            1448.69
          ),
          svDM = c(
            1454.55,
            1454.55,
            1454.48,
            1454.29,
            1454.43,
            1454.37,
            1454.35,
            1454.72,
            1439.71,
            1454.86,
            1448.98,
            1449.12,
            1448.78,
            1448.26,
            1448.14,
            1448.08,
            1449.89,
            1450.01,
            1450.1,
            1449.98,
            1450.91,
            1450.4,
            1449.76,
            1449.12,
            1448.61
          ),
          svWM = c(
            1454.57,
            1454.57,
            1454.49,
            1454.3,
            1454.45,
            1454.39,
            1454.37,
            1454.74,
            1439.73,
            1454.89,
            1448.89,
            1449.03,
            1448.69,
            1448.18,
            1448.05,
            1448,
            1449.84,
            1449.96,
            1450.05,
            1449.94,
            1451.01,
            1450.49,
            1449.83,
            1449.16,
            1448.62
          )
        ),
        row.names = c(NA,
                      25L),
        class = "data.frame"
      )