Search code examples
rfunctionapply

R: how to use apply in dataset


I have a dataset with susceptibilities of various drugs to different bacteria. I would like to get the susceptibility frequencies by organism. Is there a way to streamline this, instead of copy/pasting for each drug? I'm thinking using apply or maybe writing a function, but not sure where to start.

pacman::p_load(tidyverse,
               janitor)

demo_dat <- data.frame(
  stringsAsFactors = FALSE,
                 organism_name = c("Klebsiella pneumonia","Klebsiella pneumonia",
                                   "Escherichia coli","Klebsiella pneumonia",
                                   "Enterobacter cloacae","Escherichia coli",
                                   "Klebsiella pneumonia","Escherichia coli",
                                   "Escherichia coli","Escherichia coli",
                                   "Klebsiella pneumonia","Klebsiella pneumonia",
                                   "Escherichia coli","Klebsiella pneumonia",
                                   "Escherichia coli","Serratia marcenscens",
                                   "Klebsiella oxytoca","Escherichia coli",
                                   "Proteus mirabilis","Escherichia coli"),
                  amox_clav_po = c("S",
                                   "S","S","I","R","I","S","I","R","I",
                                   "S","S","S","S","I","R","S","S","S",
                                   "R"),
                    amp_sul_iv = c("S",
                                   "I","S","S","R","R","S","S","R","I",
                                   "S","I","S","I","R","R","S","S","S",
                                   "R"),
                   cefaclor_po = c("S",
                                   "S","S","S","R","S","S","S","S","S",
                                   "S","S","S","S","R","R","S","S","S",
                                   "S"),
                ceftriaxone_iv = c("S",
                                   "S","S","S","S","S","S","S","S","S",
                                   "S","S","S","S","R","S","S","S","S",
                                   "S")
            )

demo_dat |> 
  group_by(organism_name) |> 
  summarise(susceptibility = sum((amox_clav_po == "S")/n()))
#> # A tibble: 6 × 2
#>   organism_name        susceptibility
#>   <chr>                         <dbl>
#> 1 Enterobacter cloacae          0    
#> 2 Escherichia coli              0.333
#> 3 Klebsiella oxytoca            1    
#> 4 Klebsiella pneumonia          0.857
#> 5 Proteus mirabilis             1    
#> 6 Serratia marcenscens          0

demo_dat |> 
  group_by(organism_name) |> 
  summarise(susceptibility = sum((amp_sul_iv == "S")/n()))
#> # A tibble: 6 × 2
#>   organism_name        susceptibility
#>   <chr>                         <dbl>
#> 1 Enterobacter cloacae          0    
#> 2 Escherichia coli              0.444
#> 3 Klebsiella oxytoca            1    
#> 4 Klebsiella pneumonia          0.571
#> 5 Proteus mirabilis             1    
#> 6 Serratia marcenscens          0

Created on 2024-01-29 with reprex v2.0.2

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.2.2 (2022-10-31)
#>  os       macOS Big Sur ... 10.16
#>  system   x86_64, darwin17.0
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       America/Phoenix
#>  date     2024-01-29
#>  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package       * version date (UTC) lib source
#>  assertthat      0.2.1   2019-03-21 [1] CRAN (R 4.2.0)
#>  backports       1.4.1   2021-12-13 [1] CRAN (R 4.2.0)
#>  broom           1.0.1   2022-08-29 [1] CRAN (R 4.2.0)
#>  cellranger      1.1.0   2016-07-27 [1] CRAN (R 4.2.0)
#>  cli             3.6.1   2023-03-23 [1] CRAN (R 4.2.0)
#>  colorspace      2.0-3   2022-02-21 [1] CRAN (R 4.2.0)
#>  crayon          1.5.2   2022-09-29 [1] CRAN (R 4.2.0)
#>  DBI             1.1.3   2022-06-18 [1] CRAN (R 4.2.0)
#>  dbplyr          2.2.1   2022-06-27 [1] CRAN (R 4.2.0)
#>  digest          0.6.30  2022-10-18 [1] CRAN (R 4.2.0)
#>  dplyr         * 1.1.2   2023-04-20 [1] CRAN (R 4.2.0)
#>  ellipsis        0.3.2   2021-04-29 [1] CRAN (R 4.2.0)
#>  evaluate        0.17    2022-10-07 [1] CRAN (R 4.2.0)
#>  fansi           1.0.3   2022-03-24 [1] CRAN (R 4.2.0)
#>  fastmap         1.1.0   2021-01-25 [1] CRAN (R 4.2.0)
#>  forcats       * 0.5.2   2022-08-19 [1] CRAN (R 4.2.0)
#>  fs              1.5.2   2021-12-08 [1] CRAN (R 4.2.0)
#>  gargle          1.2.1   2022-09-08 [1] CRAN (R 4.2.0)
#>  generics        0.1.3   2022-07-05 [1] CRAN (R 4.2.0)
#>  ggplot2       * 3.3.6   2022-05-03 [1] CRAN (R 4.2.0)
#>  glue            1.6.2   2022-02-24 [1] CRAN (R 4.2.0)
#>  googledrive     2.0.0   2021-07-08 [1] CRAN (R 4.2.0)
#>  googlesheets4   1.0.1   2022-08-13 [1] CRAN (R 4.2.0)
#>  gtable          0.3.1   2022-09-01 [1] CRAN (R 4.2.0)
#>  haven           2.5.1   2022-08-22 [1] CRAN (R 4.2.0)
#>  highr           0.9     2021-04-16 [1] CRAN (R 4.2.0)
#>  hms             1.1.2   2022-08-19 [1] CRAN (R 4.2.0)
#>  htmltools       0.5.3   2022-07-18 [1] CRAN (R 4.2.0)
#>  httr            1.4.4   2022-08-17 [1] CRAN (R 4.2.0)
#>  janitor       * 2.1.0   2021-01-05 [1] CRAN (R 4.2.0)
#>  jsonlite        1.8.3   2022-10-21 [1] CRAN (R 4.2.0)
#>  knitr           1.40    2022-08-24 [1] CRAN (R 4.2.0)
#>  lifecycle       1.0.3   2022-10-07 [1] CRAN (R 4.2.0)
#>  lubridate       1.9.0   2022-11-06 [1] CRAN (R 4.2.0)
#>  magrittr        2.0.3   2022-03-30 [1] CRAN (R 4.2.0)
#>  modelr          0.1.9   2022-08-19 [1] CRAN (R 4.2.0)
#>  munsell         0.5.0   2018-06-12 [1] CRAN (R 4.2.0)
#>  pacman          0.5.1   2019-03-11 [1] CRAN (R 4.2.0)
#>  pillar          1.9.0   2023-03-22 [1] CRAN (R 4.2.0)
#>  pkgconfig       2.0.3   2019-09-22 [1] CRAN (R 4.2.0)
#>  purrr         * 1.0.1   2023-01-10 [1] CRAN (R 4.2.0)
#>  R6              2.5.1   2021-08-19 [1] CRAN (R 4.2.0)
#>  readr         * 2.1.3   2022-10-01 [1] CRAN (R 4.2.0)
#>  readxl          1.4.1   2022-08-17 [1] CRAN (R 4.2.0)
#>  reprex          2.0.2   2022-08-17 [1] CRAN (R 4.2.0)
#>  rlang           1.1.1   2023-04-28 [1] CRAN (R 4.2.0)
#>  rmarkdown       2.17    2022-10-07 [1] CRAN (R 4.2.0)
#>  rstudioapi      0.14    2022-08-22 [1] CRAN (R 4.2.0)
#>  rvest           1.0.3   2022-08-19 [1] CRAN (R 4.2.0)
#>  scales          1.2.1   2022-08-20 [1] CRAN (R 4.2.0)
#>  sessioninfo     1.2.2   2021-12-06 [1] CRAN (R 4.2.0)
#>  snakecase       0.11.0  2019-05-25 [1] CRAN (R 4.2.0)
#>  stringi         1.7.8   2022-07-11 [1] CRAN (R 4.2.0)
#>  stringr       * 1.4.1   2022-08-20 [1] CRAN (R 4.2.0)
#>  tibble        * 3.2.1   2023-03-20 [1] CRAN (R 4.2.0)
#>  tidyr         * 1.2.1   2022-09-08 [1] CRAN (R 4.2.0)
#>  tidyselect      1.2.0   2022-10-10 [1] CRAN (R 4.2.0)
#>  tidyverse     * 1.3.2   2022-07-18 [1] CRAN (R 4.2.0)
#>  timechange      0.1.1   2022-11-04 [1] CRAN (R 4.2.0)
#>  tzdb            0.4.0   2023-05-12 [1] CRAN (R 4.2.0)
#>  utf8            1.2.2   2021-07-24 [1] CRAN (R 4.2.0)
#>  vctrs           0.6.2   2023-04-19 [1] CRAN (R 4.2.0)
#>  withr           2.5.0   2022-03-03 [1] CRAN (R 4.2.0)
#>  xfun            0.34    2022-10-18 [1] CRAN (R 4.2.0)
#>  xml2            1.3.3   2021-11-30 [1] CRAN (R 4.2.0)
#>  yaml            2.3.6   2022-10-18 [1] CRAN (R 4.2.0)
#> 
#>  [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────

Solution

  • You can pivot and summarize:

    library(dplyr)
    library(tidyr)
    out <- demo_dat |>
      pivot_longer(-organism_name) |>
      summarize(
        susceptibility = sum(value == "S")/n(),
        .by = c(organism_name, name)
      )
    
    out
    # # A tibble: 24 × 3
    #    organism_name        name           susceptibility
    #    <chr>                <chr>                   <dbl>
    #  1 Klebsiella pneumonia amox_clav_po            0.857
    #  2 Klebsiella pneumonia amp_sul_iv              0.571
    #  3 Klebsiella pneumonia cefaclor_po             1    
    #  4 Klebsiella pneumonia ceftriaxone_iv          1    
    #  5 Escherichia coli     amox_clav_po            0.333
    #  6 Escherichia coli     amp_sul_iv              0.444
    #  7 Escherichia coli     cefaclor_po             0.889
    #  8 Escherichia coli     ceftriaxone_iv          0.889
    #  9 Enterobacter cloacae amox_clav_po            0    
    # 10 Enterobacter cloacae amp_sul_iv              0    
    # # ℹ 14 more rows
    # # ℹ Use `print(n = ...)` to see more rows
    

    Just to make sure we're seeing the same thing as your output,

    filter(out, name == "amox_clav_po")
    # # A tibble: 6 × 3
    #   organism_name        name         susceptibility
    #   <chr>                <chr>                 <dbl>
    # 1 Klebsiella pneumonia amox_clav_po          0.857
    # 2 Escherichia coli     amox_clav_po          0.333
    # 3 Enterobacter cloacae amox_clav_po          0    
    # 4 Serratia marcenscens amox_clav_po          0    
    # 5 Klebsiella oxytoca   amox_clav_po          1    
    # 6 Proteus mirabilis    amox_clav_po          1