Search code examples
rdatabasebioinformatics

FInding multiple time reportings across columns


I am trying to assess a dataset of injuries. The data comes from 4 sources (hospital, gp, self report, death) each with a time to the injury given in years (continuous variable). A single person may have an injury reported in one source, or multiple sources. I wish to know if the hospital injuries are reported elsewhere in other sources (any time to injury within 0.25 is considered the same injury).

I therefore want to create a column saying Hospital_elsewhere_1 where if there is a time in the Hospital_1 column, the column Hospital_elsewhere_1 will say "Hospital" and if there is a time within 0.25 in any of the other columns (not including Hospital) it will also include a text, seperated by a | of the source.

FOr instance if there is a injury at 65.44 years in Hospital_1 and a injury at 65.42 in GP_1 and 65.43 in self_report it would say "Hospital|GP|self_report".

I would want to do this for each of the hospital columns so there would be a Hospital_elsewhere_(i)

A example dataset is below

library(tibble)
set.seed(123)

example_data <- tibble(
  id = 1:30,
  Hospital_1 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  Hospital_2 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  Hospital_3 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  Hospital_4 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_1 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_2 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_3 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_4 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_5 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_6 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_7 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_8 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_9 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_10 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  self_report_1 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  self_report_2 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  self_report_3 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  self_report_4 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  death_1 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE)
)

for (i in 1:10) {
  index <- sample(1:30, 1)  
  gp_value <- round(runif(1, 1, 80), 2)  
  example_data[index, paste0("GP_", 1:4)] <- gp_value 
  example_data[index, paste0("Hospital_", 1:4)] <- gp_value + runif(1, -0.25, 0.25)
}


Solution

    1. Transform your data with one (list) column per location:

      library(dplyr)
      library(purrr)
      library(tidyr)
      (list_df <- example_data %>%
        pivot_longer(names_to = c("Type", "nr"), 
                     names_pattern = "(.*)_(\\d*)", 
                     cols = -id,
                     names_transform = list(nr = as.integer)) %>%
        summarize(vals = list(value), .by = c(id, Type)) %>%
        pivot_wider(names_from = Type, values_from = vals) %>%
        relocate(GP, .after = Hospital))
      # # A tibble: 30 × 5
      #       id Hospital  GP         self_report death    
      #    <int> <list>    <list>     <list>      <list>   
      #  1     1 <dbl [4]> <dbl [10]> <dbl [4]>   <dbl [1]>
      #  2     2 <dbl [4]> <dbl [10]> <dbl [4]>   <dbl [1]>
      #  3     3 <dbl [4]> <dbl [10]> <dbl [4]>   <dbl [1]>
      # [...]
      
    2. For each Hospital check whether there is any location within the time range:

      detect_reportings <- function(other, Hospital) {
        imap_chr(set_names(Hospital, paste0("Hospital_", seq_along(Hospital))),
                 ~ if_else(any(abs(other - .x) <= .25), cur_column(), NA_character_)) %>%
        list()
      }
      

      This function will eventually return a character vector of length 4 (== number of hospitals) for each other location which is NA if there is noevent within the time range and the name of the location otherwise:

      (dist_check <- list_df %>%
        group_by(id) %>%
        mutate(across(GP:death, ~ detect_reportings(.x[[1]], Hospital[[1]])),
               Hospital = set_names(if_else(is.na(Hospital[[1]]), NA_character_, "Hospital"),
                                    paste0("Hospital_", seq_along(Hospital[[1]]))) %>%
                           list()))
      # # A tibble: 30 × 5
      # # Groups:   id [30]
      #       id Hospital  GP        self_report death    
      #    <int> <list>    <list>    <list>      <list>   
      #  1     1 <chr [4]> <chr [4]> <chr [4]>   <chr [4]>
      #  2     2 <chr [4]> <chr [4]> <chr [4]>   <chr [4]>
      #  3     3 <chr [4]> <chr [4]> <chr [4]>   <chr [4]>
      # [...]
      
      dist_check$GP[[16]]
      ## Hospital_3 and Hospital_4 were within the .25 range
      # Hospital_1 Hospital_2 Hospital_3 Hospital_4 
      #         NA         NA       "GP"       "GP" 
      example_data[16, c(4:5, 8, 14)]
      # # A tibble: 1 × 4
      #   Hospital_3 Hospital_4  GP_3  GP_9
      #        <dbl>      <dbl> <dbl> <dbl>
      # 1       25.6       19.9  25.4  19.7
      
    3. The final step is to merge these checks:

      merge_reportings <- function(...) {
        cbind(...) %>%
          as_tibble() %>%
          unite("result", sep = "|", na.rm = TRUE) %>%
          t() %>%
          c() %>%
          set_names(paste0("Hospital_elsewhere_", seq_along(..1))) %>%
          as.list() %>%
          as_tibble()
      }
      res <- dist_check %>%
        reframe(merge_reportings(Hospital[[1]], GP[[1]], self_report[[1]], death[[1]]))
      print(res, n = 10L)
      
      # # A tibble: 30 × 5
      #       id Hospital_elsewhere_1 Hospital_elsewhere_2 Hospital_elsewhere_3 Hospital_elsewhere_4
      #    <int> <chr>                <chr>                <chr>                <chr>               
      #  1     1 Hospital             Hospital             Hospital             "Hospital"          
      #  2     2 Hospital             Hospital             Hospital             "Hospital"          
      #  3     3 Hospital             Hospital             Hospital             "Hospital"          
      #  4     4 Hospital             Hospital             Hospital             "Hospital"          
      #  5     5 Hospital             Hospital             Hospital             "Hospital"          
      #  6     6 Hospital             Hospital             Hospital|GP          "Hospital|GP"       
      #  7     7 Hospital|GP          Hospital|GP          Hospital|GP          "Hospital|GP"       
      #  8     8 Hospital|GP          Hospital|GP          Hospital|GP          "Hospital|GP"       
      #  9     9 Hospital             Hospital             Hospital             "Hospital"          
      # 10    10 Hospital             Hospital             Hospital             "Hospital"          
      # # ℹ 20 more rows
      # # ℹ Use `print(n = ...)` to see more rows