Search code examples
rloopsfor-loopvectorization

How to vectorize nested loops in R?


I want to vectorize this operation:

for (row1 in 1:nrow(full_df)) {
  for (row2 in 1:nrow(icd10_codes)){
    if (any(full_df[row1, "coding_19"] %in% icd10_codes[row2, "present_icd10"])){
      full_df[row1, "code_count"] <- full_df[row1, "code_count"]+1
    }
  }
}

full_df is this:

full_df <- data.frame(
           coding_19=I(list(c("H353"), c("B20","B21", "B22", "B23","B24","Z21", "F024","O987"), c("G30","F00"), c("E780")))
)
full_df$code_count <- 0

icd10_codes looks like this. eid is the id of the person, and present_icd10 are the codes associated to that person.

If you want you can just copy and paste:

data.frame(
    eid=c(1,2,3,4,5,6),
    present_icd10=I(list(c("G30", "F00"), c("B20"), c("E780"), c("H401", "H409"), c("H353"), c("E780")))
    )

notice that present_icd10 and coding_19 are vectors of n dimensions.

I want to count the times where at least one element in full_df$coding_19 (rowise) is present in each person (present_icd10)

The expected output should be:

coding_19                                              code_count
"H353"                                                     1
"B20", "B21", "B22", "B23", "B24", "Z21", "F024", "O987"   1
"G30", "F00"                                               1
"E780"                                                     2

Meaning that there is 1 person with code H353, 1 person with code B20, 1 person with either G30 or F00 (in this case both), and 2 persons with code E780.

I tried using this:

full_df <- full_df %>%
  rowwise() %>%
  mutate(code_count = code_count + as.integer(any(coding_19 %in% icd10_codes$present_icd10)))

But I think this would only work if I had a single loop, not nested loops.


Solution

  • It's hackish, but it gets to your desired output:

    library(tidyr)
    library(dplyr)
    
    full_df <- full_df |> 
      as_tibble() |> 
      mutate(id_row = row_number()) |> 
      unnest(coding_19)
    icd10_codes <- icd10_codes |> 
      as_tibble() |> 
      unnest(present_icd10)
    

    For the above, the use of unnest() separates each of the codes into separate lines, and we'll rely on the id variables (which I created for full_df) to keep track as to which codes belong with which when wrapping up.

    full_df |> 
      left_join(icd10_codes, by = join_by(coding_19 == present_icd10)) |> # Merge the two data frames together, using the codes as the key
      group_by(id_row) |> # For each id_row...
      fill(eid) |> # ... replace NAs with the eid
      group_by(id_row, eid) |> # And for each combination of id_row and eid
      summarize(coding_19 = paste(coding_19, collapse = ", ")) |> # ... collapse the codes back.
      ungroup() |> # And remove the grouping
      count(coding_19, name = "code_count") # And count the number of occurrences by `coding_19`.
    
    # A tibble: 4 × 2
      coding_19                           code_count
      <chr>                                    <int>
    1 B20, B21, B22, B23, B24, Z21, F024, O987     1
    2 E780                                         2
    3 G30, F00                                     1
    4 H353                                         1
    

    This would need to be tested to a larger dataset. There may be a lapse in logic in there.


    EDIT:

    With this data:

    ex_icd10_codes <- data.frame(eid=1:7, 
                                 present_icd10=I(list(c("G30", "F00"), "B20", "E780", c("H401", "H409"), "H353", "E780", "B20"))) 
    
    full_df <- data.frame(
      coding_19=I(list("H353", c("B20","B21", "B22", "B23","B24","Z21", "F024","O987"), c("G30","F00"), "E780"))
    ) 
    

    We could do:

    library(tidyverse)
    library(fuzzyjoin)
    
    ex_icd10_codes <- ex_icd10_codes |> 
      as_tibble() |> 
      unnest(present_icd10)
    
    full_df |> 
      as_tibble() |> 
      mutate(id_row = row_number(),
             coding_19 = map_chr(coding_19, ~ paste(.x, collapse = ", "))) |> # Collapse the codes into one string per row
      fuzzy_left_join(ex_icd10_codes, by = c("coding_19" = "present_icd10"), match_fun = str_detect) |> # Merge the two data frames together, using the codes as the key, if the code in ex_icd10_codes is in full_df
      distinct(eid, .keep_all = TRUE) |> # Remove duplicates
      count(coding_19, name = "code_count") # And count the number of occurrences by `coding_19`.
    
    # A tibble: 4 × 2
      coding_19                                code_count
      <chr>                                         <int>
    1 B20, B21, B22, B23, B24, Z21, F024, O987          2
    2 E780                                              2
    3 G30, F00                                          1
    4 H353                                              1