I want to vectorize this operation:
for (row1 in 1:nrow(full_df)) {
for (row2 in 1:nrow(icd10_codes)){
if (any(full_df[row1, "coding_19"] %in% icd10_codes[row2, "present_icd10"])){
full_df[row1, "code_count"] <- full_df[row1, "code_count"]+1
}
}
}
full_df is this:
full_df <- data.frame(
coding_19=I(list(c("H353"), c("B20","B21", "B22", "B23","B24","Z21", "F024","O987"), c("G30","F00"), c("E780")))
)
full_df$code_count <- 0
icd10_codes
looks like this. eid is the id of the person, and present_icd10
are the codes associated to that person.
If you want you can just copy and paste:
data.frame(
eid=c(1,2,3,4,5,6),
present_icd10=I(list(c("G30", "F00"), c("B20"), c("E780"), c("H401", "H409"), c("H353"), c("E780")))
)
notice that present_icd10
and coding_19
are vectors of n dimensions.
I want to count the times where at least one element in full_df$coding_19
(rowise) is present in each person (present_icd10
)
The expected output should be:
coding_19 code_count
"H353" 1
"B20", "B21", "B22", "B23", "B24", "Z21", "F024", "O987" 1
"G30", "F00" 1
"E780" 2
Meaning that there is 1 person with code H353, 1 person with code B20, 1 person with either G30 or F00 (in this case both), and 2 persons with code E780.
I tried using this:
full_df <- full_df %>%
rowwise() %>%
mutate(code_count = code_count + as.integer(any(coding_19 %in% icd10_codes$present_icd10)))
But I think this would only work if I had a single loop, not nested loops.
It's hackish, but it gets to your desired output:
library(tidyr)
library(dplyr)
full_df <- full_df |>
as_tibble() |>
mutate(id_row = row_number()) |>
unnest(coding_19)
icd10_codes <- icd10_codes |>
as_tibble() |>
unnest(present_icd10)
For the above, the use of unnest()
separates each of the codes into separate lines, and we'll rely on the id variables (which I created for full_df
) to keep track as to which codes belong with which when wrapping up.
full_df |>
left_join(icd10_codes, by = join_by(coding_19 == present_icd10)) |> # Merge the two data frames together, using the codes as the key
group_by(id_row) |> # For each id_row...
fill(eid) |> # ... replace NAs with the eid
group_by(id_row, eid) |> # And for each combination of id_row and eid
summarize(coding_19 = paste(coding_19, collapse = ", ")) |> # ... collapse the codes back.
ungroup() |> # And remove the grouping
count(coding_19, name = "code_count") # And count the number of occurrences by `coding_19`.
# A tibble: 4 × 2
coding_19 code_count
<chr> <int>
1 B20, B21, B22, B23, B24, Z21, F024, O987 1
2 E780 2
3 G30, F00 1
4 H353 1
This would need to be tested to a larger dataset. There may be a lapse in logic in there.
EDIT:
With this data:
ex_icd10_codes <- data.frame(eid=1:7,
present_icd10=I(list(c("G30", "F00"), "B20", "E780", c("H401", "H409"), "H353", "E780", "B20")))
full_df <- data.frame(
coding_19=I(list("H353", c("B20","B21", "B22", "B23","B24","Z21", "F024","O987"), c("G30","F00"), "E780"))
)
We could do:
library(tidyverse)
library(fuzzyjoin)
ex_icd10_codes <- ex_icd10_codes |>
as_tibble() |>
unnest(present_icd10)
full_df |>
as_tibble() |>
mutate(id_row = row_number(),
coding_19 = map_chr(coding_19, ~ paste(.x, collapse = ", "))) |> # Collapse the codes into one string per row
fuzzy_left_join(ex_icd10_codes, by = c("coding_19" = "present_icd10"), match_fun = str_detect) |> # Merge the two data frames together, using the codes as the key, if the code in ex_icd10_codes is in full_df
distinct(eid, .keep_all = TRUE) |> # Remove duplicates
count(coding_19, name = "code_count") # And count the number of occurrences by `coding_19`.
# A tibble: 4 × 2
coding_19 code_count
<chr> <int>
1 B20, B21, B22, B23, B24, Z21, F024, O987 2
2 E780 2
3 G30, F00 1
4 H353 1