Search code examples
rjointidyversematchleft-join

How to join tables on prefix equality?


I have a table with prefixes (here in csv format):

PREFIX,LABEL
A,Infectious diseases
B,Infectious diseases
C,Tumor
D1,Tumor
D2,Tumor
D31,Tumor
D32,Tumor
D33,Blood disorder
D4,Blood disorder
D5,Blood disorder

And I want to join it with this one:

AGE,DEATH_CODE
67,A02
85,D318
75,C007+X
62,D338

To get obviously:

AGE,LABEL
67,Infectious diseases
85,Tumor
75,Tumor
62,Blood disorder

I know how to do that with SQL and LIKE but not with tidyverse left_join or base R.


Dput of data

Table 1: CIM_CODES

structure(list(PREFIX = c("A", "B", "C", "D1", "D2", "D31", "D32", 
"D33", "D4", "D5"), LABEL = c("Infectious diseases", "Infectious diseases", 
"Tumor", "Tumor", "Tumor", "Tumor", "Tumor", "Blood disorder", 
"Blood disorder", "Blood disorder")), row.names = c(NA, -10L), spec = structure(list(
    cols = list(PREFIX = structure(list(), class = c("collector_character", 
    "collector")), LABEL = structure(list(), class = c("collector_character", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x000002527d306190>, class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"))

Table 2: DEATH_CAUSES

structure(list(AGE = c(67, 85, 75, 62), DEATH_CODE = c("A02", 
"D318", "C007+X", "D338")), row.names = c(NA, -4L), spec = structure(list(
    cols = list(AGE = structure(list(), class = c("collector_double", 
    "collector")), DEATH_CODE = structure(list(), class = c("collector_character", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector")), delim = ","), class = "col_spec"), problems = <pointer: 0x0000025273898c60>, class = c("spec_tbl_df", 
"tbl_df", "tbl", "data.frame"))

Solution

  • You could do a partial string match that has the lowest difference between the two columns:

    library(tidyverse)
    
    DEATH_CAUSES |>
      mutate(LABEL = map_chr(DEATH_CODE, 
                             ~CIM_CODES$LABEL[
                               which.min(stringdist::stringdist(.x, CIM_CODES$PREFIX))
                               ]))
    #> # A tibble: 4 x 3
    #>     AGE DEATH_CODE LABEL              
    #>   <dbl> <chr>      <chr>              
    #> 1    67 A02        Infectious diseases
    #> 2    85 D318       Tumor              
    #> 3    75 C007+X     Tumor              
    #> 4    62 D338       Blood disorder
    

    UPDATE not using the stringdist package as requested.

    library(tidyverse)
    
    
    get_match <- function(code, prefix, target){
      map(code, \(x){
        map(prefix, \(y){
          grepl(paste0("^", y), x)
          }) 
        }) |>
        map_chr(\(z) target[unlist(z) |> which()] )
    }
    
    DEATH_CAUSES |>
      mutate(LABEL = get_match(DEATH_CAUSES$DEATH_CODE, 
                               CIM_CODES$PREFIX, 
                               CIM_CODES$LABEL))
    #> # A tibble: 4 x 3
    #>     AGE DEATH_CODE LABEL              
    #>   <dbl> <chr>      <chr>              
    #> 1    67 A02        Infectious diseases
    #> 2    85 D318       Tumor              
    #> 3    75 C007+X     Tumor              
    #> 4    62 D338       Blood disorder
    

    EDIT how to do this with a join:

    library(tidyverse)
    library(fuzzyjoin)
    
    fuzzy_left_join(DEATH_CAUSES, 
                    CIM_CODES, 
                    by = c("DEATH_CODE" = "PREFIX"), 
                    str_detect)
    #> # A tibble: 4 x 4
    #>     AGE DEATH_CODE PREFIX LABEL              
    #>   <dbl> <chr>      <chr>  <chr>              
    #> 1    67 A02        A      Infectious diseases
    #> 2    85 D318       D31    Tumor              
    #> 3    75 C007+X     C      Tumor              
    #> 4    62 D338       D33    Blood disorder