Search code examples
rmatching

Matching a string in a column in one dataframe with EITHER of two columns in another dataframe in R


I have two dataframes df1 and df2. df1 includes a column df1$product with the name of a product. The second one includes three columns of interest here: df2$product_code, df2$product_name, and df2$product_aka_name. I need to add a column df1$product_code in df1 based on matching on however the product is called (given that it can be known with two different names as listed in df2).

So far I have tried using match() after collating df2$product_name and df2$product_aka_name (which failed). I have also tried using grep() but had to stop after a few hours without any results. Finally, I looked into fuzzyjoin, but it seemed not appropriate to my problem. The real datasets have respectively 700,000 rows and 20 columns for df1, and 40,000 rows and 5 columns for df2. So I would need something fast and efficient…

An MRE could be as follows:

df1 <- data.frame(product = c("Abcd", "Efgh", "Ijkl", "Mnop", "Qrst", "Uvwx"), col2 = c("DLK", "CBN", "ABC", "ZHU", "HSC", "LJK"), col3 = c("qdsf88", "sdf63", "dd995", "dgsg1", "xxx587", "dfr55"))
df1
          product      col2            col3
1         Abcd         DLK             qdsf88
2         Efgh         CBN             sdf63
3         Ijkl         ABC             dd995
4         Mnop         ZHU             dgsg1
5         Qrst         HSC             xxx587
6         Uvwx         LJK             dfr55

df2 <- data.frame(product_code = c("1001", "1002", "1003", "1004", "1005", "1006", "1007", "1008"), product_name = c("Fcde", "Abcd", "Efgh", "Mlfr", "Mnop", "Plor", "Kdlr", "Vfsd"), product_aka_name = c(NA, NA, NA, "Qrst", NA, "Uvwx", "Azer", "Qwer"))
df2
 product_code product_name product_aka_name
1         1001         Fcde             <NA>
2         1002         Abcd             <NA>
3         1003         Efgh             <NA>
4         1004         Mlfr             Qrst
5         1005         Mnop             <NA>
6         1006         Plor             Uvwx
7         1007         Kdlr             Azer
8         1008         Vfsd             Qwer

The result I need is as follows:

df1 <- data.frame(product = c("Abcd", "Efgh", "Ijkl", "Mnop", "Qrst", "Uvwx"), col2 = c("DLK", "CBN", "ABC", "ZHU", "HSC", "LJK"), col3 = c("qdsf88", "sdf63", "dd995", "dgsg1", "xxx587", "dfr55"), product_code = c("1002", "1003", NA, "1005", "1004", "1006"))
df1
 product col2   col3 product_code
1    Abcd  DLK qdsf88         1002
2    Efgh  CBN  sdf63         1003
3    Ijkl  ABC  dd995         <NA>
4    Mnop  ZHU  dgsg1         1005
5    Qrst  HSC xxx587         1004
6    Uvwx  LJK  dfr55         1006

Thanks a lot for any help.


Solution

  • Make it in two steps and update those with NA.

    i <- match(df1$product, df2$product_name)
    j <- is.na(i)
    i[j] <- match(df1$product[j], df2$product_aka_name)
    df2$product_code[i]
    #[1] "1002" "1003" NA     "1005" "1004" "1006"
    

    In case this is to slow use fastmatch::fmatch instead of match and maybe use which(is.na(i)) instead of is.na(i).

    Or create one vector to match and use modulo %%.

    i <- match(df1$product, c(df2$product_name, df2$product_aka_name))
    i <- (i - 1) %% nrow(df2) + 1
    df2$product_code[i]
    

    Or create a vector of the indexes, set the names and use this to subset.

    df2$product_code[setNames(rep(seq_len(nrow(df2)), 2),
            c(df2$product_name, df2$product_aka_name))[df1$product]]
    #[1] "1002" "1003" NA     "1005" "1004" "1006"
    

    Benchmark

    library(data.table)
    dt0 <- copy(df1)
    dt2 <- copy(df2)
    setDT(dt0); setDT(dt2)
    library(dplyr)
    library(tidyr)
    
    microbenchmark::microbenchmark(setup = dt1 <- copy(dt0),
    "data.table" = {
    dt1[dt2,
        on = c("product" = "product_name"),
        code1 := i.product_code
    ][dt2,
        on = c("product" = "product_aka_name"),
        code2 := i.product_code
    ][, 
        product_code := fcoalesce(
            code1, code2
        )
    ][, `:=`(
        code1 = NULL,
        code2 = NULL
     )]
    dt1$product_code},
    base = {i <- match(df1$product, df2$product_name)
      j <- is.na(i)
      i[j] <- match(df1$product[j], df2$product_aka_name)
      df2$product_code[i] },
    "dplyr-TarJae" = {df1 %>% 
      left_join(df2, by = c("product" = "product_name")) %>% 
      left_join(df2, by = c("product" = "product_aka_name")) %>% 
      mutate(product_code = coalesce(product_code.x, product_code.y), .keep = "unused") %>% 
        select(-c(product_aka_name, product_name)) },
    "dplyr-Ben Norris" = {df1 %>%
      left_join( # keeps all rows in df1 even if returns NA
        df2 %>%
          pivot_longer(cols = ends_with("name"), # pivots the columns with product names
                       values_to = "product", # names the new column to match df1
                       names_to = NULL) # discards old column names
      )}
    )
    

    Result

    Unit: microseconds
                 expr       min        lq        mean     median        uq       max neval
           data.table  1885.500  2050.626  2238.04686  2223.5400  2281.879  5869.183   100
                 base    12.285    14.954    19.24151    19.7455    21.667    38.147   100
         dplyr-TarJae  5724.538  6041.483  6515.73361  6367.4275  6552.855 14358.221   100
     dplyr-Ben Norris 16517.120 17091.760 18016.42934 17427.3015 18800.145 22272.853   100