I have two dataframes df1 and df2. df1 includes a column df1$product with the name of a product. The second one includes three columns of interest here: df2$product_code, df2$product_name, and df2$product_aka_name. I need to add a column df1$product_code in df1 based on matching on however the product is called (given that it can be known with two different names as listed in df2).
So far I have tried using match() after collating df2$product_name and df2$product_aka_name (which failed). I have also tried using grep() but had to stop after a few hours without any results. Finally, I looked into fuzzyjoin, but it seemed not appropriate to my problem. The real datasets have respectively 700,000 rows and 20 columns for df1, and 40,000 rows and 5 columns for df2. So I would need something fast and efficient…
An MRE could be as follows:
df1 <- data.frame(product = c("Abcd", "Efgh", "Ijkl", "Mnop", "Qrst", "Uvwx"), col2 = c("DLK", "CBN", "ABC", "ZHU", "HSC", "LJK"), col3 = c("qdsf88", "sdf63", "dd995", "dgsg1", "xxx587", "dfr55"))
df1
product col2 col3
1 Abcd DLK qdsf88
2 Efgh CBN sdf63
3 Ijkl ABC dd995
4 Mnop ZHU dgsg1
5 Qrst HSC xxx587
6 Uvwx LJK dfr55
df2 <- data.frame(product_code = c("1001", "1002", "1003", "1004", "1005", "1006", "1007", "1008"), product_name = c("Fcde", "Abcd", "Efgh", "Mlfr", "Mnop", "Plor", "Kdlr", "Vfsd"), product_aka_name = c(NA, NA, NA, "Qrst", NA, "Uvwx", "Azer", "Qwer"))
df2
product_code product_name product_aka_name
1 1001 Fcde <NA>
2 1002 Abcd <NA>
3 1003 Efgh <NA>
4 1004 Mlfr Qrst
5 1005 Mnop <NA>
6 1006 Plor Uvwx
7 1007 Kdlr Azer
8 1008 Vfsd Qwer
The result I need is as follows:
df1 <- data.frame(product = c("Abcd", "Efgh", "Ijkl", "Mnop", "Qrst", "Uvwx"), col2 = c("DLK", "CBN", "ABC", "ZHU", "HSC", "LJK"), col3 = c("qdsf88", "sdf63", "dd995", "dgsg1", "xxx587", "dfr55"), product_code = c("1002", "1003", NA, "1005", "1004", "1006"))
df1
product col2 col3 product_code
1 Abcd DLK qdsf88 1002
2 Efgh CBN sdf63 1003
3 Ijkl ABC dd995 <NA>
4 Mnop ZHU dgsg1 1005
5 Qrst HSC xxx587 1004
6 Uvwx LJK dfr55 1006
Thanks a lot for any help.
Make it in two steps and update those with NA
.
i <- match(df1$product, df2$product_name)
j <- is.na(i)
i[j] <- match(df1$product[j], df2$product_aka_name)
df2$product_code[i]
#[1] "1002" "1003" NA "1005" "1004" "1006"
In case this is to slow use fastmatch::fmatch
instead of match
and maybe use which(is.na(i))
instead of is.na(i)
.
Or create one vector to match and use modulo %%
.
i <- match(df1$product, c(df2$product_name, df2$product_aka_name))
i <- (i - 1) %% nrow(df2) + 1
df2$product_code[i]
Or create a vector of the indexes, set the names and use this to subset.
df2$product_code[setNames(rep(seq_len(nrow(df2)), 2),
c(df2$product_name, df2$product_aka_name))[df1$product]]
#[1] "1002" "1003" NA "1005" "1004" "1006"
Benchmark
library(data.table)
dt0 <- copy(df1)
dt2 <- copy(df2)
setDT(dt0); setDT(dt2)
library(dplyr)
library(tidyr)
microbenchmark::microbenchmark(setup = dt1 <- copy(dt0),
"data.table" = {
dt1[dt2,
on = c("product" = "product_name"),
code1 := i.product_code
][dt2,
on = c("product" = "product_aka_name"),
code2 := i.product_code
][,
product_code := fcoalesce(
code1, code2
)
][, `:=`(
code1 = NULL,
code2 = NULL
)]
dt1$product_code},
base = {i <- match(df1$product, df2$product_name)
j <- is.na(i)
i[j] <- match(df1$product[j], df2$product_aka_name)
df2$product_code[i] },
"dplyr-TarJae" = {df1 %>%
left_join(df2, by = c("product" = "product_name")) %>%
left_join(df2, by = c("product" = "product_aka_name")) %>%
mutate(product_code = coalesce(product_code.x, product_code.y), .keep = "unused") %>%
select(-c(product_aka_name, product_name)) },
"dplyr-Ben Norris" = {df1 %>%
left_join( # keeps all rows in df1 even if returns NA
df2 %>%
pivot_longer(cols = ends_with("name"), # pivots the columns with product names
values_to = "product", # names the new column to match df1
names_to = NULL) # discards old column names
)}
)
Result
Unit: microseconds
expr min lq mean median uq max neval
data.table 1885.500 2050.626 2238.04686 2223.5400 2281.879 5869.183 100
base 12.285 14.954 19.24151 19.7455 21.667 38.147 100
dplyr-TarJae 5724.538 6041.483 6515.73361 6367.4275 6552.855 14358.221 100
dplyr-Ben Norris 16517.120 17091.760 18016.42934 17427.3015 18800.145 22272.853 100