Search code examples
rdataframejoindplyr

Join two tables if given variable value is a part of another variable


There are base_table and value_table, how to join them according whether category in mix_category ? Thanks!

library(tidyverse)
    
base_table <- data.frame(mix_category = c('ABC','WE','CF','Z236'))
#   mix_category
# 1          ABC
# 2           WE
# 3           CF
# 4         Z236

value_table <- data.frame(category = c('A','F','W','E','C','2'),
                          value = c('1','2','3','6','0','-1'))
#   category value
# 1        A     1
# 2        F     2
# 3        W     3
# 4        E     6
# 5        C     0
# 6        2    -1
  • Wished result 1 : if find category in mix_category in the first time, then return the first category and value variable which in value_table
#   mix_category category value
# 1          ABC        A     1
# 2           WE        W     3
# 3           CF        C     0
# 4         Z236        2    -1
  • Wished result 2 : if find category in mix_category , then return the category and value variable which in value_table (if there are more than one returns, combine then into a string with '_')
#   mix_category category value
# 1          ABC      A_C   1_0
# 2           WE      W_E   3_6
# 3           CF      C_F   0_2
# 4         Z236        2    -1

Solution

  • Solution 1


    You can split mix_category column into rows using tidyr::separate_longer_position(), and then match it to value_table.

    match_table <- base_table %>%
      mutate(category = mix_category) %>%
      tidyr::separate_longer_position(category, width = 1) %>%
      inner_join(value_table, by = "category")
    
    ## Request 1
    match_table %>% 
      slice_head(n = 1, by = mix_category)
    
    # or
    match_table %>% 
      left_join(base_table, ., by = "mix_category", multiple = "first")
    
    #   mix_category category value
    # 1          ABC        A     1
    # 2           WE        W     3
    # 3           CF        C     0
    # 4         Z236        2    -1
    
    ## Request 2
    match_table %>%
      summarise(across(everything(), ~ paste(.x, collapse = '_')),
                .by = mix_category)
    
    #   mix_category category value
    # 1          ABC      A_C   1_0
    # 2           WE      W_E   3_6
    # 3           CF      C_F   0_2
    # 4         Z236        2    -1
    

    Solution 2


    You can also use fuzzy joins supported by {powerjoin}.

    library(powerjoin)
    
    match_table <- power_left_join(base_table, value_table,
        by = c(~ stringr::str_detect(.x$mix_category, .y$category)))
    
    ## Request 1
    match_table %>%
      slice_head(n = 1, by = mix_category)
    
    #   mix_category category value
    # 1          ABC        A     1
    # 2           WE        W     3
    # 3           CF        F     2
    # 4         Z236        2    -1
    
    ## Request 2
    match_table %>%
      summarise(across(everything(), ~ paste(.x, collapse = '_')),
                .by = mix_category)
    
    #   mix_category category value
    # 1          ABC      A_C   1_0
    # 2           WE      W_E   3_6
    # 3           CF      F_C   2_0
    # 4         Z236        2    -1