Search code examples
rdplyrmemory-efficientfuzzy-comparisonfuzzyjoin

Limiting the amount of fuzzy string comparisons by comparing by subgroup


I have two datasets as follows:

DT1 <- structure(list(Province = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3), Year = c(2000, 
2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 
2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 2001, 2001, 
2001, 2002, 2002, 2002), Municipality = c("Something", "Anything", 
"Nothing", "Something", "Anything", "Nothing", "Something", "Anything", 
"Nothing", "Something", "Anything", "Nothing", "Something", "Anything", 
"Nothing", "Something", "Anything", "Nothing", "Something", "Anything", 
"Nothing", "Something", "Anything", "Nothing", "Something", "Anything", 
"Nothing"), Values = c(0.59, 0.58, 0.66, 0.53, 0.94, 0.2, 0.86, 
0.85, 0.99, 0.59, 0.58, 0.66, 0.53, 0.94, 0.2, 0.86, 0.85, 0.99, 
0.59, 0.58, 0.66, 0.53, 0.94, 0.2, 0.86, 0.85, 0.99)), row.names = c(NA, 
-27L), class = c("tbl_df", "tbl", "data.frame"))

DT2 <- structure(list(Province = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3), Year = c(2000, 
2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 
2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 2001, 2001, 
2001, 2002, 2002, 2002), Municipality = c("Some", "Anything", 
"Nothing", "Someth.", "Anything", "Not", "Something", "Anything", 
"None", "Some", "Anything", "Nothing", "Someth.", "Anything", 
"Not", "Something", "Anything", "None", "Some", "Anything", "Nothing", 
"Someth.", "Anything", "Not", "Something", "Anything", "None"
), `Other Values` = c(0.41, 0.42, 0.34, 0.47, 0.0600000000000001, 
0.8, 0.14, 0.15, 0.01, 0.41, 0.42, 0.34, 0.47, 0.0600000000000001, 
0.8, 0.14, 0.15, 0.01, 0.41, 0.42, 0.34, 0.47, 0.0600000000000001, 
0.8, 0.14, 0.15, 0.01)), row.names = c(NA, -27L), class = c("tbl_df", 
"tbl", "data.frame"))

I am trying to match them as follows, suggested in this link, by Arthur Yip.

library(fuzzyjoin); library(dplyr);
stringdist_join(DT1, DT2, 
                by = "Municipality",
                mode = "left",
                ignore_case = TRUE, 
                method = "jw", 
                max_dist = 10, 
                distance_col = "dist") %>%
  group_by(Municipality.x) %>%
  top_n(1, -dist)

The issue is that the code completely fries my computer, so I would like to split up the code into groups to limit the amount of string comparisons. I tried:

library(fuzzyjoin); library(dplyr);
stringdist_join(DT1, DT2, 
                by = c("Municipality","Year", "State"),
                mode = "left",
                ignore_case = TRUE, 
                method = "jw", 
                max_dist = 10, 
                distance_col = "dist") %>%
  group_by(Municipality.x) %>%
  top_n(1, -dist)

stringdist_join(DT1, DT2, 
                by = "Municipality",
                mode = "left",
                ignore_case = TRUE, 
                method = "jw", 
                max_dist = 10, 
                distance_col = "dist") %>%
  group_by(Municipality, Year, Province) %>%
  top_n(1, -dist)

But both give me the following respective errors:

Error: All columns in a tibble must be vectors.
x Column `col` is NULL.
Run `rlang::last_error()` to see where the error occurred.

And:

Error: Must group by variables found in `.data`.
* Column `Municipality` is not found.
* Column `Year` is not found.
* Column `Province` is not found.
Run `rlang::last_error()` to see where the error occurred.

What would be the proper way to do this?


Solution

  • You were on the right track - just a few typos/bugs and you need to finish changing/replacing the column names.

    Also, in your first one, you will need to figure out how you want to pick the "best match" based on Municipality.dist, Province.dist, and Year.dist.

    Maybe the second one works better if you get the years and provinces sorted out first.

    
    DT1 <- structure(list(Province = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3), Year = c(2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002), Municipality = c("Something", "Anything", "Nothing", "Something", "Anything", "Nothing", "Something", "Anything", "Nothing", "Something", "Anything", "Nothing", "Something", "Anything", "Nothing", "Something", "Anything", "Nothing", "Something", "Anything", "Nothing", "Something", "Anything", "Nothing", "Something", "Anything", "Nothing"), Values = c(0.59, 0.58, 0.66, 0.53, 0.94, 0.2, 0.86, 0.85, 0.99, 0.59, 0.58, 0.66, 0.53, 0.94, 0.2, 0.86, 0.85, 0.99, 0.59, 0.58, 0.66, 0.53, 0.94, 0.2, 0.86, 0.85, 0.99)), row.names = c(NA, -27L), class = c("tbl_df", "tbl", "data.frame"))
    
    DT2 <- structure(list(Province = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3), Year = c(2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002, 2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002), Municipality = c("Some", "Anything", "Nothing", "Someth.", "Anything", "Not", "Something", "Anything", "None", "Some", "Anything", "Nothing", "Someth.", "Anything", "Not", "Something", "Anything", "None", "Some", "Anything", "Nothing", "Someth.", "Anything", "Not", "Something", "Anything", "None"), `Other Values` = c(0.41, 0.42, 0.34, 0.47, 0.0600000000000001, 0.8, 0.14, 0.15, 0.01, 0.41, 0.42, 0.34, 0.47, 0.0600000000000001, 0.8, 0.14, 0.15, 0.01, 0.41, 0.42, 0.34, 0.47, 0.0600000000000001, 0.8, 0.14, 0.15, 0.01)), row.names = c(NA, -27L), class = c("tbl_df", "tbl", "data.frame"))
    
    library(fuzzyjoin); library(dplyr);
    
    stringdist_join(DT1, DT2, 
                    by = c("Municipality", "Year", "Province"),
                    mode = "left",
                    ignore_case = TRUE, 
                    method = "jw", 
                    max_dist = 10, 
                    distance_col = "dist") %>%
        group_by(Municipality.x) %>%
        slice_min(Municipality.dist)
    #> # A tibble: 135 x 12
    #> # Groups:   Municipality.x [3]
    #>    Province.x Year.x Municipality.x Values Province.y Year.y Municipality.y
    #>         <dbl>  <dbl> <chr>           <dbl>      <dbl>  <dbl> <chr>         
    #>  1          1   2000 Anything        0.580          1   2000 Anything      
    #>  2          1   2000 Anything        0.580          1   2001 Anything      
    #>  3          1   2000 Anything        0.580          1   2002 Anything      
    #>  4          1   2000 Anything        0.580          2   2000 Anything      
    #>  5          1   2000 Anything        0.580          2   2001 Anything      
    #>  6          1   2000 Anything        0.580          2   2002 Anything      
    #>  7          1   2000 Anything        0.580          3   2000 Anything      
    #>  8          1   2000 Anything        0.580          3   2001 Anything      
    #>  9          1   2000 Anything        0.580          3   2002 Anything      
    #> 10          1   2001 Anything        0.94           1   2000 Anything      
    #> # ... with 125 more rows, and 5 more variables: `Other Values` <dbl>,
    #> #   Municipality.dist <dbl>, Province.dist <dbl>, Year.dist <dbl>, dist <lgl>
    
    stringdist_join(DT1, DT2, 
                    by = "Municipality",
                    mode = "left",
                    ignore_case = TRUE, 
                    method = "jw", 
                    max_dist = 10, 
                    distance_col = "dist") %>%
        group_by(Municipality.x, Year.x, Province.x) %>%
        slice_min(dist)
    #> # A tibble: 135 x 9
    #> # Groups:   Municipality.x, Year.x, Province.x [27]
    #>    Province.x Year.x Municipality.x Values Province.y Year.y Municipality.y
    #>         <dbl>  <dbl> <chr>           <dbl>      <dbl>  <dbl> <chr>         
    #>  1          1   2000 Anything        0.580          1   2000 Anything      
    #>  2          1   2000 Anything        0.580          1   2001 Anything      
    #>  3          1   2000 Anything        0.580          1   2002 Anything      
    #>  4          1   2000 Anything        0.580          2   2000 Anything      
    #>  5          1   2000 Anything        0.580          2   2001 Anything      
    #>  6          1   2000 Anything        0.580          2   2002 Anything      
    #>  7          1   2000 Anything        0.580          3   2000 Anything      
    #>  8          1   2000 Anything        0.580          3   2001 Anything      
    #>  9          1   2000 Anything        0.580          3   2002 Anything      
    #> 10          2   2000 Anything        0.580          1   2000 Anything      
    #> # ... with 125 more rows, and 2 more variables: `Other Values` <dbl>,
    #> #   dist <dbl>
    

    Created on 2020-12-07 by the reprex package (v0.3.0)