Search code examples
rlistdataframegeocodingsplit-apply-combine

R: split-apply-combine for geographic distance


I have downloaded a list of all the towns and cities etc in the US from the census bureau. Here is a random sample:

dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L, 
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L, 
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L, 
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L, 
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L, 
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L, 
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L, 
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L, 
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co", 
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi", 
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh", 
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa", 
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L, 
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L, 
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L, 
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L, 
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L, 
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L, 
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L, 
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L, 
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L, 
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L, 
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L, 
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L, 
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L, 
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L, 
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L), 
    ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L, 
    2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L, 
    2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L, 
    2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L, 
    2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L, 
    2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L, 
    2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L, 
    2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L, 
    2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L, 
    2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L, 
    2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L, 
    1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L, 
    2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L, 
    2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L, 
    1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L, 
    2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L, 
    2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald", 
    "warminster heights", "san juan capistrano", "littlestown", 
    "port republic", "taylor", "merriam", "northlakes", "julesburg", 
    "california junction", "lower allen", "antwerp", "pleasantville", 
    "el rancho", "santa clarita", "willacoochee", "kennard", 
    "effingham", "la france", "beechwood", "keys", "orange grove mobile manor", 
    "shiloh", "west mineral", "stony point", "east salem", "heath", 
    "stamps", "haworth", "rio grande", "ester", "clayton", "hackberry", 
    "middle amana", "new baden", "melville", "rolland colony", 
    "hannahs mill", "germantown hills", "la fontaine", "aurora", 
    "green meadows", "kaiminani", "pinecroft", "dawson", "park city", 
    "hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville", 
    "palos verdes estates", "wyandotte", "meigs", "waverly", 
    "sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel", 
    "chatsworth", "smith village", "glenbrook", "rock spring", 
    "villalba", "bayfield", "waynesfield", "utica", "sunset", 
    "milford square", "lithium", "swall meadows", "unalaska", 
    "martinsburg", "ashland", "leawood", "hindsboro", "gray", 
    "turley", "trimble", "falcon", "linn", "olympia fields", 
    "mitchell", "mount pleasant mills", "greenville", "park city", 
    "arkabutla", "new river", "huntsville", "boulder junction", 
    "potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks", 
    "chassell", "edgewood", "lindsay"), lsad = c("25", "57", 
    "25", "21", "25", "25", "25", "57", "43", "57", "57", "47", 
    "25", "57", "25", "25", "47", "25", "57", "57", "57", "57", 
    "21", "25", "57", "57", "43", "25", "43", "57", "57", "25", 
    "57", "57", "47", "57", "57", "57", "47", "43", "25", "57", 
    "57", "57", "25", "25", "47", "57", "57", "25", "43", "25", 
    "43", "25", "57", "57", "57", "57", "57", "47", "25", "43", 
    "57", "57", "62", "25", "47", "47", "25", "57", "57", "57", 
    "25", "21", "25", "25", "47", "25", "57", "25", "43", "25", 
    "47", "25", "57", "25", "57", "57", "57", "25", "57", "25", 
    "25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a", 
    "s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a", 
    "s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s", 
    "s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s", 
    "s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s", 
    "a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a", 
    "a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a", 
    "a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s", 
    "a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a", 
    "s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889, 
    39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864, 
    41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083, 
    31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786, 
    32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839, 
    33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958, 
    41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954, 
    40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864, 
    37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065, 
    33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599, 
    46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123, 
    34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806, 
    40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464, 
    38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804, 
    38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946, 
    34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081, 
    38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208, 
    47.224885), longitude = c(-122.837813, -75.084089, -117.654388, 
    -77.089213, -74.476099, -97.427116, -94.693955, -81.367835, 
    -102.262708, -95.994752, -76.902769, -84.736099, -93.272787, 
    -119.068357, -118.494729, -83.044003, -96.203696, -88.550859, 
    -82.770697, -90.808692, -94.941358, -114.660588, -75.29244, 
    -94.926801, -81.044121, -77.23694, -86.46905, -93.497879, 
    -94.657035, -74.87787, -148.041153, -100.176484, -93.410178, 
    -91.901539, -89.707193, -71.301933, -96.59226, -84.340945, 
    -89.462982, -85.722023, -97.509615, -83.945334, -156.001765, 
    -78.353464, -84.443499, -86.048077, -87.928172, -86.832128, 
    -98.454026, -95.634011, -83.084305, -118.425754, -94.729305, 
    -84.092683, -81.625304, -102.762746, -105.666602, -120.092812, 
    -75.579197, -82.180426, -96.514499, -97.457006, -119.927289, 
    -85.238869, -66.481897, -90.822546, -83.973881, -97.345349, 
    -112.028388, -75.405024, -89.88325, -118.642656, -166.529029, 
    -78.324286, -92.239531, -94.62524, -88.134729, -94.985863, 
    -107.792147, -94.561898, -78.65389, -91.844989, -87.691648, 
    -98.029974, -77.026451, -96.110256, -108.925311, -90.121565, 
    -80.595817, -86.532599, -89.654438, -97.01835, -94.161474, 
    -89.289973, -97.05148, -77.893875, -97.08933, -88.530745, 
    -85.737461, -105.152791), designation = c("city", "cdp", 
    "city", "borough", "city", "city", "city", "cdp", "town", 
    "cdp", "cdp", "village", "city", "cdp", "city", "city", "village", 
    "city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp", 
    "cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp", 
    "cdp", "village", "cdp", "cdp", "cdp", "village", "town", 
    "city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp", 
    "cdp", "city", "town", "city", "town", "city", "cdp", "cdp", 
    "cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp", 
    "urbana", "city", "village", "village", "city", "cdp", "cdp", 
    "cdp", "city", "borough", "city", "city", "village", "city", 
    "cdp", "city", "town", "city", "village", "city", "cdp", 
    "city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city", 
    "village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L, 
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L, 
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L, 
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L, 
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L, 
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L, 
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L, 
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L, 
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L, 
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L, 
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L, 
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L, 
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")

I would like to calculate the distance between each entry in the city column using the longitude and latitude columns and the gdist function. I know that the following for loop works and is easy to read:

dist_list <- list()
for (i in 1:nrow(somewhere)) {
  
  dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i], 
                          lat.1 = somewhere$latitude[i], 
                          lon.2 = somewhere$longitude, 
                          lat.2 = somewhere$latitude,
                          units="miles")
  
}

However: IT TAKES FOREVER to run on the full dataset (31K+ rows)--as in hours. I'm looking for something that will speed up this calculation. I figured something in the split-apply-combine approach would work well, since I would like to eventually involve a pair of grouping variables, geo_block and ansi_block but honestly anything would be better than what I have.

I have tried the following:

somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)

somewhere <- somewhere %>%
  split(.$geo_block, .$ansi_block) %>%
  mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))

But am unsure of how to specify the second set of long-lat inputs outside of the standard for loop.

My question:

  1. How do I use the split-apply-combine approach to solve this problem with geo_block and ansi_block as a grouping variable as above? I would like to return the shortest distance and the name of the city and the value of geo_block corresponding to this distance.

All suggestions are welcome. Ideally the desired result would be fairly quick because the actual dataset I'm working with is quite large. Since I'm a bit in the woods here, I've added a bounty to the question to generate a little more interest and hopefully a wide set of potential answers that I can learn from. Thanks so much!


Solution

  • I'll show both the tidyverse and base R approaches to split-apply-combine. My understanding is that, for each city in each group of cities (whatever your grouping variables may be), you want to report the nearest within-group city and the distance to that nearest city.

    First, a couple of remarks about Imap::gdist:

    • It does not have a lonlat argument. You must use the arguments (lon|lat).(1|2) to pass coordinates.
    • It may not be vectorized properly. Its body contains a loop while (abs(lamda - lamda.old) > 1e-11) {...}, where the value of lamda is (lon.1 - lon.2) * pi / 180. For the loop condition to have length 1 (it should), the arguments lon.1 and lon.2 must have length 1.

    So, at least for now, I'll base my answer on the more canonical geosphere::distm. It stores the entire n-by-n symmetric distance matrix in memory, so you may hit a memory allocation limit, but that is much less likely to happen within a split-apply-combine, where n is the within-group (rather than total) number of cities.

    I'll be working with this part of your data frame:

    library("dplyr")
    
    dd <- somewhere %>%
      as_tibble() %>%
      mutate(geo_block = as.factor(as.integer(substr(geoid, 1L, 1L))),
             ansi_block = as.factor(as.integer(substr(ansicode, 1L, 1L)))) %>%
      select(geo_block, ansi_block, city, longitude, latitude)
    dd
    # # A tibble: 100 × 5
    #    geo_block ansi_block city                longitude latitude
    #    <fct>     <fct>      <chr>                   <dbl>    <dbl>
    #  1 4         2          donald                 -123.      45.2
    #  2 4         2          warminster heights      -75.1     40.2
    #  3 6         2          san juan capistrano    -118.      33.5
    #  4 4         1          littlestown             -77.1     39.7
    #  5 3         8          port republic           -74.5     39.5
    #  6 4         2          taylor                  -97.4     30.6
    #  7 2         4          merriam                 -94.7     39.0
    #  8 3         2          northlakes              -81.4     35.8
    #  9 8         2          julesburg              -102.      41.0
    # 10 1         2          california junction     -96.0     41.6
    # # … with 90 more rows
    

    The following function find_nearest performs the basic task of identifying nearest neighbours given a set of n points in a d-dimensional metric space. Its arguments are as follows:

    1. data: A data frame with n rows (one per point), d variables specifying point coordinates, and 1 or more ID variables to be associated with each point.
    2. dist: A function taking an n-by-d matrix of coordinates as an argument and returning a corresponding n-by-n symmetric distance matrix.
    3. coordvar: A character vector listing coordinate variable names.
    4. idvar: A character vector listing ID variable names.

    In our data frame, the coordinate variables are longitude and latitude and there is one ID variable city. For simplicity, I have defined the default values of coordvar and idvar accordingly.

    find_nearest <- function(data, dist, 
                             coordvar = c("longitude", "latitude"), 
                             idvar = "city") {
      m <- match(coordvar, names(data), 0L)
      n <- nrow(data)
      if (n < 2L) {
        argmin <- NA_integer_[n]
        distance <- NA_real_[n]
      } else {
        ## Compute distance matrix
        D <- dist(as.matrix(data[m]))
        ## Extract minimum distances
        diag(D) <- Inf # want off-diagonal distances
        argmin <- apply(D, 2L, which.min)
        distance <- D[cbind(argmin, seq_len(n))]
      }
      ## Return focal point data, nearest neighbour ID, distance
      r1 <- data[-m]
      r2 <- data[argmin, idvar, drop = FALSE]
      names(r2) <- paste0(idvar, "_nearest")
      data.frame(r1, r2, distance, row.names = NULL, stringsAsFactors = FALSE)
    }
    

    Here is the output of find_nearest applied to our data frame, without grouping, with distances computed by the Vincenty ellipsoid method.

    dist_vel <- function(x) {
      geosphere::distm(x, fun = geosphere::distVincentyEllipsoid)
    }
    
    res <- find_nearest(dd, dist = dist_vel)
    head(res, 10L)
    #    geo_block ansi_block                city         city_nearest  distance
    # 1          4          2              donald              outlook 246536.39
    # 2          4          2  warminster heights       milford square  38512.79
    # 3          6          2 san juan capistrano palos verdes estates  77722.35
    # 4          4          1         littlestown          lower allen  55792.53
    # 5          3          8       port republic           rio grande  66935.70
    # 6          4          2              taylor            kingsland  98997.90
    # 7          2          4             merriam              leawood  13620.87
    # 8          3          2          northlakes          stony point  30813.46
    # 9          8          2           julesburg                sunol  46037.81
    # 10         1          2 california junction              kennard  19857.41
    

    Here is the tidyverse split-apply-combine, grouping on geo_block and ansi_block:

    dd %>%
      group_by(geo_block, ansi_block) %>%
      group_modify(~find_nearest(., dist = dist_vel))
    # # A tibble: 100 × 5
    # # Groups:   geo_block, ansi_block [13]
    #    geo_block ansi_block city                city_nearest  distance
    #    <fct>     <fct>      <chr>               <chr>            <dbl>
    #  1 1         2          california junction gray            89610.
    #  2 1         2          pleasantville       middle amana   122974.
    #  3 1         2          willacoochee        meigs          104116.
    #  4 1         2          effingham           hindsboro       72160.
    #  5 1         2          heath               dawson         198052.
    #  6 1         2          middle amana        pleasantville  122974.
    #  7 1         2          new baden           huey            37147.
    #  8 1         2          hannahs mill        dawson         129599.
    #  9 1         2          germantown hills    hindsboro      165140.
    # 10 1         2          la fontaine         edgewood        63384.
    # # … with 90 more rows
    

    Note, for example, how the city considered nearest to California Junction changes from Kennard to Gray, which is farther away, under this grouping.

    Here is the base R split-apply-combine, built on base function by. It is equivalent except for the ordering of groups in the result:

    find_nearest_by <- function(data, by, ...) {
      do.call(rbind, base::by(data, by, find_nearest, ...))
    }
    
    res <- find_nearest_by(dd, by = dd[c("geo_block", "ansi_block")], dist = dist_vel)
    head(res, 10L)
    #    geo_block ansi_block                city city_nearest   distance
    # 1          3          1         grand forks         <NA>         NA
    # 2          4          1         littlestown  martinsburg  122718.95
    # 3          4          1         martinsburg  littlestown  122718.95
    # 4          4          1            mitchell  martinsburg 1671365.58
    # 5          5          1            bayfield         <NA>         NA
    # 6          1          2 california junction         gray   89609.71
    # 7          1          2       pleasantville middle amana  122974.32
    # 8          1          2        willacoochee        meigs  104116.21
    # 9          1          2           effingham    hindsboro   72160.43
    # 10         1          2               heath       dawson  198051.76
    

    This ordering reveals that find_nearest returns NA for groups containing only one city. We would have seen these NA in the tidyverse result had we printed the entire tibble.

    FWIW, here is a benchmark of the various functions implemented in geosphere for computing geodesic distances, saying nothing about accuracy, though you can speculate from the results that the Vincenty ellipsoid distance cuts the fewest corners (haha)...

    fn <- function(dist) find_nearest(dd, dist = dist)
    
    library("geosphere")
    dist_geo <- function(x) distm(x, fun = distGeo)
    dist_cos <- function(x) distm(x, fun = distCosine)
    dist_hav <- function(x) distm(x, fun = distHaversine)
    dist_vsp <- function(x) distm(x, fun = distVincentySphere)
    dist_vel <- function(x) distm(x, fun = distVincentyEllipsoid)
    dist_mee <- function(x) distm(x, fun = distMeeus)
    
    microbenchmark::microbenchmark(
      fn(dist_geo),
      fn(dist_cos),
      fn(dist_hav),
      fn(dist_vsp),
      fn(dist_vel),
      fn(dist_mee),
      times = 1000L
    )
    # Unit: milliseconds
    #          expr        min         lq       mean     median         uq       max neval
    #  fn(dist_geo)   6.143276   6.291737   6.718329   6.362257   6.459345  45.91131  1000
    #  fn(dist_cos)   4.239236   4.399977   4.918079   4.461804   4.572033  45.70233  1000
    #  fn(dist_hav)   4.005331   4.156067   4.641016   4.210721   4.307542  41.91619  1000
    #  fn(dist_vsp)   3.827227   3.979829   4.446428   4.033621   4.123924  44.29160  1000
    #  fn(dist_vel) 129.712069 132.549638 135.006170 133.935479 135.248135 174.88874  1000
    #  fn(dist_mee)   3.716814   3.830999   4.234231   3.883582   3.962712  42.12947  1000
    

    Imap::gdist seems to be a pure R implementation of Vincenty ellipsoid distance. That may account for its slowness...

    Some final remarks:

    • The dist argument of find_nearest need not be built on one of the geosphere distances. You can pass any function you want satisfying the constraints that I state above.
    • find_nearest could be adapted to accept functions dist returning objects of class "dist". These objects store just the n*(n-1)/2 lower triangular elements of the n-by-n symmetric distance matrix (see ?dist). This would improve support for large n and make find_nearest compatible with this memory-efficient implementation of the Haversine distance (suggested by @dww in the comments). It would just need to be worked out how to translate an operation like apply(D, 2L, which.min). [Update: I have implemented this change to find_nearest in a separate answer, where I provide new benchmarks.]