Finding Out 5 Closest Points To Each Point

Suppose I have the following two data frames:

set.seed(123)

df_1 <- data.frame(
  name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
  lon = rnorm(15, mean = -74.0060, sd = 0.01),
  lat = rnorm(15, mean = 40.7128, sd = 0.01)
)

df_2 <- data.frame(
  name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
  lon = rnorm(14, mean = -74.0060, sd = 0.01),
  lat = rnorm(14, mean = 40.7128, sd = 0.01)
)

My Problem: For each person in df_1, I am trying to find out the 5 closest people (haversine distance) to this person in df_2, and record various distance statistics (e.g. mean, median, max, min standard deviation).

Attempt

First, I defined the distance function:

library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
  distHaversine(c(lon1, lat1), c(lon2, lat2))
}

Then, I calculated the distance between each person in df_1 and all people in df_2:

# Create a matrix to store results
distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))

# calculate the distances
for (i in 1:nrow(df_1)) {
    for (j in 1:nrow(df_2)) {
        distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
    }
}

# Create final
final <- data.frame(
    name_1 = rep(df_1$name_1, each = nrow(df_2)),
    lon_1 = rep(df_1$lon, each = nrow(df_2)),
    lat_1 = rep(df_1$lat, each = nrow(df_2)),
    name_2 = rep(df_2$name_2, nrow(df_1)),
    lon_2 = rep(df_2$lon, nrow(df_1)),
    lat_2 = rep(df_2$lat, nrow(df_1)),
    distance = c(distances)
)

Finally, for each person in df_1, I kept the 5 minimum distances and recorded the distance statistics:

# Keep only first 5 rows for each unique value of final$name_1
final <- final[order(final$name_1, final$distance), ]
final <- final[ave(final$distance, final$name_1, FUN = seq_along) <= 5, ]


# Calculate summary statistics for each unique person in final$name_1
final_summary <- aggregate(distance ~ name_1,
                           data = final,
                           FUN = function(x) c(min = min(x),
                                               max = max(x),
                                               mean = mean(x),
                                               median = median(x),
                                               sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")


final_summary$closest_people <- tapply(final$name_2,
                                       final$name_1,
                                       FUN = function(x) paste(sort(x), collapse = ", "))


# break closest_people column into multiple columns
n <- 5
closest_people_split <- strsplit(final_summary$closest_people, ", ")
final_summary[paste0("closest_", seq_len(n))] <- do.call(rbind, closest_people_split)

The final result look like this:

  name_1 min_distance max_distance mean_distance median_distance sd_distance                          closest_people closest_1 closest_2 closest_3 closest_4 closest_5
1   alex     342.8375    1158.1408      717.0810        650.9167    358.7439     boris, david, matthew, nigel, sarah     boris     david   matthew     nigel     sarah
2  chris     195.4891    1504.8199      934.6618        895.8301    489.5175     boris, david, malcolm, nigel, steph     boris     david   malcolm     nigel     steph
3  david     549.4500     830.2758      716.3839        807.6626    143.9571      matthew, sarah, steph, sylvia, tim   matthew     sarah     steph    sylvia       tim
4  henry     423.1875     975.1733      639.5657        560.1101    223.2389    anna, boris, matthew, sebastian, tim      anna     boris   matthew sebastian       tim
5   john     415.8956    1174.1631      849.4313        965.2928    313.2616      boris, julie, matthew, theo, tyler     boris     julie   matthew      theo     tyler
6  kelly     489.7949     828.5550      657.5908        658.7015    120.6485 david, julie, matthew, sebastian, steph     david     julie   matthew sebastian     steph

My Question: Although this code seems to run without errors, I have the feeling that this code will start to take a long time to run when the sizes of df_1 and df_2 start to grow. Hence, I am looking for ways to improve the efficiency of this code. Can someone please suggest routines for large data frames?

Solution

A data.table approach to this problem might be as follows:

funcs <- function(d,n) {
  c(setNames(lapply(c(min,max,mean,median,sd), \(f) f(d)), c("min", "max", "mean", "median", "sd")),
    list("names" = paste0(n, collapse=", "))
  )
}

library(data.table)

setDT(cross_join(df_1, df_2))[
  ,dist:=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y)), .(name_1, name_2)
][order(dist), .SD[1:5, funcs(dist, name_2)], name_1]

Output:

     name_1       min       max      mean    median        sd                                  names
 1:  taylor  170.5171  746.6206  470.0857  439.8022 227.39141    david, tim, nigel, sarah, sebastian
 2:   peter  195.4891 1455.0204  834.2543  830.2758 539.69009     steph, boris, matthew, anna, david
 3:     tom  243.6729  530.4778  426.2490  447.8639 110.26649    tim, sebastian, julie, nigel, david
 4:    ryan  342.8375 1243.7473  970.0721 1052.6759 367.08513 tyler, julie, sebastian, sylvia, nigel
 5:   henry  394.8684  894.5358  647.1996  670.9220 236.69562     anna, matthew, david, steph, boris
 6:    john  423.1875 1948.9521 1106.4374 1052.8789 674.69139     boris, steph, matthew, anna, david
 7:   kelly  491.6430 1130.9239  717.7716  658.7015 248.96974     sylvia, tyler, sarah, nigel, julie
 8:  trevor  520.1834  650.9167  609.4363  631.9494  52.96026    nigel, sarah, julie, tim, sebastian
 9:    troy  549.4500 1035.0599  782.8799  828.5550 220.72034      tyler, sylvia, sarah, nigel, theo
10: michael  581.9209 1504.5642 1057.1773 1012.5247 378.81712      theo, tyler, sylvia, sarah, nigel
11:   david  602.9369  941.3102  752.1558  715.3872 159.37550      nigel, sarah, david, sylvia, anna
12:   kevin  638.9259  834.5504  715.5252  644.2898 102.23793     matthew, anna, david, nigel, steph
13:  xavier  972.9730 1767.1953 1369.5604 1396.8569 371.03190    julie, sebastian, tim, tyler, david
14:   chris 1389.1659 2106.7084 1644.0448 1455.8430 316.31565     julie, tyler, sebastian, tim, theo
15:    alex 1765.7750 2428.5429 2013.7843 1828.6055 294.37805     julie, tyler, sebastian, tim, theo

Another approach using dplyr is to use cross_join with rowwise() to get the distances, followed by slice_head(n=5, by=name_1) to get the five minimum distance by name_1, and then reframe or summarize the usual way:

cross_join(df_1, df_2) %>%
  rowwise() %>% 
  mutate(dist=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y))) %>% 
  ungroup() %>% 
  arrange(dist) %>%
  slice_head(n = 5, by=name_1) %>% 
  reframe(
    min_distance = min(dist),
    max_distance = max(dist),
    mean_distance=mean(dist),
    median_distance=median(dist),
    sd_distance = sd(dist),
    names = paste0(name_2, collapse=","),
    .by=name_1
  )

Output:

# A tibble: 15 × 7
   name_1  min_distance max_distance mean_distance median_distance sd_distance names                             
   <chr>          <dbl>        <dbl>         <dbl>           <dbl>       <dbl> <chr>                             
 1 taylor          171.         747.          470.            440.       227.  david,tim,nigel,sarah,sebastian   
 2 peter           195.        1455.          834.            830.       540.  steph,boris,matthew,anna,david    
 3 tom             244.         530.          426.            448.       110.  tim,sebastian,julie,nigel,david   
 4 ryan            343.        1244.          970.           1053.       367.  tyler,julie,sebastian,sylvia,nigel
 5 henry           395.         895.          647.            671.       237.  anna,matthew,david,steph,boris    
 6 john            423.        1949.         1106.           1053.       675.  boris,steph,matthew,anna,david    
 7 kelly           492.        1131.          718.            659.       249.  sylvia,tyler,sarah,nigel,julie    
 8 trevor          520.         651.          609.            632.        53.0 nigel,sarah,julie,tim,sebastian   
 9 troy            549.        1035.          783.            829.       221.  tyler,sylvia,sarah,nigel,theo     
10 michael         582.        1505.         1057.           1013.       379.  theo,tyler,sylvia,sarah,nigel     
11 david           603.         941.          752.            715.       159.  nigel,sarah,david,sylvia,anna     
12 kevin           639.         835.          716.            644.       102.  matthew,anna,david,nigel,steph    
13 xavier          973.        1767.         1370.           1397.       371.  julie,sebastian,tim,tyler,david   
14 chris          1389.        2107.         1644.           1456.       316.  julie,tyler,sebastian,tim,theo    
15 alex           1766.        2429.         2014.           1829.       294.  julie,tyler,sebastian,tim,theo