Suppose I have the following two data frames:
set.seed(123)
df_1 <- data.frame(
name_1 = c("john", "david", "alex", "kevin", "trevor", "xavier", "tom", "michael", "troy", "kelly", "chris", "henry", "taylor", "ryan", "peter"),
lon = rnorm(15, mean = -74.0060, sd = 0.01),
lat = rnorm(15, mean = 40.7128, sd = 0.01)
)
df_2 <- data.frame(
name_2 = c("matthew", "tyler", "sebastian", "julie", "anna", "tim", "david", "nigel", "sarah", "steph", "sylvia", "boris", "theo", "malcolm"),
lon = rnorm(14, mean = -74.0060, sd = 0.01),
lat = rnorm(14, mean = 40.7128, sd = 0.01)
)
My Problem: For each person in df_1
, I am trying to find out the 5 closest people (haversine distance) to this person in df_2
, and record various distance statistics (e.g. mean, median, max, min standard deviation).
Attempt
First, I defined the distance function:
library(geosphere)
haversine_distance <- function(lon1, lat1, lon2, lat2) {
distHaversine(c(lon1, lat1), c(lon2, lat2))
}
Then, I calculated the distance between each person in df_1
and all people in df_2
:
# Create a matrix to store results
distances <- matrix(nrow = nrow(df_1), ncol = nrow(df_2))
# calculate the distances
for (i in 1:nrow(df_1)) {
for (j in 1:nrow(df_2)) {
distances[i, j] <- haversine_distance(df_1$lon[i], df_1$lat[i], df_2$lon[j], df_2$lat[j])
}
}
# Create final
final <- data.frame(
name_1 = rep(df_1$name_1, each = nrow(df_2)),
lon_1 = rep(df_1$lon, each = nrow(df_2)),
lat_1 = rep(df_1$lat, each = nrow(df_2)),
name_2 = rep(df_2$name_2, nrow(df_1)),
lon_2 = rep(df_2$lon, nrow(df_1)),
lat_2 = rep(df_2$lat, nrow(df_1)),
distance = c(distances)
)
Finally, for each person in df_1, I kept the 5 minimum distances and recorded the distance statistics:
# Keep only first 5 rows for each unique value of final$name_1
final <- final[order(final$name_1, final$distance), ]
final <- final[ave(final$distance, final$name_1, FUN = seq_along) <= 5, ]
# Calculate summary statistics for each unique person in final$name_1
final_summary <- aggregate(distance ~ name_1,
data = final,
FUN = function(x) c(min = min(x),
max = max(x),
mean = mean(x),
median = median(x),
sd = sd(x)))
final_summary <- do.call(data.frame, final_summary)
names(final_summary)[-(1)] <- c("min_distance", "max_distance", "mean_distance", "median_distance", "sd_distance")
final_summary$closest_people <- tapply(final$name_2,
final$name_1,
FUN = function(x) paste(sort(x), collapse = ", "))
# break closest_people column into multiple columns
n <- 5
closest_people_split <- strsplit(final_summary$closest_people, ", ")
final_summary[paste0("closest_", seq_len(n))] <- do.call(rbind, closest_people_split)
The final result look like this:
name_1 min_distance max_distance mean_distance median_distance sd_distance closest_people closest_1 closest_2 closest_3 closest_4 closest_5
1 alex 342.8375 1158.1408 717.0810 650.9167 358.7439 boris, david, matthew, nigel, sarah boris david matthew nigel sarah
2 chris 195.4891 1504.8199 934.6618 895.8301 489.5175 boris, david, malcolm, nigel, steph boris david malcolm nigel steph
3 david 549.4500 830.2758 716.3839 807.6626 143.9571 matthew, sarah, steph, sylvia, tim matthew sarah steph sylvia tim
4 henry 423.1875 975.1733 639.5657 560.1101 223.2389 anna, boris, matthew, sebastian, tim anna boris matthew sebastian tim
5 john 415.8956 1174.1631 849.4313 965.2928 313.2616 boris, julie, matthew, theo, tyler boris julie matthew theo tyler
6 kelly 489.7949 828.5550 657.5908 658.7015 120.6485 david, julie, matthew, sebastian, steph david julie matthew sebastian steph
My Question: Although this code seems to run without errors, I have the feeling that this code will start to take a long time to run when the sizes of df_1
and df_2
start to grow. Hence, I am looking for ways to improve the efficiency of this code. Can someone please suggest routines for large data frames?
A data.table approach to this problem might be as follows:
funcs <- function(d,n) {
c(setNames(lapply(c(min,max,mean,median,sd), \(f) f(d)), c("min", "max", "mean", "median", "sd")),
list("names" = paste0(n, collapse=", "))
)
}
library(data.table)
setDT(cross_join(df_1, df_2))[
,dist:=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y)), .(name_1, name_2)
][order(dist), .SD[1:5, funcs(dist, name_2)], name_1]
Output:
name_1 min max mean median sd names
1: taylor 170.5171 746.6206 470.0857 439.8022 227.39141 david, tim, nigel, sarah, sebastian
2: peter 195.4891 1455.0204 834.2543 830.2758 539.69009 steph, boris, matthew, anna, david
3: tom 243.6729 530.4778 426.2490 447.8639 110.26649 tim, sebastian, julie, nigel, david
4: ryan 342.8375 1243.7473 970.0721 1052.6759 367.08513 tyler, julie, sebastian, sylvia, nigel
5: henry 394.8684 894.5358 647.1996 670.9220 236.69562 anna, matthew, david, steph, boris
6: john 423.1875 1948.9521 1106.4374 1052.8789 674.69139 boris, steph, matthew, anna, david
7: kelly 491.6430 1130.9239 717.7716 658.7015 248.96974 sylvia, tyler, sarah, nigel, julie
8: trevor 520.1834 650.9167 609.4363 631.9494 52.96026 nigel, sarah, julie, tim, sebastian
9: troy 549.4500 1035.0599 782.8799 828.5550 220.72034 tyler, sylvia, sarah, nigel, theo
10: michael 581.9209 1504.5642 1057.1773 1012.5247 378.81712 theo, tyler, sylvia, sarah, nigel
11: david 602.9369 941.3102 752.1558 715.3872 159.37550 nigel, sarah, david, sylvia, anna
12: kevin 638.9259 834.5504 715.5252 644.2898 102.23793 matthew, anna, david, nigel, steph
13: xavier 972.9730 1767.1953 1369.5604 1396.8569 371.03190 julie, sebastian, tim, tyler, david
14: chris 1389.1659 2106.7084 1644.0448 1455.8430 316.31565 julie, tyler, sebastian, tim, theo
15: alex 1765.7750 2428.5429 2013.7843 1828.6055 294.37805 julie, tyler, sebastian, tim, theo
Another approach using dplyr is to use cross_join
with rowwise()
to get the distances, followed by slice_head(n=5, by=name_1)
to get the five minimum distance by name_1
, and then reframe or summarize the usual way:
cross_join(df_1, df_2) %>%
rowwise() %>%
mutate(dist=distHaversine(c(lon.x, lat.x), c(lon.y, lat.y))) %>%
ungroup() %>%
arrange(dist) %>%
slice_head(n = 5, by=name_1) %>%
reframe(
min_distance = min(dist),
max_distance = max(dist),
mean_distance=mean(dist),
median_distance=median(dist),
sd_distance = sd(dist),
names = paste0(name_2, collapse=","),
.by=name_1
)
Output:
# A tibble: 15 × 7
name_1 min_distance max_distance mean_distance median_distance sd_distance names
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 taylor 171. 747. 470. 440. 227. david,tim,nigel,sarah,sebastian
2 peter 195. 1455. 834. 830. 540. steph,boris,matthew,anna,david
3 tom 244. 530. 426. 448. 110. tim,sebastian,julie,nigel,david
4 ryan 343. 1244. 970. 1053. 367. tyler,julie,sebastian,sylvia,nigel
5 henry 395. 895. 647. 671. 237. anna,matthew,david,steph,boris
6 john 423. 1949. 1106. 1053. 675. boris,steph,matthew,anna,david
7 kelly 492. 1131. 718. 659. 249. sylvia,tyler,sarah,nigel,julie
8 trevor 520. 651. 609. 632. 53.0 nigel,sarah,julie,tim,sebastian
9 troy 549. 1035. 783. 829. 221. tyler,sylvia,sarah,nigel,theo
10 michael 582. 1505. 1057. 1013. 379. theo,tyler,sylvia,sarah,nigel
11 david 603. 941. 752. 715. 159. nigel,sarah,david,sylvia,anna
12 kevin 639. 835. 716. 644. 102. matthew,anna,david,nigel,steph
13 xavier 973. 1767. 1370. 1397. 371. julie,sebastian,tim,tyler,david
14 chris 1389. 2107. 1644. 1456. 316. julie,tyler,sebastian,tim,theo
15 alex 1766. 2429. 2014. 1829. 294. julie,tyler,sebastian,tim,theo