I have a list of cities and related information that I've placed in a dataframe, like so:
library(plyr)
library(dplyr)
library(ggmap)
library(Imap)
cities <- c("washington, dc", "wilmington, de", "amarillo, tx",
"denver, co", "needham, ma", "philadelphia, pa",
"doylestown, pa", "galveston, tx", "tuscaloosa, al",
"hollywood, fl"
)
id <- c(156952, 154222, 785695, 154423, 971453, 149888, 1356987,
178946, 169944, 136421)
month <- c(201811, 201811, 201912, 201912, 202005, 202005,
202005, 202106, 202106, 202106 )
category<- c("home", "work", "home", "home", "home", "work",
"cell", "home", "work", "cell")
places <- data.frame(cities, id, category, month)
Using the Imap
and ggmap
packages, I can retrieve the longitude and latitudes for each city:
lat <- geocode(location = places$cities, source = "google")$lat
lon <- geocode(location = places$cities, source = "google")$lon
places <- cbind(places, lat, lon)
What I would like to do is the following:
places
I wrote a for
loop to calculate the distances:
for (i in 1:nrow(places)) {
dist_list[[i]] <- gdist(lon.1 = places$lon[i],
lat.1 = places$lat[i],
lon.2 = places$lon,
lat.2 = places$lat,
units="miles")
}
Which produces the following data:
dput(dist_list)
list(c(0, 98.3464717885451, 1386.25425677199, 1489.87718040776,
383.083760289456, 123.232894969413, 140.284537078237, 1209.23510542932,
706.670452283757, 906.79542720295), c(98.4762434610638, 0, 1472.06660056474,
1560.93398322985, 285.23618862797, 24.9195071209828, 44.8853561530985,
1308.60741637919, 805.755084908157, 983.102810248198), c(1389.07354011351,
1472.06660056474, 0, 356.573530670257, 1712.29111612461, 1493.39302974566,
1497.2125164277, 579.329313217289, 827.577713357261, 1434.82691622332
), c(1492.80130415651, 1560.93398322985, 356.573530670257, 0,
1761.3773163288, 1578.71125031146, 1576.80713231756, 923.725006795209,
1067.04809350934, 1717.32991551111), c(383.551997010915, 285.23618862797,
1712.29111612461, 1761.3773163288, 0, 260.382178510916, 243.947043197789,
1588.85470703957, 1088.38640303169, 1230.47219244291), c(123.395655314093,
24.9195071209827, 1493.39302974566, 1578.71125031146, 260.382178510916,
0, 24.7382114555287, 1333.29925285915, 830.581742827321, 1002.94777739349
), c(140.431447025301, 44.8853561530986, 1497.2125164277, 1576.80713231756,
243.947043197789, 24.7382114555285, 0, 1346.44527983873, 844.827513981938,
1026.98263808807), c(1211.16392416136, 1308.60741637919, 579.329313217289,
923.725006795209, 1588.85470703957, 1333.29925285915, 1346.44527983873,
0, 505.292529136012, 925.512554201542), c(707.73957320737, 805.755084908157,
827.577713357261, 1067.04809350934, 1088.38640303169, 830.581742827321,
844.827513981938, 505.292529136012, 0, 666.837848781548), c(906.880841903584,
983.102810248198, 1434.82691622332, 1717.32991551111, 1230.47219244291,
1002.94777739349, 1026.98263808807, 925.512554201542, 666.837848781548,
0))
The desired result would look like this (first row):
cities id category month lat lon min.dist closest city closest city id
washington, dc 156952 home 201811 38.90719 -77.03687 98.34647 wilmington, de 154222
And via the nth
function in Rfast
I can get the second smallest distance
nth(dist_list[[1]], 2)
The problem I have is I don't know how to connect the info from the list to the df places
. Any help or suggestions would be greatly appreciated.
# get min distance:
min_d <- sapply(dist_list, function(x) sort(x)[2])
places$min_dist <- min_d
# index:
i <- sapply(dist_list, function(x) which(sort(x)[2] == x))
# add name:
places$min_name <- places$cities[i]
# prepare dist matrix outside loop
m <- t(as.data.frame(dist_list))
row.names(m) <- NULL
diag(m) <- NA
# create grouping variable:
gv <- as.integer(factor(places$month)) # or:
# gv <- as.integer(factor(paste(places$month, places$category)))
# set distance to NA if not in relevant group:
i <- sapply(gv, function(x) gv == x)
m[!i] <- NA
l <- sapply(as.data.frame(t(m)), function(x) {
if (all(is.na(x))) return(list(NA, NA))
mv <- min(x, na.rm = T)
i <- which(x == mv)
list(mv, i)
})
l
places <- cbind(places, min_dist = unlist(l[1, ]), min_nr = unlist(l[2, ]))
places$min_name <- places$cities[places$min_nr] # add name
places$min_id <- places$id[places$min_nr] # add id
places
result:
cities id category month min_dist min_nr min_name min_id
V1 washington, dc 156952 home 201811 98.34647 2 wilmington, de 154222
V2 wilmington, de 154222 work 201811 98.47624 1 washington, dc 156952
V3 amarillo, tx 785695 home 201912 356.57353 4 denver, co 154423
V4 denver, co 154423 home 201912 356.57353 3 amarillo, tx 785695
V5 needham, ma 971453 home 202005 243.94704 7 doylestown, pa 1356987
V6 philadelphia, pa 149888 work 202005 24.73821 7 doylestown, pa 1356987
V7 doylestown, pa 1356987 cell 202005 24.73821 6 philadelphia, pa 149888
V8 galveston, tx 178946 home 202106 505.29253 9 tuscaloosa, al 169944
V9 tuscaloosa, al 169944 work 202106 505.29253 8 galveston, tx 178946
V10 hollywood, fl 136421 cell 202106 666.83785 9 tuscaloosa, al 169944