I have to data frames of points (lat+lon pairs). For each point in the first data frame, I would like to find the closest point in the second data frame. This question and answer gets me almost there (Finding closest coordinates between two large data sets). The only difference, as suggested in a comment, is that I would like to use geosphere::distGeo or raster::pointDistance instead of what is in the answer. I have tried doing this (see code below), but get an error. How can I fix it? And what units is the Distance
variable in? I would like it to be in miles.
Code - copied directly from aforementioned question, except (incorrectly) modified to try to get what I want.
df1 <- structure(list(id = c(1L, 2L, 4L, 5L,
6L, 7L, 8L, 9, 10L, 3L),
HIGH_PRCN_LAT = c(52.881442267773, 57.8094538200198, 34.0233529,
63.8087900198, 53.6888144440184, 63.4462810678651, 21.6075544376207,
78.324442654172, 66.85532539759495, 51.623544596), HIGH_PRCN_LON = c(-2.87377812157822,
-2.23454414781635, -3.0984448341, -2.439163178635, -7.396111601421454,
-5.162345043546359, -8.63311254098095, 3.813289888829932,
-3.994325961186105, -8.9065532453272409), SRC_ID = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), distance = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 10L), class = "data.frame")
df2 <- structure(list(SRC_ID = c(55L, 54L, 23L, 11L, 44L, 21L, 76L,
5688L, 440L, 61114L), HIGH_PRCN_LAT = c(68.46506, 50.34127, 61.16432,
42.57807, 52.29879, 68.52132, 87.83912, 55.67825, 29.74444, 34.33228
), HIGH_PRCN_LON = c(-5.0584, -5.95506, -5.75546, -5.47801, -3.42062,
-6.99441, -2.63457, -2.63057, -7.52216, -1.65532)), row.names = c(NA,
10L), class = "data.frame")
library(data.table)
library(raster)
library(geosphere)
#Euclidean distance
mydist <- function(a, b, df1, x, y){
#dt <- data.table(sqrt((df1[[x]]-a)^2 + (df1[[y]]-b)^2))
#I couldn't figure out how to modify this correcly:
dt <- data.table(pointDistance(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
#dt <- data.table(distGeo(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
return(data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)]))
}
setDT(df1)[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON, setDT(df2),
"HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
This is the error I get:
Error in .pointsToMatrix(p2) : Wrong length for a vector, should be 2
c(df1[[x]],df1[[y]])
should be changed to df1[, c(y,x), with=FALSE]
because geosphere::distGeo
/ raster::pointDistance
expects a 2-column matrix of points. Also, it expects the points to be supplied as (longitude, latitude) - hence the (x,y) should be flipped to (y,x).
Using raster::pointDistance
:
# Using raster::pointDistance
library(data.table)
library(raster)
library(geosphere)
setDT(df1)
setDT(df2)
mydist <- function(a, b, df1, x, y) {
dt <- data.table(pointDistance(c(b,a), df1[, c(y,x), with=FALSE], lonlat=TRUE))
return (
data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)])
)
}
df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
Using geosphere::distGeo
(same result as above):
# Using geosphere::distGeo
mydist <- function(a, b, df1, x, y) {
dt <- data.table(distGeo(c(b,a), df1[, c(y,x), with=FALSE]))
return (
data.table(Closest.V1 = which.min(dt$V1),
Distance = dt[which.min(dt$V1)])
)
}
df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]