Search code examples
rdata.tablegeolocationgeospatialnearest-neighbor

Find nearest point in second datset for every point in first dataset


I have to data frames of points (lat+lon pairs). For each point in the first data frame, I would like to find the closest point in the second data frame. This question and answer gets me almost there (Finding closest coordinates between two large data sets). The only difference, as suggested in a comment, is that I would like to use geosphere::distGeo or raster::pointDistance instead of what is in the answer. I have tried doing this (see code below), but get an error. How can I fix it? And what units is the Distance variable in? I would like it to be in miles.

Code - copied directly from aforementioned question, except (incorrectly) modified to try to get what I want.

df1 <- structure(list(id = c(1L, 2L, 4L, 5L, 
                      6L, 7L, 8L, 9, 10L, 3L), 
               HIGH_PRCN_LAT = c(52.881442267773, 57.8094538200198, 34.0233529, 
                              63.8087900198, 53.6888144440184, 63.4462810678651, 21.6075544376207, 
                              78.324442654172, 66.85532539759495, 51.623544596), HIGH_PRCN_LON = c(-2.87377812157822, 
                                                                                                 -2.23454414781635, -3.0984448341, -2.439163178635, -7.396111601421454, 
                                                                                                 -5.162345043546359, -8.63311254098095, 3.813289888829932, 
                                                                                                 -3.994325961186105, -8.9065532453272409), SRC_ID = c(NA, NA, 
                                                                                                                                                     NA, NA, NA, NA, NA, NA, NA, NA), distance = c(NA, NA, 
                                                                                                                                                                                                  NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 10L), class = "data.frame")


df2 <- structure(list(SRC_ID = c(55L, 54L, 23L, 11L, 44L, 21L, 76L, 
                          5688L, 440L, 61114L), HIGH_PRCN_LAT = c(68.46506, 50.34127, 61.16432, 
                                                                42.57807, 52.29879, 68.52132, 87.83912, 55.67825, 29.74444, 34.33228
                          ), HIGH_PRCN_LON = c(-5.0584, -5.95506, -5.75546, -5.47801, -3.42062, 
                                             -6.99441, -2.63457, -2.63057, -7.52216, -1.65532)), row.names = c(NA, 
                                                                                                              10L), class = "data.frame")

library(data.table)
library(raster)
library(geosphere)
#Euclidean distance 
mydist <- function(a, b, df1, x, y){
    
    #dt <- data.table(sqrt((df1[[x]]-a)^2 + (df1[[y]]-b)^2))
    #I couldn't figure out how to modify this correcly:
    dt <- data.table(pointDistance(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
    #dt <- data.table(distGeo(c(a,b), c(df1[[x]],df1[[y]]), lonlat=TRUE))
    
    return(data.table(Closest.V1  = which.min(dt$V1),
                      Distance    = dt[which.min(dt$V1)]))
}

setDT(df1)[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON, setDT(df2), 
                        "HIGH_PRCN_LAT", "HIGH_PRCN_LON"), 
           by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]

This is the error I get:

Error in .pointsToMatrix(p2) : Wrong length for a vector, should be 2

Solution

  • c(df1[[x]],df1[[y]]) should be changed to df1[, c(y,x), with=FALSE] because geosphere::distGeo / raster::pointDistance expects a 2-column matrix of points. Also, it expects the points to be supplied as (longitude, latitude) - hence the (x,y) should be flipped to (y,x).

    Using raster::pointDistance:

    # Using raster::pointDistance
    library(data.table)
    library(raster)
    library(geosphere)
    
    setDT(df1)
    setDT(df2)
    
    mydist <- function(a, b, df1, x, y) {
        dt <- data.table(pointDistance(c(b,a), df1[, c(y,x), with=FALSE], lonlat=TRUE))
        
        return (
            data.table(Closest.V1 = which.min(dt$V1),
            Distance = dt[which.min(dt$V1)])
        )
    }
    
    df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
        df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
        by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]
    

    enter image description here

    Using geosphere::distGeo (same result as above):

    # Using geosphere::distGeo
    mydist <- function(a, b, df1, x, y) {
        dt <- data.table(distGeo(c(b,a), df1[, c(y,x), with=FALSE]))
        
        return (
            data.table(Closest.V1 = which.min(dt$V1),
            Distance = dt[which.min(dt$V1)])
        )
    }
    
    df1[, j = mydist(HIGH_PRCN_LAT, HIGH_PRCN_LON,
        df2, "HIGH_PRCN_LAT", "HIGH_PRCN_LON"),
        by = list(id, HIGH_PRCN_LAT, HIGH_PRCN_LON)]