Search code examples
rgreat-circler-bigmemorygeosphere

R: Distm for big data? Calculating minimum distances between two matrices


I have two matrices, one is 200K rows long, the other is 20K. For each row (which is a point) in the first matrix, I am trying to find which row (also a point) in the second matrix is closest to the point in the first matrix. This is the first method that I tried on a sample dataset:

#Test dataset
pixels.latlon=cbind(runif(200000,min=-180, max=-120), runif(200000, min=50, max=85))
grwl.latlon=cbind(runif(20000,min=-180, max=-120), runif(20000, min=50, max=85))
#calculate the distance matrix
library(geosphere)
dist.matrix=distm(pixels.latlon, grwl.latlon, fun=distHaversine)
#Pick out the indices of the minimum distance
rnum=apply(dist.matrix, 1, which.min)

However, I get a Error: cannot allocate vector of size 30.1 Gb error when I use the distm function.

There have been several posts on this topic:

This one uses bigmemory to calculate distances between points in the SAME dataframe, but I'm not sure how to adapt it to calculate distances between points in two different matrices...https://stevemosher.wordpress.com/2012/04/12/nick-stokes-distance-code-now-with-big-memory/

This one also works for calculating a distance matrix between points in the SAME matrix...Efficient (memory-wise) function for repeated distance matrix calculations AND chunking of extra large distance matrices

And this one is pretty much identical to what I want to do, but they didn't actually come up with a solution that worked for large data: R: distm with Big Memory I tried this method, which uses bigmemory, but get a Error in CreateFileBackedBigMatrix(as.character(backingfile), as.character(backingpath), : Problem creating filebacked matrix. error, I think because the dataframe is too large.

Has anyone come up with a good solution to this problem? I am open to other package ideas!

Updated code which fixed the issue

pixels.latlon=cbind(runif(200000,min=-180, max=-120), runif(200000, min=50, max=85))
grwl.tibble = tibble(long=runif(20000,min=-180, max=-120), lat=runif(20000, min=50, max=85), id=runif(20000, min=0, max=20000))
rnum <- apply(pixels.latlon, 1, function(x) {
    xlon=x[1]
    xlat=x[2]
    grwl.filt = grwl.tibble %>% 
      filter(long < (xlon+0.3) & long >(xlon-0.3) & lat < (xlat+0.3)&lat >(xlat-.3))
    grwl.latlon.filt = cbind(grwl.filt$long, grwl.filt$lat)
    dm <- distm(x, grwl.latlon.filt, fun=distHaversine)
    rnum=apply(dm, 1, which.min)
    id = grwl.filt$id[rnum]
    return(id)
                     })

Solution

  • You can use this R(cpp) function:

    #include <Rcpp.h>
    using namespace Rcpp;
    
    double compute_a(double lat1, double long1, double lat2, double long2) {
    
      double sin_dLat = ::sin((lat2 - lat1) / 2);
      double sin_dLon = ::sin((long2 - long1) / 2);
    
      return sin_dLat * sin_dLat + ::cos(lat1) * ::cos(lat2) * sin_dLon * sin_dLon;
    }
    
    int find_min(double lat1, double long1,
                 const NumericVector& lat2,
                 const NumericVector& long2,
                 int current0) {
    
      int m = lat2.size();
      double lat_k, lat_min, lat_max, a, a0;
      int k, current = current0;
    
      a0 = compute_a(lat1, long1, lat2[current], long2[current]);
      // Search before current0
      lat_min = lat1 - 2 * ::asin(::sqrt(a0));
      for (k = current0 - 1; k >= 0; k--) {
        lat_k = lat2[k];
        if (lat_k > lat_min) {
          a = compute_a(lat1, long1, lat_k, long2[k]);
          if (a < a0) {
            a0 = a;
            current = k;
            lat_min = lat1 - 2 * ::asin(::sqrt(a0));
          }
        } else {
          // No need to search further
          break;
        }
      }
      // Search after current0
      lat_max = lat1 + 2 * ::asin(::sqrt(a0));
      for (k = current0 + 1; k < m; k++) {
        lat_k = lat2[k];
        if (lat_k < lat_max) {
          a = compute_a(lat1, long1, lat_k, long2[k]);
          if (a < a0) {
            a0 = a;
            current = k;
            lat_max = lat1 + 2 * ::asin(::sqrt(a0));
          }
        } else {
          // No need to search further
          break;
        }
      }
    
      return current;
    } 
    
    // [[Rcpp::export]]
    IntegerVector find_closest_point(const NumericVector& lat1,
                                     const NumericVector& long1,
                                     const NumericVector& lat2,
                                     const NumericVector& long2) {
    
      int n = lat1.size();
      IntegerVector res(n);
    
      int current = 0;
      for (int i = 0; i < n; i++) {
        res[i] = current = find_min(lat1[i], long1[i], lat2, long2, current);
      }
    
      return res; // need +1
    }
    
    
    /*** R
    N <- 2000  # 2e6
    M <- 500   # 2e4
    
    pixels.latlon=cbind(runif(N,min=-180, max=-120), runif(N, min=50, max=85))
    grwl.latlon=cbind(runif(M,min=-180, max=-120), runif(M, min=50, max=85))
    # grwl.latlon <- grwl.latlon[order(grwl.latlon[, 2]), ]
    
    library(geosphere)
    system.time({
      #calculate the distance matrix
      dist.matrix = distm(pixels.latlon, grwl.latlon, fun=distHaversine)
      #Pick out the indices of the minimum distance
      rnum=apply(dist.matrix, 1, which.min)
    })
    
    
    find_closest <- function(lat1, long1, lat2, long2) {
    
      toRad <- pi / 180
      lat1  <- lat1  * toRad
      long1 <- long1 * toRad
      lat2  <- lat2  * toRad
      long2 <- long2 * toRad
    
      ord1  <- order(lat1)
      rank1 <- match(seq_along(lat1), ord1)
      ord2  <- order(lat2)
    
      ind <- find_closest_point(lat1[ord1], long1[ord1], lat2[ord2], long2[ord2])
    
      ord2[ind + 1][rank1]
    }
    
    system.time(
      test <- find_closest(pixels.latlon[, 2], pixels.latlon[, 1], 
                           grwl.latlon[, 2], grwl.latlon[, 1])
    )
    all.equal(test, rnum)
    
    N <- 2e4
    M <- 2e4
    pixels.latlon=cbind(runif(N,min=-180, max=-120), runif(N, min=50, max=85))
    grwl.latlon=cbind(long = runif(M,min=-180, max=-120), lat = runif(M, min=50, max=85))
    system.time(
      test <- find_closest(pixels.latlon[, 2], pixels.latlon[, 1], 
                           grwl.latlon[, 2], grwl.latlon[, 1])
    )
    */
    

    It takes 0.5 sec for N = 2e4 and 4.2 sec for N = 2e5. I can't make your code work to compare.