For reproducibility, let us simplify my problem as follows: I have 100 spatial polygons representing convex hulls of N random samples drawn from a population (100 times) to calculate the sensitivity of a model to single values. How do I calculate the percentage overlap of these polygons? The ideal solution should be quick and introduce as little approximation as possible.
I have no particular reason to use the GIS capabilities of R, other than I thought this could be the easiest approach to solve the problem.
library(sp)
library(raster)
library(sf)
#> Linking to GEOS 3.8.1, GDAL 3.1.1, PROJ 6.3.1
set.seed(11)
dt <- data.frame(x = rnorm(1e3, 10, 3) + sample(-5:5, 1e3, replace = TRUE))
dt$y <- (rnorm(1e3, 3, 4) + sample(-10:10, 1e3, replace = TRUE)) + dt$x
dt <- rbind(dt, data.frame(x = -dt$x, y = dt$y))
plot(dt, asp = 1)
dt.chull <- dt[chull(dt),]
dt.chull <- rbind(dt.chull, dt.chull[1,])
lines(dt.chull, col = "green")
uncert.polys <- lapply(1:100, function(i) {
tmp <- dt[sample(rownames(dt), 1e2),]
# points(tmp, col = "red")
tmp <- tmp[chull(tmp),]
tmp <- rbind(tmp, tmp[1,])
tmp <- sp::SpatialPolygons(list(sp::Polygons(list(sp::Polygon(tmp)), ID = i)))
sp::SpatialPolygonsDataFrame(tmp, data = data.frame(id = i, row.names = i))
# lines(tmp, col = "red")
})
polys <- do.call(rbind, uncert.polys)
plot(polys, add = TRUE, border = "red")
My initial attempt was to use the sf::st_intersection
function:
sf.polys <- sf::st_make_valid(sf::st_as_sf(polys))
all(sf::st_is_valid(sf.polys))
#> [1] TRUE
sf::st_intersection(sf.polys)
#> Error in CPL_nary_intersection(x): Evaluation error: TopologyException: found non-noded intersection between LINESTRING (-9.80706 -0.619557, -7.66331 -3.55177) and LINESTRING (-9.80706 -0.619557, -9.80706 -0.619557) at -9.8070645468969637 -0.61955676978603658.
The error is likely related to polygon lines "that are almost coincident but not identical". Multiple solutions (1, 2) have been suggested to solve this GEOS related problem, none of which I have managed to make work with my data:
sf.polys <- sf::st_set_precision(sf.polys, 1e6)
sf.polys <- sf::st_snap(sf.polys, sf.polys, tolerance = 1e-4)
sf::st_intersection(sf.polys)
#> Error in CPL_nary_intersection(x): Evaluation error: TopologyException: found non-noded intersection between LINESTRING (-13.7114 32.7341, 3.29417 30.3736) and LINESTRING (3.29417 30.3736, 3.29417 30.3736) at 3.2941702528617176 30.373627946201278.
So, I have to approximate the polygon overlap using rasterization:
GT <- sp::GridTopology(cellcentre.offset = c(round(min(dt$x),1), round(min(dt$y),1)),
cellsize = c(diff(round(range(dt$x), 1))/100, diff(round(range(dt$y), 1))/100),
cells.dim = c(100, 100)
)
SG <- sp::SpatialGrid(GT)
tmp <- lapply(seq_along(uncert.polys), function(i) {
out <- sp::over(SG, uncert.polys[[i]])
out[!is.na(out)] <- 1
out[is.na(out)] <- 0
out
})
tmp <- data.frame(overlapping.n = Reduce("+", lapply(tmp, "[[", 1)))
tmp$overlapping.pr <- 100*tmp$overlapping.n/100
uncert.data <- SpatialGridDataFrame(SG, tmp)
## Plot
plot(x = range(dt$x),
y = range(dt$y),
type = "n"
)
plot(raster::raster(uncert.data), col = colorRampPalette(c("white", "red", "blue", "white"))(100), add = TRUE)
plot(polys, add = TRUE, border = adjustcolor("black", alpha.f = 0.2), cex = 0.5)
points(dt, pch = ".", col = "black", cex = 3)
lines(dt.chull, col = "green")
The approach gives results, but the output is approximated and takes a long time to process. There has to be a better way of doing this.
For performance comparison purposes, here is my current solution:
gridOverlap <- function(dt, uncert.polys) {
GT <- sp::GridTopology(cellcentre.offset = c(round(min(dt$x),1), round(min(dt$y),1)),
cellsize = c(diff(round(range(dt$x), 1))/100, diff(round(range(dt$y), 1))/100),
cells.dim = c(100, 100)
)
SG <- sp::SpatialGrid(GT)
tmp <- lapply(seq_along(uncert.polys), function(i) {
out <- sp::over(SG, uncert.polys[[i]])
out[!is.na(out)] <- 1
out[is.na(out)] <- 0
out
})
tmp <- data.frame(overlapping.n = Reduce("+", lapply(tmp, "[[", 1)))
tmp$overlapping.pr <- 100*tmp$overlapping.n/100
SpatialGridDataFrame(SG, tmp)
}
system.time(gridOverlap(dt = dt, uncert.polys = uncert.polys))
# user system elapsed
# 3.011 0.083 3.105
The performance matters for larger datasets (this solution takes several minutes in the real application).
Created on 2020-09-01 by the reprex package (v0.3.0)
Here is a solution to finding the interior without any errors using spatstat
and the underlying polyclip
package.
library(spatstat)
# Data from OP
set.seed(11)
dt <- data.frame(x = rnorm(1e3, 10, 3) + sample(-5:5, 1e3, replace = TRUE))
dt$y <- (rnorm(1e3, 3, 4) + sample(-10:10, 1e3, replace = TRUE)) + dt$x
dt <- rbind(dt, data.frame(x = -dt$x, y = dt$y))
# Converted to spatstat classes (`ppp` not strictly necessary -- just a habit)
X <- as.ppp(dt, W = owin(c(-25,25),c(-15,40)))
p1 <- owin(poly = dt[rev(chull(dt)),])
# Plot of data and convex hull
plot(X, main = "")
plot(p1, add = TRUE, border = "green")
# Convex hulls of sampled points in spatstat format
polys <- lapply(1:100, function(i) {
tmp <- dt[sample(rownames(dt), 1e2),]
owin(poly = tmp[rev(chull(tmp)),])
})
# Plot of convex hulls
for(i in seq_along(polys)){
plot(polys[[i]], add = TRUE, border = "red")
}
# Intersection of all convex hulls plotted in transparent blue
interior <- do.call(intersect.owin, polys)
plot(interior, add = TRUE, col = rgb(0,0,1,0.1))
It is not clear to me what you want to do from here, but at least this approach avoids the errors of polygon clipping.
To do the grid based solution in spatstat
I would convert the windows to
binary image masks and then work from there:
Wmask <- as.im(Window(X), dimyx = c(200, 200))
masks <- lapply(polys, as.im.owin, xy = Wmask, na.replace = 0)
maskmean <- Reduce("+", masks)/100
plot(maskmean)
The speed depends on the resolution you choose, but I would guess it is much
faster than the current suggestion using sp
/raster
(which can probably
be improved a lot using the same logic as here, so that would be another
option to stick to raster
).