Intersecting shapefiles in R

I have these 2 shapefiles from the internet:

library(sf)
library(dplyr)
library(fs)

url_1 <- "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfsa000b21a_e.zip"

temp_dir <- tempdir()

zip_file <- file.path(temp_dir, "shapefile.zip")

download.file(url_1, destfile = zip_file)

exdir_ <- tempfile(pattern = "shp_1")

unzip(zip_file, exdir = exdir_, junkpaths = TRUE)

fs::dir_tree(exdir_)

file_1 <- st_read(exdir_)

fsa <- st_transform(file_1, "+proj=longlat +datum=WGS84")
unlink(temp_dir, recursive = TRUE)

url_2 <- "https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lada000b21a_e.zip&k=%20%20%20151162&loc=//www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip"


download.file(url_2, destfile = "lada000b21a_e.zip")

unzip("lada000b21a_e.zip")

ada <- st_read("lada000b21a_e.shp")

For the same PRUID, I want to make a matrix which contains what % each polygon from shapefile 1 intersects with each polygon from shapefile 2.

library(sf)
library(dplyr)


calculate_intersection_percentages <- function(ada, fsa, pruid) {
  ada_filtered <- ada %>% filter(PRUID == pruid)
  fsa_filtered <- fsa %>% filter(PRUID == pruid)
  
  ada_filtered <- st_transform(ada_filtered, st_crs(fsa_filtered))
  
  results <- list()
  
  total_fsas <- nrow(fsa_filtered)
  completed_fsas <- 0
  
  for (ada_id in ada_filtered$ADAUID) {
    ada_single <- ada_filtered %>% filter(ADAUID == ada_id)
    ada_area <- st_area(ada_single)
    
    for (fsa_id in fsa_filtered$CFSAUID) {
      fsa_single <- fsa_filtered %>% filter(CFSAUID == fsa_id)
      
      intersection <- st_intersection(ada_single, fsa_single)
      
      if (length(intersection) > 0) {
        intersection_area <- st_area(intersection)
        percentage <- as.numeric(intersection_area / ada_area * 100)
        
        print(sprintf("ADA: %s, FSA: %s, Intersection = %.2f%%", ada_id, fsa_id, percentage))
        
        results[[ada_id]][[fsa_id]] <- percentage
      }
      
      completed_fsas <- completed_fsas + 1
      progress_percentage <- (completed_fsas / (total_fsas * nrow(ada_filtered))) * 100
      print(sprintf("Progress: %.2f%% of FSA calculations completed", progress_percentage))
    }
  }
  
  result_matrix <- do.call(rbind, lapply(results, function(x) {
    unlist(x, use.names = TRUE)
  }))
  
  result_matrix[is.na(result_matrix)] <- 0
  
  result_matrix <- result_matrix / rowSums(result_matrix) * 100
  
  return(result_matrix)
}

I launch the function like this:

result <- calculate_intersection_percentages(ada, fsa, pruid = "10")

The code runs, but I am getting some very exact numbers which makes me think the intersection is not happening correctly:

20.00000000 20.00000000 20.00000000 20.00000000 20.00000000

Also, there are 35 FSA's in Newfoundland (https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_A#:~:text=and%20some%20libraries.-,Newfoundland%20and%20Labrador,35%20FSAs%20in%20this%20list.), but I am only seeing 5:

> head(result)
              A0A      A1B      A1C      A1E      A1S
10010001 15.50990 26.73515 15.50990 26.73515 15.50990
10010002 23.85278 14.22083 23.85278 14.22083 23.85278
10010003 13.88354 29.17468 13.88354 29.17468 13.88354
10010004 20.84652 18.73022 20.84652 18.73022 20.84652
10010005 26.08074 10.87889 26.08074 10.87889 26.08074
10010006 20.00000 20.00000 20.00000 20.00000 20.00000

How can I correctly intersect these two shapefiles?

Solution

Just few interesting issues to start with, there might be more.

performance-wise you could do much better without those nested loops, growing that result list shouldn't actually matter too much, but to my understanding this pair-wise operation effectively disables all the gain you could get from spatial indexing. Though (for me) it's not so straightforward to properly test & measure, don't think we can simply disable indexing.
length(intersection) > 0 is probably not indicating the state you had in mind; when using st_intersection() with your sf objects, it returns another sf object, so length() is never 0, not even for non-intersecting geometries as it's not the number of intersections but number of columns; interestingly this actually pushes you to the right direction as your results list gets NULL values for 0-area intersections, something you definitely would want when preparing inputs for rbind; when there's no output from print(), it's not because your if-clause but because percentage is NULL. When testing with a smaller pre-filtered subsets (ada[1:5,], fsa[1:5,], both with PRUID == "10" ), final results looks like this:

lobstr::tree(results)
<list>
├─10010001: <list>
│ ├─A0A: 36.6844068668399
│ ├─A0B: 63.3155931332246
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
├─10010002: <list>
│ ├─A0A: 62.6409032955523
│ ├─A0B<dbl [0]>: 
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
├─10010003: <list>
│ ├─A0A: 32.2477921284301
│ ├─A0B: 0
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
├─10010004: <list>
│ ├─A0A<dbl [0]>: 
│ ├─A0B<dbl [0]>: 
│ ├─A0C<dbl [0]>: 
│ ├─A0E<dbl [0]>: 
│ └─A0G<dbl [0]>: 
└─10010005: <list>
  ├─A0A: 0
  ├─A0B<dbl [0]>: 
  ├─A0C<dbl [0]>: 
  ├─A0E<dbl [0]>: 
  └─A0G<dbl [0]>:

with lapply(results, function(x) {unlist(x, use.names = TRUE)}) all NULL values are dropped, resulting with a ragged list where item alignment for rbind() is broken; as a result, the number of matrix columns is defined by the number of non-NULL areas from the first outer-loop iteration, also rows with only NULL values are dropped; while our small 5-row sf objects should have resulted with 5x5 matrix, we end up with a 4x2:

lapply(results, unlist, use.names = TRUE) |> lobstr::tree(show_attributes = TRUE)
<list>
├─10010001<dbl [2]>: 36.6844068668399, 63.3155931332246
│ └┄attr(,"names")<chr [2]>: "A0A", "A0B"
├─10010002: 62.6409032955523
│ └┄attr(,"names"): "A0A"
├─10010003<dbl [2]>: 32.2477921284301, 0
│ └┄attr(,"names")<chr [2]>: "A0A", "A0B"
├─10010004<dbl [0]>: 
├─10010005: 0
┊ └┄attr(,"names"): "A0A"
└┄attr(,"names")<chr [5]>: "10010001", "10010002", "10010003", "10010004", "10010005"

lapply(results, unlist, use.names = TRUE) |> do.call(what = rbind)
              A0A      A0B
10010001 36.68441 63.31559
10010002 62.64090 62.64090
10010003 32.24779  0.00000
10010005  0.00000  0.00000

To continue with a possible solution, it seems that a frame from st_intersection() pivoted wider should be quite close to what you are after.

Fetch data and load just a small subsets for testing:

library(sf)
#> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(dplyr)
library(tidyr)
library(ggplot2)

# curl::curl_download(
#   "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lfsa000b21a_e.zip", 
#   "lfsa000b21a_e.zip",
#   quiet = FALSE)
# 
# curl::curl_download(
#   "https://www12.statcan.gc.ca/census-recensement/2021/geo/sip-pis/boundary-limites/files-fichiers/lada000b21a_e.zip", 
#   "lada000b21a_e.zip",
#   quiet = FALSE)

# check layers
rbind(
  st_layers("/vsizip/lfsa000b21a_e.zip/lfsa000b21a_e/"),
  st_layers("/vsizip/lada000b21a_e.zip")
)
#> Driver: ESRI Shapefile 
#> Driver: ESRI Shapefile 
#> Available layers:
#>      layer_name geometry_type features fields                          crs_name
#> 1 lfsa000b21a_e       Polygon     1643      5 NAD83 / Statistics Canada Lambert
#> 2 lada000b21a_e       Polygon     5433      4 NAD83 / Statistics Canada Lambert

# read relevant subsets for testing
fsa <- read_sf(
  "/vsizip/lfsa000b21a_e.zip/lfsa000b21a_e/", 
  query = "SELECT CFSAUID, PRUID FROM lfsa000b21a_e WHERE PRUID = '10'",
  quiet = TRUE)
nrow(fsa)
#> [1] 35

ada <- read_sf(
  "/vsizip/lada000b21a_e.zip", 
  query = "SELECT ADAUID, PRUID FROM lada000b21a_e WHERE PRUID = '10'",
  quiet = TRUE)
nrow(ada)
#> [1] 86

Area ratios of intersections, feel free to adjust scaling:

isect_area_ratio <- function(ada, fsa, pruid) {
  fsa_filtered <- 
    fsa |> 
    filter(PRUID == pruid) |> 
    select(CFSAUID)

  # add `area_ada` so it would be included in st_intersection() result
  ada_filtered <- 
    ada |> 
    filter(PRUID == pruid) |> 
    select(ADAUID) |>   
    st_transform(st_crs(fsa_filtered)) |> 
    mutate(area_ada = st_area(geometry)) 

  # intersection also includes geometries without area (points, linestrings, ..), 
  # leave those out;
  # once we have ratios, we can drop geometries, pivot from long to wide and
  # convert to matrix
  area_ratio_m <- 
    st_intersection(ada_filtered, fsa_filtered) |> 
    filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON", "GEOMETRYCOLLECTION")) |> 
    mutate(area_ratio = (st_area(geometry) / area_ada) |> as.numeric()) |> 
    st_drop_geometry() |> 
    select(ADAUID, CFSAUID, area_ratio) |> 
    pivot_wider(names_from = CFSAUID, values_from = area_ratio, values_fill = 0) |> 
    tibble::column_to_rownames("ADAUID") |> 
    as.matrix()
  
  # make sure matrix row and column order aligns with row order of input dataframes
  area_ratio_m[ada_filtered$ADAUID, fsa_filtered$CFSAUID ]
}

Test:

tictoc::tic("isect_area_ratio")
m <- isect_area_ratio(ada, fsa, pruid = "10")
tictoc::toc()
#> isect_area_ratio: 9.14 sec elapsed

# expect 86x35 matrix
str(m)
#>  num [1:86, 1:35] 0.367 0.626 0.322 0 0 ...
#>  - attr(*, "dimnames")=List of 2
#>   ..$ : chr [1:86] "10010001" "10010002" "10010003" "10010004" ...
#>   ..$ : chr [1:35] "A0A" "A0B" "A0C" "A0E" ...

# upper left corner
m[1:10, 1:10]
#>                A0A       A0B A0C A0E A0G A0H A0J A0K A0L A0M
#> 10010001 0.3668441 0.6331559   0   0   0   0   0   0   0   0
#> 10010002 0.6264090 0.0000000   0   0   0   0   0   0   0   0
#> 10010003 0.3224779 0.0000000   0   0   0   0   0   0   0   0
#> 10010004 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010005 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010006 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010007 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010008 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010009 0.0000000 0.0000000   0   0   0   0   0   0   0   0
#> 10010010 0.0000000 0.0000000   0   0   0   0   0   0   0   0

# check if rows add up to ~1
rowSums(m) |> summary() 
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>       1       1       1       1       1       1

A bit closer look at a small subset of intersections, note how we end up with geometry collections and multipolygons, one for every intersecting geometry pair. Number of individual intersection polygons can be much higher.

ada |> 
  filter(ADAUID == "10010032") |> 
  mutate(area_ada = st_area(geometry)) |> 
  st_intersection(fsa) |> 
  filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON", "GEOMETRYCOLLECTION")) |> 
  mutate(area_ratio = (st_area(geometry) / area_ada) |> as.numeric()) |> 
  print() |> 
  ggplot() +
  geom_sf(data = ada[ada$ADAUID == "10010032","geometry"]) +
  geom_sf(aes(fill = CFSAUID), legend = FALSE) +
  geom_sf_label(aes(label = scales::label_percent()(area_ratio))) +
  facet_grid(rows = vars(ADAUID), cols = vars(CFSAUID)) +
  guides(fill = "none")
#> Warning: attribute variables are assumed to be spatially constant throughout
#> all geometries
#> Simple feature collection with 2 features and 6 fields
#> Geometry type: GEOMETRY
#> Dimension:     XY
#> Bounding box:  xmin: 8952734 ymin: 2027053 xmax: 9015751 ymax: 2119991
#> Projected CRS: NAD83 / Statistics Canada Lambert
#> # A tibble: 2 × 7
#>   ADAUID   PRUID   area_ada CFSAUID PRUID.1                  geometry area_ratio
#> * <chr>    <chr>      [m^2] <chr>   <chr>              <GEOMETRY [m]>      <dbl>
#> 1 10010032 10        3.49e9 A0A     10      MULTIPOLYGON (((8991045 …      0.890
#> 2 10010032 10        3.49e9 A0B     10      GEOMETRYCOLLECTION (POLY…      0.110

^{Created on 2024-09-23 with reprex v2.1.1}