I have two different GeoDataFrames: One of which contain polygon squares in a large grid. The other contains larger, and fewer, polygons. I wish to calculate the area of overlap within each of the grid squares with the other, larger squares.
To do so, I made a simple loop method
for _, patch in tqdm(layer.iterrows(), total=layer.shape[0], desc=name):
# Index of intersecting squares
idx = joined.intersects(patch.geometry)
intersection_polygon = joined[idx].intersection(patch.geometry)
area_of_intersection = intersection_polygon.area
joined.loc[idx, "value"] += area_of_intersection
In an attempt to speed up this method, I converted the layer
DataFrame, which contains the larger patches to a Dask-DataFrame.
I implemented it the following way:
def multi_area(patch, joined=None):
# Index of intersecting squares
idx = joined.intersects(patch.geometry)
intersection_polygon = joined[idx].intersection(patch.geometry)
area_of_intersection = intersection_polygon.area
joined.loc[idx, "value"] += area_of_intersection
return joined["value"]
layer_dask = dask_geopandas.from_geopandas(layer, npartitions=8)
with ProgressBar():
joined["value"] = layer_dask.apply(multi_area, meta=joined, joined=joined, axis=1).compute(scheduler='multiprocessing')
This, however, returns the error AttributeError: 'GeoDataFrame' object has no attribute 'name'
, and at this point I am unsure if this is the optimal way of doing it, and what I am doing wrong.
The job I will be doing will have 400 million grid squares, so I am planning on batching this calculation out on smaller areas later, as I can't come up with a smarter way of doing it...
I managed to speed up the process quite a bit using spatial joins and overlay as suggested by Michael in the comments. In addition I implemented Dask Dataframes so the final code becomes:
import dask_geopandas as dg
import geopandas as gpd
def dissolve_shuffle(ddf, by=None, **kwargs):
"""Shuffle and map partition"""
meta = ddf._meta.dissolve(by=by, as_index=False, **kwargs)
shuffled = ddf.shuffle(
by, npartitions=ddf.npartitions, shuffle="tasks", ignore_index=True
)
return shuffled.map_partitions(
gpd.GeoDataFrame.dissolve, by=by, as_index=False, meta=meta, **kwargs
)
def calculate_area_overlap_dask(
df_grid,
layer,
nthreads=8,
) -> gpd.GeoDataFrame:
"""This function calculates the area of overlap in each grid cell for a given map-layer
"""
layer = layer[["geometry"]]
df_grid = df_grid[["geometry"]]
# Split up the layer using the grid
_overlay = gpd.overlay(layer, df_grid, how="intersection")
# Convert the overlay to a dask geopandas dataframe and calculate the area of each new polygon
_overlay = dg.from_geopandas(_overlay, npartitions=nthreads)
_overlay["area"] = _overlay.area
_overlay = _overlay.compute()
# Convert the grid to a dask geopandas dataframe and spatial join all split layer polygons to corresponding grid cells
df_grid = dg.from_geopandas(df_grid, npartitions=nthreads)
joined = dg.sjoin(df_grid, _overlay, how="inner").reset_index()
# Faster dissolve of area within each grid cell
scored_grid = dissolve_shuffle(
joined,
"index",
)
scored_grid = scored_grid.compute()
return scored_grid
def polygon_to_grid(name: str, gdf) -> gpd.GeoDataFrame:
"""This function converts a geodataframe to a grid of polygons
"""
gdf["value"] = range(len(gdf.index))
# Rasteriser polygonet
out_grid: xr.Dataset = make_geocube(
vector_data=gdf,
measurements=["value"],
resolution=(-100, 100),
fill=np.nan,
)
vals: xr.DataArray = out_grid.value.values
vals[~np.isnan(vals)] = np.arange(len(vals[~np.isnan(vals)]), dtype=np.int32)
vals[np.isnan(vals)] = -9999
out_grid.value.values = vals
out_grid.rio.to_raster( f"{name}_raster.tif")
# Read saved raster
src: xr.Dataset = rasterio.open(f"{name}_raster.tif")
r = src.read(1).astype(np.int32)
# Convert polygons
shapes = features.shapes(r, mask=r != -9999, transform= src.transform)
polygons: list[Polygon] = list(shapes)
geom: list[Polygon] = [shapely.geometry.shape(i[0]) for i in polygons]
# Convert to geodataframe
grid = gpd.GeoDataFrame(
geometry=gpd.GeoSeries(
geom,
),
)
return grid
if __name__=="__main__":
area = gpd.read_file("some_area.shp")
layer = gpd.read_file("some_map_layer.shp")
area_grid = polygon_to_grid("area", area)
grid_evaluated = calculate_area_overlap_dask(area_grid, layer)
This mess ended up working, but it was very prone to memory-issues with large datasets. So I opted for a solution that was less precise, but much faster.