python pandas dask geopandas dask-dataframe

Checking area of overlap between two dataframes with parallel dask GeoPandas

I have two different GeoDataFrames: One of which contain polygon squares in a large grid. The other contains larger, and fewer, polygons. I wish to calculate the area of overlap within each of the grid squares with the other, larger squares.

To do so, I made a simple loop method

for _, patch in tqdm(layer.iterrows(), total=layer.shape[0], desc=name):
    # Index of intersecting squares
    idx = joined.intersects(patch.geometry)
    intersection_polygon = joined[idx].intersection(patch.geometry)
    area_of_intersection = intersection_polygon.area
    joined.loc[idx, "value"] += area_of_intersection

In an attempt to speed up this method, I converted the layer DataFrame, which contains the larger patches to a Dask-DataFrame.

I implemented it the following way:

def multi_area(patch, joined=None):
    # Index of intersecting squares
    idx = joined.intersects(patch.geometry)
    intersection_polygon = joined[idx].intersection(patch.geometry)
    area_of_intersection = intersection_polygon.area
    joined.loc[idx, "value"] += area_of_intersection
    return joined["value"]

layer_dask = dask_geopandas.from_geopandas(layer, npartitions=8)

with ProgressBar():
    joined["value"] = layer_dask.apply(multi_area, meta=joined, joined=joined, axis=1).compute(scheduler='multiprocessing')

This, however, returns the error AttributeError: 'GeoDataFrame' object has no attribute 'name', and at this point I am unsure if this is the optimal way of doing it, and what I am doing wrong.

The job I will be doing will have 400 million grid squares, so I am planning on batching this calculation out on smaller areas later, as I can't come up with a smarter way of doing it...

Solution

I managed to speed up the process quite a bit using spatial joins and overlay as suggested by Michael in the comments. In addition I implemented Dask Dataframes so the final code becomes:

import dask_geopandas as dg
import geopandas as gpd

def dissolve_shuffle(ddf, by=None, **kwargs):
    """Shuffle and map partition"""
    meta = ddf._meta.dissolve(by=by, as_index=False, **kwargs)

    shuffled = ddf.shuffle(
        by, npartitions=ddf.npartitions, shuffle="tasks", ignore_index=True
    )

    return shuffled.map_partitions(
        gpd.GeoDataFrame.dissolve, by=by, as_index=False, meta=meta, **kwargs
    )


def calculate_area_overlap_dask(
    df_grid,
    layer,
    nthreads=8,
) -> gpd.GeoDataFrame:
    """This function calculates the area of overlap in each grid cell for a given map-layer
    """

    layer = layer[["geometry"]]
    df_grid = df_grid[["geometry"]]

    # Split up the layer using the grid
    _overlay = gpd.overlay(layer, df_grid, how="intersection")
    
    # Convert the overlay to a dask geopandas dataframe and calculate the area of each new polygon
    _overlay = dg.from_geopandas(_overlay, npartitions=nthreads)
    _overlay["area"] = _overlay.area
    _overlay = _overlay.compute()
    
    # Convert the grid to a dask geopandas dataframe and spatial join all split layer polygons to corresponding grid cells
    df_grid = dg.from_geopandas(df_grid, npartitions=nthreads)
    joined = dg.sjoin(df_grid, _overlay, how="inner").reset_index()

    # Faster dissolve of area within each grid cell
    scored_grid = dissolve_shuffle(
        joined,
        "index",
    )
    scored_grid = scored_grid.compute()
    return scored_grid

def polygon_to_grid(name: str, gdf) -> gpd.GeoDataFrame:
    """This function converts a geodataframe to a grid of polygons
    """
  
    gdf["value"] = range(len(gdf.index))

    # Rasteriser polygonet
    out_grid: xr.Dataset = make_geocube(
        vector_data=gdf,
        measurements=["value"],
        resolution=(-100, 100),
        fill=np.nan,
    )

    vals: xr.DataArray = out_grid.value.values
    vals[~np.isnan(vals)] = np.arange(len(vals[~np.isnan(vals)]), dtype=np.int32)
    vals[np.isnan(vals)] = -9999
    out_grid.value.values = vals
    out_grid.rio.to_raster( f"{name}_raster.tif")
   
    # Read saved raster
    src: xr.Dataset = rasterio.open(f"{name}_raster.tif")
    r = src.read(1).astype(np.int32)

    # Convert polygons
    shapes = features.shapes(r, mask=r != -9999, transform= src.transform)
    polygons: list[Polygon] = list(shapes)
    geom: list[Polygon] = [shapely.geometry.shape(i[0]) for i in polygons]

    # Convert to geodataframe
    grid = gpd.GeoDataFrame(
        geometry=gpd.GeoSeries(
            geom,
        ),
    )
    return grid

if __name__=="__main__":
    area = gpd.read_file("some_area.shp")
    layer = gpd.read_file("some_map_layer.shp")
    area_grid = polygon_to_grid("area", area)
    grid_evaluated = calculate_area_overlap_dask(area_grid, layer)

This mess ended up working, but it was very prone to memory-issues with large datasets. So I opted for a solution that was less precise, but much faster.