Is there any way to optimize my function for district matching using Python's GeoPandas and Pandas?

I have a table with polygons and district names; I also have data on purchases with exact longitude and latitude. I wrote a function that checks for every coordinate pair a match in a polygon; then it assigns district name for a purchase. The problem is that it works very-very slow due to lack of vectorization and nested for-loops (thanks, pandas). How can I optimize so it will digest 10+ million rows in less time?

def get_district_name(geo_df: pd.DataFrame, ship_df: pd.DataFrame, col_name: str, frac: int=0.65) -> pd.DataFrame:
    
    sample_ship = ship_df.sample(frac=frac, replace=False, random_state=42).reset_index(drop=True)

    sample_ship['municipal_district_name'] = ''

    for i in tqdm(range(len(sample_ship))):
        point = shapely.geometry.Point(sample_ship['address_longitude'][i], sample_ship['address_latitude'][i])
        for j in range(len(geo_df)):
            if point.within(geo_df.geometry[j]):        
                sample_ship['municipal_district_name'][i] = geo_df[col_name][j]
                continue

    return sample_ship

Solution

You could use a within/sjoin but note that doing a spatial join on 10+M ain't gonna be fast.

def get_district_name(
    geo_df: gpd.GeoDataFrame,
    ship_df: pd.DataFrame,
    col_name: str,
    frac: float = 0.65,
) -> pd.DataFrame:

    ship_sample_df = ship_df.sample(
        frac=frac, replace=False, random_state=42,
    )
    sample_ship_gdf = gpd.GeoDataFrame(
        ship_sample_df,
        geometry=gpd.points_from_xy(
            ship_sample_df["address_longitude"],
            ship_sample_df["address_latitude"],
        ),
    )
    return gpd.sjoin(
        sample_ship_gdf,
        geo_df,
        predicate="within",
        how="left",
    )[list(ship_df) + [col_name]]

Output :

>>> get_district_name(gdf_districts, df_purchases, "municipal_district_name", 1)

   address_longitude  address_latitude municipal_district_name
1               1.00              2.00              district_2
4               1.50              1.60              district_2
2               0.70              3.00                     NaN
0               3.00              2.50                     NaN
3               0.20              0.30              district_1

[5 rows x 3 columns]

^{Used input}