I have a table with polygons and district names; I also have data on purchases with exact longitude and latitude. I wrote a function that checks for every coordinate pair a match in a polygon; then it assigns district name for a purchase. The problem is that it works very-very slow due to lack of vectorization and nested for-loops (thanks, pandas). How can I optimize so it will digest 10+ million rows in less time?
def get_district_name(geo_df: pd.DataFrame, ship_df: pd.DataFrame, col_name: str, frac: int=0.65) -> pd.DataFrame:
sample_ship = ship_df.sample(frac=frac, replace=False, random_state=42).reset_index(drop=True)
sample_ship['municipal_district_name'] = ''
for i in tqdm(range(len(sample_ship))):
point = shapely.geometry.Point(sample_ship['address_longitude'][i], sample_ship['address_latitude'][i])
for j in range(len(geo_df)):
if point.within(geo_df.geometry[j]):
sample_ship['municipal_district_name'][i] = geo_df[col_name][j]
continue
return sample_ship
You could use a within
/sjoin
but note that doing a spatial join on 10+M ain't gonna be fast.
def get_district_name(
geo_df: gpd.GeoDataFrame,
ship_df: pd.DataFrame,
col_name: str,
frac: float = 0.65,
) -> pd.DataFrame:
ship_sample_df = ship_df.sample(
frac=frac, replace=False, random_state=42,
)
sample_ship_gdf = gpd.GeoDataFrame(
ship_sample_df,
geometry=gpd.points_from_xy(
ship_sample_df["address_longitude"],
ship_sample_df["address_latitude"],
),
)
return gpd.sjoin(
sample_ship_gdf,
geo_df,
predicate="within",
how="left",
)[list(ship_df) + [col_name]]
Output :
>>> get_district_name(gdf_districts, df_purchases, "municipal_district_name", 1)
address_longitude address_latitude municipal_district_name
1 1.00 2.00 district_2
4 1.50 1.60 district_2
2 0.70 3.00 NaN
0 3.00 2.50 NaN
3 0.20 0.30 district_1
[5 rows x 3 columns]