python python-3.x pandas dataframe geopy

How do I find closest locations for each location in a list from a dataframe of unique locations

What I have is a list of city + state locations like this:

import pandas as pd

city_state = [
    ['Austin, Texas', 30.264265060424805, -97.74750518798828], 
    ['San Marcos, Texas', 29.882080, -97.939987], 
    ['Denver, Colorado', 39.7392364, -104.984862] 
]
df1 = pd.DataFrame(city_state, columns=['Location', 'Latitude', 'Longitude'])

I also have a csv file that I read in as a Dataframe of locations that looks like this:

data = [
    ['San Antonio, Texas', 29.425171, -98.494614],
    ['Oklahoma City, Oklahoma', 35.468491, -97.521263],
    ['Fort Collins, Colorado', 40.588970, -105.082458],
    ['Salt Lake City, Utah', 40.758480, -111.888138],
    ['Springfield, Massachusetts', 42.102051, -72.585762],
    ['Hartford, Connecticut', 41.764582, -72.6908547]
]
df2 = pd.DataFrame(data, columns=['Location', 'Latitude', 'Longitude'])

What I want to do is divide all these locations between city_state list so there are no duplicates and each "city_state" element has a list of "locations" that are closest to the "city_state" element. Plus I would also like to set a max radius of how far the "location" can be from "city_state" element. For example, if "location" row is more than 100 miles away from every "city_state" then it will be excluded.

So in short I want the result to be something like this:

{
    'Austin, Texas': [...], 
    'San Marcos, Texas': ['San Antonio, Texas', ...], 
    'Denver, Colorado': ['Fort Collins, Colorado', ...]
}

What I have found so far is how to do that to match only the closest location to each "city_state" element but I want to divide the whole list of locations between the "city_state" elements with a maximum radius limit.

If my explanations were unclear or something is missing please let me know.

Edit** I did find a solution that wouldn't have df2 duplicates between df1 locations but I don't think it's a very clean one and could probably be improved on.

What I have:

def calculate_distance(lat1, lon1, lat2, lon2):
    # Convert coordinates to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    radius = 3958.8  # Radius of the Earth in miles
    distance = radius * c
    return distance

max_radius = 100

result = {}
closest_locations_x = {}
for _, city_row in df1.iterrows():
    city_name = city_row['Location']
    city_lat = float(city_row['Latitude'])
    city_lon = float(city_row['Longitude'])
    closest_locations = {}

    for _, location_row in df2.iterrows():
        if location_row['Latitude'] != 'Not Found' or location_row['Longitude'] != 'Not Found':
            location_name = location_row['Location']
            location_lat = float(location_row['Latitude'])
            location_lon = float(location_row['Longitude'])
            distance = calculate_distance(city_lat, city_lon, location_lat, location_lon)

            if distance <= max_radius:
                if location_name not in closest_locations_x:
                    closest_locations[location_name] = distance
                    closest_locations_x[location_name] = distance
                else:
                    if distance < closest_locations_x[location_name]:
                        result = {key: [value for value in values if value != location_name] for key, values in result.items()}
                        closest_locations[location_name] = distance
                        closest_locations_x[location_name] = distance

    result[city_name] = list(closest_locations.keys())

result:

{'Austin, Texas': [], 'San Marcos, Texas': ['San Antonio, Texas'], 'Denver, Colorado': ['Fort Collins, Colorado']}

Solution

Considering the dataframes you provided, here is one way to do it with GeoPy distance function, Python built-in zip function and defaultdict class from the standard library's collections module:

from geopy import distance
from collections import defaultdict

# Compute distance between all cities and each capital city
distances_between_cities = {}
for city, other_lat, other_lon in zip(
    df2["Location"], df2["Latitude"], df2["Longitude"]
):
    distances_between_cities[city] = {}
    for city_state, lat, lon in zip(df1["Location"], df1["Latitude"], df1["Longitude"]):
        if (
            d := distance.distance((lat, lon), (other_lat, other_lon)).miles
        ) <= max_radius:
            distances_between_cities[city][city_state] = d

# Remove duplicates
distances_between_cities = {
    k: min(v, key=v.get) for k, v in distances_between_cities.items() if v
}

# Get final results as dictionary with capital cities as keys
# and list of cities as values
closest_locations = defaultdict(list)
for k, v in distances_between_cities.items():
    closest_locations[v].append(k)
closest_locations = dict(closest_locations)

Then:

print(closest_locations)
# Output

{'Denver, Colorado': ['Fort Collins, Colorado'],
 'San Marcos, Texas': ['San Antonio, Texas']}