What I have is a list of city + state locations like this:
import pandas as pd
city_state = [
['Austin, Texas', 30.264265060424805, -97.74750518798828],
['San Marcos, Texas', 29.882080, -97.939987],
['Denver, Colorado', 39.7392364, -104.984862]
]
df1 = pd.DataFrame(city_state, columns=['Location', 'Latitude', 'Longitude'])
I also have a csv file that I read in as a Dataframe of locations that looks like this:
data = [
['San Antonio, Texas', 29.425171, -98.494614],
['Oklahoma City, Oklahoma', 35.468491, -97.521263],
['Fort Collins, Colorado', 40.588970, -105.082458],
['Salt Lake City, Utah', 40.758480, -111.888138],
['Springfield, Massachusetts', 42.102051, -72.585762],
['Hartford, Connecticut', 41.764582, -72.6908547]
]
df2 = pd.DataFrame(data, columns=['Location', 'Latitude', 'Longitude'])
What I want to do is divide all these locations between city_state
list so there are no duplicates and each "city_state" element has a list of "locations" that are closest to the "city_state" element. Plus I would also like to set a max radius of how far the "location" can be from "city_state" element. For example, if "location" row is more than 100 miles away from every "city_state" then it will be excluded.
So in short I want the result to be something like this:
{
'Austin, Texas': [...],
'San Marcos, Texas': ['San Antonio, Texas', ...],
'Denver, Colorado': ['Fort Collins, Colorado', ...]
}
What I have found so far is how to do that to match only the closest location to each "city_state" element but I want to divide the whole list of locations between the "city_state" elements with a maximum radius limit.
If my explanations were unclear or something is missing please let me know.
Edit** I did find a solution that wouldn't have df2 duplicates between df1 locations but I don't think it's a very clean one and could probably be improved on.
What I have:
def calculate_distance(lat1, lon1, lat2, lon2):
# Convert coordinates to radians
lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
# Haversine formula
dlat = lat2 - lat1
dlon = lon2 - lon1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
radius = 3958.8 # Radius of the Earth in miles
distance = radius * c
return distance
max_radius = 100
result = {}
closest_locations_x = {}
for _, city_row in df1.iterrows():
city_name = city_row['Location']
city_lat = float(city_row['Latitude'])
city_lon = float(city_row['Longitude'])
closest_locations = {}
for _, location_row in df2.iterrows():
if location_row['Latitude'] != 'Not Found' or location_row['Longitude'] != 'Not Found':
location_name = location_row['Location']
location_lat = float(location_row['Latitude'])
location_lon = float(location_row['Longitude'])
distance = calculate_distance(city_lat, city_lon, location_lat, location_lon)
if distance <= max_radius:
if location_name not in closest_locations_x:
closest_locations[location_name] = distance
closest_locations_x[location_name] = distance
else:
if distance < closest_locations_x[location_name]:
result = {key: [value for value in values if value != location_name] for key, values in result.items()}
closest_locations[location_name] = distance
closest_locations_x[location_name] = distance
result[city_name] = list(closest_locations.keys())
result:
{'Austin, Texas': [], 'San Marcos, Texas': ['San Antonio, Texas'], 'Denver, Colorado': ['Fort Collins, Colorado']}
Considering the dataframes you provided, here is one way to do it with GeoPy
distance function, Python
built-in zip function and defaultdict class from the standard library's collections module:
from geopy import distance
from collections import defaultdict
# Compute distance between all cities and each capital city
distances_between_cities = {}
for city, other_lat, other_lon in zip(
df2["Location"], df2["Latitude"], df2["Longitude"]
):
distances_between_cities[city] = {}
for city_state, lat, lon in zip(df1["Location"], df1["Latitude"], df1["Longitude"]):
if (
d := distance.distance((lat, lon), (other_lat, other_lon)).miles
) <= max_radius:
distances_between_cities[city][city_state] = d
# Remove duplicates
distances_between_cities = {
k: min(v, key=v.get) for k, v in distances_between_cities.items() if v
}
# Get final results as dictionary with capital cities as keys
# and list of cities as values
closest_locations = defaultdict(list)
for k, v in distances_between_cities.items():
closest_locations[v].append(k)
closest_locations = dict(closest_locations)
Then:
print(closest_locations)
# Output
{'Denver, Colorado': ['Fort Collins, Colorado'],
'San Marcos, Texas': ['San Antonio, Texas']}