Search code examples
pythonpandasdataframegeolocationgeopandas

Get the nearest distance with two geodataframe in pandas


Here is my first geodatframe :

!pip install geopandas
import pandas as pd
import geopandas

city1 = [{'City':"Buenos Aires","Country":"Argentina","Latitude":-34.58,"Longitude":-58.66},
           {'City':"Brasilia","Country":"Brazil","Latitude":-15.78 ,"Longitude":-70.66},
         {'City':"Santiago","Country":"Chile ","Latitude":-33.45 ,"Longitude":-70.66 }]
city2 =  [{'City':"Bogota","Country":"Colombia ","Latitude":4.60 ,"Longitude":-74.08},
           {'City':"Caracas","Country":"Venezuela","Latitude":10.48  ,"Longitude":-66.86}]
city1df = pd.DataFrame(city1)
city2df = pd.DataFrame(city2)
gcity1df = geopandas.GeoDataFrame(
    city1df, geometry=geopandas.points_from_xy(city1df.Longitude, city1df.Latitude))
gcity2df = geopandas.GeoDataFrame(
    city2df, geometry=geopandas.points_from_xy(city2df.Longitude, city2df.Latitude))

City1

           City    Country  Latitude  Longitude                     geometry
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)
1      Brasilia     Brazil    -15.78     -47.91  POINT (-47.91000 -15.78000)
2      Santiago      Chile    -33.45     -70.66  POINT (-70.66000 -33.45000)

and my second geodataframe : City2 :

         City    Country  Latitude  Longitude                     geometry
1        Bogota   Colombia      4.60     -74.08    POINT (-74.08000 4.60000)
2       Caracas  Venezuela     10.48     -66.86   POINT (-66.86000 10.48000)

i would like third dataframe with the nearest city from city1 to city2 with the distance like :

           City    Country  Latitude  Longitude                     geometry    Nearest    Distance
0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)    Bogota    111 Km

Here is my actual solution using geodjango and dict (but it's way too long) :

from django.contrib.gis.geos import GEOSGeometry
result = []
dict_result = {}
for city01 in city1 :
  dist = 99999999
  pnt = GEOSGeometry('SRID=4326;POINT( '+str(city01["Latitude"])+' '+str(city01['Longitude'])+')')
  for city02 in city2:
    pnt2 = GEOSGeometry('SRID=4326;POINT('+str(city02['Latitude'])+' '+str(city02['Longitude'])+')')
    distance_test = pnt.distance(pnt2) * 100
    if distance_test < dist :
      dist = distance_test
  result.append(dist)
  dict_result[city01['City']] = city02['City']

Here are my tryings :

from shapely.ops import nearest_points
# unary union of the gpd2 geomtries 
pts3 = gcity2df.geometry.unary_union
def Euclidean_Dist(df1, df2, cols=['x_coord','y_coord']):
    return np.linalg.norm(df1[cols].values - df2[cols].values,
                   axis=1)
def near(point, pts=pts3):
     # find the nearest point and return the corresponding Place value
     nearest = gcity2df.geometry == nearest_points(point, pts)[1]

     return gcity2df[nearest].City
gcity1df['Nearest'] = gcity1df.apply(lambda row: near(row.geometry), axis=1)
gcity1df

here :

    City    Country     Latitude    Longitude   geometry    Nearest
0   Buenos Aires    Argentina   -34.58  -58.66  POINT (-58.66000 -34.58000)     Bogota
1   Brasilia    Brazil  -15.78  -70.66  POINT (-70.66000 -15.78000)     Bogota
2   Santiago    Chile   -33.45  -70.66  POINT (-70.66000 -33.45000)     Bogota

Regards


Solution

  • Firstly, I merge two data frames by cross join. And then, I found distance between two points using map in python. I use map, because most of the time it is much faster than apply, itertuples, iterrows etc. (Reference: https://stackoverflow.com/a/52674448/8205554)

    Lastly, I group by data frame and fetch minimum values of distance.

    Here are libraries,

    import pandas as pd
    import geopandas
    import geopy.distance
    from math import radians, cos, sin, asin, sqrt
    

    Here are used functions,

    def dist1(p1, p2):
        lon1, lat1, lon2, lat2 = map(radians, [p1.x, p1.y, p2.x, p2.y])
    
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
    
        return c * 6373
    
    def dist2(p1, p2):
        lon1, lat1, lon2, lat2 = map(radians, [p1[0], p1[1], p2[0], p2[1]])
    
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
    
        return c * 6373
    
    def dist3(p1, p2):
        x = p1.y, p1.x
        y = p2.y, p2.x
    
        return geopy.distance.geodesic(x, y).km
    
    def dist4(p1, p2):
        x = p1[1], p1[0]
        y = p2[1], p2[0]
    
        return geopy.distance.geodesic(x, y).km
    

    And data,

    city1 = [
      {
        'City': 'Buenos Aires',
        'Country': 'Argentina',
        'Latitude': -34.58,
        'Longitude': -58.66
      },
      {
        'City': 'Brasilia',
        'Country': 'Brazil',
        'Latitude': -15.78,
        'Longitude': -70.66
      },
      {
        'City': 'Santiago',
        'Country': 'Chile ',
        'Latitude': -33.45,
        'Longitude': -70.66
      }
    ]
    
    city2 = [
      {
        'City': 'Bogota',
        'Country': 'Colombia ',
        'Latitude': 4.6,
        'Longitude': -74.08
      },
      {
        'City': 'Caracas',
        'Country': 'Venezuela',
        'Latitude': 10.48,
        'Longitude': -66.86
      }
    ]
    
    
    city1df = pd.DataFrame(city1)
    city2df = pd.DataFrame(city2)
    

    Cross join with geopandas data frames,

    gcity1df = geopandas.GeoDataFrame(
        city1df, 
        geometry=geopandas.points_from_xy(city1df.Longitude, city1df.Latitude)
    )
    gcity2df = geopandas.GeoDataFrame(
        city2df, 
        geometry=geopandas.points_from_xy(city2df.Longitude, city2df.Latitude)
    )
    
    # cross join geopandas
    gcity1df['key'] = 1
    gcity2df['key'] = 1
    merged = gcity1df.merge(gcity2df, on='key')
    

    math functions and geopandas,

    # 6.64 ms ± 588 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %%timeit
    
    # find distance
    merged['dist'] = list(map(dist1, merged['geometry_x'], merged['geometry_y']))
    
    mapping = {
        'City_x': 'City',
        'Country_x': 'Country',
        'Latitude_x': 'Latitude',
        'Longitude_x': 'Longitude',
        'geometry_x': 'geometry',
        'City_y': 'Nearest',
        'dist': 'Distance'
    }
    
    nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
    nearest.rename(columns=mapping)[list(mapping.values())]
    
               City    Country  Latitude  Longitude                     geometry  \
    2      Brasilia     Brazil    -15.78     -70.66  POINT (-70.66000 -15.78000)   
    0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)   
    4      Santiago     Chile     -33.45     -70.66  POINT (-70.66000 -33.45000)   
    
      Nearest     Distance  
    2  Bogota  2297.922808  
    0  Bogota  4648.004515  
    4  Bogota  4247.586882 
    

    geopy and geopandas,

    # 9.99 ms ± 764 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %%timeit
    
    # find distance
    merged['dist'] = list(map(dist3, merged['geometry_x'], merged['geometry_y']))
    
    mapping = {
        'City_x': 'City',
        'Country_x': 'Country',
        'Latitude_x': 'Latitude',
        'Longitude_x': 'Longitude',
        'geometry_x': 'geometry',
        'City_y': 'Nearest',
        'dist': 'Distance'
    }
    
    nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
    nearest.rename(columns=mapping)[list(mapping.values())]
    
               City    Country  Latitude  Longitude                     geometry  \
    2      Brasilia     Brazil    -15.78     -70.66  POINT (-70.66000 -15.78000)   
    0  Buenos Aires  Argentina    -34.58     -58.66  POINT (-58.66000 -34.58000)   
    4      Santiago     Chile     -33.45     -70.66  POINT (-70.66000 -33.45000)   
    
      Nearest     Distance  
    2  Bogota  2285.239605  
    0  Bogota  4628.641817  
    4  Bogota  4226.710978 
    

    If you want to use pandas instead of geopandas,

    # cross join pandas
    city1df['key'] = 1
    city2df['key'] = 1
    merged = city1df.merge(city2df, on='key')
    

    With math functions,

    # 8.65 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %%timeit
    
    # find distance
    merged['dist'] = list(
        map(
            dist2, 
            merged[['Longitude_x', 'Latitude_x']].values, 
            merged[['Longitude_y', 'Latitude_y']].values
        )
    )
    
    mapping = {
        'City_x': 'City',
        'Country_x': 'Country',
        'Latitude_x': 'Latitude',
        'Longitude_x': 'Longitude',
        'City_y': 'Nearest',
        'dist': 'Distance'
    }
    
    nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
    nearest.rename(columns=mapping)[list(mapping.values())]
    
               City    Country  Latitude  Longitude Nearest     Distance
    2      Brasilia     Brazil    -15.78     -70.66  Bogota  2297.922808
    0  Buenos Aires  Argentina    -34.58     -58.66  Bogota  4648.004515
    4      Santiago     Chile     -33.45     -70.66  Bogota  4247.586882
    

    With geopy,

    # 9.8 ms ± 807 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    %%timeit
    
    # find distance
    merged['dist'] = list(
        map(
            dist4, 
            merged[['Longitude_x', 'Latitude_x']].values, 
            merged[['Longitude_y', 'Latitude_y']].values
        )
    )
    
    mapping = {
        'City_x': 'City',
        'Country_x': 'Country',
        'Latitude_x': 'Latitude',
        'Longitude_x': 'Longitude',
        'City_y': 'Nearest',
        'dist': 'Distance'
    }
    
    nearest = merged.loc[merged.groupby(['City_x', 'Country_x'])['dist'].idxmin()]
    nearest.rename(columns=mapping)[list(mapping.values())]
    
               City    Country  Latitude  Longitude Nearest     Distance
    2      Brasilia     Brazil    -15.78     -70.66  Bogota  2285.239605
    0  Buenos Aires  Argentina    -34.58     -58.66  Bogota  4628.641817
    4      Santiago     Chile     -33.45     -70.66  Bogota  4226.710978