Search code examples
pythonpandasnumpydataframeh3

pandas sum of value for nearest matching neighbors in area


I have two data frames.

One is structured like:

id,value

the second one:

id, neighbor_1, neighbor_2, neighbor_3, neighbor_4, neighbor_5, ...

Now I want to lookup for each id in the neighborhood data frame for each neighbor the sum of the corresponding value column and calculate the total sum over all the neighbors for an id to find the id where the aggregated sum overall values in the neighborhood are the largest.

import pandas as pd
from h3 import h3
k=1

df = pd.DataFrame({'x': {0: 16,
  1: 17,
  2: 18,
  3: 19,
  4: 20},
 'y': {0: 48,
  1: 49,
  2: 50,
  3: 51,
  4: 52},
 'value': {0: 2.0, 1: 4.0, 2: 100.0, 3: 40.0, 4: 500.0},
 'id': {0: '891e15b706bffff',
  1: '891e15b738fffff',
  2: '891e15b714fffff',
  3: '891e15b44c3ffff',
  4: '891e15b448bffff'}})

display(df)

df_neighbors = df[['id']]
df_neighbors.index = df_neighbors['id']
df_neighbors = df_neighbors['id'].apply(lambda x: pd.Series(list(h3.k_ring(x,k))))
display(df_neighbors)

What is an effective way to calculate such a problem (iterative join & aggregation) in pandas?

a naive solution:

import pandas as pd
from h3 import h3
import numpy as np
k=2

df = pd.DataFrame({'x': {0: 16,
  1: 17,
  2: 18,
  3: 19,
  4: 20},
 'y': {0: 48,
  1: 49,
  2: 50,
  3: 51,
  4: 52},
 'value': {0: 2.0, 1: 4.0, 2: 100.0, 3: 40.0, 4: 500.0},
 'id': {0: '891e15b706bffff',
  1: '891e15b738fffff',
  2: '891e15b714fffff',
  3: '891e15b44c3ffff',
  4: '891e15b448bffff'}})

display(df)

df_neighbors = df[['id']]
df_neighbors.index = df_neighbors['id']
df_neighbors = df_neighbors['id'].apply(lambda x: pd.Series(list(h3.k_ring(x,k))))
display(df_neighbors)

joined = df.merge(df_neighbors.reset_index(), left_on='id', right_on='id', how='left')#.drop(['id_neighbors'], axis=1)
# display(joined)

for c in joined[df_neighbors.columns].columns:
    joined[f'sum_of_{c}'] = joined.groupby([c]).value.transform(pd.Series.sum)

xx = [f'sum_of_{c}' for c in joined[df_neighbors.columns].columns]
joined['total_value_sum'] = joined[xx].sum(axis=1)
display(joined)

maximal_neighborhood = joined[df_neighbors.columns].iloc[joined.total_value_sum.argmax()]
display(maximal_neighborhood)

max_neighborhood_raw_elements = df[df['id'].isin(maximal_neighborhood)]
display(max_neighborhood_raw_elements)

avg_y_lat = np.average(max_neighborhood_raw_elements.y, weights=max_neighborhood_raw_elements.value)
avg_x_long = np.average(max_neighborhood_raw_elements.x, weights=max_neighborhood_raw_elements.value)

print(f'(x,y): ({avg_x_long},{avg_y_lat})')

Solution

  • Using stack I can operate without the iteration, only with two groupby:

    df_neighbors = df[['id']]
    df_neighbors.index = df_neighbors['id']
    df_neighbors = df_neighbors['id'].apply(lambda x: pd.Series(list(h3.k_ring(x,k)))).stack().to_frame('hexk').reset_index()#.reset_index(1, drop=True).reset_index()
    #display(df_neighbors)
    
    # df_neighbors.level_1.value_counts()
    
    joined = df.merge(df_neighbors, left_on='id', right_on='id', how='left')#.drop(['id_neighbors'], axis=1)
    #display(joined.head())
    
    joined[f'sum_of_hexk'] = joined.groupby(['hexk']).value.transform(pd.Series.sum)
    joined[f'total_value_sum'] = joined.groupby(['id']).sum_of_hexk.transform(pd.Series.sum)
    
    #display(joined)
    display(joined.total_value_sum.unique())
    
    maximal_neighborhood = joined[joined.id == joined.iloc[joined.total_value_sum.argmax()].id].hexk
    #display(maximal_neighborhood)
    max_neighborhood_raw_elements = df[df['id'].isin(maximal_neighborhood)]
    display(max_neighborhood_raw_elements)
    
    avg_y_lat = np.average(max_neighborhood_raw_elements.y, weights=max_neighborhood_raw_elements.value)
    avg_x_long = np.average(max_neighborhood_raw_elements.x, weights=max_neighborhood_raw_elements.value)
    
    print(f'(x,y): ({avg_x_long},{avg_y_lat})')