I have two data frames.
One is structured like:
id,value
the second one:
id, neighbor_1, neighbor_2, neighbor_3, neighbor_4, neighbor_5, ...
Now I want to lookup for each id
in the neighborhood
data frame for each neighbor
the sum
of the corresponding value
column and calculate the total sum
over all the neighbors for an id to find the id where the aggregated sum overall values in the neighborhood are the largest.
import pandas as pd
from h3 import h3
k=1
df = pd.DataFrame({'x': {0: 16,
1: 17,
2: 18,
3: 19,
4: 20},
'y': {0: 48,
1: 49,
2: 50,
3: 51,
4: 52},
'value': {0: 2.0, 1: 4.0, 2: 100.0, 3: 40.0, 4: 500.0},
'id': {0: '891e15b706bffff',
1: '891e15b738fffff',
2: '891e15b714fffff',
3: '891e15b44c3ffff',
4: '891e15b448bffff'}})
display(df)
df_neighbors = df[['id']]
df_neighbors.index = df_neighbors['id']
df_neighbors = df_neighbors['id'].apply(lambda x: pd.Series(list(h3.k_ring(x,k))))
display(df_neighbors)
What is an effective way to calculate such a problem (iterative join & aggregation) in pandas?
import pandas as pd
from h3 import h3
import numpy as np
k=2
df = pd.DataFrame({'x': {0: 16,
1: 17,
2: 18,
3: 19,
4: 20},
'y': {0: 48,
1: 49,
2: 50,
3: 51,
4: 52},
'value': {0: 2.0, 1: 4.0, 2: 100.0, 3: 40.0, 4: 500.0},
'id': {0: '891e15b706bffff',
1: '891e15b738fffff',
2: '891e15b714fffff',
3: '891e15b44c3ffff',
4: '891e15b448bffff'}})
display(df)
df_neighbors = df[['id']]
df_neighbors.index = df_neighbors['id']
df_neighbors = df_neighbors['id'].apply(lambda x: pd.Series(list(h3.k_ring(x,k))))
display(df_neighbors)
joined = df.merge(df_neighbors.reset_index(), left_on='id', right_on='id', how='left')#.drop(['id_neighbors'], axis=1)
# display(joined)
for c in joined[df_neighbors.columns].columns:
joined[f'sum_of_{c}'] = joined.groupby([c]).value.transform(pd.Series.sum)
xx = [f'sum_of_{c}' for c in joined[df_neighbors.columns].columns]
joined['total_value_sum'] = joined[xx].sum(axis=1)
display(joined)
maximal_neighborhood = joined[df_neighbors.columns].iloc[joined.total_value_sum.argmax()]
display(maximal_neighborhood)
max_neighborhood_raw_elements = df[df['id'].isin(maximal_neighborhood)]
display(max_neighborhood_raw_elements)
avg_y_lat = np.average(max_neighborhood_raw_elements.y, weights=max_neighborhood_raw_elements.value)
avg_x_long = np.average(max_neighborhood_raw_elements.x, weights=max_neighborhood_raw_elements.value)
print(f'(x,y): ({avg_x_long},{avg_y_lat})')
Using stack
I can operate without the iteration, only with two groupby
:
df_neighbors = df[['id']]
df_neighbors.index = df_neighbors['id']
df_neighbors = df_neighbors['id'].apply(lambda x: pd.Series(list(h3.k_ring(x,k)))).stack().to_frame('hexk').reset_index()#.reset_index(1, drop=True).reset_index()
#display(df_neighbors)
# df_neighbors.level_1.value_counts()
joined = df.merge(df_neighbors, left_on='id', right_on='id', how='left')#.drop(['id_neighbors'], axis=1)
#display(joined.head())
joined[f'sum_of_hexk'] = joined.groupby(['hexk']).value.transform(pd.Series.sum)
joined[f'total_value_sum'] = joined.groupby(['id']).sum_of_hexk.transform(pd.Series.sum)
#display(joined)
display(joined.total_value_sum.unique())
maximal_neighborhood = joined[joined.id == joined.iloc[joined.total_value_sum.argmax()].id].hexk
#display(maximal_neighborhood)
max_neighborhood_raw_elements = df[df['id'].isin(maximal_neighborhood)]
display(max_neighborhood_raw_elements)
avg_y_lat = np.average(max_neighborhood_raw_elements.y, weights=max_neighborhood_raw_elements.value)
avg_x_long = np.average(max_neighborhood_raw_elements.x, weights=max_neighborhood_raw_elements.value)
print(f'(x,y): ({avg_x_long},{avg_y_lat})')