I have a dataframe need to perform certain operation on it but not able to exact final output as needed.
data = {
"Category": ["900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK"],
"Size": [1041.4, 1020, 711.2, 660.4, 648.6, 584.2, 562.8, 508, 495.3, 488.95, 381, 431.8, 482.6, 520.7, 533.4, 558.8, 584.2, 609.6, 660.4, 750.062],
"Count": [6, 1, 10, 10, 10, 23, 22, 1, 6, 3, 7, 2, 5, 8, 9, 22, 23, 10, 10, 6]
}
# Create DataFrame
df = pd.DataFrame(data)
Raw data:
Here for tranform dataframe by adding new column as Total Count where we need to group in every category with the nearest Size value in range of 25 below is the example for it.
Final output needed where we need to extract the rows from every group if group has multiple row then need to extract only row which has highest value from the size column
import pandas as pd
import numpy as np
data = {
"Category": ["900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "900IG", "165LK",
"165LK", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK", "165LK"],
"Size": [1041.4, 1020, 711.2, 660.4, 648.6, 584.2, 562.8, 508, 495.3, 488.95, 381, 431.8, 482.6, 520.7, 533.4,
558.8, 584.2, 609.6, 660.4, 750.062],
"Count": [6, 1, 10, 10, 10, 23, 22, 1, 6, 3, 7, 2, 5, 8, 9, 22, 23, 10, 10, 6]
}
# Create DataFrame
df = pd.DataFrame(data)
def get_closest_size_count(x):
# get df for category wihtout current row
_df = df[
(df.index != x.name) &
(df.Category == x.Category)
]
# calculate all absolute differences
abs_diffs = abs(_df.Size - x.Size)
# filter to max diff of 25
abs_diffs = abs_diffs[abs_diffs <= 25]
if abs_diffs.empty:
return 0
# sum of nearest counts
# _df.Count[abs_diffs.index].sum()
# return count where difference is the smallest
return _df.Count[abs_diffs.idxmin()]
df['TotalCount'] = df.apply(
lambda x: x.Count + get_closest_size_count(x),
axis=1
)