Search code examples
pythonpandassortingcategorieshierarchy

How to sort dataframe in pandas by value in hierarchical category structure


I have a data frame in pandas.

pd.DataFrame({
    "category": ["Transport", "Transport : Car", "Transport : Train", "Household", "Household : Utilities", "Household : Utilities : Water", "Household : Utilities : Electric", "Household : Cleaning", "Household : Cleaning : Bathroom", "Household : Cleaning : Kitchen", "Household : Rent", "Living", "Living : Other", "Living : Food", "Living : Something", "Living : Anitsomething"],
    "amount": [5000, 4900, 100, 1100, 600, 400, 200, 100, 75, 25, 400, 250, 150, 100, 1000, -1000]
})

Categories and subcategories are split by a colon.

I am trying to sort this data frame in descending amount (absolute value) order. Whilst respecting the hierarchical grouping. I.e. the sorted result should look like

Transport                           5000
Transport : Car                     4900
Transport : Train                   100
Household                           1600
Household : Utilities               600
Household : Utilities : Water       400
Household : Utilities : Electric    200
Household : Rent                    400
Living                              250
Living : Something                  1000
Living : Antisomething              -1000
Living : Other                      150
Living : Food                       100

I can do this recursively in an incredibly inefficient manner. Super slow but it works.

def sort_hierachical(self, full_df, name_column, sort_column, parent="", level=0):
    result_df = pd.DataFrame(columns=full_df.columns)
    part_df = full_df.loc[(full_df[name_column].str.count(':') == level) & (full_df[name_column].str.startswith(parent)), :]
    part_df['abs'] = part_df[sort_column].abs()
    part_df = part_df.sort_values('abs', ascending=False)
    for _, row in part_df.iterrows():
        category = row[name_column]
        row_df = pd.DataFrame(columns = full_df.columns).append(row)
        child_rows = self.sort_hierachical(full_df, name_column, sort_column, category, level+1)
        if not child_rows.empty:
            result_df = pd.concat([result_df, row_df], sort=False)
            result_df = pd.concat([result_df, child_rows], sort=False)
        else:
            result_df = pd.concat([result_df, row_df], sort=False)
    return result_df

df = self.sort_hierachical(df, "category", "amount")

My question: Is there a nice performant way to do such a thing in pandas. Some sort of group by sort or multi index trick??

Good karma will come to the ones who can solve this challenging problem :)

Edit:

This almost works... But the -1000, 1000 messes up the sort order.

def _sort_tree_df(self, df, tree_column, sort_column):
    sort_key = sort_column + '_abs'
    df[sort_key] = df[sort_column].abs()
    df.index = pd.MultiIndex.from_frame(df[tree_column].str.split(":").apply(lambda x: [y.strip() for y in x]).apply(pd.Series))
    sort_columns = [df[tree_column].values]
    sort_columns.append(df[sort_key].values)
    for x in range(df.index.nlevels, 0, -1):
        group_lvl = list(range(0, x))
        sort_columns.append(df.groupby(level=group_lvl)[sort_key].transform('max').values)
    sort_indexes = np.lexsort(sort_columns)
    df_sorted = df.iloc[sort_indexes[::-1]]
    df_sorted.reset_index(drop=True, inplace=True)
    df_sorted = df_sorted.drop(sort_key, axis=1)
    return df_sorted

Edit2:

Ok I think I've managed to make it work. I'm still very confused how lexsort works. I made this work through educated trial and error. If you understand it please feel free to explain it. Also feel free to post a better method.

def _sort_tree_df(self, df, tree_column, sort_column, delimeter=':'):
    df.index = pd.MultiIndex.from_frame(df[tree_column].str.split(delimeter).apply(lambda x: [y.strip() for y in x]).apply(pd.Series))
    sort_columns = [df[tree_column].values]
    sort_columns.append(df[sort_column].abs().values)
    for x in range(df.index.nlevels, 0, -1):
        group_lvl = list(range(0, x))
        sort_columns.append(df.groupby(level=group_lvl)[sort_column].transform('sum').abs().values)
    sort_indexes = np.lexsort(sort_columns)
    df_sorted = df.iloc[sort_indexes[::-1]]
    df_sorted.reset_index(drop=True, inplace=True)
    return df_sorted

Edit3 : Actually this doesn't always sort correctly :(

Edit4 The problem is I need a way to make th transform('sum') only apply to items where level = x-1

ie something like:

df['level'] = df[tree_column].str.count(':')

sorting_by = df.groupby(level=group_lvl)[sort_column].transform('sum' if 'level' = x-1).abs().values

or

sorting_by = df.groupby(level=group_lvl).loc['level' = x-1: sort_column].transform('sum').abs().values

both of which are not valid

Anyone know how to do a conditional transform like this on a multi index df?


Solution

  • OK, took a while to nut out but now I'm pretty sure this works. Much faster than recursive method too.

    def _sort_tree_df(self, df, tree_column, sort_column, delimeter=':'):
        df=df.copy()
        parts = df[tree_column].str.split(delimeter).apply(lambda x: [y.strip() for y in x]).apply(pd.Series)
        for i, column in enumerate(parts.columns):
            df[column] = parts[column]
        sort_columns = [df[tree_column].values]
        sort_columns.append(df[sort_column].abs().values)
        df['level'] = df[tree_column].str.count(':')
        for x in range(len(parts.columns), 0, -1):
            group_columns = list(range(0, x))
            sorting_by = df.copy()
            sorting_by.loc[sorting_by['level'] != x-1, sort_column] = np.nan
            sorting_by = sorting_by.groupby(group_columns)[sort_column].transform('sum').abs().values
            sort_columns.append(sorting_by)
        sort_indexes = np.lexsort(sort_columns)
        df_sorted = df.iloc[sort_indexes[::-1]]
        df_sorted.reset_index(drop=True, inplace=True)
        df.drop([column for column in parts.columns], inplace=True, axis=1)
        df.drop('level', inplace=True, axis=1)
        return df_sorted