Search code examples
pythonpandasdataframevectorizationmulti-index

How do I create a dataframe from another dataframe with only the last non negative values for each value column?


I have a multi-indexed dataframe like so:

           year  value   value2 value3   some_other_column_i_dont_care_about
  one two             
  a   t     2000     0     1     7        aaa
      w     2001     3    -1     4        bbb
      t     2002    -2     1     -3       ccc
  b   t     2000     4     3     6        ddd
      w     2001     7     5     -1       eee    
      t     2002    -8    -3     3        fff
  c   t     2000    11    10     3        ggg
      w     2001   -12    -9     -1       hhh
      t     2002   -15    -6     -5       iii 

How do I create a new, single level df, that just has the latest (in terms of years) non-negative values, like so:

            value  value2 value3
 one    
 a            3      1     4
 b            7      5     3
 c           11     10     3  

Solution

  • One option is to melt, use query to keep values >=0, then use pivot_table with aggfunc='last' to get to wide format again:

    new_df = (
        df.reset_index('one')
            .melt(id_vars='one',
                  value_vars=['value', 'value2', 'value3'],
                  value_name='values')
            .query('values >= 0')
            .pivot_table(index='one', columns='variable', aggfunc='last')
            .droplevel(0, axis=1)
            .rename_axis(index=None, columns=None)
    )
    

    Alternatively use groupby last to keep the last value from each group, then unstack after melt:

    new_df = (
        df.reset_index('one')
            .melt(id_vars='one',
                  value_vars=['value', 'value2', 'value3'],
                  value_name='values')
            .query('values >= 0')
            .groupby(['one', 'variable'])
            .last()
            .unstack()
            .droplevel(0, axis=1)
            .rename_axis(index=None, columns=None)
    )
    

    new_df:

       value  value2  value3
    a      3       1       4
    b      7       5       3
    c     11      10       3
    

    Assuming years are not guaranteed to be in ascending order chain sort_values before melt:

    new_df = (
        df.reset_index('one')
            .sort_values('year')  # Sort By Year
            .melt(id_vars='one',
                  value_vars=['value', 'value2', 'value3'],
                  value_name='values')
            .query('values >= 0')
            .pivot_table(index='one', columns='variable', aggfunc='last')
            .droplevel(0, axis=1)
            .rename_axis(index=None, columns=None)
    )
    

    Complete Working Example:

    import pandas as pd
    
    df = pd.DataFrame({
        'one': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'],
        'two': ['t', 'w', 't', 't', 'w', 't', 't', 'w', 't'],
        'year': [2000, 2001, 2002, 2000, 2001, 2002, 2000, 2001, 2002],
        'value': [0, 3, -2, 4, 7, -8, 11, -12, -15],
        'value2': [1, -1, 1, 3, 5, -3, 10, -9, -6],
        'value3': [3, 4, -3, -1, 3, -2, 7, -9, 3],
        'some_other_column_i_dont_care_about': ['aaa', 'bbb', 'ccc', 'ddd', 'eee',
                                                'fff', 'ggg', 'hhh', 'iii']
    }).set_index(['one', 'two'])
    
    new_df = (
        df.reset_index('one')
            .melt(id_vars='one',
                  value_vars=['value', 'value2', 'value3'],
                  value_name='values')
            .query('values >= 0')
            .pivot_table(index='one', columns='variable', aggfunc='last')
            .droplevel(0, axis=1)
            .rename_axis(index=None, columns=None)
    )
    
    print(new_df)