Search code examples
pythonpandasmultipleselection

Add Pandas Column with Row Lookup List


I want to add a column to a dataframe based on row contents. (The array is the result of .melt.) The selection column variables are in a list.

How can I form the selection programatically using the list?

# cols = df.columns.tolist()[:3]
cols = ['height', 'weight', 'volume']

np.select([data.variable == 'height', data.variable == 'weight', data.variable == 'volume'], ['tall', '', 'bulky'], default='')

A string is easily formed but it is not the correct parameter type.

np_cols = ['data.variable == \'' + c + '\'' for c in cols]
np_cols
# ["data.variable == 'height'", "data.variable == 'weight'", "data.variable == 'volume'"]

Full code shows extra column based on row selection:

import numpy as np
import pandas as pd

df = pd.DataFrame(
    [
        ['note', '', '', 'tall','','bulky'],
        ['TAE', 'B', 2020, 127, 338, 168],
        ['NQZ', 'A', 2020, 578, 85, 310],
        ['NQZ', 'Q', 2020, 196, 17, 10],
    ],
    columns=['airport', 'ver', 'year', 'height', 'weight', 'volume'],
)

index_cols = ['airport', 'ver', 'year']

original_index = pd.MultiIndex.from_frame(df[index_cols])
data = (
    df
    .melt(id_vars=original_index.names, var_name='variable', value_name='value')
    .set_index(index_cols)
    .loc[original_index]
    .reset_index(drop=False)
)

data = data[(data.airport != 'note')]
data['note'] = np.select([data.variable == 'height', data.variable == 'weight', data.variable == 'volume'], ['tall', '', 'bulky'], default='')

print(data)

Output:

   airport ver  year variable value   note
3      TAE   B  2020   height   127   tall
4      TAE   B  2020   weight   338
5      TAE   B  2020   volume   168  bulky
6      NQZ   A  2020   height   578   tall
7      NQZ   A  2020   weight    85
8      NQZ   A  2020   volume   310  bulky
9      NQZ   Q  2020   height   196   tall
10     NQZ   Q  2020   weight    17
11     NQZ   Q  2020   volume    10  bulky

Solution

  • Thank you to the good people at https://www.pythondiscord.com/

    Form a dictionary:

    from_ = df.columns.tolist()[3:]
    to_   = df.loc[df['airport'] == 'note'].values.flatten().tolist()[3:]
    # from_ = ['height', 'weight', 'volume']
    # to_   = ['tall', '', 'bulky']
    data["note"] = data["variable"].map(dict(zip(from_, to_))).fillna("")
    

    Shown in full code:

    import pandas as pd
    
    df = pd.DataFrame(
        [
            ['note', '', '', 'tall','','bulky'],
            ['TAE', 'B', 2020, 127, 338, 168],
            ['NQZ', 'A', 2020, 578, 85, 310],
            ['NQZ', 'Q', 2020, 196, 17, 10],
        ],
        columns=['airport', 'ver', 'year', 'height', 'weight', 'volume'],
    )
    
    index_cols = ['airport', 'ver', 'year']
    
    original_index = pd.MultiIndex.from_frame(df[index_cols])
    data = (
        df
        .melt(id_vars=original_index.names, var_name='variable', value_name='value')
        .set_index(index_cols)
        .loc[original_index]
        .reset_index(drop=False)
    )
    
    data = data[(data.airport != 'note')]
    # data['note'] = np.select([data.variable == 'height', data.variable == 'weight', data.variable == 'volume'], ['tall', '', 'bulky'], default='')
    
    from_ = df.columns.tolist()[3:]
    to_   = df.loc[df['airport'] == 'note'].values.flatten().tolist()[3:]
    data["note"] = data["variable"].map(dict(zip(from_, to_))).fillna("")
    
    print(data)
    
    

    Output is identical:

       airport ver  year variable value   note
    3      TAE   B  2020   height   127   tall
    4      TAE   B  2020   weight   338
    5      TAE   B  2020   volume   168  bulky
    6      NQZ   A  2020   height   578   tall
    7      NQZ   A  2020   weight    85
    8      NQZ   A  2020   volume   310  bulky
    9      NQZ   Q  2020   height   196   tall
    10     NQZ   Q  2020   weight    17
    11     NQZ   Q  2020   volume    10  bulky