Search code examples
pythonpandasstringlistcontains

Show which strings contained in a column


How could I create the column called "reason", which shows which strings matched?

match_d = {"col_a":["green", "purple"], "col_b":["weak", "stro", "strong"],...}
df
    fruit   col_a     col_b
0   apple   yellow     NaN
1   pear    blue       NaN
2   banana  green      strong
3   cherry  green      heavy
4   grapes  brown      light
...

Expected Output

    fruit   col_a     col_b      reason
0   apple   yellow     NaN        NaN
1   pear    blue       NaN        NaN
2   banana  green      strong     col_a:["green"], col_b:["stro", "strong"]
3   cherry  green      heavy      col_a:["green"] 
4   grapes  brown      light       

Solution

  • Use nested list comprehension for join matched values by match_d and then join values with columns names if not empty strings:

    match_d = {"col_a":["green", "purple"], "col_b":["weak", "stro", "strong"]}
    
    cols = list(match_d.keys())
    L = [[','.join(z for z in match_d[x] if pd.notna(y) and z in y) 
          for y in df[x]]  for x in df[cols]]
    
    df['reason'] = [np.nan if ''.join(x) == '' else ';'.join(f'{a}:[{b}]' 
                    for a, b in zip(cols, x) if b != '') 
                    for x in zip(*L)]
    print (df)
        fruit   col_a   col_b                             reason
    0   apple  yellow     NaN                                NaN
    1    pear    blue     NaN                                NaN
    2  banana   green  strong  col_a:[green];col_b:[stro,strong]
    3  cherry   green   heavy                      col_a:[green]
    4  grapes   brown   light                                NaN
    

    Alternative solution with .apply:

    match_d = {"col_a":["green", "purple"], "col_b":["weak", "stro", "strong"]}
    
    cols = list(match_d.keys())
    df1 = df[cols].apply(lambda x: [','.join(z for z in match_d[x.name] 
                                             if pd.notna(y) and z in y) for y in x])
    
    df['reason'] = [np.nan if ''.join(x) == '' else ';'.join(f'{a}:[{b}]' 
                    for a, b in zip(cols, x) if b != '') 
                    for x in df1.to_numpy()]
    

    EDIT: For add substring for not missing values in column reason use:

    m = df['reason'].notna()
    df.loc[m, 'reason'] = 'fruit[not_empty];' + df.loc[m, 'reason']
    print (df)
        fruit   col_a   col_b                                             reason
    0   apple  yellow     NaN                                                NaN
    1    pear    blue     NaN                                                NaN
    2  banana   green  strong  fruit[not_empty];col_a:[green];col_b:[stro,str...
    3  cherry   green   heavy                     fruit[not_empty];col_a:[green]
    4  grapes   brown   light                                                NaN 
    

    EDIT1: You ca wrap solution to function:

    def func(df, match_d):
    
        cols = list(match_d.keys())
        df1 = df[cols].apply(lambda x: [','.join(z for z in match_d[x.name] 
                                                 if pd.notna(y) and z in y) for y in x])
        
        s = [np.nan if ''.join(x) == '' else ';'.join(f'{a}:[{b}]' 
                        for a, b in zip(cols, x) if b != '') 
                        for x in df1.to_numpy()]
        return s
    
    d = {"col_a":["green", "purple"], "col_b":["weak", "stro", "strong"]}
    d1 = {"col_a":["blue", "yeallow"], "col_b":["light"]}
    
    df['reason'] = df.pipe(func, d)
    df['reason1'] = df.pipe(func, d1)
    
    print (df)
    
        fruit   col_a   col_b                             reason        reason1
    0   apple  yellow     NaN                                NaN            NaN
    1    pear    blue     NaN                                NaN   col_a:[blue]
    2  banana   green  strong  col_a:[green];col_b:[stro,strong]            NaN
    3  cherry   green   heavy                      col_a:[green]            NaN
    4  grapes   brown   light                                NaN  col_b:[light]