Search code examples
pythonpandasdataframeif-statementmultiple-columns

pandas apply function to multiple columns with condition and create new columns


I have a df with multiple columns like this (there are many more cols & rows):

df = pd.DataFrame([
                   {'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy',}, 
                   {'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m'}, 
                   {'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z'},
                   {'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarkt':'wf'},
                   {'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf'},
                   {'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k' },
                   {'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' }

                  ])

my expected df should look like this in the end:

df = pd.DataFrame([
                   {'ID': 1,'date': '2022-01-01', 'fruit_code':'[100,99,300]', 'vegetable_code':'[1000,2000,3000]','supermarket':'xy','new_col_1':'all'}, 
                   {'ID': 2,'date': '2022-01-01', 'fruit_code':'[67,200,87]', 'vegetable_code':'[5000]','supermarket':'z, m','new_col_1':'[5000]'}, 
                   {'ID': 3,'date': '2021-01-01', 'fruit_code':'[100,5,300,78]', 'vegetable_code':'[7000,2000,3000]','supermarket':'wf, z','new_col_1':'all'},
                   {'ID': 4,'date': '2020-01-01', 'fruit_code':'[77]', 'vegetable_code':'[1000]','supermarket':'wf','new_col_1':'[77]'},
                   {'ID': 5,'date': '2022-15-01', 'fruit_code':'[100,200,546,33]', 'vegetable_code':'[4000,2000,3000]','supermarket':'t, wf','new_col_1':'all'},
                   {'ID': 6,'date': '2002-12-01', 'fruit_code':'[64,2]', 'vegetable_code':'[6000,8000,1000]','supermarket':'k', 'new_col_1':'[64]', 'new_col_2':'[2]'},
                   {'ID': 7,'date': '2018-12-01', 'fruit_code':'[5]', 'vegetable_code':'[6000,8000,1000]','supermarket':'p' ,'new_col_1':'all'}

                  ])

and here are multiple conditions I want to apply on cols fruit_code & vegetable_code to get two new columns:

UPDATE

def fruits_vegetable(row):
    if len(str(row['fruit_code'])) == 1:                                                        # fruit_code in new_col_1
        row['new_col_1'] = row['fruit_code']
    elif len(str(row['fruit_code'])) == 1 and len(str(row['vegetable_code'])) > 1:              # write "all" in new_col_1 
        row['new_col_1'] = 'all'
    elif len(str(row['fruit_code'])) > 2 and len(str(row['vegetable_code'])) == 1:              # vegetable_code in new_col_1
        row['new_col_1'] = row['vegetable_code']
    elif len(str(row['fruit_code'])) > 3 and len(str(row['vegetable_code'])) > 1:               # write "all" in new_col_1
        row['new_col_1'] = 'all'
    elif len(str(row['fruit_code'])) == 2 and len(str(row['vegetable_code'])) >= 0:             # fruit 1 new_col_1 & fruit 2 new_col_2
        row['new_col_1'] = row['fruit_code'][0]
        row['new_col_2'] = row['fruit_code'][1]

    return row

df = df.apply(fruits_vegetable, axis=1)

I'm still stuck, now I get "all" in some of the rows for the first column, but the second does not change.

If someone has some insights, that would be great.

Thanks, much appreciated


Solution

  • First is necessary convert strings repr of lists by ast.literal_eval to lists, then for chceck length remove casting to strings. If need one element lists instead scalars use [] in fruit[0] and fruit[1] and last change order of condition for len(fruit) == 1, also change len(fruit) > 3 to len(fruit) > 2 for match first row:

    def fruits_vegetable(row):
        
        fruit = ast.literal_eval(row['fruit_code'])
        vege = ast.literal_eval(row['vegetable_code'])
        
        if len(fruit) == 1 and len(vege) > 1:   # write "all" in new_col_1 
            row['new_col_1'] = 'all'
        elif len(fruit) > 2 and len(vege) == 1: # vegetable_code in new_col_1
            row['new_col_1'] = vege
        elif len(fruit) > 2 and len(vege) > 1:  # write "all" in new_col_1
            row['new_col_1'] = 'all'
        elif len(fruit) == 2 and len(vege) >= 0:# fruit 1 new_col_1 & fruit 2 new_col_2
            row['new_col_1'] = [fruit[0]]
            row['new_col_2'] = [fruit[1]]
        elif len(fruit) == 1:                   # fruit_code in new_col_1
            row['new_col_1'] = fruit
        return row
    
    
    df = df.apply(fruits_vegetable, axis=1)
    

    print (df)
       ID        date        fruit_code new_col_1 new_col_2 supermarket  \
    0   1  2022-01-01      [100,99,300]       all       NaN          xy   
    1   2  2022-01-01       [67,200,87]    [5000]       NaN        z, m   
    2   3  2021-01-01    [100,5,300,78]       all       NaN       wf, z   
    3   4  2020-01-01              [77]      [77]       NaN         NaN   
    4   5  2022-15-01  [100,200,546,33]       all       NaN       t, wf   
    5   6  2002-12-01            [64,2]      [64]       [2]           k   
    6   7  2018-12-01               [5]       all       NaN           p   
    
      supermarkt    vegetable_code  
    0        NaN  [1000,2000,3000]  
    1        NaN            [5000]  
    2        NaN  [7000,2000,3000]  
    3         wf            [1000]  
    4        NaN  [4000,2000,3000]  
    5        NaN  [6000,8000,1000]  
    6        NaN  [6000,8000,1000]