Search code examples
pythonpandasdataframepandas-groupbynested-lists

Error in creating dynamic columns from existing column having nested list of lists


I want to create two column from an existing column which contains nested list of list as values.

Rows of record consisting of 3 companies participant and their role:

**row 1** [{'roles': [{'type': 'director'}, {'type': 'founder'}, {'type': 'owner'}, {'type': 'real_owner'}], 'life': {'name': 'Lichun Du'}}]

**row 2** [{'roles': [{'type': 'board'}], 'life': {'name': 'Erik Mølgaard'}}, {'roles': [{'type': 'director'}, {'type': 'board'}, {'type': 'real_owner'}], 'life': {'name': 'Mikael Bodholdt Linde'}}, {'roles': [{'type': 'board'}, {'type': 'real_owner'}], 'life': {'name': 'Dorte Bøcker Linde'}}]

**row 3** [{'roles': [{'type': 'director'}, {'type': 'real_owner'}], 'life': {'name': 'Kristian Løth Hougaard'}}, {'roles': [{'type': 'owner'}], 'life': {'name': 'WORLD JET HOLDING ApS'}}]

So far I have tried:

    responses['Role of Participant(s)'] = [element[0]['roles'] for element in responses['participants']]
    responses['Role of Participant(s)'] = responses['Role of Participant(s)'].apply(lambda x: ', '.join(t['type'] for t in x))
    responses['Name of Participant(s)'] = [element[0]['life']['name'] for element in responses['participants']]

Which gives me the following output:

enter image description here

It just returns me only the first participants's role and name .

However, I need all the participant and their respective roles of each row/records like the following:

enter image description here

So how can I achieve this using "***" as separator for each rows value like the above screenshot?

Update

Here is the csv version of the dataframe:

participants
"[{'roles': [{'type': 'founder'}], 'life': {'name': 'Poul Erik Andersen'}}, {'roles': [{'type': 'director'}, {'type': 'board'}], 'life': {'name': 'Martin Ravn-Nielsen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Søren Haugaard'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Mads Dehlsen Winther'}}, {'roles': [{'type': 'founder'}], 'life': {'name': 'M+ Ejendomme A/S'}}, {'roles': [{'type': 'founder'}], 'life': {'name': 'MILTON HOLDING HORSENS A/S'}}, {'roles': [{'type': 'accountant'}], 'life': {'name': 'EY Godkendt Revisionspartnerselskab'}}, {'roles': [{'type': 'owner'}], 'life': {'name': 'HUSCOMPAGNIET HOLDING A/S'}}]"
"[{'roles': [{'type': 'founder'}, {'type': 'director'}, {'type': 'board'}, {'type': 'real_owner'}], 'life': {'name': 'Rasmus Gert Hansen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'John Nyrup Larsen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Ole Nidolf Larsen'}}, {'roles': [{'type': 'owner'}], 'life': {'name': 'RASMUS HANSEN HOLDING ApS'}}, {'roles': [{'type': 'accountant'}], 'life': {'name': 'DANSK REVISION SLAGELSE GODKENDT REVISIONSAKTIESELSKAB'}}]"
"[{'roles': [{'type': 'board'}], 'life': {'name': 'Berit Pedersen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Sanne Kristine Späth'}}, {'roles': [{'type': 'real_owner'}], 'life': {'name': 'Kjeld Kirk Kristiansen'}}, {'roles': [{'type': 'director'}], 'life': {'name': 'Jesper Andersen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Poul Hartvig Nielsen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Nanna Birgitta Gudum'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Henrik Baagøe Fredeløkke'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Carsten Rasmussen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'Jesper Laursen'}}, {'roles': [{'type': 'board'}], 'life': {'name': 'John Hansen'}}, {'roles': [{'type': 'owner'}], 'life': {'name': 'LEGO A/S'}}, {'roles': [{'type': 'accountant'}], 'life': {'name': 'PRICEWATERHOUSECOOPERS STATSAUTORISERET REVISIONSPARTNERSELSKAB'}}]"

Solution

  • You need second for-loop instead of [0]

    I use normal function instead of lambda to make it more readable.

    First for role:

    import pandas as pd
    
    data = {'participants': 
    [
        [{'roles': [{'type': 'director'}, {'type': 'founder'}, {'type': 'owner'}, {'type': 'real_owner'}], 'life': {'name': 'Lichun Du'}}],
        [{'roles': [{'type': 'board'}], 'life': {'name': 'Erik Mølgaard'}}, {'roles': [{'type': 'director'}, {'type': 'board'}, {'type': 'real_owner'}], 'life': {'name': 'Mikael Bodholdt Linde'}}, {'roles': [{'type': 'board'}, {'type': 'real_owner'}], 'life': {'name': 'Dorte Bøcker Linde'}}],
        [{'roles': [{'type': 'director'}, {'type': 'real_owner'}], 'life': {'name': 'Kristian Løth Hougaard'}}, {'roles': [{'type': 'owner'}], 'life': {'name': 'WORLD JET HOLDING ApS'}}],
    ]
    }
    
    df = pd.DataFrame(data)
    
    def get_roles(cell):
        
        results = []
        
        for item in cell:
            roles = []
            for role in item['roles']:
                roles.append(role['type'])
            results.append(",".join(roles))
        
        results = "***".join(results)
    
        return results
    
    df['Role of Participant(s)'] = df['participants'].apply(get_roles)
    
    print(df[['Role of Participant(s)']].to_string())
    

    Result:

                                     Role of Participant(s)
    0                     director,founder,owner,real_owner
    1  board***director,board,real_owner***board,real_owner
    2                           director,real_owner***owner
    

    And now you can try to write as lambda

    df['Role of Participant(s)'] = df['participants'].apply(lambda cell:"***".join([",".join(role['type'] for role in item['roles']) for item in cell]))
    

    Similar for name:

    def get_names(cell):
        
        results = []
        
        for item in cell:
            results.append(item['life']['name'])
        
        results = "***".join(results)
    
        return results
    
    df['Name of Participant(s)'] = df['participants'].apply(get_names)
    

    And now with lambda

    df['Name of Participant(s)'] = df['participants'].apply(lambda cell:"***".join(item['life']['name'] for item in cell))
    

    EDIT:

    version which create both column in one apply and skip participants which have role director

    import pandas as pd
    
    data = {'participants': 
    [
        [{'roles': [{'type': 'director'}, {'type': 'founder'}, {'type': 'owner'}, {'type': 'real_owner'}], 'life': {'name': 'Lichun Du'}}],
        [{'roles': [{'type': 'board'}], 'life': {'name': 'Erik Mølgaard'}}, {'roles': [{'type': 'director'}, {'type': 'board'}, {'type': 'real_owner'}], 'life': {'name': 'Mikael Bodholdt Linde'}}, {'roles': [{'type': 'board'}, {'type': 'real_owner'}], 'life': {'name': 'Dorte Bøcker Linde'}}],
        [{'roles': [{'type': 'director'}, {'type': 'real_owner'}], 'life': {'name': 'Kristian Løth Hougaard'}}, {'roles': [{'type': 'owner'}], 'life': {'name': 'WORLD JET HOLDING ApS'}}],
    ]
    }
    
    df = pd.DataFrame(data)
    
    def get_names_and_roles(cell):
        
        all_names = []
        all_roles = []
        
        for item in cell:
            name = item['life']['name']
            roles = [role['type'] for role in item['roles']]
    
            if 'director' not in roles:
                all_names.append(name)
                all_roles.append(",".join(roles))
        
        all_names = "***".join(all_names)
        all_roles = "***".join(all_roles)
    
        return pd.Series([all_names, all_roles])
    
    
    df[ ['Name of Participant(s)', 'Role of Participant(s)'] ] = df['participants'].apply(get_names_and_roles)
    
    print(df[ ['Name of Participant(s)', 'Role of Participant(s)'] ].to_string())
    

    Result:

                   Name of Participant(s)    Role of Participant(s)
    0                                                              
    1  Erik Mølgaard***Dorte Bøcker Linde  board***board,real_owner
    2               WORLD JET HOLDING ApS                     owner