Search code examples
pythonjsonpandasnormalize

Extract elements from uneven pandas dict-like series


Given the below sample data (10 records):

test_df = pd.DataFrame({"PN_id": ["745d626b", "745d626b", "fce503fb", "df3d727e", "df3d727e", "56c00531", "72ebb2b3", "5d1bc5d3", "72ebb2b3", "5c32fc8a", "5c32fc8a"], 
                        "PN_raw": ['{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}', 
                                   '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}', 
                                   '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}', 
                                   '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}', 
                                   '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}', 
                                   '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}', 
                                   '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}', 
                                   '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"P_90_or_S_90_ESLA","group":"******_CRM"}]}}', 
                                   '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}', 
                                   '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}', 
                                   '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}']})

How could I achieve the below desired output? (either within the same DF or on a separate one, which I can see as a likely possibility):

test_df_desired = pd.DataFrame({"PN_id":["745d626b", "745d626b", "fce503fb", "df3d727e", "df3d727e", "56c00531", "72ebb2b3", "5d1bc5d3", "72ebb2b3", "5c32fc8a", "5c32fc8a"], 
                                "segment":["67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "850c8d94-1236-45a1-93fc-08b0337b4059", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "850c8d94-1236-45a1-93fc-08b0337b4059", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "850c8d94-1236-45a1-93fc-08b0337b4059", "850c8d94-1236-45a1-93fc-08b0337b4059"], 
                                "tag_1":["BR", "BR", "All_S****_ES", "BR", "BR", "All_S****_ES", "BR", "P_90_or_S_90_ESLA", "BR", "P_90_or_S_90_ES", "P_90_or_S_90_ES"], 
                                "group_1":["ua_locale_country", "ua_locale_country", "******_CRM", "ua_locale_country", "ua_locale_country", "******_CRM", "ua_locale_country", "******_CRM", "ua_locale_country", "******_CRM", "******_CRM"], 
                                "tag_2":["90_P******_BR", "90_P******_BR", np.nan, "All_S*****_BR", "All_S*****_BR", np.nan, "P_90_or_S_90_BR", np.nan, "P_90_or_S_90_BR", np.nan, np.nan], 
                                "group_2":["******_CRM", "******_CRM", np.nan, "******_CRM", "******_CRM", np.nan, "******_CRM", np.nan, "******_CRM", np.nan, np.nan]})

So far, using pd.json_normalize(test_df["PN_raw"].apply(ast.literal_eval), record_path = ["audience", "and"]), I've managed to kinda unnest the records where the dict's path structure is audience -> and but for records where path is audience -> and -> and this doesn't work, nor can I hack my way around it adding record_path = ["audience", "and", "and"] which I thought could've worked. I reckon this would need to be solved looping through the series and applying a different function based on wether if it contains one or two+ "and"s

Current output not only "fails" on the mentioned above, but then there's the issue of "transposing" the data to its correct row (you'll see what I mean if you run that line above).


Solution

  • import json
    
    
    def promote(d):
        if list(d.keys()) == ['and']:
            for i in d['and']:
                yield from promote(i)
        else:
            yield d
    
    parsed = []
    data = {"PN_id": ["745d626b", "745d626b", "fce503fb", "df3d727e", "df3d727e", "56c00531", "72ebb2b3", "5d1bc5d3", "72ebb2b3", "5c32fc8a", "5c32fc8a"],
     "PN_raw": ['{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}',
                '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}',
                '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}',
                '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}',
                '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}',
                '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}',
                '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}',
                '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"P_90_or_S_90_ESLA","group":"******_CRM"}]}}',
                '{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}',
                '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}',
                '{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}']}
    
    data['PN'] = list(map(json.loads, data['PN_raw']))
    for ind, pn_id in enumerate(data['PN_id']):
        parsed_sub = {'PN_id': pn_id}
        count = 1
        for chunk in promote(data['PN'][ind]['audience']):
            if 'segment' in chunk:
                parsed_sub.update(chunk)
            else:
                parsed_sub.update({f'{k}{count}': v for k, v in chunk.items()})
                count +=1
        parsed.append(parsed_sub)
    
    df = pd.DataFrame(parsed)
    

    I find pandas obfuscates/mangles JSON, and I prefer to handle it in basic python. I'd say that JSON can have such different shapes, that there's no great way to make a universal "make_the_json_flat()" function, though I'd love to learn about it if there is such a thing.