Given the below sample data (10 records):
test_df = pd.DataFrame({"PN_id": ["745d626b", "745d626b", "fce503fb", "df3d727e", "df3d727e", "56c00531", "72ebb2b3", "5d1bc5d3", "72ebb2b3", "5c32fc8a", "5c32fc8a"],
"PN_raw": ['{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"P_90_or_S_90_ESLA","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}']})
How could I achieve the below desired output? (either within the same DF or on a separate one, which I can see as a likely possibility):
test_df_desired = pd.DataFrame({"PN_id":["745d626b", "745d626b", "fce503fb", "df3d727e", "df3d727e", "56c00531", "72ebb2b3", "5d1bc5d3", "72ebb2b3", "5c32fc8a", "5c32fc8a"],
"segment":["67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "850c8d94-1236-45a1-93fc-08b0337b4059", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "850c8d94-1236-45a1-93fc-08b0337b4059", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "67537044-27db-4a0b-b5b7-362c9c5b2ba7", "850c8d94-1236-45a1-93fc-08b0337b4059", "850c8d94-1236-45a1-93fc-08b0337b4059"],
"tag_1":["BR", "BR", "All_S****_ES", "BR", "BR", "All_S****_ES", "BR", "P_90_or_S_90_ESLA", "BR", "P_90_or_S_90_ES", "P_90_or_S_90_ES"],
"group_1":["ua_locale_country", "ua_locale_country", "******_CRM", "ua_locale_country", "ua_locale_country", "******_CRM", "ua_locale_country", "******_CRM", "ua_locale_country", "******_CRM", "******_CRM"],
"tag_2":["90_P******_BR", "90_P******_BR", np.nan, "All_S*****_BR", "All_S*****_BR", np.nan, "P_90_or_S_90_BR", np.nan, "P_90_or_S_90_BR", np.nan, np.nan],
"group_2":["******_CRM", "******_CRM", np.nan, "******_CRM", "******_CRM", np.nan, "******_CRM", np.nan, "******_CRM", np.nan, np.nan]})
So far, using pd.json_normalize(test_df["PN_raw"].apply(ast.literal_eval), record_path = ["audience", "and"])
, I've managed to kinda unnest the records where the dict's path structure is audience -> and
but for records where path is audience -> and -> and
this doesn't work, nor can I hack my way around it adding record_path = ["audience", "and", "and"]
which I thought could've worked. I reckon this would need to be solved looping through the series and applying a different function based on wether if it contains one or two+ "and"s
Current output not only "fails" on the mentioned above, but then there's the issue of "transposing" the data to its correct row (you'll see what I mean if you run that line above).
import json
def promote(d):
if list(d.keys()) == ['and']:
for i in d['and']:
yield from promote(i)
else:
yield d
parsed = []
data = {"PN_id": ["745d626b", "745d626b", "fce503fb", "df3d727e", "df3d727e", "56c00531", "72ebb2b3", "5d1bc5d3", "72ebb2b3", "5c32fc8a", "5c32fc8a"],
"PN_raw": ['{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"90_P******_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"All_S*****_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"All_S****_ES","group":"******_CRM"}]}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"P_90_or_S_90_ESLA","group":"******_CRM"}]}}',
'{"audience":{"and":[{"segment":"67537044-27db-4a0b-b5b7-362c9c5b2ba7"},{"tag":"BR","group":"ua_locale_country"},{"tag":"P_90_or_S_90_BR","group":"******_CRM"}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}',
'{"audience":{"and":[{"and":[{"segment":"850c8d94-1236-45a1-93fc-08b0337b4059"}]},{"and":[{"tag":"P_90_or_S_90_ES","group":"******_CRM"}]}]}}']}
data['PN'] = list(map(json.loads, data['PN_raw']))
for ind, pn_id in enumerate(data['PN_id']):
parsed_sub = {'PN_id': pn_id}
count = 1
for chunk in promote(data['PN'][ind]['audience']):
if 'segment' in chunk:
parsed_sub.update(chunk)
else:
parsed_sub.update({f'{k}{count}': v for k, v in chunk.items()})
count +=1
parsed.append(parsed_sub)
df = pd.DataFrame(parsed)
I find pandas obfuscates/mangles JSON, and I prefer to handle it in basic python. I'd say that JSON can have such different shapes, that there's no great way to make a universal "make_the_json_flat()" function, though I'd love to learn about it if there is such a thing.