pandas dataframe multi-index flatten nested-json

flatten complicated nested Json file with a lot of list of dictionary with same key ( "key", "value")

I have a complicated json file that take me a lot of time to make it in Excel format, I do my best but i think I miss something so can someone help please,

the json file is below and my script did half of work :

json file : {"logFormatVersion": "log_security_v3", "data": [ { "logAlertUid": "a2fee3b7e2824c", "request": { "body": "", "cookies": [ { "key": "info_1", "value": "info_2" }, { "key": "info_3", "value": "info_4" }, { "key": "info_5", "value": "info_6" } ], "headers": [ { "key": "Host", "value": "ip_address" }, { "key": "Accept-Charset", "value": "iso-8859-1,utf-8;q=0.9,*;q=0.1" }, { "key": "Accept-Language", "value": "info_7" }, { "key": "Connection", "value": "Keep-Alive" }, { "key": "Referer", "value": "info_8" } ], "hostname": "FQDN", "ipDst": "Y.Y.Y.Y", "ipSrc": "X.X.X.X", "method": "GET", "path": "/xampp/cgi.cgi", "portDst": 443, "protocol": "HTTP/1.1", "query": "", "requestUid": "info_9" }, "websocket": [], "context": { "tags": "", "geoipCode": "", "geoipName": "", "applianceName": "name_device", "applianceUid": "18539", "backendHost": "ip_address", "backendPort": 80, "reverseProxyName": "FQDN", "reverseProxyUid": "info_10", "tunnelName": "info_11", "tunnelUid": "6d531c", "workflowName": "name-workflow", "workflowUid": "77802" }, "events": [ { "eventUid": "e62d8b", "tokens": { "date": "time", "matchingParts": [ { "part": "info_17", "partKey": "info_18", "partKeyOperator": "info_19", "partKeyPattern": "info_20", "partKeyMatch": "info_21", "partValue": "info_21", "partValueOperator": "info_22", "partValuePatternUid": "info_23", "partValuePatternName": "info_24", "partValuePatternVersion": "00614", "partValueMatch": "info_25", "attackFamily": "info_26", "riskLevel": 80, "riskLevelOWASP": 8, "cwe": "CWE-name" } ], "reason": "info_27", "securityExceptionConfigurationUids": [ "info_28" ], "securityExceptionMatchedRuleUids": [ "info_24" ] } }, { "eventUid": "e62d8b", "tokens": { "date": "time", "matchingParts": [ { "part": "info_17", "partKey": "info_18", "partKeyOperator": "info_19", "partKeyPattern": "info_20", "partKeyMatch": "info_21", "partValue": "info_21", "partValueOperator": "info_22", "partValuePatternUid": "info_23", "partValuePatternName": "info_24", "partValuePatternVersion": "00614", "partValueMatch": "info_25", "attackFamily": "info_26", "riskLevel": 80, "riskLevelOWASP": 8, "cwe": "CWE-name" } ], "reason": "info_27", "securityExceptionConfigurationUids": [ "info_28" ], "securityExceptionMatchedRuleUids": [ "info_24" ] } }

 ],
"timestampImport": null,
"timestamp": "info30",
"uid": "AYYm"

} ]}

cordially

my script is like that :

import json
import pandas as pd 

with open("eventlogs.json", "r") as f:
    objectfile = json.load(f)
    data = objectfile["data"]

df =pd.DataFrame(data)
df_request = pd.json_normalize(df["request"])
df_headers = pd.DataFrame(df_request["headers"])
df_headers = df_headers.explode("headers")
df_headers[["header-key", "header-value"]] = df_headers["headers"].apply(pd.Series)
df_headers.drop("headers", axis=1, inplace=True)
df = df.drop("request", axis=1)
new_df = pd.concat([df, df_request, df_headers], axis=1)

new_df.to_excel("all-data.xlsx", index=False)

if you execute the script it give a result like this : result without flatten "events" columns and "cookies" with a lot of line.

like I said the result is not really nice specially when I want to have result like this : Dream - File

I will be very thankful if you can help with idea to arrive to flatten this json to excel with multiindexed columns .

Solution

You could use further json_normalize() calls on the event and cookies columns to "flatten" them.

As an alternative, you could try something like this which attempts to flatten everything "automatically":

import json
import pandas as pd
from pathlib import Path


def _json_flatten(data):
    rows = []
    parents = set()
    for parent, children in data:
        if isinstance(children, dict):
            row = {}
            for name, child in children.items():
                key = (parent,) + (name if isinstance(name, tuple) else (name,))
                row.setdefault(key, []).append(child)
            rows.append(row)
            parents.add(parent)
        if isinstance(children, list) and len(children) > 0:
            if all(isinstance(child, dict) for child in children):
                if set(children[0]) == set().union(*children):
                    row = {}
                    for name in children[0]:
                        key = (parent,) + (name if isinstance(name, tuple) else (name,))
                        for child in children:
                            if len(children) == 1:
                                row[key] = child[name]
                            else:
                                row.setdefault(key, []).append(child[name])
                    rows.append(row)
                    parents.add(parent)
    data = dict(data)
    for parent in parents:
        del data[parent]
    for row in rows:
        data.update(row)

    return {
        (key,) if not isinstance(key, tuple) else key: value
        for key, value in data.items()
    }


rows = json.loads(
    json.dumps(json.loads(Path("json/eventlog.json").read_bytes())["data"]),
    object_pairs_hook=_json_flatten,
)

if isinstance(rows, dict):
    rows = [rows]

rows = [
    {
        key: value[0] if (isinstance(value, list) and len(value) == 1) else value
        for key, value in row.items()
    }
    for row in rows
]

df = pd.DataFrame.from_records(rows)

print(df)

Exploding the request headers:

columns = [ ("request", "headers", name) for name in ("key", "value") ]
df.explode(columns)[columns]

  (request, headers, key)       (request, headers, value)
0                    Host                      ip_address
0          Accept-Charset  iso-8859-1,utf-8;q=0.9,*;q=0.1
0         Accept-Language                          info_7
0              Connection                      Keep-Alive
0                 Referer                          info_8