Search code examples
pandasdataframemulti-indexflattennested-json

flatten complicated nested Json file with a lot of list of dictionary with same key ( "key", "value")


I have a complicated json file that take me a lot of time to make it in Excel format, I do my best but i think I miss something so can someone help please,

the json file is below and my script did half of work :

json file : {"logFormatVersion": "log_security_v3", "data": [ { "logAlertUid": "a2fee3b7e2824c", "request": { "body": "", "cookies": [ { "key": "info_1", "value": "info_2" }, { "key": "info_3", "value": "info_4" }, { "key": "info_5", "value": "info_6" } ], "headers": [ { "key": "Host", "value": "ip_address" }, { "key": "Accept-Charset", "value": "iso-8859-1,utf-8;q=0.9,*;q=0.1" }, { "key": "Accept-Language", "value": "info_7" }, { "key": "Connection", "value": "Keep-Alive" }, { "key": "Referer", "value": "info_8" } ], "hostname": "FQDN", "ipDst": "Y.Y.Y.Y", "ipSrc": "X.X.X.X", "method": "GET", "path": "/xampp/cgi.cgi", "portDst": 443, "protocol": "HTTP/1.1", "query": "", "requestUid": "info_9" }, "websocket": [], "context": { "tags": "", "geoipCode": "", "geoipName": "", "applianceName": "name_device", "applianceUid": "18539", "backendHost": "ip_address", "backendPort": 80, "reverseProxyName": "FQDN", "reverseProxyUid": "info_10", "tunnelName": "info_11", "tunnelUid": "6d531c", "workflowName": "name-workflow", "workflowUid": "77802" }, "events": [ { "eventUid": "e62d8b", "tokens": { "date": "time", "matchingParts": [ { "part": "info_17", "partKey": "info_18", "partKeyOperator": "info_19", "partKeyPattern": "info_20", "partKeyMatch": "info_21", "partValue": "info_21", "partValueOperator": "info_22", "partValuePatternUid": "info_23", "partValuePatternName": "info_24", "partValuePatternVersion": "00614", "partValueMatch": "info_25", "attackFamily": "info_26", "riskLevel": 80, "riskLevelOWASP": 8, "cwe": "CWE-name" } ], "reason": "info_27", "securityExceptionConfigurationUids": [ "info_28" ], "securityExceptionMatchedRuleUids": [ "info_24" ] } }, { "eventUid": "e62d8b", "tokens": { "date": "time", "matchingParts": [ { "part": "info_17", "partKey": "info_18", "partKeyOperator": "info_19", "partKeyPattern": "info_20", "partKeyMatch": "info_21", "partValue": "info_21", "partValueOperator": "info_22", "partValuePatternUid": "info_23", "partValuePatternName": "info_24", "partValuePatternVersion": "00614", "partValueMatch": "info_25", "attackFamily": "info_26", "riskLevel": 80, "riskLevelOWASP": 8, "cwe": "CWE-name" } ], "reason": "info_27", "securityExceptionConfigurationUids": [ "info_28" ], "securityExceptionMatchedRuleUids": [ "info_24" ] } }

 ],
"timestampImport": null,
"timestamp": "info30",
"uid": "AYYm"

} ]}

cordially

my script is like that :

import json
import pandas as pd 

with open("eventlogs.json", "r") as f:
    objectfile = json.load(f)
    data = objectfile["data"]

df =pd.DataFrame(data)
df_request = pd.json_normalize(df["request"])
df_headers = pd.DataFrame(df_request["headers"])
df_headers = df_headers.explode("headers")
df_headers[["header-key", "header-value"]] = df_headers["headers"].apply(pd.Series)
df_headers.drop("headers", axis=1, inplace=True)
df = df.drop("request", axis=1)
new_df = pd.concat([df, df_request, df_headers], axis=1)

new_df.to_excel("all-data.xlsx", index=False)

if you execute the script it give a result like this : result without flatten "events" columns and "cookies" with a lot of line.

like I said the result is not really nice specially when I want to have result like this : Dream - File

I will be very thankful if you can help with idea to arrive to flatten this json to excel with multiindexed columns .


Solution

  • You could use further json_normalize() calls on the event and cookies columns to "flatten" them.

    As an alternative, you could try something like this which attempts to flatten everything "automatically":

    import json
    import pandas as pd
    from pathlib import Path
    
    
    def _json_flatten(data):
        rows = []
        parents = set()
        for parent, children in data:
            if isinstance(children, dict):
                row = {}
                for name, child in children.items():
                    key = (parent,) + (name if isinstance(name, tuple) else (name,))
                    row.setdefault(key, []).append(child)
                rows.append(row)
                parents.add(parent)
            if isinstance(children, list) and len(children) > 0:
                if all(isinstance(child, dict) for child in children):
                    if set(children[0]) == set().union(*children):
                        row = {}
                        for name in children[0]:
                            key = (parent,) + (name if isinstance(name, tuple) else (name,))
                            for child in children:
                                if len(children) == 1:
                                    row[key] = child[name]
                                else:
                                    row.setdefault(key, []).append(child[name])
                        rows.append(row)
                        parents.add(parent)
        data = dict(data)
        for parent in parents:
            del data[parent]
        for row in rows:
            data.update(row)
    
        return {
            (key,) if not isinstance(key, tuple) else key: value
            for key, value in data.items()
        }
    
    
    rows = json.loads(
        json.dumps(json.loads(Path("json/eventlog.json").read_bytes())["data"]),
        object_pairs_hook=_json_flatten,
    )
    
    if isinstance(rows, dict):
        rows = [rows]
    
    rows = [
        {
            key: value[0] if (isinstance(value, list) and len(value) == 1) else value
            for key, value in row.items()
        }
        for row in rows
    ]
    
    df = pd.DataFrame.from_records(rows)
    
    print(df)
    

    Exploding the request headers:

    columns = [ ("request", "headers", name) for name in ("key", "value") ]
    df.explode(columns)[columns]
    
      (request, headers, key)       (request, headers, value)
    0                    Host                      ip_address
    0          Accept-Charset  iso-8859-1,utf-8;q=0.9,*;q=0.1
    0         Accept-Language                          info_7
    0              Connection                      Keep-Alive
    0                 Referer                          info_8