Search code examples
jsonpython-3.xpandasjson-normalize

Extract data from Json into Pandas (Python)


I'm trying to extract data in a dataframe. My attempts with pd.json_normalize did not work... I must be doing something wrong.

Exemple :

{
    "data": [
        {
            "date": {
                "01_07_2020": [
                    {
                        "customerId": "977869f4e181e656d",
                        "data": [
                            {
                                "_id": "5e1c75498de14f0bb5d",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 19.5,
                                    "max": 20.75,
                                    "avg": 20.0714285714,
                                    "diff": -7.9478021978,
                                    "last": 19.75
                                }
                            },
...
                                }
                            }
                        ]
                    },
                    {
                        "customerId": "5efaf52b0b26e2ae31816",
                        "data": [
                            {
                                "_id": "5efb44604bd91a7cde4c",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 23.0,
                                    "max": 23.0,
                                    "avg": 23.0,
                                    "diff": null,
                                    "last": 23.0
                                }
                            },
                            {
                                "_id": "5efb44604bd9126e2de4d",
                                "sensorType": "FLAT",
                                "external": 0.0,
                                "stats": {
                                    "min": 17.75,
                                    "max": 19.75,
                                    "avg": 18.5833333333,
                                    "diff": null,
                                    "last": 17.75
                                }
                            }
                        ]
                    }
                ]
            },
            "year": 2020
        },
        {
            "date": {
                "01_07_2021": 
etc...

Expected result :

_id sensorType extarnal min max avg diff last
5e1c75498de14f0bb5d FLAT 0.0 17.75 19.5 20.75 20.0714285714 -7.9478021978

I don't show my results, I am very far from getting what I want.


Solution

  • You can try:

    import json
    import pandas as pd
    
    
    json_data = r"""{
        "data": [
            {
                "date": {
                    "01_07_2020": [
                        {
                            "customerId": "977869f4e181e656d",
                            "data": [
                                {
                                    "_id": "5e1c75498de14f0bb5d",
                                    "sensorType": "FLAT",
                                    "external": 0.0,
                                    "stats": {
                                        "min": 19.5,
                                        "max": 20.75,
                                        "avg": 20.0714285714,
                                        "diff": -7.9478021978,
                                        "last": 19.75
                                    }
                                }
                            ]
                        },
                        {
                            "customerId": "5efaf52b0b26e2ae31816",
                            "data": [
                                {
                                    "_id": "5efb44604bd91a7cde4c",
                                    "sensorType": "FLAT",
                                    "external": 0.0,
                                    "stats": {
                                        "min": 23.0,
                                        "max": 23.0,
                                        "avg": 23.0,
                                        "diff": null,
                                        "last": 23.0
                                    }
                                },
                                {
                                    "_id": "5efb44604bd9126e2de4d",
                                    "sensorType": "FLAT",
                                    "external": 0.0,
                                    "stats": {
                                        "min": 17.75,
                                        "max": 19.75,
                                        "avg": 18.5833333333,
                                        "diff": null,
                                        "last": 17.75
                                    }
                                }
                            ]
                        }
                    ]
                }
            }
        ]
    }"""
    
    
    def get_data(o):
        if isinstance(o, dict):
            if "_id" in o and "stats" in o:
                yield o
            else:
                for v in o.values():
                    yield from get_data(v)
        elif isinstance(o, list):
            for v in o:
                yield from get_data(v)
    
    
    data = json.loads(json_data)
    
    all_data = []
    for d in get_data(data):
        all_data.append(
            {"_id": d["_id"], "sensorType": d["sensorType"], **d["stats"]}
        )
    
    df = pd.DataFrame(all_data)
    print(df)
    

    Prints:

                         _id sensorType    min    max        avg      diff   last
    0    5e1c75498de14f0bb5d       FLAT  19.50  20.75  20.071429 -7.947802  19.75
    1   5efb44604bd91a7cde4c       FLAT  23.00  23.00  23.000000       NaN  23.00
    2  5efb44604bd9126e2de4d       FLAT  17.75  19.75  18.583333       NaN  17.75
    

    EDIT: Different method to create the dataframe (with customerId and date):

    data = json.loads(json_data)
    
    all_data = []
    for d in data["data"]:
        for dt, dd in d["date"].items():
            for ddd in dd:
                customer_id = ddd["customerId"]
                for dddd in ddd["data"]:
                    all_data.append(
                        {
                            "date": dt,
                            "customerId": customer_id,
                            "_id": dddd["_id"],
                            "sensorType": dddd["sensorType"],
                            **dddd["stats"],
                        }
                    )
    
    df = pd.DataFrame(all_data)
    print(df)
    

    Prints:

             date             customerId                    _id sensorType    min    max        avg      diff   last
    0  01_07_2020      977869f4e181e656d    5e1c75498de14f0bb5d       FLAT  19.50  20.75  20.071429 -7.947802  19.75
    1  01_07_2020  5efaf52b0b26e2ae31816   5efb44604bd91a7cde4c       FLAT  23.00  23.00  23.000000       NaN  23.00
    2  01_07_2020  5efaf52b0b26e2ae31816  5efb44604bd9126e2de4d       FLAT  17.75  19.75  18.583333       NaN  17.75