How to convert a nested JSON to CSV

I want to convert nested json into csv format including sub rows for grouped list/dict.

Here my json

data =\
{
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }
    ]
}

import pandas as pd

df = pd.json_normalize(data)

# display(df)
  description id       name                                                                                                                                                                                                packages item.description item.id
0         HLD  1  HIGHLEVEL  [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]      description      11

Output of JSON to CSV Converter

"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"

I tried pandas normalization but the results are not the same as wanted. JSON Array are not converted into sub rows in csv. I want to keep empty string in the csv.

I want to do the same but with a Python Script.

Solution

This should work for you:

from copy import deepcopy
import pandas


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def json_to_dataframe(data_in):
    def flatten_json(data, prev_heading=''):
        if isinstance(data, dict):
            rows = [{}]
            for key, value in data.items():
                rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
        elif isinstance(data, list):
            rows = []
            if(len(data) != 0):
                for i in range(len(data)):
                    [rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
            else:
                data.append("")
                [rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
        else:
            rows = [{prev_heading[1:]: data}]
        return rows

    return pandas.DataFrame(flatten_json(data_in))

def remove_duplicates(df):
    columns = list(df)[:7]
    for c in columns:
        df[c] = df[c].mask(df[c].duplicated(), "")

    return df


if __name__ == '__main__':
    df = json_to_dataframe(data)
    df = remove_duplicates(df)

    print(df)
    df.to_csv('data.csv', index=False)

Input 01:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }
    ]
}

Output 01:

Input 02:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": []
        }, {
            "id": "2",
            "label": "Package 3",
            "products": []
        }
    ]
}

Output 02:

Hope it will resolve your issue. If you need any explanation then please let me know.

Thanks