Search code examples
pythonjsonpandascsvjson-normalize

How to convert a nested JSON to CSV


I want to convert nested json into csv format including sub rows for grouped list/dict.

Here my json

data =\
{
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }
    ]
}
import pandas as pd

df = pd.json_normalize(data)

# display(df)
  description id       name                                                                                                                                                                                                packages item.description item.id
0         HLD  1  HIGHLEVEL  [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]      description      11

Output of JSON to CSV Converter

"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"

I tried pandas normalization but the results are not the same as wanted. JSON Array are not converted into sub rows in csv. I want to keep empty string in the csv.

I want to do the same but with a Python Script.


Solution

  • This should work for you:

    from copy import deepcopy
    import pandas
    
    
    def cross_join(left, right):
        new_rows = [] if right else left
        for left_row in left:
            for right_row in right:
                temp_row = deepcopy(left_row)
                for key, value in right_row.items():
                    temp_row[key] = value
                new_rows.append(deepcopy(temp_row))
        return new_rows
    
    
    def flatten_list(data):
        for elem in data:
            if isinstance(elem, list):
                yield from flatten_list(elem)
            else:
                yield elem
    
    
    def json_to_dataframe(data_in):
        def flatten_json(data, prev_heading=''):
            if isinstance(data, dict):
                rows = [{}]
                for key, value in data.items():
                    rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
            elif isinstance(data, list):
                rows = []
                if(len(data) != 0):
                    for i in range(len(data)):
                        [rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
                else:
                    data.append("")
                    [rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
            else:
                rows = [{prev_heading[1:]: data}]
            return rows
    
        return pandas.DataFrame(flatten_json(data_in))
    
    def remove_duplicates(df):
        columns = list(df)[:7]
        for c in columns:
            df[c] = df[c].mask(df[c].duplicated(), "")
    
        return df
    
    
    if __name__ == '__main__':
        df = json_to_dataframe(data)
        df = remove_duplicates(df)
    
        print(df)
        df.to_csv('data.csv', index=False)
    

    Input 01:

    data = {
        "id": "1",
        "name": "HIGHLEVEL",
        "description": "HLD",
        "item": {
            "id": "11",
            "description": "description"
        },
        "packages": [{
                "id": "1",
                "label": "Package 1",
                "products": [{
                        "id": "1",
                        "price": 5
                    }, {
                        "id": "2",
                        "price": 3
                    }, {
                        "id": "3",
                        "price": 9
                    }
                ]
            }, {
                "id": "2",
                "label": "Package 3",
                "products": [{
                        "id": "1",
                        "price": 5
                    }, {
                        "id": "2",
                        "price": 3
                    }, {
                        "id": "3",
                        "price": 9
                    }
                ]
            }
        ]
    }
    

    Output 01:

    enter image description here

    Input 02:

    data = {
        "id": "1",
        "name": "HIGHLEVEL",
        "description": "HLD",
        "item": {
            "id": "11",
            "description": "description"
        },
        "packages": [{
                "id": "1",
                "label": "Package 1",
                "products": []
            }, {
                "id": "2",
                "label": "Package 3",
                "products": []
            }
        ]
    }
    

    Output 02: enter image description here

    Hope it will resolve your issue. If you need any explanation then please let me know.

    Thanks