Search code examples
pythonjsoncsvnested

How to convert nested json to csv with multiple different names?


I've been trying to convert a nested json file to csv. Here is a small example of the json file.

 json_data =   
{"labels":
    {
      "longfilename01:png": {
        "events": {
          "-N8V6uUR__vvB0qv1lPb": {
            "t": "2022-08-02T19:54:23.608Z",
            "user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1"
          }
        },
        "questions": {
          "would-you": {
            "-N8V6uUR__vvB0qv1lPb": {
              "answer": "no",
              "format": 1
            }
          }
        }
      },
      "longfilename02:png": {
        "events": {
          "-N8ILnaH-1ylwp2LGvtP": {
            "t": "2022-07-31T08:24:23.698Z",
            "user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2"
          }
        },
        "questions": {
          "would-you": {
            "-N8ILnaH-1ylwp2LGvtP": {
              "answer": "yes",
              "format": 1
            }
          }
        }
      }

I've tried multiple ways to get this output:

Labels Event User Time Answer
Long filename 01 -N8V6uUR__vvB0qv1lPb bmEhwNCZT9Wiftgvsopb7vBjO9o1 2022-08-02T19:54:23.608Z no
Long filename 02 -N8ILnaH-1ylwp2LGvtP bmEhwNCZT9Wiftgvsopb7vBjO9o1 2022-07-31T08:24:23.698Z yes

If I normalise with:

f= open('after_labels.json')

data = json.load(f)

df = pd.json_normalize(data)

Or try to flatten the file with multiple functions such as:

def flatten_json(json):
    def process_value(keys, value, flattened):
        if isinstance(value, dict):
            for key in value.keys():
                process_value(keys + [key], value[key], flattened)
        elif isinstance(value, list):
            for idx, v in enumerate(value):
                process_value(keys + [str(idx)], v, flattened)
        else:
            flattened['__'.join(keys)] = value

    flattened = {}
    for key in json.keys():
        process_value([key], json[key], flattened)
    return flattened

df = flatten_json(data)

or

from copy import deepcopy
import pandas


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def json_to_dataframe(data_in):
    def flatten_json(data, prev_heading=''):
        if isinstance(data, dict):
            rows = [{}]
            for key, value in data.items():
                rows = cross_join(rows, flatten_json(value, prev_heading + '.' + key))
        elif isinstance(data, list):
            rows = []
            for item in data:
                [rows.append(elem) for elem in flatten_list(flatten_json(item, prev_heading))]
        else:
            rows = [{prev_heading[1:]: data}]
        return rows

    return pandas.DataFrame(flatten_json(data_in))

df = json_to_dataframe(data)
print(df)

It gives me 292 columns and I suspect this is because of the long unique filenames.

I can't change the json file before processing, because that seems like the simple solution to do "filename": "longfilename01:png" as they would then all be consistent and I wouldn't have this problem.

I would be grateful for any other clever ideas on how to solve this.


Solution

  • Try:

    json_data = {
        "labels": {
            "longfilename01:png": {
                "events": {
                    "-N8V6uUR__vvB0qv1lPb": {
                        "t": "2022-08-02T19:54:23.608Z",
                        "user": "bmEhwNCZT9Wiftgvsopb7vBjO9o1",
                    }
                },
                "questions": {
                    "would-you": {
                        "-N8V6uUR__vvB0qv1lPb": {"answer": "no", "format": 1}
                    }
                },
            },
            "longfilename02:png": {
                "events": {
                    "-N8ILnaH-1ylwp2LGvtP": {
                        "t": "2022-07-31T08:24:23.698Z",
                        "user": "Qf7C5cXQkXfQanxKPR0rsKW4QzE2",
                    }
                },
                "questions": {
                    "would-you": {
                        "-N8ILnaH-1ylwp2LGvtP": {"answer": "yes", "format": 1}
                    }
                },
            },
        }
    }
    
    
    df = pd.DataFrame(
        [
            {
                "Labels": k,
                "Event": list(v["events"])[0],
                "User": list(v["events"].values())[0]["user"],
                "Time": list(v["events"].values())[0]["t"],
                "Answer": list(list(v["questions"].values())[0].values())[0][
                    "answer"
                ],
            }
            for k, v in json_data["labels"].items()
        ]
    )
    print(df)
    

    Prints:

                   Labels                 Event                          User                      Time Answer
    0  longfilename01:png  -N8V6uUR__vvB0qv1lPb  bmEhwNCZT9Wiftgvsopb7vBjO9o1  2022-08-02T19:54:23.608Z     no
    1  longfilename02:png  -N8ILnaH-1ylwp2LGvtP  Qf7C5cXQkXfQanxKPR0rsKW4QzE2  2022-07-31T08:24:23.698Z    yes