Loading and cleaning a very large JSON file

I'm working on an image classification project using the Snapshot Serengeti dataset. The dataset comes with a single very large JSON file (5GB+) that contains for top level keys. I specifically need the values contained in the "images": [{...}, {...}, ...] array for training. The file's too large for me to open directly and read from or to store to a dictionary.

The image entries in the file are formatted like this:

{
"id": "S1/B04/B04_R1/S1_B04_R1_PICT0003",
"file_name": "S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG",
"frame_num": 1,
"seq_id": "SER_S1#B04#1#3",
"width": 2048,
"height": 1536,
"corrupt": false,
"location": "B04",
"seq_num_frames": 1,
"datetime": "2010-07-20 06:14:06"
},

I've tried to loop over the file in 100MB chunks, but the file also has formatting issues (single quotes, NaN values) that need to be addressed first or errors are thrown. The code I tried was below

with open(labels_json) as f:         
    for chunk in iter(lambda: f.read(100*1024*1024), ""):         
    data = json.loads(chunk)

As the images are organized into 11 seasons, I tried to instead write the data to 11 separate files that can be loaded individually with the script below, but the cloud storage gets eaten up before even a single season is stored. I'm new to data storage issues like this so there could definitely be an issue in my script that's causing the file to be written inefficiently. Any help would be very much appreciated.

import json

labels_json = annotations_directory + "SS_Labels.json"

get_filename = lambda n : f"SS_labels_S{i}.json"

# Define the 11 output files
seasons = {}
started = {}
for i in range(1, 12):
    filename = get_filename(i)
    seasons[i] = open(filename, "w")
    seasons[i].write('[')
    started[i] = False

def seperate_seasons(dir):
    line_num = 0
    decoder = json.JSONDecoder()
    with open(dir, 'r') as labels:
        begin_writing = False
        buffer = []
        id = 1
        for line in labels:
            if not begin_writing: # Begin writing for the line after "images"
                if 'images' in line:
                    begin_writing = True
            else:
                line.replace('NaN', 'null') # clean NaN values
                line.replace("'", '"')      # clean incorrect key values

                buffer.append(line.strip()) # add line to buffer

                getID = lambda l: int(line.split('"')[3].split('/')[0][1])
                if '"id"' in line or "'id'" in line:
                    previous_id = id
                    id = getID(line)        # get id of object

                if line.strip() == '},' or line.strip() == '}': # when the object has finished, write it to the appropriate image folder
                    label = ','.join(buffer)
                    if label[-1] != ',':
                        label += ','

                    if started[id] == False:
                        print(f'Beginning Season {id}')
                        started[id] = True

                        if id != 1:
                            seasons[previous_id].write(']')
                            seasons[previous_id].close()
                            del seasons[previous_id]


                    seasons[id].write(label)                    # add label entry to file

seperate_seasons(labels_json)

# Close all remaining label files
for season in seasons.values():
    season.write(']')
    season.close()

Solution

If you don't have the RAM to load the file into memory (and I don't blame you if you don't), you could use a couple of additional libraries to split the data up into more manageable files.

This uses json-stream for, well, streaming in JSON, and orjson for a faster JSON encoder, and tqdm for a progress bar.

The input is the original single JSON file, and the output folder out/ will end up containing the info and categories data from the JSON, as well as JSONL (aka JSON Lines, aka ND-JSON) files (i.e. JSON objects, one per line) á la

{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0001","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0001.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#1","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-18 16:26:14"}
{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0002","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0002.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#2","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-18 16:26:30"}
{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0003","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#3","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-20 06:14:06"}
{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0004","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0004.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#4","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-22 08:56:06"}

JSONL files are readily handled by many tools, and can be parsed in Python with a simple for loop too. If you like, you could replace open with gzip.open to compress the JSONL files as you go.

The json_stream API is a bit finicky, but here you go – works on my machine (json-stream==2.3.0).

On my laptop, tqdm reports this to process 29594 images per second.

import json_stream
import orjson
import tqdm


def read_images(value):
    jsonl_files = {}

    with tqdm.tqdm(value, unit="image") as pbar:
        for image in pbar:
            image = dict(image)
            prefix = "/".join(image["id"].split("/")[:2])

            filename = f"out/{prefix.replace('/', '_')}.jsonl"

            if filename not in jsonl_files:
                jsonl_files[filename] = open(filename, "ab")
                if len(jsonl_files) > 50:
                    jsonl_files.popitem()[1].close()
                pbar.set_description(f"Writing {filename}")

            jsonl_files[filename].write(orjson.dumps(image))
            jsonl_files[filename].write(b"\n")


def main():
    with open("/Users/akx/Downloads/SnapshotSerengeti_S1-11_v2.1.json", "rb") as f:
        data = json_stream.load(f)
        for key, value in data.items():
            if key == "info":
                value = dict(value.persistent().items())
                with open(f"out/info.json", "wb") as f:
                    f.write(orjson.dumps(value))
            elif key == "categories":
                value = [dict(d) for d in value.persistent()]
                with open(f"out/categories.json", "wb") as f:
                    f.write(orjson.dumps(value))
            elif key == "images":
                read_images(value.persistent())


if __name__ == "__main__":
    main()