I'm working on an image classification project using the Snapshot Serengeti dataset. The dataset comes with a single very large JSON file (5GB+) that contains for top level keys. I specifically need the values contained in the "images": [{...}, {...}, ...] array for training. The file's too large for me to open directly and read from or to store to a dictionary.
The image entries in the file are formatted like this:
{
"id": "S1/B04/B04_R1/S1_B04_R1_PICT0003",
"file_name": "S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG",
"frame_num": 1,
"seq_id": "SER_S1#B04#1#3",
"width": 2048,
"height": 1536,
"corrupt": false,
"location": "B04",
"seq_num_frames": 1,
"datetime": "2010-07-20 06:14:06"
},
I've tried to loop over the file in 100MB chunks, but the file also has formatting issues (single quotes, NaN values) that need to be addressed first or errors are thrown. The code I tried was below
with open(labels_json) as f:
for chunk in iter(lambda: f.read(100*1024*1024), ""):
data = json.loads(chunk)
As the images are organized into 11 seasons, I tried to instead write the data to 11 separate files that can be loaded individually with the script below, but the cloud storage gets eaten up before even a single season is stored. I'm new to data storage issues like this so there could definitely be an issue in my script that's causing the file to be written inefficiently. Any help would be very much appreciated.
import json
labels_json = annotations_directory + "SS_Labels.json"
get_filename = lambda n : f"SS_labels_S{i}.json"
# Define the 11 output files
seasons = {}
started = {}
for i in range(1, 12):
filename = get_filename(i)
seasons[i] = open(filename, "w")
seasons[i].write('[')
started[i] = False
def seperate_seasons(dir):
line_num = 0
decoder = json.JSONDecoder()
with open(dir, 'r') as labels:
begin_writing = False
buffer = []
id = 1
for line in labels:
if not begin_writing: # Begin writing for the line after "images"
if 'images' in line:
begin_writing = True
else:
line.replace('NaN', 'null') # clean NaN values
line.replace("'", '"') # clean incorrect key values
buffer.append(line.strip()) # add line to buffer
getID = lambda l: int(line.split('"')[3].split('/')[0][1])
if '"id"' in line or "'id'" in line:
previous_id = id
id = getID(line) # get id of object
if line.strip() == '},' or line.strip() == '}': # when the object has finished, write it to the appropriate image folder
label = ','.join(buffer)
if label[-1] != ',':
label += ','
if started[id] == False:
print(f'Beginning Season {id}')
started[id] = True
if id != 1:
seasons[previous_id].write(']')
seasons[previous_id].close()
del seasons[previous_id]
seasons[id].write(label) # add label entry to file
seperate_seasons(labels_json)
# Close all remaining label files
for season in seasons.values():
season.write(']')
season.close()
If you don't have the RAM to load the file into memory (and I don't blame you if you don't), you could use a couple of additional libraries to split the data up into more manageable files.
This uses json-stream
for, well, streaming in JSON, and orjson
for a faster JSON encoder, and tqdm
for a progress bar.
The input is the original single JSON file, and the output folder out/
will end up containing the info and categories data from the JSON, as well as JSONL (aka JSON Lines, aka ND-JSON) files (i.e. JSON objects, one per line) á la
{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0001","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0001.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#1","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-18 16:26:14"}
{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0002","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0002.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#2","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-18 16:26:30"}
{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0003","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#3","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-20 06:14:06"}
{"id":"S1/B04/B04_R1/S1_B04_R1_PICT0004","file_name":"S1/B04/B04_R1/S1_B04_R1_PICT0004.JPG","frame_num":1,"seq_id":"SER_S1#B04#1#4","width":2048,"height":1536,"corrupt":false,"location":"B04","seq_num_frames":1,"datetime":"2010-07-22 08:56:06"}
JSONL files are readily handled by many tools, and can be parsed in Python with a simple for loop too. If you like, you could replace open
with gzip.open
to compress the JSONL files as you go.
The json_stream
API is a bit finicky, but here you go – works on my machine (json-stream==2.3.0).
On my laptop, tqdm reports this to process 29594 images per second.
import json_stream
import orjson
import tqdm
def read_images(value):
jsonl_files = {}
with tqdm.tqdm(value, unit="image") as pbar:
for image in pbar:
image = dict(image)
prefix = "/".join(image["id"].split("/")[:2])
filename = f"out/{prefix.replace('/', '_')}.jsonl"
if filename not in jsonl_files:
jsonl_files[filename] = open(filename, "ab")
if len(jsonl_files) > 50:
jsonl_files.popitem()[1].close()
pbar.set_description(f"Writing {filename}")
jsonl_files[filename].write(orjson.dumps(image))
jsonl_files[filename].write(b"\n")
def main():
with open("/Users/akx/Downloads/SnapshotSerengeti_S1-11_v2.1.json", "rb") as f:
data = json_stream.load(f)
for key, value in data.items():
if key == "info":
value = dict(value.persistent().items())
with open(f"out/info.json", "wb") as f:
f.write(orjson.dumps(value))
elif key == "categories":
value = [dict(d) for d in value.persistent()]
with open(f"out/categories.json", "wb") as f:
f.write(orjson.dumps(value))
elif key == "images":
read_images(value.persistent())
if __name__ == "__main__":
main()