How to Prevent Memory Filling Up When Reading in Large Folder of Images & Converting in H5 in Python

I'm attempting to read in ~1.1M image and float pairs and save them in .h5 file chunks to be read in by a keras generator later on. As you can see below I'm attempting to explain to python that I want 25,000 images read in at a time and saved into the aforementioned .h5 file. For some reason, my ram is filling at around the 22nd loop.

My call to the GC is lowering memory usage a bit after each loop, but I'm seemingly not targeting the right variable to clear as the memory takes two steps forward and one step back.

Any help would be greatly appreciated!


def augment_clean_data_master(master_data):
    data_list = master_data
    print("Length of Data List: ",len(data_list))
    num_processors = 8
    print("Doing Flipped Image Data.")
    p=Pool(processes = num_processors)
    flipped_output = p_map(get_data_from_line_master_flipped,data_list)
    print("Doing OG Image Data.")
    clean_output = p_map(get_data_from_line_master,data_list)
    print("Merging Data.")

    aug = clean_output + flipped_output
    return aug

def augment_data_and_save_as_hdf5():
    #take master data, augment it, save in hdf5 format
    file_name = "./master/master.txt"

    #read in image name list
    with open(file_name) as f:
        data_list = f.readlines()
        #make sublists of [angle, name, speed]
        data_list = [x.split(',') for x in data_list]
        f.close()


    #make a for loop that goes in increments of 25000 & makes that data
    #& appends it to the .h5 array
    for i in range(int(len(data_list)/25000)):
        #these two indexes are always 25000 in size apart
        start_index = i * 25000
        stop_index = (i+1) * 25000

        print("*"*30)
        print("Start Index: ", start_index/25000)
        #read in the lines from the text data & pull necessary images and angles
        partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])

        #if we're at the last chunk, then use the starting index to the end using a :
        if (i == int(len(data_list)/25000)-1):
            partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])

        #make empty lists to store images temporarily
        X = []
        y = []

        #cull the error lines and pull out the images and angles
        for element in partial_augmented_image_data_list:
            try:
                if (element[1] == 'None'):
                    pass
                else:
                    X.append(element[0])
                    y.append(float(element[1]))
            except:
                pass

        X = np.array(X).reshape(-1, 66, 200, 3)
        y = np.array(y).reshape(len(y), 1)

        print("X Shape: ", X.shape)
        print("y Shape: ", y.shape)

        #if it's the first interaction with saving the data, then you have to instantiate the files
        #if start_index == 0:
        with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
            hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
            hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))

        #if we're at the last chunk, then use the starting index to the end using a :
        '''
        else:
            with h5py.File('./masterArrays/Data.h5', 'a') as hf:
                hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
                hf["X"][-X.shape[0]:] = X

                hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
                hf["y"][-X.shape[0]:] = y
        '''
        #clearing ram manually here cause the GC was fucking up
        del X
        del y
        import gc
        gc.collect()

    print("Augmented & Saved Arrays.")

Solution

In the end I just wrapped the method for processing images into chunks in the for loop and spun up and joined a process each iteration. That way we know that when the process dies it will be more likely to release the memory it was using. Seems to work well.

def mrMeeseeks(i,chunk_size, data_list):
    #these two indexes are always chunk_size in size apart
    start_index = i * chunk_size
    stop_index = (i+1) * chunk_size

    print("*"*30)
    print("Start Index: ", start_index/chunk_size)
    #read in the lines from the text data & pull necessary images and angles
    partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])

    #if we're at the last chunk, then use the starting index to the end using a :
    if (i == int(len(data_list)/chunk_size)-1):
        partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])

    #make empty lists to store images temporarily
    X = np.empty([chunk_size,66,200,3])
    y = np.empty([chunk_size,1])

    #cull the error lines and pull out the images and angles

    for count, element in enumerate(partial_augmented_image_data_list):
        try:
            if (element[1] == 'None'):
                print("FUCKED UP, KEEP THE CHECK")
                pass
            else:
                X[count] = element[0]
                y[count] = float(element[1])
        except:
            pass

    X = X.reshape(-1, 66, 200, 3)
    y = y.reshape(len(y), 1)


    #if it's the first interaction with saving the data, then you have to instantiate the files
    #if start_index == 0:
    with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
        hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
        hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))

    #if we're at the last chunk, then use the starting index to the end using a :
    '''
    else:
        with h5py.File('./masterArrays/Data.h5', 'a') as hf:
            hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
            hf["X"][-X.shape[0]:] = X

            hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
            hf["y"][-X.shape[0]:] = y
    '''

    print("X Shape: ", X.shape)
    print("y Shape: ", y.shape)

    print(X[0])
    print(y[0])
    print("((((((((((()))))))))))")
    print(X[1])
    print(y[1])

    #clearing ram manually here cause the GC was fucking up
    del X
    del y
    del partial_augmented_image_data_list
    import gc
    gc.collect()

import multiprocessing

def augment_data_and_save_as_hdf5():
    #take master data, augment it, save in hdf5 format
    file_name = "./master/master.txt"

    #read in image name list
    with open(file_name) as f:
        data_list = f.readlines()
        #make sublists of [angle, name, speed]
        data_list = [x.split(',') for x in data_list]
        f.close()

    chunk_size = 25000

    #make a for loop that goes in increments of chunk_size & makes that data
    #& appends it to the .h5 array
    for i in range(int(len(data_list)/chunk_size)):
        #start process
        p = multiprocessing.Process(target=mrMeeseeks, args=(i,chunk_size,data_list,))
        p.start()
        p.join()
        #add a .join command every 6 iterations so we can use only 6 cores and make sure each one works then finishes.