I'm attempting to read in ~1.1M image and float pairs and save them in .h5 file chunks to be read in by a keras generator later on. As you can see below I'm attempting to explain to python that I want 25,000 images read in at a time and saved into the aforementioned .h5 file. For some reason, my ram is filling at around the 22nd loop.
My call to the GC is lowering memory usage a bit after each loop, but I'm seemingly not targeting the right variable to clear as the memory takes two steps forward and one step back.
Any help would be greatly appreciated!
def augment_clean_data_master(master_data):
data_list = master_data
print("Length of Data List: ",len(data_list))
num_processors = 8
print("Doing Flipped Image Data.")
p=Pool(processes = num_processors)
flipped_output = p_map(get_data_from_line_master_flipped,data_list)
print("Doing OG Image Data.")
clean_output = p_map(get_data_from_line_master,data_list)
print("Merging Data.")
aug = clean_output + flipped_output
return aug
def augment_data_and_save_as_hdf5():
#take master data, augment it, save in hdf5 format
file_name = "./master/master.txt"
#read in image name list
with open(file_name) as f:
data_list = f.readlines()
#make sublists of [angle, name, speed]
data_list = [x.split(',') for x in data_list]
f.close()
#make a for loop that goes in increments of 25000 & makes that data
#& appends it to the .h5 array
for i in range(int(len(data_list)/25000)):
#these two indexes are always 25000 in size apart
start_index = i * 25000
stop_index = (i+1) * 25000
print("*"*30)
print("Start Index: ", start_index/25000)
#read in the lines from the text data & pull necessary images and angles
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])
#if we're at the last chunk, then use the starting index to the end using a :
if (i == int(len(data_list)/25000)-1):
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])
#make empty lists to store images temporarily
X = []
y = []
#cull the error lines and pull out the images and angles
for element in partial_augmented_image_data_list:
try:
if (element[1] == 'None'):
pass
else:
X.append(element[0])
y.append(float(element[1]))
except:
pass
X = np.array(X).reshape(-1, 66, 200, 3)
y = np.array(y).reshape(len(y), 1)
print("X Shape: ", X.shape)
print("y Shape: ", y.shape)
#if it's the first interaction with saving the data, then you have to instantiate the files
#if start_index == 0:
with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))
#if we're at the last chunk, then use the starting index to the end using a :
'''
else:
with h5py.File('./masterArrays/Data.h5', 'a') as hf:
hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
hf["X"][-X.shape[0]:] = X
hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
hf["y"][-X.shape[0]:] = y
'''
#clearing ram manually here cause the GC was fucking up
del X
del y
import gc
gc.collect()
print("Augmented & Saved Arrays.")
In the end I just wrapped the method for processing images into chunks in the for loop and spun up and joined a process each iteration. That way we know that when the process dies it will be more likely to release the memory it was using. Seems to work well.
def mrMeeseeks(i,chunk_size, data_list):
#these two indexes are always chunk_size in size apart
start_index = i * chunk_size
stop_index = (i+1) * chunk_size
print("*"*30)
print("Start Index: ", start_index/chunk_size)
#read in the lines from the text data & pull necessary images and angles
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:stop_index])
#if we're at the last chunk, then use the starting index to the end using a :
if (i == int(len(data_list)/chunk_size)-1):
partial_augmented_image_data_list = process_data_helpers.augment_clean_data_master(data_list[start_index:])
#make empty lists to store images temporarily
X = np.empty([chunk_size,66,200,3])
y = np.empty([chunk_size,1])
#cull the error lines and pull out the images and angles
for count, element in enumerate(partial_augmented_image_data_list):
try:
if (element[1] == 'None'):
print("FUCKED UP, KEEP THE CHECK")
pass
else:
X[count] = element[0]
y[count] = float(element[1])
except:
pass
X = X.reshape(-1, 66, 200, 3)
y = y.reshape(len(y), 1)
#if it's the first interaction with saving the data, then you have to instantiate the files
#if start_index == 0:
with h5py.File('./masterArrays/Data_Chunk_' + str(i) + '.h5', 'w') as hf:
hf.create_dataset("X", data=X, compression="gzip", chunks=True, maxshape=(50000,66, 200, 3))
hf.create_dataset("y", data=y, compression="gzip", chunks=True, maxshape=(50000,1))
#if we're at the last chunk, then use the starting index to the end using a :
'''
else:
with h5py.File('./masterArrays/Data.h5', 'a') as hf:
hf["X"].resize((hf["X"].shape[0] + X.shape[0]), axis = 0)
hf["X"][-X.shape[0]:] = X
hf["y"].resize((hf["y"].shape[0] + y.shape[0]), axis = 0)
hf["y"][-X.shape[0]:] = y
'''
print("X Shape: ", X.shape)
print("y Shape: ", y.shape)
print(X[0])
print(y[0])
print("((((((((((()))))))))))")
print(X[1])
print(y[1])
#clearing ram manually here cause the GC was fucking up
del X
del y
del partial_augmented_image_data_list
import gc
gc.collect()
import multiprocessing
def augment_data_and_save_as_hdf5():
#take master data, augment it, save in hdf5 format
file_name = "./master/master.txt"
#read in image name list
with open(file_name) as f:
data_list = f.readlines()
#make sublists of [angle, name, speed]
data_list = [x.split(',') for x in data_list]
f.close()
chunk_size = 25000
#make a for loop that goes in increments of chunk_size & makes that data
#& appends it to the .h5 array
for i in range(int(len(data_list)/chunk_size)):
#start process
p = multiprocessing.Process(target=mrMeeseeks, args=(i,chunk_size,data_list,))
p.start()
p.join()
#add a .join command every 6 iterations so we can use only 6 cores and make sure each one works then finishes.