I successfully retrained mask-rcnn and faster-rcnn models with my own custom dataset and I want to run inference for multiple images. I modified the single image inference function from the demo with the code below. I got the following result if I used retrained faster-rcnn resnet101
and the following result if I used retrained mask-rcnn resnet101
The following if I run with faster-rcnn inception-resnet
and the following with mask-rcnn inception-resnet
All images have resolution of 1024x768. Please help whether this is the right behavior or not. Thanks
The following function is the one that I modified from the demo
def run_inference_for_multiple_images(images, graph):
with graph.as_default():
with tf.Session() as sess:
output_dict_array = []
dict_time = []
for image in images:
# Get handles to input and output tensors
ops = tf.get_default_graph().get_operations()
all_tensor_names = {output.name for op in ops for output in op.outputs}
tensor_dict = {}
for key in ['num_detections', 'detection_boxes', 'detection_scores',
'detection_classes', 'detection_masks']:
tensor_name = key + ':0'
if tensor_name in all_tensor_names:
tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
tensor_name)
if 'detection_masks' in tensor_dict:
detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
# Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
detection_masks, detection_boxes, image.shape[0], image.shape[1])
detection_masks_reframed = tf.cast(
tf.greater(detection_masks_reframed, 0.5), tf.uint8)
# Follow the convention by adding back the batch dimension
tensor_dict['detection_masks'] = tf.expand_dims(
detection_masks_reframed, 0)
image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
# Run inference
start = time.time()
output_dict = sess.run(tensor_dict,
feed_dict={image_tensor: np.expand_dims(image, 0)})
end = time.time()
print('inference time : {}'.format(end-start))
# all outputs are float32 numpy arrays, so convert types as appropriate
output_dict['num_detections'] = int(output_dict['num_detections'][0])
output_dict['detection_classes'] = output_dict[
'detection_classes'][0].astype(np.uint8)
output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
output_dict['detection_scores'] = output_dict['detection_scores'][0]
if 'detection_masks' in output_dict:
output_dict['detection_masks'] = output_dict['detection_masks'][0]
output_dict_array.append(output_dict)
dict_time.append(end-start)
return output_dict_array, dict_time
The following is a piece of code to run the function
batch_size = 10
chunks = len(diff_files) // batch_size + 1
ave_time = []
for i in range(chunks):
batch = diff_files[i*batch_size:(i+1)*batch_size]
images = []
files = []
proc_time = []
for file in batch:
image_path = os.path.join(subdir_path, file)
print('Reading file {}'.format(image_path))
image = cv2.imread(image_path)
image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
images.append(image_np)
files.append(file)
output_dicts, out_time = run_inference_for_multiple_images(images, detection_graph)
print('length of output_dicts is : {}'.format(len(output_dicts)))
if len(output_dicts) == 0:
break
for idx in range(len(output_dicts)):
output_dict = output_dicts[idx]
image_np = images[idx]
file = files[idx]
# Visualization of the results of a detection.
start = time.time()
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
output_dict['detection_boxes'],
output_dict['detection_classes'],
output_dict['detection_scores'],
category_index,
instance_masks=output_dict.get('detection_masks'),
use_normalized_coordinates=True, min_score_thresh=.5,
line_thickness=4, skip_scores=False,
skip_labels=False,
skip_boxes=False)
height, width, chan = image_np.shape
# Saving the processed image
image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
cv2.imwrite(os.path.join(result_img_path, file), image_np)
print('Saving {}, time : {}'.format(file, time.time()-start))
proc_time.append(time.time()-start + out_time[idx])
# count += 1
if len(proc_time) != 0:
mean_batch_time = statistics.mean(proc_time)
print('mean processing time: {}'.format(mean_batch_time))
ave_time.append(mean_batch_time)
proc_time.clear()
output_dicts.clear()
I have found the issues and the following function seems to work. The average inference time has reduced from around 3-4 seconds to 0.3-0.4 seconds per image (using resnet50 feature extractor). However, You must be careful when using this function because the assumption taken when using batch size image is all of the images must have the same size. Therefore, there would be an error thrown when one of the image in the batch has different sizes. Though I haven't confirmed this myself.
def run_inference_for_multiple_images(images, graph):
with graph.as_default():
with tf.Session() as sess:
output_dict_array = []
dict_time = []
# Get handles to input and output tensors
ops = tf.get_default_graph().get_operations()
all_tensor_names = {output.name for op in ops for output in op.outputs}
tensor_dict = {}
for key in ['num_detections', 'detection_boxes', 'detection_scores',
'detection_classes', 'detection_masks']:
tensor_name = key + ':0'
if tensor_name in all_tensor_names:
tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(tensor_name)
if 'detection_masks' in tensor_dict:
detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
# Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
detection_masks, detection_boxes, images[0].shape[0], images[0].shape[1])
detection_masks_reframed = tf.cast(tf.greater(detection_masks_reframed, 0.5), tf.uint8)
# Follow the convention by adding back the batch dimension
tensor_dict['detection_masks'] = tf.expand_dims(detection_masks_reframed, 0)
image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
for image in images:
# Run inference
start = time.time()
output_dict = sess.run(tensor_dict, feed_dict={image_tensor: np.expand_dims(image, 0)})
end = time.time()
print('inference time : {}'.format(end - start))
# all outputs are float32 numpy arrays, so convert types as appropriate
output_dict['num_detections'] = int(output_dict['num_detections'][0])
output_dict['detection_classes'] = output_dict['detection_classes'][0].astype(np.uint8)
output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
output_dict['detection_scores'] = output_dict['detection_scores'][0]
if 'detection_masks' in output_dict:
output_dict['detection_masks'] = output_dict['detection_masks'][0]
output_dict_array.append(output_dict)
dict_time.append(end - start)
return output_dict_array, dict_time