python tensorflow object-detection object-detection-api

Multiple image inference for mask-rcnn runs ~10x slower than faster-rcnn for the same image size

I successfully retrained mask-rcnn and faster-rcnn models with my own custom dataset and I want to run inference for multiple images. I modified the single image inference function from the demo with the code below. I got the following result if I used retrained faster-rcnn resnet101 and the following result if I used retrained mask-rcnn resnet101 The following if I run with faster-rcnn inception-resnet and the following with mask-rcnn inception-resnet All images have resolution of 1024x768. Please help whether this is the right behavior or not. Thanks

The following function is the one that I modified from the demo

def run_inference_for_multiple_images(images, graph):
  with graph.as_default():
    with tf.Session() as sess:
        output_dict_array = []
        dict_time = []
        for image in images:
            # Get handles to input and output tensors
            ops = tf.get_default_graph().get_operations()
            all_tensor_names = {output.name for op in ops for output in op.outputs}
            tensor_dict = {}
            for key in ['num_detections', 'detection_boxes', 'detection_scores',
                'detection_classes', 'detection_masks']:
                tensor_name = key + ':0'
                if tensor_name in all_tensor_names:
                    tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
                        tensor_name)
            if 'detection_masks' in tensor_dict:
                detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
                detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
                # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
                real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
                detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
                detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
                detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
                    detection_masks, detection_boxes, image.shape[0], image.shape[1])
                detection_masks_reframed = tf.cast(
                    tf.greater(detection_masks_reframed, 0.5), tf.uint8)
                # Follow the convention by adding back the batch dimension
                tensor_dict['detection_masks'] = tf.expand_dims(
                    detection_masks_reframed, 0)
            image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

            # Run inference
            start = time.time()
            output_dict = sess.run(tensor_dict,
                                   feed_dict={image_tensor: np.expand_dims(image, 0)})
            end = time.time()
            print('inference time : {}'.format(end-start))

            # all outputs are float32 numpy arrays, so convert types as appropriate
            output_dict['num_detections'] = int(output_dict['num_detections'][0])
            output_dict['detection_classes'] = output_dict[
                'detection_classes'][0].astype(np.uint8)
            output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
            output_dict['detection_scores'] = output_dict['detection_scores'][0]
            if 'detection_masks' in output_dict:
                output_dict['detection_masks'] = output_dict['detection_masks'][0]

            output_dict_array.append(output_dict)
            dict_time.append(end-start)
return output_dict_array, dict_time

The following is a piece of code to run the function

batch_size = 10
chunks = len(diff_files) // batch_size + 1
ave_time = []
for i in range(chunks):
    batch = diff_files[i*batch_size:(i+1)*batch_size]
    images = []
    files = []
    proc_time = []
    for file in batch:
        image_path = os.path.join(subdir_path, file)
        print('Reading file {}'.format(image_path))
        image = cv2.imread(image_path)
        image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images.append(image_np)
        files.append(file)

    output_dicts, out_time = run_inference_for_multiple_images(images, detection_graph)
    print('length of output_dicts is : {}'.format(len(output_dicts)))
    if len(output_dicts) == 0:
        break

    for idx in range(len(output_dicts)):
        output_dict = output_dicts[idx]
        image_np = images[idx]
        file = files[idx]
        # Visualization of the results of a detection.
        start = time.time()
        vis_util.visualize_boxes_and_labels_on_image_array(
          image_np,
          output_dict['detection_boxes'],
          output_dict['detection_classes'],
          output_dict['detection_scores'],
          category_index,
          instance_masks=output_dict.get('detection_masks'),
          use_normalized_coordinates=True, min_score_thresh=.5,
          line_thickness=4, skip_scores=False,
          skip_labels=False,
          skip_boxes=False)
        height, width, chan = image_np.shape

        # Saving the processed image
        image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
        cv2.imwrite(os.path.join(result_img_path, file), image_np)
        print('Saving {}, time : {}'.format(file, time.time()-start))
        proc_time.append(time.time()-start + out_time[idx])
        # count += 1    

    if len(proc_time) != 0:
        mean_batch_time = statistics.mean(proc_time)
        print('mean processing time: {}'.format(mean_batch_time))
        ave_time.append(mean_batch_time)
    proc_time.clear()
    output_dicts.clear()

Solution

I have found the issues and the following function seems to work. The average inference time has reduced from around 3-4 seconds to 0.3-0.4 seconds per image (using resnet50 feature extractor). However, You must be careful when using this function because the assumption taken when using batch size image is all of the images must have the same size. Therefore, there would be an error thrown when one of the image in the batch has different sizes. Though I haven't confirmed this myself.

def run_inference_for_multiple_images(images, graph):
    with graph.as_default():
        with tf.Session() as sess:
            output_dict_array = []
            dict_time = []
            # Get handles to input and output tensors
            ops = tf.get_default_graph().get_operations()
            all_tensor_names = {output.name for op in ops for output in op.outputs}
            tensor_dict = {}
            for key in ['num_detections', 'detection_boxes', 'detection_scores',
                'detection_classes', 'detection_masks']:
                tensor_name = key + ':0'
                if tensor_name in all_tensor_names:
                    tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(tensor_name)
            if 'detection_masks' in tensor_dict:
                detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
                detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
                # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
                real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
                detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
                detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
                detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
                    detection_masks, detection_boxes, images[0].shape[0], images[0].shape[1])
                detection_masks_reframed = tf.cast(tf.greater(detection_masks_reframed, 0.5), tf.uint8)
                # Follow the convention by adding back the batch dimension
                tensor_dict['detection_masks'] = tf.expand_dims(detection_masks_reframed, 0)
            image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')
            for image in images:
                # Run inference
                start = time.time()
                output_dict = sess.run(tensor_dict, feed_dict={image_tensor: np.expand_dims(image, 0)})
                end = time.time()
                print('inference time : {}'.format(end - start))

                # all outputs are float32 numpy arrays, so convert types as appropriate
                output_dict['num_detections'] = int(output_dict['num_detections'][0])
                output_dict['detection_classes'] = output_dict['detection_classes'][0].astype(np.uint8)
                output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
                output_dict['detection_scores'] = output_dict['detection_scores'][0]
                if 'detection_masks' in output_dict:
                    output_dict['detection_masks'] = output_dict['detection_masks'][0]

                output_dict_array.append(output_dict)
                dict_time.append(end - start)
    return output_dict_array, dict_time