Search code examples
pythontensorflowobject-detectionobject-detection-apifaster-rcnn

Object Detection on Video has different predictions than Object Detection on images


I wanted to test my created Model. While testing i noticed that the predictions of the first and second code are different. Both Codes use the same frozen interference graph and use the same frame for the object detection. How can i change the second code to get the same results as in the first code?

cap = cv2.VideoCapture("InputVideo.mp4")
frame_array = []
with detection_graph.as_default():
  with tf.Session(graph=detection_graph) as sess:
    while cap.isOpened():
      frameId = int(round(cap.get(1)))
      ret, image_np = cap.read()
      if ret == True:
          if frameId % 1 == 0:
              image_np_expanded = np.expand_dims(image_np, axis=0)
              image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
              boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
              scores = detection_graph.get_tensor_by_name('detection_scores:0')
              classes = detection_graph.get_tensor_by_name('detection_classes:0')
              num_detections = detection_graph.get_tensor_by_name('num_detections:0')
              (boxes, scores, classes, num_detections) = sess.run(
                  [boxes, scores, classes, num_detections],
                   feed_dict={image_tensor: image_np_expanded})
              vis_util.visualize_boxes_and_labels_on_image_array(
                  image_np,
                  np.squeeze(boxes),
                  np.squeeze(classes).astype(np.int32),
                  np.squeeze(scores),
                  category_index,
                  use_normalized_coordinates=True,
                  line_thickness=8,
                  min_score_thresh=.35)
              frame_array.append(image_np)
      else:
          break

Second Code

def load_image_into_numpy_array(image):
  (im_width, im_height) = image.size
  return np.array(image.getdata()).reshape((im_height, im_width, 3)).astype(np.uint8)
%matplotlib inline
with detection_graph.as_default():
  with tf.Session(graph=detection_graph) as sess:
    for image_path in TEST_IMAGE_PATHS:
      image = Image.open(image_path)
      image_np = load_image_into_numpy_array(image)
      # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
      image_np_expanded = np.expand_dims(image_np, axis=0)
      image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
      boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
      scores = detection_graph.get_tensor_by_name('detection_scores:0')
      classes = detection_graph.get_tensor_by_name('detection_classes:0')
      num_detections = detection_graph.get_tensor_by_name('num_detections:0')
      # Actual detection.
      (boxes, scores, classes, num_detections) = sess.run(
          [boxes, scores, classes, num_detections],
          feed_dict={image_tensor: image_np_expanded})
      # Visualization of the results of a detection.
      vis_util.visualize_boxes_and_labels_on_image_array(
          image_np,
          np.squeeze(boxes),
          np.squeeze(classes).astype(np.int32),
          np.squeeze(scores),
          category_index,
          use_normalized_coordinates=True,
          line_thickness=8,
          min_score_thresh=.35
      )
      plt.figure(figsize=IMAGE_SIZE)
      plt.show()

Solution

  • Did you checked the input images? As models are the same in both cases, the only possible reasons is that the input is different. Maybe you have a raw frame, but compressed frame from video (for example, h264 code, which is lossy). Maybe you have a different channels order (cv2.VideoCapture returns frames in BRG format by default, and original frame is probably in RGB). Probably you'll need to do

    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)