I am trying to read a video file (using opencv), loop over all frames using tensorflow's object-detection API to do the predictions and bounding boxes, and writing the predicted frames (with boxes) to a new video file. I used the object_detection_tutorial.ipynb with some modifications to capture the video frames and process it in faster-rcnn-inception-resnet-v2 loaded from a frozen graph (after trained).
I am using a tesla P100 gpu in a cloud machine with windows 10 and 56GB ram. Also using tensorflow-gpu.
When I run the code, it takes 0,5 second per frame. Is it a normal speed for a tesla P100 or I am doing something wrong in the code to make it slower?
This code is just a test, as later I will have to use it in a real time video prediction task. If 0,5 second per frame is an expected speed using tensorflow API, I think I will cannot use it in my task :(
So, after running it, i get the following running times
processing frame number 1.0
time to capture video frame 0.0
time to predict 0.49225664138793945
time to generate boxes in a frame 0.14833950996398926
time to write a frame in video file 0.04687023162841797
total time in the loop 0.6874663829803467
As you guys can see, the code using the CPU (opencv) goes fast. But when I use the GPU, it takes almost 0,5 seconds just in prediction task (used in sess.run).
Any advices? Thank you in advance. Bellow follows my code
from distutils.version import StrictVersion import numpy as np import os import six.moves.urllib as urllib import sys import tarfile import tensorflow as tf import zipfile import time
from collections import defaultdict
from io import StringIO
#from matplotlib import pyplot as plt
from PIL import Image
import cv2
from imutils import paths
import re
#This is needed since the code is stored in the object_detection folder.
sys.path.append("..")
from object_detection.utils import ops as utils_ops
if StrictVersion(tf.__version__) < StrictVersion('1.9.0'):
raise ImportError('Please upgrade your TensorFlow installation to v1.9.* or later!')
from utils import label_map_util
from utils import visualization_utils as vis_util
#Detection using tensorflow inside write_video function
def write_video():
filename = 'output/teste_v2.avi'
codec = cv2.VideoWriter_fourcc('W', 'M', 'V', '2')
cap = cv2.VideoCapture('pneu_trim2.mp4')
framerate = round(cap.get(5),2)
w = int(cap.get(3))
h = int(cap.get(4))
resolution = (w, h)
VideoFileOutput = cv2.VideoWriter(filename, codec, framerate, resolution)
################################
# # Model preparation
# ## Variables
#
# Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_FROZEN_GRAPH` to point to a new .pb file.
#
# What model to download.
MODEL_NAME = 'training/pneu_incep_step_24887'
print("loading model from " + MODEL_NAME)
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'object-detection.pbtxt')
NUM_CLASSES = 5
# ## Load a (frozen) Tensorflow model into memory.
time_graph = time.time()
print('loading graphs')
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
print("tempo build graph = " + str(time.time() - time_graph))
# ## Loading label map
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
################################
with tf.Session(graph=detection_graph) as sess:
with detection_graph.as_default():
while (cap.isOpened()):
time_loop = time.time()
print('processing frame number: ' + str(cap.get(1)))
time_captureframe = time.time()
ret, image_np = cap.read()
print("time to capture video frame = " + str(time.time() - time_captureframe))
if (ret != True):
break
# the array based representation of the image will be used later in order to prepare the
# result image with boxes and labels on it.
#image_np = load_image_into_numpy_array(image)
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
scores = detection_graph.get_tensor_by_name('detection_scores:0')
classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
time_prediction = time.time()
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
print("time to predict = " + str(time.time() - time_prediction))
# Visualization of the results of a detection.
time_visualizeboxes = time.time()
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=8)
print("time to generate boxes in a frame = " + str(time.time() - time_visualizeboxes))
time_writeframe = time.time()
VideoFileOutput.write(image_np)
print("time to write a frame in video file = " + str(time.time() - time_writeframe))
print("total time in the loop = " + str(time.time() - time_loop))
cap.release()
VideoFileOutput.release()
print('done')
Actually the problem is with the model you were using. https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md Basically the model Faster-rcnn-inception-resnet-v2 will take more time. You can refer the link to know the speed for the model