Search code examples
tensorflowgpuyolo

Low FPS when using deepsort and yolov3 with GPU


I'm doing a vehicle tracking and counting problem using yolov3 and deepsort. After consulting the source code on the net and implementing, the fps that I achieved was only 2-3 Fps when using yolov3 weight + cfg. I tried yolov3 tiny weight + cfg it gives more fps in about 10 fps but have big problem with ID switch I have Cuda 11.6 and cudnn installed for gpu usage (I use GTX 1650 and cpu: 10300h) I don't know if the problem I'm having is if I'm using the gpu instead of the cpu, or if the yolov3 weight + cfg is slow.

Here is the code in main.py

import cv2
import numpy as np
import time
from deep_sort_yolov3.deep_sort import preprocessing
from deep_sort_yolov3.deep_sort import nn_matching
from deep_sort_yolov3.deep_sort.detection import Detection
from deep_sort_yolov3.deep_sort.tracker import Tracker
from deep_sort_yolov3.deep_sort.detection import Detection as ddet
from deep_sort_yolov3.tools import generate_detections as gdet
from collections import deque
import tensorflow as tf
YOLO_SIZE = 320

net = cv2.dnn.readNet("yolov3_320.weights", "yolov3.cfg")

model_filename = "deep_sort_yolov3\model_data\mars-small128.pb"
layer_names = net.getLayerNames()
output_layers = [layer_names[i- 1] for i in net.getUnconnectedOutLayers()]
max_cosine_distance = 0.5
nms_max_overlap = 0.3
classes = []
nn_budget = None
counter = []
# fps = 0.0
pts = [deque(maxlen=30) for _ in range(9999)]
COLORS = np.random.randint(0, 255, size=(200, 3),
    dtype="uint8")


encoder = gdet.create_box_encoder(model_filename, batch_size = 1) #Dữ liệu được encode của từng boxs
metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) # Dùng cosine để so sánh khoảng cách của từng boxs
tracker = Tracker(metric) # track đối tượng dược trên k/c thu được

# Loading camera
#cap = cv2.VideoCapture(0)
cap = cv2.VideoCapture('traffic.mp4')
font = cv2.FONT_HERSHEY_PLAIN
starting_time = time.time()
frame_id = 0

while True:
    _, frame = cap.read()
    #cv2.resize(frame,(640,480))
    frame_id += 1
    height, width, channels = frame.shape[:3]
    #print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

    # Detecting objects
    blob = cv2.dnn.blobFromImage(frame, 1/255, (YOLO_SIZE, YOLO_SIZE), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)
    # Showing informations on the screen
    i = 0
    indexIDs = []
    class_ids = []
    confidences = []
    bboxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                bboxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)
    #feature extraction
    features = encoder(frame, bboxes)
    # import pdb; pdb.set_trace()
    detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(bboxes, features)]

    #non-maxima supression
    boxes = np.array([d.tlwh for d in detections])
    scores = np.array([d.confidence for d in detections])
    indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores)
    detections  =[detections[i] for i in indices]

    #Call the tracker
    tracker.predict()
    tracker.update(detections)
    for track in tracker.tracks:
        if not track.is_confirmed() or track.time_since_update > 1:
            continue
        # boxes.append([track[0], track[1], track[2], track[3]])
        indexIDs.append(int(track.track_id))
        counter.append(int(track.track_id))
        bbox = track.to_tlbr()
        color = [int(c) for c in COLORS[indexIDs[i] % len(COLORS)]]
        
        cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (color), 3)
        cv2.putText(frame, str(track.track_id), (int(bbox[0]), int(bbox[1] - 50)), 0, 5e-3 * 150, (color), 2)

        i += 1
        # bbox_center_point(x,y)
        center = (int(((bbox[0]) + (bbox[2])) / 2), int(((bbox[1]) + (bbox[3])) / 2))
        # track_id[center]
        pts[track.track_id].append(center)
        thickness = 5
        # center point
        cv2.circle(frame, (center), 1, color, thickness)

        # draw motion path
        for j in range(1, len(pts[track.track_id])):
            if pts[track.track_id][j - 1] is None or pts[track.track_id][j] is None:
                continue
            thickness = int(np.sqrt(64 / float(j + 1)) * 2)
            cv2.line(frame, (pts[track.track_id][j - 1]), (pts[track.track_id][j]), (color), thickness)

    count = len(set(counter))
    cv2.putText(frame, "Total Object Counter: " + str(count), (int(20), int(120)), 0, 5e-3 * 200, (0, 255, 0), 2)
    cv2.putText(frame, "Current Object Counter: " + str(i), (int(20), int(80)), 0, 5e-3 * 200, (0, 255, 0), 2)
    elapsed_time = time.time() - starting_time
    fps = frame_id / elapsed_time
    cv2.putText(frame, "FPS: %f" % (fps), (int(20), int(40)), 0, 5e-3 * 200, (0, 255, 0), 3)
    cv2.namedWindow("YOLO3_Deep_SORT", 0)
  
    cv2.imshow('YOLO3_Deep_SORT', frame)
    key = cv2.waitKey(1)
    if key == 27:
        break
cap.release()
cv2.destroyAllWindows()

Here is the terminal window after running

C:\Users\Duong\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\linear_assignment_.py:22: FutureWarning: The linear_assignment_ module is deprecated in 0.21 and will be removed from 0.23. Use scipy.optimize.linear_sum_assignment instead.
  FutureWarning)
2023-03-14 17:37:56.079094: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2023-03-14 17:37:57.745508: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-03-14 17:37:57.746566: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library nvcuda.dll
2023-03-14 17:37:58.755945: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.515GHz coreCount: 14 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2023-03-14 17:37:58.756274: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2023-03-14 17:37:58.770488: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2023-03-14 17:37:58.770666: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2023-03-14 17:37:58.777627: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2023-03-14 17:37:58.779710: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2023-03-14 17:37:58.796984: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2023-03-14 17:37:58.824496: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2023-03-14 17:37:58.825520: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2023-03-14 17:37:58.825819: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2023-03-14 17:37:58.826281: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-14 17:37:58.827124: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.515GHz coreCount: 14 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2023-03-14 17:37:58.827465: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2023-03-14 17:37:58.827582: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2023-03-14 17:37:58.827701: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2023-03-14 17:37:58.827810: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2023-03-14 17:37:58.827929: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2023-03-14 17:37:58.828022: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2023-03-14 17:37:58.828115: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2023-03-14 17:37:58.828235: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2023-03-14 17:37:58.828391: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2023-03-14 17:37:59.234835: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2023-03-14 17:37:59.235043: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      0 
2023-03-14 17:37:59.235230: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0:   N
2023-03-14 17:37:59.235544: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 2891 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5)
2023-03-14 17:37:59.236423: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
[ WARN:[email protected]] global net_impl.cpp:174 cv::dnn::dnn4_v20221220::Net::Impl::setUpNet DNN module was not built with CUDA backend; switching to CPU
2023-03-14 17:37:59.922098: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:196] None of the MLIR optimization passes are enabled (registered 0 passes)
2023-03-14 17:38:00.470162: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2023-03-14 17:38:01.190269: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2023-03-14 17:38:01.191604: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2023-03-14 17:38:01.956778: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0

2023-03-14 17:38:02.009645: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0

C:\Users\Duong\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\linear_assignment_.py:128: FutureWarning: The linear_assignment function is deprecated in 0.21 and will be removed from 0.23. Use scipy.optimize.linear_sum_assignment instead.
  FutureWarning)

I want to improve the accuracy of the math and at the same time increase the FPS. Thanks!


Solution

  • You should set the network preferred backend and target to be CUDA if you want to run it on a NVIDIA GPU. Take a look at the setPreferableBackend and setPreferableTarget documentation.

    Also take a look at this official Python script example.