Low FPS when using deepsort and yolov3 with GPU

I'm doing a vehicle tracking and counting problem using yolov3 and deepsort. After consulting the source code on the net and implementing, the fps that I achieved was only 2-3 Fps when using yolov3 weight + cfg. I tried yolov3 tiny weight + cfg it gives more fps in about 10 fps but have big problem with ID switch I have Cuda 11.6 and cudnn installed for gpu usage (I use GTX 1650 and cpu: 10300h) I don't know if the problem I'm having is if I'm using the gpu instead of the cpu, or if the yolov3 weight + cfg is slow.

Here is the code in main.py

import cv2
import numpy as np
import time
from deep_sort_yolov3.deep_sort import preprocessing
from deep_sort_yolov3.deep_sort import nn_matching
from deep_sort_yolov3.deep_sort.detection import Detection
from deep_sort_yolov3.deep_sort.tracker import Tracker
from deep_sort_yolov3.deep_sort.detection import Detection as ddet
from deep_sort_yolov3.tools import generate_detections as gdet
from collections import deque
import tensorflow as tf
YOLO_SIZE = 320

net = cv2.dnn.readNet("yolov3_320.weights", "yolov3.cfg")

model_filename = "deep_sort_yolov3\model_data\mars-small128.pb"
layer_names = net.getLayerNames()
output_layers = [layer_names[i- 1] for i in net.getUnconnectedOutLayers()]
max_cosine_distance = 0.5
nms_max_overlap = 0.3
classes = []
nn_budget = None
counter = []
# fps = 0.0
pts = [deque(maxlen=30) for _ in range(9999)]
COLORS = np.random.randint(0, 255, size=(200, 3),
    dtype="uint8")


encoder = gdet.create_box_encoder(model_filename, batch_size = 1) #Dữ liệu được encode của từng boxs
metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) # Dùng cosine để so sánh khoảng cách của từng boxs
tracker = Tracker(metric) # track đối tượng dược trên k/c thu được

# Loading camera
#cap = cv2.VideoCapture(0)
cap = cv2.VideoCapture('traffic.mp4')
font = cv2.FONT_HERSHEY_PLAIN
starting_time = time.time()
frame_id = 0

while True:
    _, frame = cap.read()
    #cv2.resize(frame,(640,480))
    frame_id += 1
    height, width, channels = frame.shape[:3]
    #print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

    # Detecting objects
    blob = cv2.dnn.blobFromImage(frame, 1/255, (YOLO_SIZE, YOLO_SIZE), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward(output_layers)
    # Showing informations on the screen
    i = 0
    indexIDs = []
    class_ids = []
    confidences = []
    bboxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                bboxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)
    #feature extraction
    features = encoder(frame, bboxes)
    # import pdb; pdb.set_trace()
    detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(bboxes, features)]

    #non-maxima supression
    boxes = np.array([d.tlwh for d in detections])
    scores = np.array([d.confidence for d in detections])
    indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores)
    detections  =[detections[i] for i in indices]

    #Call the tracker
    tracker.predict()
    tracker.update(detections)
    for track in tracker.tracks:
        if not track.is_confirmed() or track.time_since_update > 1:
            continue
        # boxes.append([track[0], track[1], track[2], track[3]])
        indexIDs.append(int(track.track_id))
        counter.append(int(track.track_id))
        bbox = track.to_tlbr()
        color = [int(c) for c in COLORS[indexIDs[i] % len(COLORS)]]
        
        cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (color), 3)
        cv2.putText(frame, str(track.track_id), (int(bbox[0]), int(bbox[1] - 50)), 0, 5e-3 * 150, (color), 2)

        i += 1
        # bbox_center_point(x,y)
        center = (int(((bbox[0]) + (bbox[2])) / 2), int(((bbox[1]) + (bbox[3])) / 2))
        # track_id[center]
        pts[track.track_id].append(center)
        thickness = 5
        # center point
        cv2.circle(frame, (center), 1, color, thickness)

        # draw motion path
        for j in range(1, len(pts[track.track_id])):
            if pts[track.track_id][j - 1] is None or pts[track.track_id][j] is None:
                continue
            thickness = int(np.sqrt(64 / float(j + 1)) * 2)
            cv2.line(frame, (pts[track.track_id][j - 1]), (pts[track.track_id][j]), (color), thickness)

    count = len(set(counter))
    cv2.putText(frame, "Total Object Counter: " + str(count), (int(20), int(120)), 0, 5e-3 * 200, (0, 255, 0), 2)
    cv2.putText(frame, "Current Object Counter: " + str(i), (int(20), int(80)), 0, 5e-3 * 200, (0, 255, 0), 2)
    elapsed_time = time.time() - starting_time
    fps = frame_id / elapsed_time
    cv2.putText(frame, "FPS: %f" % (fps), (int(20), int(40)), 0, 5e-3 * 200, (0, 255, 0), 3)
    cv2.namedWindow("YOLO3_Deep_SORT", 0)
  
    cv2.imshow('YOLO3_Deep_SORT', frame)
    key = cv2.waitKey(1)
    if key == 27:
        break
cap.release()
cv2.destroyAllWindows()

Here is the terminal window after running

C:\Users\Duong\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\linear_assignment_.py:22: FutureWarning: The linear_assignment_ module is deprecated in 0.21 and will be removed from 0.23. Use scipy.optimize.linear_sum_assignment instead.
  FutureWarning)
2023-03-14 17:37:56.079094: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2023-03-14 17:37:57.745508: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-03-14 17:37:57.746566: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library nvcuda.dll
2023-03-14 17:37:58.755945: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.515GHz coreCount: 14 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2023-03-14 17:37:58.756274: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2023-03-14 17:37:58.770488: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2023-03-14 17:37:58.770666: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2023-03-14 17:37:58.777627: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2023-03-14 17:37:58.779710: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2023-03-14 17:37:58.796984: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2023-03-14 17:37:58.824496: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2023-03-14 17:37:58.825520: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2023-03-14 17:37:58.825819: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2023-03-14 17:37:58.826281: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-14 17:37:58.827124: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties:
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 computeCapability: 7.5
coreClock: 1.515GHz coreCount: 14 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2023-03-14 17:37:58.827465: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2023-03-14 17:37:58.827582: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2023-03-14 17:37:58.827701: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2023-03-14 17:37:58.827810: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2023-03-14 17:37:58.827929: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2023-03-14 17:37:58.828022: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2023-03-14 17:37:58.828115: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2023-03-14 17:37:58.828235: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2023-03-14 17:37:58.828391: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2023-03-14 17:37:59.234835: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2023-03-14 17:37:59.235043: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      0 
2023-03-14 17:37:59.235230: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0:   N
2023-03-14 17:37:59.235544: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 2891 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5)
2023-03-14 17:37:59.236423: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
[ WARN:0@3.712] global net_impl.cpp:174 cv::dnn::dnn4_v20221220::Net::Impl::setUpNet DNN module was not built with CUDA backend; switching to CPU
2023-03-14 17:37:59.922098: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:196] None of the MLIR optimization passes are enabled (registered 0 passes)
2023-03-14 17:38:00.470162: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2023-03-14 17:38:01.190269: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2023-03-14 17:38:01.191604: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2023-03-14 17:38:01.956778: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0

2023-03-14 17:38:02.009645: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0

C:\Users\Duong\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\linear_assignment_.py:128: FutureWarning: The linear_assignment function is deprecated in 0.21 and will be removed from 0.23. Use scipy.optimize.linear_sum_assignment instead.
  FutureWarning)

I want to improve the accuracy of the math and at the same time increase the FPS. Thanks!

Solution

You should set the network preferred backend and target to be CUDA if you want to run it on a NVIDIA GPU. Take a look at the setPreferableBackend and setPreferableTarget documentation.

Also take a look at this official Python script example.