Low FPS when using deepsort and yolov3 with GPU

I'm doing a vehicle tracking and counting problem using yolov3 and deepsort. After consulting the source code on the net and implementing, the fps that I achieved was only 2-3 Fps when using yolov3 weight + cfg. I tried yolov3 tiny weight + cfg it gives more fps in about 10 fps but have big problem with ID switch I have Cuda 11.6 and cudnn installed for gpu usage (I use GTX 1650 and cpu: 10300h) I don't know if the problem I'm having is if I'm using the gpu instead of the cpu, or if the yolov3 weight + cfg is slow.

Here is the code in

import cv2
import numpy as np
import time
from deep_sort_yolov3.deep_sort import preprocessing
from deep_sort_yolov3.deep_sort import nn_matching
from deep_sort_yolov3.deep_sort.detection import Detection
from deep_sort_yolov3.deep_sort.tracker import Tracker
from deep_sort_yolov3.deep_sort.detection import Detection as ddet
from import generate_detections as gdet
from collections import deque
import tensorflow as tf

net = cv2.dnn.readNet("yolov3_320.weights", "yolov3.cfg")

model_filename = "deep_sort_yolov3\model_data\mars-small128.pb"
layer_names = net.getLayerNames()
output_layers = [layer_names[i- 1] for i in net.getUnconnectedOutLayers()]
max_cosine_distance = 0.5
nms_max_overlap = 0.3
classes = []
nn_budget = None
counter = []
# fps = 0.0
pts = [deque(maxlen=30) for _ in range(9999)]
COLORS = np.random.randint(0, 255, size=(200, 3),

encoder = gdet.create_box_encoder(model_filename, batch_size = 1) #Dữ liệu được encode của từng boxs
metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget) # Dùng cosine để so sánh khoảng cách của từng boxs
tracker = Tracker(metric) # track đối tượng dược trên k/c thu được

# Loading camera
#cap = cv2.VideoCapture(0)
cap = cv2.VideoCapture('traffic.mp4')
starting_time = time.time()
frame_id = 0

while True:
    _, frame =
    frame_id += 1
    height, width, channels = frame.shape[:3]
    #print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

    # Detecting objects
    blob = cv2.dnn.blobFromImage(frame, 1/255, (YOLO_SIZE, YOLO_SIZE), (0, 0, 0), True, crop=False)
    outs = net.forward(output_layers)
    # Showing informations on the screen
    i = 0
    indexIDs = []
    class_ids = []
    confidences = []
    bboxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                bboxes.append([x, y, w, h])
    #feature extraction
    features = encoder(frame, bboxes)
    # import pdb; pdb.set_trace()
    detections = [Detection(bbox, 1.0, feature) for bbox, feature in zip(bboxes, features)]

    #non-maxima supression
    boxes = np.array([d.tlwh for d in detections])
    scores = np.array([d.confidence for d in detections])
    indices = preprocessing.non_max_suppression(boxes, nms_max_overlap, scores)
    detections  =[detections[i] for i in indices]

    #Call the tracker
    for track in tracker.tracks:
        if not track.is_confirmed() or track.time_since_update > 1:
        # boxes.append([track[0], track[1], track[2], track[3]])
        bbox = track.to_tlbr()
        color = [int(c) for c in COLORS[indexIDs[i] % len(COLORS)]]
        cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (color), 3)
        cv2.putText(frame, str(track.track_id), (int(bbox[0]), int(bbox[1] - 50)), 0, 5e-3 * 150, (color), 2)

        i += 1
        # bbox_center_point(x,y)
        center = (int(((bbox[0]) + (bbox[2])) / 2), int(((bbox[1]) + (bbox[3])) / 2))
        # track_id[center]
        thickness = 5
        # center point, (center), 1, color, thickness)

        # draw motion path
        for j in range(1, len(pts[track.track_id])):
            if pts[track.track_id][j - 1] is None or pts[track.track_id][j] is None:
            thickness = int(np.sqrt(64 / float(j + 1)) * 2)
            cv2.line(frame, (pts[track.track_id][j - 1]), (pts[track.track_id][j]), (color), thickness)

    count = len(set(counter))
    cv2.putText(frame, "Total Object Counter: " + str(count), (int(20), int(120)), 0, 5e-3 * 200, (0, 255, 0), 2)
    cv2.putText(frame, "Current Object Counter: " + str(i), (int(20), int(80)), 0, 5e-3 * 200, (0, 255, 0), 2)
    elapsed_time = time.time() - starting_time
    fps = frame_id / elapsed_time
    cv2.putText(frame, "FPS: %f" % (fps), (int(20), int(40)), 0, 5e-3 * 200, (0, 255, 0), 3)
    cv2.namedWindow("YOLO3_Deep_SORT", 0)
    cv2.imshow('YOLO3_Deep_SORT', frame)
    key = cv2.waitKey(1)
    if key == 27:

Here is the terminal window after running

I want to improve the accuracy of the math and at the same time increase the FPS. Thanks!


  • You should set the network preferred backend and target to be CUDA if you want to run it on a NVIDIA GPU. Take a look at the setPreferableBackend and setPreferableTarget documentation.

    Also take a look at this official Python script example.