Could not perform object detection using GPU (darknet)

I just set up my specifications for object detection. This what I have built:

OpenCV from source (OpenCV tutorial)
Build darknet (Build Darknet)

The specifications of my laptop are:

Windows 10 Pro
RTX 3060Ti (CUDA 11.1, CUDNN 8.6)
Python 3.7.9

After compiled, I tested the darknet based on the tutorial, it detects my GPU, the picture is as below: Success, using darknet.

However, due to many arguments needed to perform detection for darknet, I have built another python program that can make me easily to just run the program.

So here's the problem, when I run "from_video.py", the detection performed is in CPU. The files arranged as follow New folder. Lets take one python program, "from_video.py" for testing, "main.py" will load the model, check the GPU. Code for the python files are as follow.

from_video.py

import cv2
from main import *

videoPath = "./data/test_sampah1.mp4"

darknetmain = darknet_main()

darknetmain.setGPU(is_GPU=True)

video = cv2.VideoCapture(videoPath)
width = int(video.get(3))
height = int(video.get(4))
size = (width,height)
# result = cv2.VideoWriter('sampah1_output.mpeg',cv2.VideoWriter_fourcc(*'MPJG'),20,size)

if video.isOpened():
    while(True):
        res, cv_img = video.read()
        if res==False:
            break
        imcaptions, boundingBoxs = darknetmain.performDetect(cv_img)
        if len(imcaptions)>0:
            if len(imcaptions) > 0:
                for i in range(len(imcaptions)):
                    name = imcaptions[i]
                    name = name[:5]
                    print(name + " is found")
                    cv_img = cv2.rectangle(cv_img, boundingBoxs[i][0], boundingBoxs[i][2], (0, 255, 0), 2)
                    cv_img = cv2.putText(cv_img, imcaptions[i], boundingBoxs[i][0], cv2.FONT_HERSHEY_SIMPLEX, 1,
                                         (0, 0, 255))         
            cv2.imshow("result", cv_img)
            #result.write(cv_img)
            cv2.waitKey(1)
        else:
            print("no result")
    #result.release()
    video.release()
else:
    print("Cannot read the video file")

main.py

from ctypes import *
import math
import random
import os
import numpy as np
import cv2
class BOX(Structure):
    _fields_ = [("x", c_float),
                ("y", c_float),
                ("w", c_float),
                ("h", c_float)]

class DETECTION(Structure):
    _fields_ = [("bbox", BOX),
                ("classes", c_int),
                ("prob", POINTER(c_float)),
                ("mask", POINTER(c_float)),
                ("objectness", c_float),
                ("sort_class", c_int),
                ("uc", POINTER(c_float)),
                ("points", c_int),
                ("embeddings", POINTER(c_float)),
                ("embedding_size", c_int),
                ("sim", c_float),
                ("track_id", c_int)]


class IMAGE(Structure):
    _fields_ = [("w", c_int),
                ("h", c_int),
                ("c", c_int),
                ("data", POINTER(c_float))]

class METADATA(Structure):
    _fields_ = [("classes", c_int),
                ("names", POINTER(c_char_p))]

class darknet_main():
    def __init__(self):
        self.netMain = None
        self.metaMain = None
        self.altNames = None

        self.thresh = 0.5

        self.configPath = './model/yolov3_custom_fyp_testing.cfg'
        self.weightPath = './model/yolov3_custom_fyp_last.weights'
        self.metaPath = "./model/obj_fyp.data"
        self.frame = None

    def setGPU(self, is_GPU):
        self.hasGPU = is_GPU
        if self.hasGPU:
            self.lib = CDLL("yolo_cpp_dll.dll", RTLD_GLOBAL)
        else:
            self.lib = CDLL("yolo_cpp_dll_nogpu.dll", RTLD_GLOBAL)

        self.lib.network_width.argtypes = [c_void_p]
        self.lib.network_width.restype = c_int
        self.lib.network_height.argtypes = [c_void_p]
        self.lib.network_height.restype = c_int

        self.predict = self.lib.network_predict
        self.predict.argtypes = [c_void_p, POINTER(c_float)]
        self.predict.restype = POINTER(c_float)

        if self.hasGPU:
            self.set_gpu = self.lib.cuda_set_device
            self.set_gpu.argtypes = [c_int]

        self.make_image = self.lib.make_image
        self.make_image.argtypes = [c_int, c_int, c_int]
        self.make_image.restype = IMAGE

        self.get_network_boxes = self.lib.get_network_boxes
        self.get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int,
                                           POINTER(c_int),
                                           c_int]
        self.get_network_boxes.restype = POINTER(DETECTION)

        self.make_network_boxes = self.lib.make_network_boxes
        self.make_network_boxes.argtypes = [c_void_p]
        self.make_network_boxes.restype = POINTER(DETECTION)

        self.free_detections = self.lib.free_detections
        self.free_detections.argtypes = [POINTER(DETECTION), c_int]

        self.free_ptrs = self.lib.free_ptrs
        self.free_ptrs.argtypes = [POINTER(c_void_p), c_int]

        self.network_predict = self.lib.network_predict
        self.network_predict.argtypes = [c_void_p, POINTER(c_float)]

        self.reset_rnn = self.lib.reset_rnn
        self.reset_rnn.argtypes = [c_void_p]

        self.load_net = self.lib.load_network
        self.load_net.argtypes = [c_char_p, c_char_p, c_int]
        self.load_net.restype = c_void_p

        self.load_net_custom = self.lib.load_network_custom
        self.load_net_custom.argtypes = [c_char_p, c_char_p, c_int, c_int]
        self.load_net_custom.restype = c_void_p

        self.do_nms_obj = self.lib.do_nms_obj
        self.do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]

        self.do_nms_sort = self.lib.do_nms_sort
        self.do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]

        self.free_image = self.lib.free_image
        self.free_image.argtypes = [IMAGE]

        self.letterbox_image = self.lib.letterbox_image
        self.letterbox_image.argtypes = [IMAGE, c_int, c_int]
        self.letterbox_image.restype = IMAGE

        self.load_meta = self.lib.get_metadata
        self.lib.get_metadata.argtypes = [c_char_p]
        self.lib.get_metadata.restype = METADATA

        self.load_image = self.lib.load_image_color
        self.load_image.argtypes = [c_char_p, c_int, c_int]
        self.load_image.restype = IMAGE

        self.rgbgr_image = self.lib.rgbgr_image
        self.rgbgr_image.argtypes = [IMAGE]

        self.predict_image = self.lib.network_predict_image
        self.predict_image.argtypes = [c_void_p, IMAGE]
        self.predict_image.restype = POINTER(c_float)


        self.netMain = self.load_net_custom(self.configPath.encode("ascii"), self.weightPath.encode("ascii"), 0,
                                            1)  # batch size = 1
        self.metaMain = self.load_meta(self.metaPath.encode("ascii"))



        try:
            with open(self.metaPath) as metaFH:
                metaContents = metaFH.read()
                import re
                match = re.search("names *= *(.*)$", metaContents, re.IGNORECASE | re.MULTILINE)
                if match:
                    result = match.group(1)
                else:
                    result = None
                try:
                    if os.path.exists(result):
                        with open(result) as namesFH:
                            self.namesList = namesFH.read().strip().split("\n")
                            self.altNames = [x.strip() for x in self.namesList]
                except TypeError:
                    pass
        except Exception:
            pass

    def sample(self, probs):
        s = sum(probs)
        probs = [a/s for a in probs]
        r = random.uniform(0, 1)
        for i in range(len(probs)):
            r = r - probs[i]
            if r <= 0:
                return i
        return len(probs)-1

    def c_array(self, ctype, values):
        arr = (ctype*len(values))()
        arr[:] = values
        return arr

    def array_to_image(self, arr):
        import numpy as np
        # need to return old values to avoid python freeing memory
        arr = arr.transpose(2,0,1)
        c = arr.shape[0]
        h = arr.shape[1]
        w = arr.shape[2]
        arr = np.ascontiguousarray(arr.flat, dtype=np.float32) / 255.0
        data = arr.ctypes.data_as(POINTER(c_float))
        im = IMAGE(w,h,c,data)
        return im, arr

    def classify(self, net, meta, im):
        out = self.predict_image(net, im)
        res = []
        for i in range(meta.classes):
            if self.altNames is None:
                nameTag = meta.names[i]
            else:
                nameTag = self.altNames[i]
            res.append((nameTag, out[i]))
        res = sorted(res, key=lambda x: -x[1])
        return res

    def detect(self, net, meta, cv_im, thresh=.5, hier_thresh=.5, nms=.45, debug= False):
        # im = self.load_image(image, 0, 0)
        # debug=True
        custom_image = cv2.cvtColor(cv_im, cv2.COLOR_BGR2RGB)
        h, w, c_ = custom_image.shape
        custom_image = cv2.resize(custom_image,(self.lib.network_width(net), self.lib.network_height(net)), interpolation = cv2.INTER_LINEAR)
        im, arr = self.array_to_image(custom_image)     # you should comment line below: free_image(im)
        if debug: print("Loaded image")
        num = c_int(0)
        if debug: print("Assigned num")
        pnum = pointer(num)
        if debug: print("Assigned pnum")
        self.predict_image(net, im)
        if debug: print("did prediction")
        dets = self.get_network_boxes(net, w, h, self.thresh, hier_thresh, None, 0, pnum, 0) # OpenCV
        # dets = self.get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum, 0)
        if debug: print("Got dets")
        num = pnum[0]
        if debug: print("got zeroth index of pnum")
        if nms:
            self.do_nms_sort(dets, num, meta.classes, nms)
        if debug: print("did sort")
        res = []
        if debug: print("about to range")
        for j in range(num):
            if debug: print("Ranging on "+str(j)+" of "+str(num))
            if debug: print("Classes: "+str(meta), meta.classes, meta.names)
            for i in range(meta.classes):
                if debug: print("Class-ranging on "+str(i)+" of "+str(meta.classes)+"= "+str(dets[j].prob[i]))
                if dets[j].prob[i] > 0:
                    b = dets[j].bbox
                    if self.altNames is None:
                        nameTag = self.meta.names[i]
                    else:
                        nameTag = self.altNames[i]
                    if debug:
                        print("Got bbox", b)
                        print(nameTag)
                        print(dets[j].prob[i])
                        print((b.x, b.y, b.w, b.h))
                    res.append((nameTag, dets[j].prob[i], (b.x, b.y, b.w, b.h)))
        if debug: print("did range")
        res = sorted(res, key=lambda x: -x[1])
        if debug: print("did sort")
        self.free_detections(dets, num)
        if debug: print("freed detections")
        return res

    def performDetect(self, cv_img):

        # Do the detection
        detections = self.detect(self.netMain, self.metaMain, cv_img, self.thresh)

        imcaptions = []
        boundingBoxs=[]
        for detection in detections:
            label = detection[0]
            confidence = detection[1]
            pstring = label+": "+str(np.rint(100 * confidence))+"%"
            imcaptions.append(pstring)
            print(pstring)
            bounds = detection[2]

            yExtent = int(bounds[3])
            xEntent = int(bounds[2])
            # Coordinates are around the center
            xCoord = int(bounds[0] - bounds[2]/2)
            yCoord = int(bounds[1] - bounds[3]/2)
            boundingBox = [
                (xCoord, yCoord),
                (xCoord, yCoord + yExtent),
                (xCoord + xEntent, yCoord + yExtent),
                (xCoord + xEntent, yCoord)
            ]
            boundingBoxs.append(boundingBox)
            # cv_img = cv2.rectangle(cv_img, boundingBox[0], boundingBox[2], (0,0, 255), 1)
        #
        # cv2.imshow("image", cv_img)
        # cv2.waitKey(0)
        return imcaptions, boundingBoxs

if __name__ == "__main__":
    darknetmain = darknet_main()
    darknetmain.setGPU(is_GPU=True)
    imagePath = "./data/bottle_8.png"
    cv_img = cv2.imread(imagePath)
    darknetmain.performDetect(cv_img)

Solution

It seems little bit complicated, when you try to Build openCV from source, make sure that your CUDA arch bin supports your version or higher. In my example, 8.0 and 8.6. Then, when you try to compile darknet from source, make sure that your compute_,sm_ for both 8.0 and 8.6