numpy opencv machine-learning computer-vision yolo

YoloV3 Result Giving Zero confidence in every class

I Am Doing Implementation Of Yolo v3 for multi class object detection

yolo is algorithm based region proposal and the region proposal with max confidence treated as prediction of yolo for more you can read it about here

for this particular task i refer to this murtuza tutorial that guided me from scratch

now as the complex network architecture requires hours of training i prefer to use transfer learning as using pretrained network and weighs(parameters) both of this link you can find here
Architecture configuration:cfg
Network Parameters(weights):weights

i used here yolov3 tiny as i required higher frame rates to process a video but afterall it not giving promising result as the tutorial shows i don't where i am lacking but even changing the network cfg and weight file to the orignal yolov3(320)does not giving true result as i am getting all 5 spatial data as coordinate and confidence [cx,cy,h,w,confidence] but all 80 classes probality is still zero vector[0.0,0.0,0.0---0.0] even changing the video source and choosing another video resulting into zero vector which was in tutorial working fine

Implementation Code:

# YOLO Algorithm
# Network Weights and configuration Files 

yolov3_tiny_cfg='/root/Downloads/ML TASK/yolov3-tiny.cfg' # configuration file
yolov3_tiny_weights='/root/Downloads/ML TASK/yolov3-tiny.weights' # weights
coco_names='/root/Downloads/ML TASK/coco.names' # coco classes
# for yolo genral 320 architecture
# put paths to directory
yolov3_cfg='/root/Downloads/ML TASK/yolov3.cfg'
yolov3_weights='/root/Downloads/ML TASK/yolov3.weights'

# Test Videos
Test_video_1='/root/Downloads/ML TASK/mn.mp4'
Test_video_2='/root/Downloads/ML TASK/bg.mp4'


# Dependencies
import cv2
import numpy as np


# Dataset Classes:
# there are around 80 classes in the coco dataset so manually writing them would not be right choice so instead of them we are getting them from a file name coco.names stored in drive
# getting list of classes
classes=[] # empty list intialization
with open(coco_names,'r')as f:
  classes=f.read().splitlines()
# viewing the multiclass list around 80 classes in coco dataset


# Loading the yolov3 using configuration file and weights
network=cv2.dnn.readNetFromDarknet(yolov3_cfg,yolov3_weights)
network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)# to use opencv CPU as backend
network.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

#NOTE: The network won't feed directly the image we have to First Preprocess it To match the input shape of network also the type i.e. Blob it genrally refers to a mathematical form of binary Images Like Bitmap
Width,Height=320,320 # sqaure image so the network grid should be n*n equal on both dimension
Confidence_Threshould=0.5 # minimum problity for claiming the prediction
NMS_Threshould=0.3


cap=cv2.VideoCapture('game.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
timestamps = [cap.get(cv2.CAP_PROP_POS_MSEC)]

# function to find objects on captured video stream
def findObjects(outputs,image):
  h,w,c=image.shape
  bound_box=[] # for feeding through function
  classIds=[]
  confidence=[]

  for output in outputs: # getting o/p from 2 layers(v3 tiny) 3 if use yolov3 320
    for detection in output:
      scores=detection[5:]  #slice first five values cause we are gonnause them in bounding
      classId=np.argmax(scores)
      confs=scores[classIds]

      # filtering object putting them as final prediction only when its breaches the minimum threshould of confidence
      if confs > Confidence_Threshould:
        w,h=int(detection[2]*Width),int(detection[3]*Height)              # to convert % into pixel
        x,y=int((detection[0]*Width)-(w/2)),int((detection[1]*Height)-(h/2)) 
        bound_box.append([x,y,w,h])
        classIds.append(classId)
        confidence.append(float(confs))
  print(len(bound_box))

  # to downsample the no. of boxes on frame we use nms boxes it give indices by which spatial info to keep
  indices=cv2.dnn.NMSBoxes(bound_box,confs,Confidence_Threshould,NMS_Threshould)
  for i in indices:
    i=i[0]
    box=bound_box[i]
    x,y,w,h=box[0],box[1],box[2],box[3]
    cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,0),2)
    cv2.puttext(image,f'{classes[classIds[i]]}{int(confidence[i]*100)}%',
    (x,y-10),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
    cv2.puttext(image,f'FPS:{fps}',(0,150),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)
    cv2.puttext(image,f'TIMESTAMPS:{timestamps}',(150,0),cv2.FONT_HERSHEY_PLAIN,0.6,(0,255,0),2)





while True:
  success,image=cap.read()
  # coverting image into blob for network i/O processing
  try:
   blob=cv2.dnn.blobFromImage(image,1/255,(Width,Height),[0,0,0],crop=False)
  except:
   continue

  # I/P
  network.setInput(blob) # Setting Input

  # O/P 
  # As Yolo Architecture Produces Three O/p[Genral Architecture] From The Respective Layer And By Summarize The Max Of Confidence to Decide Final Predictions
  # But here only 2 o/p of network as we are using the tiny version for higher frame rates
  # In Order to Get The Outputs We Have To Know the Name Of the Respective Layers #i.e. Not Names Actually But Getting indexes(starting from 1 Not zero) Here By Use Of getUnconnectedOutLayers Function 
  layers_names=network.getLayerNames()
  #print(network.getUnconnectedOutLayers()) #36th and 48th indexes
  #looping over as we are traversing multiple values of OutLayers
  outputNames=[layers_names[i[0]-1]for i in network.getUnconnectedOutLayers()] #-1 cause the index are starting from one not zero 
  #print(outputNames) # for v3 tiny its 16 and 23  are layer name

  # forwading the image to network
  outputs=network.forward(outputNames)
 
  # finding objects
  # print(outputs[0].shape)=>(300,85) 300=>no.of boxes 85=>[cx,cy,height,width,confidence,probablity of 80 classes]
  # using the cx,cy,h,w we are gonna determine the bounding box
  # print(outputs[1].shape)=>(1200,85) 1200 boxes this shape present in m*n format i.e. matrix faishion  where 1200 rows of boxes map with 85 vector details explained aboved
  #print(outputs[0][0])

  findObjects(outputs,image)
  
  cv2.imshow('Window',image)
  
  if cv2.waitKey(15) & 0xFF == ord('q'):
        break
  cap.release()
  cv2.destroyAllWindows()

Solution

You have many problems with your code.

You have to use the h,w that you got from the image and not your default width and height that you use to blob the image for the YoloV3.

change

    w,h=int(detection[2]*Width),int(detection[3]*Height)
    
    x,y=int((detection[0]*Width)-(w/2)),int((detection[1]*Height)-(h/2))

    w,h = int(det[2]*w) , int(det[3]*h)
    x,y = int((det[0]*w)-Width/2) , int((det[1]*h)-Height/2)

You confused a lot between confs and confidence and that's make a mess, you can check with the murtaza tutorial but it will take some time.

There may be a few more small mistakes I missed.

---------------------------------- Final Solution: ----------------------------------

To save your time Here is the correct code style of your project that works.

Note 1: I changed the coco.names labels loading method a little bit, your method didn't work well on my Macbook Pro.

Note 2: In my code you have to change the file paths back to your paths like in your original code.

yolov3_cfg='/root/Downloads/ML TASK/yolov3.cfg'

yolov3_weights='/root/Downloads/ML TASK/yolov3.weights'

import cv2 as cv
import numpy as np

cap = cv.VideoCapture("video.mp4")
whT = 320
confThreshold =0.5
nmsThreshold= 0.2

#### LOAD MODEL
## Coco Names
classesFile = "coco.names"
classNames = open(classesFile).read().strip().split("\n")
print(classNames)
## Model Files
modelConfiguration = "yolov3.cfg"
modelWeights = "yolov3.weights"
net = cv.dnn.readNetFromDarknet(modelConfiguration, modelWeights)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU)

def findObjects(outputs,img):
    hT, wT, cT = img.shape
    bbox = []
    classIds = []
    confs = []
    for output in outputs:
        for det in output:
            scores = det[5:]
            classId = np.argmax(scores)
            confidence = scores[classId]
            if confidence > confThreshold:
                w,h = int(det[2]*wT) , int(det[3]*hT)
                x,y = int((det[0]*wT)-w/2) , int((det[1]*hT)-h/2)
                bbox.append([x,y,w,h])
                classIds.append(classId)
                confs.append(float(confidence))

    indices = cv.dnn.NMSBoxes(bbox, confs, confThreshold, nmsThreshold)

    for i in indices:
        i = i[0]
        box = bbox[i]
        x, y, w, h = box[0], box[1], box[2], box[3]
        # print(x,y,w,h)
        cv.rectangle(img, (x, y), (x+w,y+h), (255, 0 , 255), 2)
        cv.putText(img,f'{classNames[classIds[i]].upper()} {int(confs[i]*100)}%',
                 (x, y-10), cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2)

while True:
    success, img = cap.read()
    blob = cv.dnn.blobFromImage(img, 1 / 255, (whT, whT), [0, 0, 0], 1, crop=False)
    net.setInput(blob)
    layersNames = net.getLayerNames()
    outputNames = [(layersNames[i[0] - 1]) for i in net.getUnconnectedOutLayers()]
    outputs = net.forward(outputNames)
    findObjects(outputs,img)

    cv.imshow('Image', img)
    cv.waitKey(1)