Search code examples
pythoncomputer-visionface-detectionmediapipe

I am encountering an issue with drawing boxes in Face Detection using MediaPipe


I'm trying to create simple code to detect faces using my webcam with the MediaPipe library, but I'm facing an issue. When I attempt to draw the box on the mp_image, it's not appearing. Can someone please help me?

#------ importamos las librerias -----------#
import cv2
import mediapipe as mp
import numpy as np


#--------- Declarar el detector --------#
detector = mp.tasks.vision.FaceDetector
dibujo = mp.solutions.drawing_utils

#----realizar la video captura----#
cap = cv2.VideoCapture(0)

#------Inicializamos parametros de configuracion-----#
BaseOptions = mp.tasks.BaseOptions
FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
FaceDetectorResult = mp.tasks.vision.FaceDetectorResult
VisionRunningMode = mp.tasks.vision.RunningMode



#------- Crear una instancia para la detencion en tiempo real ------#
def print_result(result: FaceDetectorResult, output_image, timestamp_ms: int):
    print('face detector result: {}'.format(result))

    # Verificar si existen detecciones para dibujarlas
    if result.detections:
        # dibujar cada rostro detectado
        for detection in result.detections:

            #PROBLEMAS CON LOS TIPO DE DATOS
            output_image = np.array(output_image)
            print(f"image = {type(output_image)}")
            det_pb2 = detection.to_pb2()
            dibujo.draw_detection(output_image, det_pb2)
            print('detected faces')

#-----------Iniciamos la configuracion----------------#
options = FaceDetectorOptions(
    base_options = BaseOptions(model_asset_path='model.tflite'),
    running_mode = VisionRunningMode.LIVE_STREAM,
    min_detection_confidence = 0.5,
    result_callback = print_result
)


# Obtén la tasa de cuadros por segundo (FPS) para calcular el timestamp
fps = cap.get(cv2.CAP_PROP_FPS)
frame_number = 0

with detector.create_from_options(options) as rostros:


    while True:

        # La lectura de la video captura
        ret, frame = cap.read()

        # Eliminar el error de movimiento
        frame = cv2.flip(frame,1)

        # Correcion de color
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Obtener frame_timestamp_ms
        frame_timestamp_ms = int((frame_number / fps) * 1000)
        frame_number += 1

        # Convertimos el frame a modelo de imagen mp.image de mediapipe
        mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
        # Mandamos la deteccion del rostro en frame
        rostros.detect_async(mp_image,frame_timestamp_ms)


        #Mostramos los fotogramas
        cv2.imshow("Camara", mp_image)

        #Leemos el teclado
        t = cv2.waitKey(1)
        if t == 27:
            break

cap.release()
cv2.destroyAllWindows()

I've tried a few things, such as changing the image type and the detection type, but nothing seems to work.


Solution

  • sorry for not responding earlier. I had some health issues and needed to rest.

    Well, the first thing to highlight is that we changed the running_mode option of the face detector from LIVE_STREAM to IMAGE. It’s not entirely clear to me, but it seems that the only difference between them is that in LIVE_STREAM, we can process the results directly using a callback function, whereas in IMAGE, we process each frame in a while loop.

        options = FaceDetectorOptions(
        base_options=BaseOptions(model_asset_path='model.tflite'),
        running_mode=VisionRunningMode.IMAGE,  # IMAGE MODE
        min_detection_confidence=0.5
    )
    

    Another thing we modified was the addition of a function to visualize the results. I copied this function directly from the MediaPipe website. It’s a function created by the same developers, which draws the bounding box on a copy of the image and then marks the keypoints. But before that, the keypoints are passed to an internal function to normalize the values. This function checks if the values are normalized, and if everything is okay, it doesn’t return anything. However, if necessary, it calculates the normalization and returns the values, which are then used with OpenCV's circle method to draw the keypoints on the image.

    from typing import Tuple, Union
    import math
    import cv2
    import numpy as np
    
    MARGIN = 10  # pixels
    ROW_SIZE = 10  # pixels
    FONT_SIZE = 1
    FONT_THICKNESS = 1
    TEXT_COLOR = (255, 0, 0)  # red
    
    
    def _normalized_to_pixel_coordinates(
        normalized_x: float, normalized_y: float, image_width: int,
        image_height: int) -> Union[None, Tuple[int, int]]:
        """Converts normalized value pair to pixel coordinates."""
    
        # Checks if the float value is between 0 and 1.
        def is_valid_normalized_value(value: float) -> bool:
            return (value > 0 or math.isclose(0, value)) and (value < 1 or
                                                              math.isclose(1, value))
    
        if not (is_valid_normalized_value(normalized_x) and
                is_valid_normalized_value(normalized_y)):
            # TODO: Draw coordinates even if it's outside of the image bounds.
            return None
        x_px = min(math.floor(normalized_x * image_width), image_width - 1)
        y_px = min(math.floor(normalized_y * image_height), image_height - 1)
        return x_px, y_px
    
    
    def visualize(
        image,
        detection_result
    ) -> np.ndarray:
        """Draws bounding boxes and keypoints on the input image and returns it.
        
        Args:
            image: The input RGB image.
            detection_result: The list of all "Detection" entities to visualize.
            
        Returns:
            Image with bounding boxes.
        """
        annotated_image = image.copy()
        height, width, _ = image.shape
    
        for detection in detection_result.detections:
            # Draw bounding box
            bbox = detection.bounding_box
            start_point = bbox.origin_x, bbox.origin_y
            end_point = bbox.origin_x + bbox.width, bbox.origin_y + bbox.height
            cv2.rectangle(annotated_image, start_point, end_point, TEXT_COLOR, 3)
    
            # Draw keypoints
            for keypoint in detection.keypoints:
                keypoint_px = _normalized_to_pixel_coordinates(
                    keypoint.x, keypoint.y, width, height)
    
                color, thickness, radius = (0, 255, 0), 2, 2
                cv2.circle(annotated_image, keypoint_px, thickness, color, radius)
    
            # Draw label and score
            category = detection.categories[0]
            category_name = category.category_name
            category_name = '' if category_name is None else category_name
            probability = round(category.score, 2)
            result_text = category_name + ' (' + str(probability) + ')'
            text_location = (MARGIN + bbox.origin_x,
                             MARGIN + ROW_SIZE + bbox.origin_y)
            cv2.putText(annotated_image, result_text, text_location,
                        cv2.FONT_HERSHEY_PLAIN, FONT_SIZE, TEXT_COLOR, FONT_THICKNESS)
    
        return annotated_image
    

    Here is the full main code so that it can be verified by everyone:

    import cv2
    import mediapipe as mp
    import numpy as np
    from drawing import visualize
    
    #--------- Declare the detector --------#
    detector = mp.tasks.vision.FaceDetector
    
    #---- Perform video capture ----#
    cap = cv2.VideoCapture(0)
    
    #------ Initialize configuration parameters -----#
    BaseOptions = mp.tasks.BaseOptions
    FaceDetectorOptions = mp.tasks.vision.FaceDetectorOptions
    VisionRunningMode = mp.tasks.vision.RunningMode
    
    #----------- Start configuration ----------------#
    options = FaceDetectorOptions(
        base_options=BaseOptions(model_asset_path='model.tflite'),
        running_mode=VisionRunningMode.IMAGE,  # IMAGE MODE
        min_detection_confidence=0.5
    )
    
    with detector.create_from_options(options) as faces:
    
        while True:
    
            # Read from the video capture
            ret, frame = cap.read()
    
            if not ret:
                print("Error capturing the image.")
                break
    
            # Eliminate movement error
            frame = cv2.flip(frame, 1)
    
            # Color correction
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
            # Convert the frame to MediaPipe's mp.Image format
            mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb)
    
            # Perform face detection in the frame
            result = faces.detect(mp_image)  # Use detect instead of detect_async
    
            # Create a NumPy copy of the mp_image
            image_copy = np.copy(mp_image.numpy_view())
    
            # Draw the face markings on the image
            annotated_image = visualize(image_copy, result)
    
            # Convert from RGB to BGR
            rgb_annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
            cv2.imshow("image", rgb_annotated_image)
    
            # Read the keyboard input
            t = cv2.waitKey(1)
            if t == 27:  # If the ESC key is pressed, exit the loop
                break
    
    cap.release()
    cv2.destroyAllWindows()
    

    You can read more on the official MediaPipe page, Python application.

    https://ai.google.dev/edge/mediapipe/solutions/vision/face_detector/python?hl=pt-br