Search code examples
pythontensorflowartificial-intelligencetraining-dataface-detection

Trying to extract frames from a video where faces are detected using my trained TensorFlow model but it writes black images. What went wrong?


I'm trying to make a TensorFlow face detection model. One file trains the model and exports the model in a .h5 file shown here.

import tensorflow as tf
import cv2
from tensorflow.keras import layers
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications.resnet50 import preprocess_input

image_size = (224, 224)
batch_size = 32
epochs = 10

def load_widerface_dataset(widerface_dir):
    images_dir = os.path.join(widerface_dir, 'WIDER_train', 'images')
    labels_file = os.path.join(widerface_dir, 'wider_face_split', 'wider_face_train_bbx_gt.txt')

    X = []
    y = []

    with open(labels_file, 'r') as f:
        lines = f.readlines()

    num_images = int(lines[1])
    current_line = 2

    for _ in range(num_images):
        image_path = os.path.join(images_dir, lines[current_line - 2].strip())
        num_faces = int(lines[num_images])

        image = Image.open(image_path)
        image = image.resize((224, 224))
        X.append(np.array(image))

        faces = []
        for i in range(num_faces):
            face_line = lines[current_line + i].strip().split(' ')
            face = [int(coord) for coord in face_line[:4]]
            faces.append(face)

        y.append(np.array(faces))

        current_line += num_faces

        if current_line < len(lines) and not lines[current_line].strip():
            current_line += 1

    X = np.array(X)
    y = np.array(y)

    return X, y

def build_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu',
                               input_shape=(image_size[0], image_size[1], 3)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(4, activation='sigmoid')
    ])

    return model

widerface_dir = 'C:/face_training/WIDER_train'
X_train, y_train = load_widerface_dataset(widerface_dir)
model = build_model()
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs)
model.save('face_detection_model.h5')

Another file is to take a video and analyze the video using my trained model here and extract frames where faces are detected. Here is the code for it.

import cv2
import os
from tensorflow.keras.models import load_model
import sys
import numpy as np

vidPath = "testclip2.mp4"
model_path = 'face_detection_model.h5'
model = load_model(model_path)

test_tensorflow = 'C:/Users/user/Documents/PyCharmProjects/test_tensorflow'
if not os.path.exists(test_tensorflow):
    os.makedirs(test_tensorflow)

cap = cv2.VideoCapture(vidPath)

currentFrame = 0
while (cap.isOpened()):
    ret, frame = cap.read()

    if ret == True:
        frame = cv2.resize(frame, (224, 224))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = frame / 255.0
        frames = [frame]

        for frame in frames:
            prediction = model.predict(frame.reshape(1, 224, 224, 3))

            if prediction.any() > 0.5:
                cv2.imwrite(os.path.join(test_tensorflow, str(currentFrame) + '.jpg'), frame)
                currentFrame += 1

    else:
        break

cap.release()
sys.exit(0)

And now I have this problem where the program only extracts black images. I'm not sure what went wrong and I've tried reading up a bunch of stuff but nothing seemed to work. Would really appreciate your guidance.


Solution

  • I think the issue is that you are normalising the RGB values, i.e. converting from from a range of 0-255 to a range of 0-1.

    This is common practice for modelling and predicting, but when you try to output the result with OpenCv it will not understand the normalised pixel values.

    You can try a quick fix suggested here: https://stackoverflow.com/a/54165573/334402

    This essentially just restores the original 0-255 range that OpenCV is expecting.