I'm creating an image popularity algorithm that cuts a video% .mp4 into frames. With the help of AI, the program examines which frames probably display the most beautiful images; the result of this is expressed in 'score'.
This works but I encounter a problem. Because certain frames in a video are very similar, I have many frames with (almost) the same score.
In the end result, a list is generated with [score, frame number]. I want, for example, if 3 items in the list are almost identical frame numbers and therefore (almost) identical scores, I only keep the frame number in the list with the highest score in order to remove duplicates.
It has something to do with this line: result.append((predict(pil_image, model), name))
Here is the code:
import os
import torch
import torchvision.models
import torchvision.transforms as transforms
from PIL import Image
import json
import cv2
def prepare_image(image):
if image.mode != 'RGB':
image = image.convert("RGB")
Transform = transforms.Compose([
transforms.Resize([224, 224]),
transforms.ToTensor(),
])
image = Transform(image)
image = image.unsqueeze(0)
return image
def predict(image, model):
image = prepare_image(image)
with torch.no_grad():
preds = model(image)
score = preds.detach().numpy().item()
print("Picture score: " + str(round(score, 2)) + " | frames left: " +str(framesToDo))
return str(round(score, 2))
if __name__ == '__main__':
model = torchvision.models.resnet50()
model.fc = torch.nn.Linear(in_features=2048, out_features=1)
model.load_state_dict(torch.load('model/model-resnet50.pth', map_location=torch.device('cpu')))
model.eval()
result = []
# In de folder videos are videos saved with the name of 1 until 23
for i in range(1, 23):
vidcap = cv2.VideoCapture('./video/' + str(i) + '.mp4')
succes, vidcap_image = vidcap.read()
count = 0
framestep = 500 #for Stackoverflow example
framesToDo = vidcap.get(cv2.CAP_PROP_FRAME_COUNT)
# while succes and count < max_frames
while succes and count < int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)): #maximum amount of frames in video
name = str(i) + '_' + str(count)
cv2.imwrite("./frames_saved/" + 'vid' + '_' + name + ".jpg", vidcap_image) # save frame as jpg image
count += framestep # 500 frames further
framesToDo = framesToDo - framestep
cv2_image = cv2.cvtColor(vidcap_image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(cv2_image)
result.append((predict(pil_image, model), name))
succes, vidcap_image = vidcap.read()
result.sort(reverse=False)
print(result)
with open('result.json', 'w') as filehandle:
filehandle.write(json.dumps(result))````
Since there is no reproducible example, you can adapt this to solve your problem, this analyses each frame data and skips unnecessary ones, updates the best values and append new values.
MAX_FRAME_NUMBER_DIFF = 60
MAX_SCORE_DIFF = 0.5
current_frame = count
current_score = predict(pil_image, model)
data = (current_score, current_frame)
if not results:
results.append(data)
else:
last_score, last_frame = results[-1]
is_similar_frame = current_frame - last_frame <= MAX_FRAME_NUMBER_DIFF
is_score_better = current_score > last_score
is_score_way_better = current_score - last_score <= MAX_SCORE_DIFF
if is_similar_frame:
if is_score_better:
if is_score_way_better: # if diff between current score and previous score bigger than MAX_SCORE_DIFF
results.append(data)
else: # current score better than previous but not so better
results[-1] = data # update last value
else: # current score not better than previous
continue # skip this one
else: # if not similar frames
results.append(data)