I'm trying to detect multi hand gestures using mediapipe. I want to detect both the gestures of both hands independently. Both hands can have the same gesture or different gestures. I the given code the function print_result
is printing the the contents of the object after the inference has been run on the frame. the max_num_hands
parameter has been set to 2
here with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:
import cv2
import mediapipe as mp
import time
cap = cv2.VideoCapture(1)
BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
GestureRecognizerResult = mp.tasks.vision.GestureRecognizerResult
VisionRunningMode = mp.tasks.vision.RunningMode
# Callback function to print gesture recognition results
def print_result(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
if result.gestures:
# Get the category name of the recognized gesture
category_name = result.gestures[0][0].category_name
# print(category_name)
print(result)
else:
print("No gestures recognized")
# Initialize MediaPipe drawing utils and hands module
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
# Configure options for the gesture recognizer
options = GestureRecognizerOptions(
base_options=BaseOptions(model_asset_path='C:\\Users\\golut\\OneDrive\\Documents\\Projects\\Virtual Mouse\\models\\gesture_recognizer.task'),
running_mode=VisionRunningMode.LIVE_STREAM,
result_callback=print_result
)
# Create a gesture recognizer instance
with GestureRecognizer.create_from_options(options) as recognizer:
print('Gesture recognizer created')
while True:
success, img = cap.read()
if not success:
print("Ignoring empty camera frame.")
continue
# Convert BGR image to RGB for MediaPipe processing
rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Detect hand landmarks using MediaPipe Hands
with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.5) as hands:
results = hands.process(rgb_img)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
# Draw hand landmarks on the image with specified color and thickness
mp_drawing.draw_landmarks(
img, hand_landmarks, mp_hands.HAND_CONNECTIONS,
mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2, circle_radius=2),
mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2, circle_radius=2)
)
# Prepare image for gesture recognition
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_img)
current_time_ms = int(time.time() * 1000)
# Perform gesture recognition on the processed image
detected_gestures = recognizer.recognize_async(mp_image, current_time_ms)
img = cv2.flip(img, 1) # Flips the image horizontally
cv2.imshow("Imshow", img)
if cv2.waitKey(10) == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
In the object GestureRecognizerResult
we see a list handedness
that contains the category_name
which is either left or right.
The problem is the gesture recognizer only gives one output either left or right hand in the output depending on which hand got detected first and the latter is ignored. In mediapipe's given try on example, both hands when shown to the camera with different gesture are recognized independently. Link to mediapipe demo
GestureRecognizerResult(gestures=[[Category(index=-1, score=0.7995390892028809,
display_name='', category_name='Open_Palm')]], handedness=[[Category(index=0, score=0.9178019165992737, display_name='Right', category_name='Right')]],
hand_landmarks=[[NormalizedLandmark(x=0.23192565143108368, y=0.8508237600326538, z=3.7175095712882467e-07, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2964465022087097, y=0.807819128036499, z=-0.02174699306488037, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3386477530002594, y=0.7381684184074402, z=-0.026875635609030724, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3652242422103882, y=0.6717657446861267, z=-0.03148443624377251, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.39171433448791504, y=0.627888560295105, z=-0.03597773239016533, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.30005523562431335, y=0.6441321969032288, z=-0.002747688442468643, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3194928765296936, y=0.5634738802909851, z=-0.015889683738350868, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.3276906907558441, y=0.5102080702781677, z=-0.0299211535602808, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.33434727787971497, y=0.46343517303466797, z=-0.04088740795850754, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2615800201892853, y=0.6335919499397278, z=-0.002842121757566929, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.26276978850364685, y=0.5426733493804932, z=-0.014345655217766762, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2621628940105438, y=0.48378312587738037, z=-0.028536789119243622, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.26235222816467285, y=0.43310630321502686, z=-0.03940063342452049, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.22592493891716003, y=0.6417601108551025, z=-0.006861940026283264, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.2230750024318695, y=0.5614591240882874, z=-0.01952073909342289, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.22449643909931183, y=0.5094373822212219, z=-0.029860520735383034, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.229284405708313, y=0.46403464674949646, z=-0.03746004030108452, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.19173786044120789, y=0.663299024105072, z=-0.0136506836861372, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.18222525715827942, y=0.604834794998169, z=-0.025881653651595116, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.18415895104408264, y=0.5673394799232483, z=-0.03144041821360588, visibility=0.0, presence=0.0),
NormalizedLandmark(x=0.19118154048919678, y=0.5324922800064087, z=-0.034897807985544205, visibility=0.0, presence=0.0)]],
hand_world_landmarks=[[Landmark(x=-0.012245522812008858, y=0.09203963726758957, z=-0.0038926522247493267, visibility=0.0, presence=0.0),
Landmark(x=0.021369636058807373, y=0.06962162256240845, z=-0.009559692814946175, visibility=0.0, presence=0.0),
Landmark(x=0.042654991149902344, y=0.04227661341428757, z=-0.012077674269676208, visibility=0.0, presence=0.0),
Landmark(x=0.0617685541510582, y=0.014768477529287338, z=-0.011491118930280209, visibility=0.0, presence=0.0),
Landmark(x=0.07398916780948639, y=-0.012367911636829376, z=-0.0075836945325136185, visibility=0.0, presence=0.0),
Landmark(x=0.025482138618826866, y=-0.0010876771993935108, z=0.006445789244025946, visibility=0.0, presence=0.0),
Landmark(x=0.03543740138411522, y=-0.02912675403058529, z=-0.00173004565294832, visibility=0.0, presence=0.0),
Landmark(x=0.040552493184804916, y=-0.0489623099565506, z=-0.007902431301772594, visibility=0.0, presence=0.0),
Landmark(x=0.04358145594596863, y=-0.06487865746021271, z=-0.0319957509636879, visibility=0.0, presence=0.0),
Landmark(x=0.0016808465588837862, y=-0.004498452879488468, z=0.006683729123324156, visibility=0.0, presence=0.0),
Landmark(x=0.004972374066710472, y=-0.04138147830963135, z=-0.003927251789718866, visibility=0.0, presence=0.0),
Landmark(x=0.00558849610388279, y=-0.06327502429485321, z=-0.020593348890542984, visibility=0.0, presence=0.0),
Landmark(x=0.0066368915140628815, y=-0.08291880786418915, z=-0.039193443953990936, visibility=0.0, presence=0.0),
Landmark(x=-0.018360454589128494, y=-0.0009643810335546732, z=-0.0038148483727127314, visibility=0.0, presence=0.0),
Landmark(x=-0.015782665461301804, y=-0.03162727132439613, z=-0.013909644447267056, visibility=0.0, presence=0.0),
Landmark(x=-0.013191262260079384, y=-0.05145301669836044, z=-0.028273196890950203, visibility=0.0, presence=0.0),
Landmark(x=-0.009723789989948273, y=-0.0685187503695488, z=-0.04024944826960564, visibility=0.0, presence=0.0),
Landmark(x=-0.035820234566926956, y=0.011946788057684898, z=-0.0120608601719141, visibility=0.0, presence=0.0),
Landmark(x=-0.03725161403417587, y=-0.009996423497796059, z=-0.017715157940983772, visibility=0.0, presence=0.0),
Landmark(x=-0.036166295409202576, y=-0.028470497578382492, z=-0.026987750083208084, visibility=0.0, presence=0.0),
Landmark(x=-0.030654065310955048, y=-0.03972318768501282, z=-0.03699912130832672, visibility=0.0, presence=0.0)]])
I want to achieve the same result as the demo of recognizing both hands individually at once with different gestures.
You can use code from my other answer that recognizes gestures of both hands (mediapipe==0.10.0):
import cv2
import mediapipe as mp
from mediapipe.tasks import python
import threading
class GestureRecognizer:
def main(self):
num_hands = 2
model_path = "gesture_recognizer.task"
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
VisionRunningMode = mp.tasks.vision.RunningMode
self.lock = threading.Lock()
self.current_gestures = []
options = GestureRecognizerOptions(
base_options=python.BaseOptions(model_asset_path=model_path),
running_mode=VisionRunningMode.LIVE_STREAM,
num_hands = num_hands,
result_callback=self.__result_callback)
recognizer = GestureRecognizer.create_from_options(options)
timestamp = 0
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
static_image_mode=False,
max_num_hands=num_hands,
min_detection_confidence=0.65,
min_tracking_confidence=0.65)
cap = cv2.VideoCapture(0)
while True:
ret, frame = cap.read()
if not ret:
break
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = hands.process(frame)
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
np_array = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=np_array)
recognizer.recognize_async(mp_image, timestamp)
timestamp = timestamp + 1 # should be monotonically increasing, because in LIVE_STREAM mode
self.put_gestures(frame)
cv2.imshow('MediaPipe Hands', frame)
if cv2.waitKey(1) & 0xFF == 27:
break
cap.release()
def put_gestures(self, frame):
self.lock.acquire()
gestures = self.current_gestures
self.lock.release()
y_pos = 50
for hand_gesture_name in gestures:
# show the prediction on the frame
cv2.putText(frame, hand_gesture_name, (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX,
1, (0,0,255), 2, cv2.LINE_AA)
y_pos += 50
def __result_callback(self, result, output_image, timestamp_ms):
#print(f'gesture recognition result: {result}')
self.lock.acquire() # solves potential concurrency issues
self.current_gestures = []
if result is not None and any(result.gestures):
print("Recognized gestures:")
for single_hand_gesture_data in result.gestures:
gesture_name = single_hand_gesture_data[0].category_name
print(gesture_name)
self.current_gestures.append(gesture_name)
self.lock.release()
if __name__ == "__main__":
rec = GestureRecognizer()
rec.main()
Demo: