Skip to content

Instantly share code, notes, and snippets.

@eldog
Last active April 1, 2024 11:42
Show Gist options
  • Save eldog/9012ce957be26934044131daffc25c73 to your computer and use it in GitHub Desktop.
Save eldog/9012ce957be26934044131daffc25c73 to your computer and use it in GitHub Desktop.
Google Mediapipe Hand Tracking to World Space using OpenCV solvePnP, rendered in OpenGL using PyGame
# MIT License
#
# Copyright (c) 2023 Foxdog Studios
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# Projects hand points into 3D space. See https://github.com/google/mediapipe/issues/2199 for full discussion
#
# Grab the ball between your index finger and thumb.
#
# Requires a webcam.
#
# Requirements as suggested by @Legel:
#
# pip install opencv-python
# pip install pygame
# pip install mediapipe or for Mac's with M1 pip install mediapipe-silicon
# pip install PyOpenGL
#
import cv2
import pygame
import math
import mediapipe as mp
import numpy as np
from PIL import Image
from pygame.locals import *
from OpenGL.GL import *
from OpenGL.GLU import *
from OpenGL.GLUT import *
mp_drawing = mp.solutions.drawing_utils # type:ignore
mp_drawing_styles = mp.solutions.drawing_styles #type:ignore
mp_hands = mp.solutions.hands # type:ignore
hand_edges = (
(0, 1),
(1, 0),
(1, 2),
(2, 3),
(3, 4),
(0, 5),
(5, 6),
(6, 7),
(7, 8),
(5, 9),
(9, 10),
(10, 11),
(11, 12),
(9, 13),
(13, 14),
(14, 15),
(15, 16),
(13, 17),
(17, 18),
(18, 19),
(19, 20),
(0, 17),
)
def draw_hand(world_points):
glLineWidth(5)
glLoadIdentity()
glBegin(GL_LINES)
for edge in hand_edges:
for vertex in edge:
p = world_points[vertex]
glVertex3fv((-p[0], p[1], p[2]))
glEnd()
for p in world_points:
glPushAttrib(GL_LIGHTING_BIT);
glMaterialfv(GL_FRONT, GL_DIFFUSE, [0, 1, 0, 0.5])
glLoadIdentity()
glTranslatef(-p[0], p[1], p[2]);
glutSolidSphere(0.01 / 2, 16, 16);
glPopAttrib()
class ImageLoader:
def __init__(self, x: float, y: float):
self.x = x
self.y = y
self.width = 0
self.height = 0
self.img_data = 0
self.Texture = glGenTextures(1)
def load(self, image: cv2.Mat):
im = image
tx_image = cv2.flip(im, 0)
tx_image = Image.fromarray(tx_image)
self.width = tx_image.size[0]
self.height = tx_image.size[1]
self.img_data = tx_image.tobytes('raw', 'BGRX', 0, -1)
glBindTexture(GL_TEXTURE_2D, self.Texture)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, self.width, self.height, 0, GL_RGBA, GL_UNSIGNED_BYTE, self.img_data)
def draw(self):
glMatrixMode(GL_MODELVIEW)
glLoadIdentity()
glTranslate(self.x, self.y, 0)
glEnable(GL_TEXTURE_2D)
glBegin(GL_QUADS)
glTexCoord2f(0, 0)
glVertex2f(0, 0)
glTexCoord2f(1, 0)
glVertex2f(self.width, 0)
glTexCoord2f(1, 1)
glVertex2f(self.width, self.height)
glTexCoord2f(0, 1)
glVertex2f(0, self.height)
glEnd()
glDisable(GL_TEXTURE_2D)
ball_pos_start = [0, 0, -0.4]
ball_pos = list(ball_pos_start)
ball_grabbed = False
with mp_hands.Hands(
model_complexity=0,
min_detection_confidence=0.5,
min_tracking_confidence=0.5) as hands:
cap = cv2.VideoCapture(0)
width, height = int(cap.get(3)), int(cap.get(4))
pygame.init()
display = (width,height)
pygame.display.set_mode(display, DOUBLEBUF|OPENGL|RESIZABLE)
glutInit()
im_loader = ImageLoader(0, 0)
draw_mediapipe = False
while True:
for event in pygame.event.get():
if event.type == pygame.QUIT:
pygame.quit()
quit()
if event.type == pygame.KEYDOWN:
if event.key == pygame.K_m:
draw_mediapipe = not draw_mediapipe
print(f'toggling draw media pipe now: {draw_mediapipe}')
if event.key == pygame.K_b:
ball_pos = list(ball_pos_start)
glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT)
glMatrixMode(GL_PROJECTION)
glLoadIdentity()
gluOrtho2D(0, width, height, 0)
glMatrixMode(GL_MODELVIEW)
glLoadIdentity()
glEnable(GL_DEPTH_TEST)
glEnable(GL_TEXTURE_2D)
glEnable(GL_LIGHTING)
glLightfv(GL_LIGHT0, GL_DIFFUSE, [1, 1, 1, 1])
glEnable(GL_LIGHT0)
success: bool
image: cv2.Mat
success, image = cap.read()
image = cv2.flip(image, 1)
image.flags.writeable = False
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = hands.process(image) # type:ignore
# Draw the hand annotations on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
frame_height, frame_width, channels = image.shape
# Fiddle with this number to get the camera image
# hands to align with the mediapipe points. Unless
# you know your camera's focal length, then put in
# here.
focal_length = frame_width * 0.75
center = (frame_width/2, frame_height/2)
camera_matrix = np.array(
[[focal_length, 0, center[0]],
[0, focal_length, center[1]],
[0, 0, 1]], dtype = "double"
)
distortion = np.zeros((4, 1))
fov_x = np.rad2deg(2 * np.arctan2(focal_length, 2 * focal_length))
world_points_total = []
if results.multi_hand_landmarks:
for [i, hand_landmarks] in enumerate(results.multi_hand_landmarks):
world_landmarks = results.multi_hand_world_landmarks[i]
if draw_mediapipe:
mp_drawing.draw_landmarks(
image,
hand_landmarks,
mp_hands.HAND_CONNECTIONS,
mp_drawing_styles.get_default_hand_landmarks_style(),
mp_drawing_styles.get_default_hand_connections_style())
model_points = np.float32([[-l.x, -l.y, -l.z] for l in world_landmarks.landmark])
image_points = np.float32([[l.x * frame_width, l.y * frame_height] for l in hand_landmarks.landmark])
success, rvecs, tvecs, = cv2.solvePnP(
model_points,
image_points,
camera_matrix,
distortion,
flags=cv2.SOLVEPNP_SQPNP
)
transformation = np.eye(4) # needs to 4x4 because you have to use homogeneous coordinates
transformation[0:3, 3] = tvecs.squeeze()
# the transformation consists only of the translation, because the rotation is accounted for in the model coordinates. Take a look at this (https://codepen.io/mediapipe/pen/RwGWYJw to see how the model coordinates behave - the hand rotates, but doesn't translate
# transform model coordinates into homogeneous coordinates
model_points_hom = np.concatenate((model_points, np.ones((21, 1))), axis=1)
# apply the transformation
world_points = model_points_hom.dot(np.linalg.inv(transformation).T)
world_points_total.append(world_points)
glDepthMask(GL_FALSE)
im_loader.load(image)
glColor3f(1, 1, 1)
im_loader.draw()
glDepthMask(GL_TRUE)
glMatrixMode(GL_PROJECTION)
glLoadIdentity()
gluPerspective(fov_x, (display[0]/display[1]), 0.1, 50.0)
glMatrixMode(GL_MODELVIEW)
glLoadIdentity()
glTranslatef(*ball_pos);
glPushAttrib(GL_LIGHTING_BIT);
glMaterialfv(GL_FRONT, GL_DIFFUSE, [1, 0, 0, 0.5])
glutSolidSphere(0.07 / 2, 16, 16);
glPopAttrib()
if len(world_points_total) > 0:
glLoadIdentity()
grab_distnace_closest = math.inf
thumb_ball_distance_closest = math.inf
thumb_closest = None
for world_points in world_points_total:
draw_hand(world_points);
thumb = world_points[4]
index = world_points[8]
gd = math.hypot(
thumb[0] - index[0],
thumb[1] - index[1],
thumb[2] - index[2],
)
grab_distnace = gd
thumb_ball_distance = math.hypot(
ball_pos[0] - -thumb[0],
ball_pos[1] - thumb[1],
ball_pos[2] - thumb[2],
)
if thumb_ball_distance < thumb_ball_distance_closest:
thumb_closest = thumb
thumb_ball_distance_closest = thumb_ball_distance
grab_distnace_closest = grab_distnace
if thumb_closest is not None:
if (thumb_ball_distance_closest < 0.1 or ball_grabbed) and grab_distnace_closest < 0.08:
ball_grabbed = True
ball_pos[0] = -thumb_closest[0]
ball_pos[1] = thumb_closest[1]
ball_pos[2] = thumb_closest[2]
else:
ball_grabbed = False
pygame.display.flip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment