Last active
January 7, 2023 01:41
-
-
Save ashwanirathee/0f6006b63a7736a0c12567da30a7925d to your computer and use it in GitHub Desktop.
hand landmarks and gesture recognition in julia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### A Pluto.jl notebook ### | |
# v0.17.3 | |
using Markdown | |
using InteractiveUtils | |
# This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error). | |
macro bind(def, element) | |
quote | |
local iv = try Base.loaded_modules[Base.PkgId(Base.UUID("6e696c72-6542-2067-7265-42206c756150"), "AbstractPlutoDingetjes")].Bonds.initial_value catch; b -> missing; end | |
local el = $(esc(element)) | |
global $(esc(def)) = Core.applicable(Base.get, el) ? Base.get(el) : iv(el) | |
el | |
end | |
end | |
# ╔═╡ 4934e4de-b03d-419f-a076-9a8116f5ddf5 | |
begin | |
using Pkg; | |
Pkg.activate(".") ; | |
end | |
# ╔═╡ 14519106-d4cf-4a77-acca-a22b7c426334 | |
using Cairo, Images, ImageDraw, Luxor, LinearAlgebra, LazySets, StaticArrays | |
# ╔═╡ 0da24d63-180f-4913-a8d6-3ba54a28ef04 | |
md""" | |
### Basic Hand Gesture Recognition using convexity defects | |
We want to detect gestures from hands using classical image processing methods at this moment | |
###### Steps | |
- We recieve real time image from Javascript | |
- Define region of interest in the image and get that part of image | |
- Convert ROI image to HSV space, then threshold it to skin range, we recieve a binary mask | |
- We use map window and dilation on the mask to reduce noise(quite robust actually) | |
- Find contours from the mask now(suzuki and abe algorithm) | |
- Find convexhull on the binary mask to find the convex hull(from ImageMorphology.jl) | |
- Now we have contour points and convexhull of the mask, we find the convexity defects | |
###### Convexity Defects wasn't available in julia so wrote my own | |
Explanation and Interesting ideas on convexity defects will be done later | |
- After we find num and location of convexity defects, we plot the points and num+1 as the number in image using Luxor.jl | |
- This happens every 100ms | |
""" | |
# ╔═╡ 841cd0d1-c5d4-41fe-949f-b2ddc9144634 | |
md""" | |
 | |
""" | |
# ╔═╡ 43f08085-b9b3-4e9b-b2ff-a0907b48a897 | |
begin | |
### Important | |
### contour related utils | |
# rotate direction clocwise | |
function clockwise(dir) | |
return (dir)%8 + 1 | |
end | |
# rotate direction counterclocwise | |
function counterclockwise(dir) | |
return (dir+6)%8 + 1 | |
end | |
# move from current pixel to next in given direction | |
function move(pixel, image, dir, dir_delta) | |
newp = pixel + dir_delta[dir] | |
height, width = size(image) | |
if (0 < newp[1] <= height) && (0 < newp[2] <= width) | |
if image[newp]!=0 | |
return newp | |
end | |
end | |
return CartesianIndex(0, 0) | |
end | |
# finds direction between two given pixels | |
function from_to(from, to, dir_delta) | |
delta = to-from | |
return findall(x->x == delta, dir_delta)[1] | |
end | |
function detect_move(image, p0, p2, nbd, border, done, dir_delta) | |
dir = from_to(p0, p2, dir_delta) | |
moved = clockwise(dir) | |
p1 = CartesianIndex(0, 0) | |
while moved != dir ## 3.1 | |
newp = move(p0, image, moved, dir_delta) | |
if newp[1]!=0 | |
p1 = newp | |
break | |
end | |
moved = clockwise(moved) | |
end | |
if p1 == CartesianIndex(0, 0) | |
return | |
end | |
p2 = p1 ## 3.2 | |
p3 = p0 ## 3.2 | |
done .= false | |
while true | |
dir = from_to(p3, p2, dir_delta) | |
moved = counterclockwise(dir) | |
p4 = CartesianIndex(0, 0) | |
done .= false | |
while true ## 3.3 | |
p4 = move(p3, image, moved, dir_delta) | |
if p4[1] != 0 | |
break | |
end | |
done[moved] = true | |
moved = counterclockwise(moved) | |
end | |
push!(border, p3) ## 3.4 | |
if p3[1] == size(image, 1) || done[3] | |
image[p3] = -nbd | |
elseif image[p3] == 1 | |
image[p3] = nbd | |
end | |
if (p4 == p0 && p3 == p1) ## 3.5 | |
break | |
end | |
p2 = p3 | |
p3 = p4 | |
end | |
end | |
function find_contours(image) | |
nbd = 1 | |
lnbd = 1 | |
image = Float64.(image) | |
contour_list = Vector{typeof(CartesianIndex[])}() | |
done = [false, false, false, false, false, false, false, false] | |
# Clockwise Moore neighborhood. | |
dir_delta = [CartesianIndex(-1, 0) , CartesianIndex(-1, 1), CartesianIndex(0, 1), CartesianIndex(1, 1), CartesianIndex(1, 0), CartesianIndex(1, -1), CartesianIndex(0, -1), CartesianIndex(-1,-1)] | |
height, width = size(image) | |
for i=1:height | |
lnbd = 1 | |
for j=1:width | |
fji = image[i, j] | |
is_outer = (image[i, j] == 1 && (j == 1 || image[i, j-1] == 0)) ## 1 (a) | |
is_hole = (image[i, j] >= 1 && (j == width || image[i, j+1] == 0)) | |
if is_outer || is_hole | |
# 2 | |
border = CartesianIndex[] | |
from = CartesianIndex(i, j) | |
if is_outer | |
nbd += 1 | |
from -= CartesianIndex(0, 1) | |
else | |
nbd += 1 | |
if fji > 1 | |
lnbd = fji | |
end | |
from += CartesianIndex(0, 1) | |
end | |
p0 = CartesianIndex(i,j) | |
detect_move(image, p0, from, nbd, border, done, dir_delta) ## 3 | |
if isempty(border) ##TODO | |
push!(border, p0) | |
image[p0] = -nbd | |
end | |
push!(contour_list, border) | |
end | |
if fji != 0 && fji != 1 | |
lnbd = abs(fji) | |
end | |
end | |
end | |
return contour_list | |
end | |
# a contour is a vector of 2 int arrays | |
function draw_contour(image, color, contour) | |
for ind in contour | |
image[ind] = color | |
end | |
end | |
function draw_contours(image, color, contours) | |
for cnt in contours | |
draw_contour(image, color, cnt) | |
end | |
end | |
end | |
# ╔═╡ f5642319-05ee-4731-ad26-80bcd4f6aa7b | |
begin | |
### Important | |
### Webcam related utils | |
function camera_input(;max_size=200, default_url="https://i.imgur.com/SUmi94P.png") | |
""" | |
<span class="pl-image waiting-for-permission"> | |
<style> | |
.pl-image.popped-out { | |
position: fixed; | |
top: 0; | |
right: 0; | |
z-index: 5; | |
} | |
.pl-image #video-container { | |
width: 250px; | |
} | |
.pl-image video { | |
# border-radius: 1rem 1rem 0 0; | |
} | |
.pl-image.waiting-for-permission #video-container { | |
display: none; | |
} | |
.pl-image #prompt { | |
display: none; | |
} | |
.pl-image.waiting-for-permission #prompt { | |
width: 250px; | |
height: 200px; | |
display: grid; | |
place-items: center; | |
font-family: monospace; | |
font-weight: bold; | |
text-decoration: underline; | |
cursor: pointer; | |
border: 5px dashed rgba(0,0,0,.5); | |
} | |
.pl-image video { | |
display: block; | |
} | |
.pl-image .bar { | |
width: inherit; | |
display: flex; | |
z-index: 6; | |
} | |
.pl-image .bar#top { | |
position: absolute; | |
flex-direction: column; | |
} | |
.pl-image .bar#bottom { | |
background: black; | |
# border-radius: 0 0 1rem 1rem; | |
} | |
.pl-image .bar button { | |
flex: 0 0 auto; | |
background: rgba(255,255,255,.8); | |
border: none; | |
width: 2rem; | |
height: 2rem; | |
border-radius: 100%; | |
cursor: pointer; | |
z-index: 7; | |
} | |
.pl-image .bar button#shutter { | |
width: 3rem; | |
height: 3rem; | |
margin: -1.5rem auto .2rem auto; | |
} | |
.pl-image video.takepicture { | |
animation: pictureflash 0ms linear; | |
} | |
@keyframes pictureflash { | |
0% { | |
filter: grayscale(1.0) contrast(2.0); | |
} | |
100% { | |
filter: grayscale(0.0) contrast(1.0); | |
} | |
} | |
</style> | |
<div id="video-container"> | |
<div id="top" class="bar"> | |
<button id="stop" title="Stop video">✖</button> | |
<button id="pop-out" title="Pop out/pop in">⏏</button> | |
</div> | |
<video playsinline autoplay></video> | |
<div id="bottom" class="bar"> | |
<button id="shutter" title="Click to take a picture">📷</button> | |
</div> | |
</div> | |
<div id="prompt"> | |
<span> | |
Enable webcam | |
</span> | |
</div> | |
<script> | |
// based on https://github.com/fonsp/printi-static (by the same author) | |
const span = currentScript.parentElement | |
const video = span.querySelector("video") | |
const popout = span.querySelector("button#pop-out") | |
const stop = span.querySelector("button#stop") | |
const shutter = span.querySelector("button#shutter") | |
const prompt = span.querySelector(".pl-image #prompt") | |
const maxsize = $(max_size) | |
const send_source = (source, src_width, src_height) => { | |
const scale = Math.min(1.0, maxsize / src_width, maxsize / src_height) | |
const width = Math.floor(src_width * scale) | |
const height = Math.floor(src_height * scale) | |
const canvas = html`<canvas width=\${width} height=\${height}>` | |
const ctx = canvas.getContext("2d") | |
ctx.drawImage(source, 0, 0, width, height) | |
span.value = { | |
width: width, | |
height: height, | |
data: ctx.getImageData(0, 0, width, height).data, | |
} | |
span.dispatchEvent(new CustomEvent("input")) | |
} | |
const clear_camera = () => { | |
window.stream.getTracks().forEach(s => s.stop()); | |
video.srcObject = null; | |
span.classList.add("waiting-for-permission"); | |
} | |
prompt.onclick = () => { | |
navigator.mediaDevices.getUserMedia({ | |
audio: false, | |
video: { | |
facingMode: "environment", | |
}, | |
}).then(function(stream) { | |
stream.onend = console.log | |
window.stream = stream | |
video.srcObject = stream | |
window.cameraConnected = true | |
video.controls = false | |
video.play() | |
video.controls = false | |
span.classList.remove("waiting-for-permission"); | |
}).catch(function(error) { | |
console.log(error) | |
}); | |
} | |
stop.onclick = () => { | |
clear_camera() | |
} | |
popout.onclick = () => { | |
span.classList.toggle("popped-out") | |
} | |
var intervalId = window.setInterval(function(){ | |
const cl = video.classList | |
cl.remove("takepicture") | |
void video.offsetHeight | |
cl.add("takepicture") | |
video.play() | |
video.controls = false | |
send_source(video, video.videoWidth, video.videoHeight) | |
}, 150); | |
shutter.onclick = () => { | |
const cl = video.classList | |
cl.remove("takepicture") | |
void video.offsetHeight | |
cl.add("takepicture") | |
video.play() | |
video.controls = false | |
send_source(video, video.videoWidth, video.videoHeight) | |
} | |
document.addEventListener("visibilitychange", () => { | |
if (document.visibilityState != "visible") { | |
clear_camera() | |
} | |
}) | |
// Set a default image | |
const img = html`<img crossOrigin="anonymous">` | |
img.onload = () => { | |
console.log("helloo") | |
send_source(img, img.width, img.height) | |
} | |
img.src = "$(default_url)" | |
console.log(img) | |
</script> | |
</span> | |
""" |> HTML | |
end | |
function process_raw_camera_data(raw_camera_data) | |
# the raw image data is a long byte array, we need to transform it into something | |
# more "Julian" - something with more _structure_. | |
# The encoding of the raw byte stream is: | |
# every 4 bytes is a single pixel | |
# every pixel has 4 values: Red, Green, Blue, Alpha | |
# (we ignore alpha for this notebook) | |
# So to get the red values for each pixel, we take every 4th value, starting at | |
# the 1st: | |
reds_flat = UInt8.(raw_camera_data["data"][1:4:end]) | |
greens_flat = UInt8.(raw_camera_data["data"][2:4:end]) | |
blues_flat = UInt8.(raw_camera_data["data"][3:4:end]) | |
# but these are still 1-dimensional arrays, nicknamed 'flat' arrays | |
# We will 'reshape' this into 2D arrays: | |
width = raw_camera_data["width"] | |
height = raw_camera_data["height"] | |
# shuffle and flip to get it in the right shape | |
reds = reshape(reds_flat, (width, height))' / 255.0 | |
greens = reshape(greens_flat, (width, height))' / 255.0 | |
blues = reshape(blues_flat, (width, height))' / 255.0 | |
# we have our 2D array for each color | |
# Let's create a single 2D array, where each value contains the R, G and B value of | |
# that pixel | |
RGB.(reds, greens, blues) | |
end | |
end | |
# ╔═╡ 1a0324de-ee19-11ea-1d4d-db37f4136ad3 | |
@bind raw_camera_data camera_input(;max_size=100) | |
# ╔═╡ 6f80e4ff-99bc-4c77-aebe-5e7f21f0d328 | |
begin | |
function drawdots!(img, res, color ) | |
for i in res | |
img[i[1]-1:i[1]+1, i[2]-1:i[2]+1] .= color | |
end | |
end | |
function dist2p(p1, p2) | |
sqrt((p1[1]-p2[1])^2 + (p1[2]-p2[2])^2) | |
end | |
function findmyangle(a1, a2; center) | |
acos((dist2p(a1,center)^2 + dist2p(a2,center)^2 - dist2p(a1,a2)^2) / (2 * dist2p(center, a1) * dist2p(center, a2))) | |
end | |
end | |
# ╔═╡ ca92aa75-50c5-4720-a0d5-6993c21ea0b1 | |
""" | |
findconvexitydefects(contour, convhull; dist = 1.1, absdiff = 10, mindist = 0, currsize = 50, d1 = 2, d2 = 2, anglemax = π/2) | |
return the convexity defects using contour points and convexhull | |
###### Arguments | |
- contour -> contour points using suzuki and abe algorithm | |
- convexhull -> convexhull boundaries found from ImageMorphology.jl | |
- dist and absdiff -> helps in edge cases when trying to synchronize contour points and convexhull | |
- mindist -> to control min distance of a defect from a convex hull region line | |
- currsize -> to avoid small regions of contours | |
- d1 and d2 -> to control of defect from pair of convexhull points forming line individually | |
- anglemax -> helps to control max angle between convex hull line points and the defect | |
Idea | For one region | |
:-------------------------:|:-------------------------: | |
 |  | |
""" | |
function findconvexitydefects( | |
contour, | |
convhull; | |
dist=1.1, | |
absdiff=10, | |
mindist=0, | |
currsize= 50, | |
d1 = 2, | |
d2 = 2, | |
anglemax = π/2 | |
) | |
# first we need to match our contours and our convehull regions | |
numindices = [] | |
previous = 0 | |
for i in convhull | |
for (num,j) in enumerate(contour) | |
if norm(Tuple(i) .- Tuple(j)) < dist && abs(previous-num) > absdiff # to avoid small and very close regions | |
push!(numindices, num) | |
previous = num | |
break | |
end | |
end | |
end | |
# we want the numindices same as our convhull points, | |
# to define regions of interest for each convhull line | |
defects = Vector{CartesianIndex{2}}([]) # indexes with defects | |
# incase numndices < convhull indexes, | |
# meaning we don't hv regions for all lines | |
if size(numindices)[1] < size(convhull)[1] | |
throw(error("Raise the range dist, numindices points less than convexhull points, $(size(numindices)[1]) $(size(convhull)[1]) ")) | |
end | |
# iterate over each consecutive pair of convhull points to form line | |
for i in 1:size(convhull)[1]-1 | |
# to handle the case where numindices are like 1256, then 1 | |
if numindices[i] > numindices[i+1] | |
curr = vcat(contour[numindices[i]: end], contour[1: numindices[i+1]]) | |
else | |
# general case to define contours poins for each convhull region | |
curr = contour[numindices[i]:numindices[i+1]] | |
end | |
# to remove minor regions of contours, we can set currsize | |
if size(curr)[1] < currsize | |
continue | |
end | |
# Defining the line | |
p1 = Float64.(Tuple(convhull[i])) # point 1 | |
p2 = Float64.(Tuple(convhull[i+1])) # point 2 | |
line = Line(;from=[p1[1], p1[2]], to=[p2[1], p2[2]]) | |
maxdef = 0 # max distance from our convhull line | |
defloc = CartesianIndex(0,0) # location of the that point | |
# check for each contour point in a convhull region | |
# their distance and find max distance point | |
for j in curr | |
p = SA[j[1], j[2]] | |
lpdist = LazySets.distance(p, line) # find distance | |
# update if we find new max | |
if lpdist > maxdef | |
maxdef = lpdist | |
defloc = j | |
end | |
end | |
def1 = norm(Tuple(defloc) .- Tuple(p1)) | |
def2 = norm(Tuple(defloc) .- Tuple(p2)) | |
# we can define what minimum distance from line should be | |
angle = findmyangle(p1,p2; center=defloc) | |
if maxdef > mindist && def1 > d1 && def2 > d2 && angle < anglemax | |
push!(defects, defloc) | |
end | |
end | |
defects | |
end | |
# ╔═╡ dfb7c6be-ee0d-11ea-194e-9758857f7b20 | |
begin | |
### Important | |
### Object tracker is here | |
function objecttracker( | |
img, | |
h = 50, | |
s = 255, | |
v = 255, | |
lh = 0, | |
ls = 20, | |
lv = 70, | |
boundingboxvar = 10 | |
) | |
hsv_img = HSV.(img) | |
channels = channelview(float.(hsv_img)) | |
hue_img = channels[1, :, :] | |
val_img = channels[3, :, :] | |
satur_img = channels[2, :, :] | |
mask = zeros(size(hue_img)) | |
h, s, v = h, s, v | |
h1, s1, v1 = lh, ls, lv | |
ex = boundingboxvar | |
for ind in eachindex(hue_img) | |
if hue_img[ind] <= h && satur_img[ind] <= s / 255 && val_img[ind] <= v / 255 | |
if hue_img[ind] >= h1 && satur_img[ind] >= s1 / 255 && val_img[ind] >= v1 / 255 | |
mask[ind] = 1 | |
end | |
end | |
end | |
img = mapwindow(ImageFiltering.median, dilate(mask), (3, 3)) | |
contours = find_contours(img) | |
try | |
convhull = convexhull(img .> 0.5) | |
push!(convhull, convhull[1]) | |
res = findconvexitydefects(contours[1], convhull; dist=3, absdiff = 2, currsize= 30, mindist =6) | |
img_convex1 = RGB{N0f8}.(ones(size(img))) | |
drawdots!(img_convex1, res, RGB(0,0,1)) | |
draw!(img_convex1, ImageDraw.Path(convhull), RGB(0)) | |
draw_contours(img_convex1, RGB(0), contours) | |
return img_convex1, size(res)[1] | |
catch e | |
img_convex1 = RGB{N0f8}.(ones(size(img))) | |
draw_contours(img_convex1, RGB(0), contours) | |
return img_convex1 , -1 | |
# return Gray.(img), e | |
# return Gray.(img) , 0 | |
end | |
end; | |
end | |
# ╔═╡ 594acafd-01d4-4eee-b9e6-5b886953b5b1 | |
begin | |
image = process_raw_camera_data(raw_camera_data); | |
img, num = objecttracker(image[:,1:70]) | |
z = convert(Array{RGB24},img') | |
img = CairoImageSurface(z) | |
Drawing(img.width, img.height, :png) | |
placeimage(img, 0, 0) | |
sethue("red") | |
fontsize(10) | |
if num != -1 | |
Luxor.text("$(num[1]+1)", Luxor.Point(10, 10), halign=:center) | |
end | |
image_as_matrix() | |
end | |
# ╔═╡ 0814234d-459a-404b-9253-7f7665ea6a38 | |
# begin | |
# Pkg.add(PackageSpec(url="https://github.com/Pocket-titan/DarkMode")) | |
# import DarkMode | |
# DarkMode.enable() | |
# end | |
# ╔═╡ f2236406-af64-403d-84bd-e3afe395b791 | |
html"""<style> | |
main { | |
max-width: 900px; | |
} | |
""" | |
# ╔═╡ Cell order: | |
# ╟─0da24d63-180f-4913-a8d6-3ba54a28ef04 | |
# ╟─ca92aa75-50c5-4720-a0d5-6993c21ea0b1 | |
# ╠═14519106-d4cf-4a77-acca-a22b7c426334 | |
# ╟─dfb7c6be-ee0d-11ea-194e-9758857f7b20 | |
# ╟─1a0324de-ee19-11ea-1d4d-db37f4136ad3 | |
# ╠═594acafd-01d4-4eee-b9e6-5b886953b5b1 | |
# ╟─841cd0d1-c5d4-41fe-949f-b2ddc9144634 | |
# ╟─43f08085-b9b3-4e9b-b2ff-a0907b48a897 | |
# ╟─f5642319-05ee-4731-ad26-80bcd4f6aa7b | |
# ╟─6f80e4ff-99bc-4c77-aebe-5e7f21f0d328 | |
# ╟─0814234d-459a-404b-9253-7f7665ea6a38 | |
# ╟─f2236406-af64-403d-84bd-e3afe395b791 | |
# ╟─4934e4de-b03d-419f-a076-9a8116f5ddf5 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Hand Landmark detection using OpenCV.jl and mediapipe | |
# add OpenCV.jl and PyCall.jl using pkg manager | |
using OpenCV # julia's opencv binding | |
using PyCall # used to call python from inside | |
cap = OpenCV.VideoCapture(Int32(0)) # To open the webcam capture stream | |
OpenCV.namedWindow("Hand Landmarks detection") # create a window for the output | |
# python code to call mediapipe | |
py""" | |
# install mediapipe and numpy in your python | |
import mediapipe # does the processing of the image for the hand landmark detection | |
import numpy as np | |
drawingModule = mediapipe.solutions.drawing_utils # used to draw the hand landmarks | |
handsModule = mediapipe.solutions.hands # used to detect the hand | |
def process_image(img): | |
# deep copy of the image | |
vis = np.array([x for x in img]) | |
# to get the hand keypoints | |
with handsModule.Hands() as hands: | |
# processing of the image | |
results = hands.process(vis) | |
# if results has some landmarks for multiple hands or less | |
if results.multi_hand_landmarks != None: | |
# for each hand, draw the landmarks | |
for handLandmarks in results.multi_hand_landmarks: | |
# draw on the image | |
drawingModule.draw_landmarks(vis, handLandmarks, handsModule.HAND_CONNECTIONS) | |
# return the image | |
return vis | |
""" | |
imgfinal = OpenCV.Mat(zeros(UInt8, (3,640,480))) # create a matrix to store the final image | |
process_image = py"process_image" # assigning the python function to a julian name | |
while true | |
ret, img = OpenCV.read(cap) # read the image from the webcam | |
# img = reverse(img, dims = 2) # to flip image horizontal, but slows down | |
# to handle case if webcam stopped | |
if ret==false | |
print("Webcam stopped") | |
break | |
end | |
# rearranging dimension of the image for python | |
input = cat(img[3,:,:], img[2,:,:], img[1,:,:]; dims=3) | |
#(3,x,y) -> (x,y, 3) | |
# calling the python function | |
output = process_image(input); | |
# rearranging the dimension of the output | |
imgfinal[1,:,:] = output[:,:,3] | |
imgfinal[2,:,:] = output[:,:,2] | |
imgfinal[3,:,:] = output[:,:,1] | |
# displaying the image | |
OpenCV.imshow("Hand Landmarks detection", imgfinal) | |
if OpenCV.waitKey(Int32(5))==27 | |
break | |
end | |
end | |
# release the webcam stream | |
OpenCV.release(cap) | |
# to release all the windows and close them | |
OpenCV.destroyAllWindows() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment