|
import time |
|
import mss |
|
from trigger_key import ( |
|
PressKey, |
|
ReleaseKey, |
|
VK_UP_ARROW_KEY, |
|
VK_DOWN_ARROW_KEY, |
|
VK_LEFT_ARROW_KEY, |
|
VK_RIGHT_ARROW_KEY, |
|
) |
|
from PIL import Image, ImageStat |
|
import cv2 as cv |
|
import numpy as np |
|
|
|
# DEBUG = True |
|
DEBUG = False |
|
|
|
# When doing the Kiryu singing parts, this should be True |
|
# for the accompaniment songs (with the pink needle), set it |
|
# to False |
|
IS_NEEDLE_BLUE = True |
|
# IS_NEEDLE_BLUE = False |
|
|
|
# Note: there are plenty of hard-coded pixel values in this code, so you're |
|
# best off matching these dimensions as close you can (maybe adjust the top if |
|
# necessary) |
|
GAME_WINDOW_DIMENSIONS = {"top": 45, "left": 0, "width": 1280, "height": 720} |
|
|
|
KEYPRESS_DURATION_SEC = 0.02 |
|
DRUM_DOWNTIME_SEC = 0.1 |
|
ROW_HEIGHT = int(GAME_WINDOW_DIMENSIONS["height"] * 15 / 72) |
|
TEMPLATE_MATCH_THRESHOLD = 0.80 |
|
MAX_PIXELS_BEFORE_PLAYING_NOTE = 18 |
|
MIN_PIXELS_BEFORE_PLAYING_NOTE = 5 |
|
|
|
notes_info = { |
|
"down": {"x": 5, "key": VK_DOWN_ARROW_KEY}, |
|
"right": {"x": 47, "key": VK_RIGHT_ARROW_KEY}, |
|
"up": {"x": 88, "key": VK_UP_ARROW_KEY}, |
|
"left": {"x": 130, "key": VK_LEFT_ARROW_KEY}, |
|
} |
|
|
|
note_templates = cv.imread("notes.png", 0) |
|
TEMPLATE_WIDTH = 13 |
|
TEMPLATE_HEIGHT = 19 |
|
RIGHT_TEMPLATE_OFFSET = 12 |
|
for note_info in notes_info.values(): |
|
x = note_info["x"] |
|
y = 9 |
|
note_info["template_left"] = note_templates[ |
|
y : y + TEMPLATE_HEIGHT, x : x + TEMPLATE_WIDTH |
|
] |
|
note_info["template_right"] = note_templates[ |
|
y : y + TEMPLATE_HEIGHT, |
|
x + RIGHT_TEMPLATE_OFFSET : x + RIGHT_TEMPLATE_OFFSET + TEMPLATE_WIDTH, |
|
] |
|
|
|
|
|
sct = mss.mss() |
|
|
|
|
|
def get_left_side_image(): |
|
"""Screenshot the left side of the screen, so we can |
|
search for the needle after it changes rows |
|
""" |
|
return np.asarray( |
|
sct.grab( |
|
{ |
|
**GAME_WINDOW_DIMENSIONS, |
|
"width": int(GAME_WINDOW_DIMENSIONS["width"] / 4), |
|
} |
|
) |
|
) |
|
|
|
|
|
def get_row_image(y_offset): |
|
"""Screenshot a row of notes and lyrics""" |
|
return np.asarray( |
|
sct.grab( |
|
{ |
|
**GAME_WINDOW_DIMENSIONS, |
|
"top": int(GAME_WINDOW_DIMENSIONS["top"] + max(0, y_offset)), |
|
"height": ROW_HEIGHT, |
|
} |
|
) |
|
) |
|
|
|
|
|
def extract_blue_needle_region(img_hsv): |
|
low_H = 92 |
|
high_H = 115 |
|
low_S = 118 |
|
high_S = 255 |
|
low_V = 71 |
|
high_V = 255 |
|
mask = cv.inRange(img_hsv, (low_H, low_S, low_V), (high_H, high_S, high_V)) |
|
|
|
return mask |
|
|
|
|
|
def extract_pink_needle_region(img_hsv): |
|
low_H = 158 |
|
high_H = 169 |
|
low_S = 143 |
|
high_S = 186 |
|
low_V = 107 |
|
high_V = 223 |
|
mask = cv.inRange(img_hsv, (low_H, low_S, low_V), (high_H, high_S, high_V)) |
|
|
|
return mask |
|
|
|
|
|
def get_needle_location(img_hsv, min_x=None): |
|
"""Locate the needle using its color and line detection""" |
|
|
|
# If a min_x has been set, we focus in on the only areas we want to |
|
# consider that the needle could be. |
|
# Also, it's a bit hacky, but the bottom 1/8 of the image |
|
# (containing the notes) has been trimmed off to make it less likely |
|
# to detect lines in the row below |
|
if min_x is not None: |
|
img_hsv = img_hsv[: int(img_hsv.shape[0] * 7 / 8), min_x : min_x + 120] |
|
else: |
|
img_hsv = img_hsv[: int(img_hsv.shape[0] * 7 / 8)] |
|
|
|
# Extract the region of the image with the needle by its hue, targeted to the |
|
# needle color we want. In retrospective, maybe I could have looked into |
|
# template matching (as I did to identify notes) here as well, although |
|
# it might not have been very resilient to the explosions of color |
|
# the notes make when played. Anyway. Follow up with canny edge detection |
|
# to get nice outlines for the Hough Transform to use for line detection. |
|
global IS_NEEDLE_BLUE |
|
if IS_NEEDLE_BLUE: |
|
img_outlines = cv.Canny(extract_blue_needle_region(img_hsv), 70, 200, None, 3) |
|
else: |
|
img_outlines = cv.Canny(extract_pink_needle_region(img_hsv), 70, 200, None, 3) |
|
|
|
lines = cv.HoughLinesP(img_outlines, 1, np.pi, 30, None, 30, 10) |
|
|
|
if DEBUG and img_hsv.shape[0] <= ROW_HEIGHT: |
|
cv.imshow("outline", img_outlines) |
|
cv.moveWindow("outline", 10, 720 + ROW_HEIGHT) |
|
if cv.waitKey(25) & 0xFF == ord("q"): |
|
cv.destroyAllWindows() |
|
|
|
if lines is None: |
|
return 0, 0, False |
|
|
|
# Account for the x offset that would have occurred by working with a trimmed image |
|
if min_x is not None: |
|
lines[:, :, 0] += min_x |
|
lines[:, :, 2] += min_x |
|
|
|
# Very rough approximation of the center point of the needle, using |
|
# averages of the x's and y's in the lines detected. |
|
# It's very weak to noise in the form of other things incorrectly |
|
# identified as lines, but my general approach was to eliminate the |
|
# noise itself before worrying about how I calculated this. |
|
mean = lambda index: np.mean([l[0][index] for l in lines]) |
|
avg_x = int(mean(0)) |
|
avg_y = int((mean(1) + mean(3)) / 2) |
|
|
|
return avg_x, avg_y, True |
|
|
|
|
|
def get_template_match(img, template): |
|
"""Returns an array of x's and y's corresponding to points where |
|
a template image was most likely to be found (using a threshold) |
|
""" |
|
res = cv.matchTemplate(img, template, cv.TM_CCOEFF_NORMED) |
|
return np.where(res > TEMPLATE_MATCH_THRESHOLD) |
|
|
|
|
|
def find_notes(row_img): |
|
"""Locates the notes in the image, and identifies if they should be held or drummed""" |
|
|
|
# Only look at the very bottom part of the image, which contains the notes |
|
img_bottom = row_img[int(row_img.shape[0] * 3 / 4) :, :] |
|
|
|
# Get a HSV representation of the image so we can check hue values |
|
# on the right side of the notes to determine if they should |
|
# be drummed or held |
|
img_hsv = cv.cvtColor(img_bottom, cv.COLOR_BGR2HSV) |
|
|
|
# Get a grayscale representation of the image, necessary for template matching |
|
img = cv.cvtColor(img_bottom, cv.COLOR_BGRA2GRAY) |
|
|
|
notes = [] |
|
for note_name, note_info in notes_info.items(): |
|
# To avoid missing notes that are overlapped on either |
|
# their right or left sides, we match on both sides independently, |
|
# and then combine the results |
|
left_side_matches = get_template_match(img, note_info["template_left"]) |
|
right_loc = get_template_match(img, note_info["template_right"]) |
|
|
|
# Put the located x's and y's into a nicer format to work with |
|
xs = np.sort( |
|
np.concatenate( |
|
[ |
|
# Since the template match finds the left side of the note, |
|
# here we add TEMPLATE_WIDTH to record the x at roughly the |
|
# the note |
|
left_side_matches[1] + TEMPLATE_WIDTH, |
|
# and for the right side, we first eliminate the offset to make |
|
# the location |
|
right_loc[1] - RIGHT_TEMPLATE_OFFSET + TEMPLATE_WIDTH, |
|
] |
|
) |
|
) |
|
ys = np.sort(np.concatenate([left_side_matches[0], right_loc[0]])) |
|
|
|
# If there were any template matches for this note, process them |
|
if len(xs) > 0: |
|
y = ys[0] |
|
|
|
last_x = -100 |
|
for x in xs: |
|
if x - last_x < TEMPLATE_WIDTH: |
|
# Skip repeated hits of the same template in the same space |
|
# Pretty hacky, but it works |
|
continue |
|
last_x = x |
|
|
|
# Using a single pixel slightly to the right of the note, we |
|
# check if the color looks close to what would appear if the |
|
# note has a HOLD or DRUM bar after it. Surprisingly reliable. |
|
check_pixel = (x + 35, int(y + TEMPLATE_HEIGHT / 2)) |
|
[hue, saturation, value] = img_hsv[check_pixel[1], check_pixel[0]] |
|
has_drum = value > 20 and saturation > 20 and 147 < hue < 167 |
|
has_hold = value > 20 and saturation > 20 and 82 < hue < 102 |
|
|
|
notes.append((x, note_name, has_drum, has_hold)) |
|
|
|
# Sort the notes from left to right |
|
notes.sort(key=lambda n: n[0]) |
|
|
|
# Print out the notes detected |
|
print( |
|
"new line", |
|
",".join( |
|
f"{f[1]}{'-drum' if f[2] else ''}{'-hold' if f[3] else ''}" for f in notes |
|
), |
|
) |
|
|
|
return notes |
|
|
|
|
|
def karaoke_singer(): |
|
last_known_row_y = 0 |
|
notes = None |
|
|
|
fps = 0 |
|
last_time = time.time() |
|
last_start_x = None |
|
last_needle_x = None |
|
frames_since_row_change = 0 |
|
last_drum_time = time.time() |
|
drum_key = None |
|
hold_key = None |
|
missed_frames = 0 |
|
|
|
# The main karaoke-playing loop |
|
while True: |
|
frames_since_row_change += 1 |
|
img = get_row_image(last_known_row_y) |
|
img_hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV) |
|
needle_x, needle_y, needle_was_found = get_needle_location( |
|
img_hsv, last_needle_x |
|
) |
|
|
|
if not needle_was_found: |
|
# needle was not found at the row we expected. |
|
# try to find it at the beginning of another row |
|
|
|
# Allow the needle to go missing just for a bit before |
|
# searching other rows |
|
if missed_frames < 1: |
|
missed_frames += 1 |
|
continue |
|
|
|
last_needle_x = None |
|
img = get_left_side_image() |
|
img_hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV) |
|
needle_x, needle_y, needle_was_found = get_needle_location(img_hsv, 80) |
|
|
|
if not needle_was_found: |
|
# if we still couldn't find it, give up |
|
continue |
|
|
|
# Reset the notes we identified on the previous row |
|
notes = None |
|
last_start_x = needle_x |
|
frames_since_row_change = 0 |
|
|
|
# Snap the needle y value up to the closest row y |
|
screen_top_padding = int(ROW_HEIGHT * 90 / 300) |
|
last_known_row_y = ( |
|
(needle_y - screen_top_padding) // ROW_HEIGHT |
|
) * ROW_HEIGHT + screen_top_padding |
|
|
|
# Prepare the row image for note-finding |
|
img = get_row_image(last_known_row_y) |
|
|
|
# Reset some stuff since we found the needle |
|
missed_frames = 0 |
|
last_needle_x = needle_x |
|
|
|
# When the needle is at the beginning of a row, |
|
# scope out all the notes coming up |
|
if notes is None: |
|
notes = find_notes(img) |
|
|
|
# Get a very rough approximation of the speed per frame of the needle |
|
# This is super finicky, and I'm still not sure if it made a difference |
|
# in the long run. |
|
needle_speed = ( |
|
(needle_x - last_start_x) / frames_since_row_change |
|
if frames_since_row_change != 0 and last_start_x is not None |
|
else 10 |
|
) |
|
|
|
# Maximum distance the needle has to be from a note before |
|
# we attempt to play it |
|
strike_zone = min( |
|
MAX_PIXELS_BEFORE_PLAYING_NOTE, |
|
max(needle_speed * 3, MIN_PIXELS_BEFORE_PLAYING_NOTE), |
|
) |
|
|
|
# Rapid-fire key presses when drummed notes are active |
|
if drum_key is not None and last_drum_time < time.time() - DRUM_DOWNTIME_SEC: |
|
PressKey(drum_key) |
|
time.sleep(KEYPRESS_DURATION_SEC) |
|
ReleaseKey(drum_key) |
|
last_drum_time = time.time() |
|
|
|
for index, note in enumerate(notes): |
|
x, note_name, has_drum, has_hold = note |
|
|
|
# If we're close enough to a note, play it. |
|
if abs(x - needle_x) < strike_zone: |
|
print("you're welcome", note_name) |
|
notes.pop(index) |
|
|
|
if drum_key is not None: |
|
# just stop drumming |
|
drum_key = None |
|
break |
|
|
|
if hold_key is not None: |
|
# release the held key |
|
ReleaseKey(hold_key) |
|
hold_key = None |
|
break |
|
|
|
PressKey(notes_info[note_name]["key"]) |
|
if not has_hold: |
|
time.sleep(KEYPRESS_DURATION_SEC) |
|
ReleaseKey(notes_info[note_name]["key"]) |
|
|
|
hold_key = notes_info[note_name]["key"] if has_hold else None |
|
drum_key = notes_info[note_name]["key"] if has_drum else None |
|
break |
|
|
|
# Keep track of FPS for the detection, sometimes handy for debugging |
|
fps += 1 |
|
if time.time() - last_time >= 1: |
|
last_time = time.time() |
|
# print(f"fps: {fps}") |
|
fps = 0 |
|
|
|
# If debugging, draw the current row with the perceived location of the |
|
# needle drawn as a green line |
|
if DEBUG: |
|
cv.line( |
|
img, |
|
(needle_x - int(strike_zone), ROW_HEIGHT - 10), |
|
(needle_x + int(strike_zone), ROW_HEIGHT - 10), |
|
(0, 255, 0), |
|
5, |
|
) |
|
cv.line( |
|
img, |
|
(needle_x, ROW_HEIGHT - 30), |
|
(needle_x, ROW_HEIGHT - 10), |
|
(0, 255, 0), |
|
5, |
|
) |
|
window_title = "Test" |
|
cv.imshow(window_title, img) |
|
cv.moveWindow(window_title, 10, 720) |
|
if cv.waitKey(25) & 0xFF == ord("q"): |
|
cv.destroyAllWindows() |
|
break |
|
|
|
|
|
karaoke_singer() |
@shyney7 I also had the same experience out of the box following the instructions written. A few things you can try just after glancing over the code:
Tweak the constants near the top of karaoke_singer.py (GAME_WINDOW_DIMENSIONS in particular is most likely the culprit). I used a third party program called ShareX to count the pixels of my windows and their borders. This got the computer vision to work much better for me.
Flip the DEBUG constant in karaoke_singer.py from False to True. This will open a separate window that gives you a better sense of what the computer vision procedures are actually doing (drawing where it thinks the line and needle are, etc)