Skip to content

Instantly share code, notes, and snippets.

@fritz-c
Last active February 14, 2024 17:18
Show Gist options
  • Save fritz-c/f51addad9554485c157c29fc004a8d3b to your computer and use it in GitHub Desktop.
Save fritz-c/f51addad9554485c157c29fc004a8d3b to your computer and use it in GitHub Desktop.
Computer-vision-based Yakuza 0 Karaoke-playing program

Yakuza 0 Karaoke Singer

A computer-vision-based program to automatically play the Yakuza 0 karaoke minigame.

auto_karaoke

(Full video on reddit)

Setup

  1. Download the karaoke_singer.py, trigger_key.py files and the following image: notes.png (save it as notes.png) and place them in the same directory

  2. Install dependencies:

    python3 -m pip install -U --user Pillow mss opencv-python
  3. Change the value of the IS_NEEDLE_BLUE according to the song you're playing (the accompaniment parts use a pink needle, while Kiryu's is blue)

Usage

  1. Load Yakuza 0 in windowed mode, at 1280x720 resolution, and move the window into the far upper-left corner of the screen.
  2. Run the following from Powershell:
    python3 karaoke_singer.py
  3. Use the keyboard to select and start the karaoke song of your choice (don't use a gamepad, or the note icons will change)
import time
import mss
from trigger_key import (
PressKey,
ReleaseKey,
VK_UP_ARROW_KEY,
VK_DOWN_ARROW_KEY,
VK_LEFT_ARROW_KEY,
VK_RIGHT_ARROW_KEY,
)
from PIL import Image, ImageStat
import cv2 as cv
import numpy as np
# DEBUG = True
DEBUG = False
# When doing the Kiryu singing parts, this should be True
# for the accompaniment songs (with the pink needle), set it
# to False
IS_NEEDLE_BLUE = True
# IS_NEEDLE_BLUE = False
# Note: there are plenty of hard-coded pixel values in this code, so you're
# best off matching these dimensions as close you can (maybe adjust the top if
# necessary)
GAME_WINDOW_DIMENSIONS = {"top": 45, "left": 0, "width": 1280, "height": 720}
KEYPRESS_DURATION_SEC = 0.02
DRUM_DOWNTIME_SEC = 0.1
ROW_HEIGHT = int(GAME_WINDOW_DIMENSIONS["height"] * 15 / 72)
TEMPLATE_MATCH_THRESHOLD = 0.80
MAX_PIXELS_BEFORE_PLAYING_NOTE = 18
MIN_PIXELS_BEFORE_PLAYING_NOTE = 5
notes_info = {
"down": {"x": 5, "key": VK_DOWN_ARROW_KEY},
"right": {"x": 47, "key": VK_RIGHT_ARROW_KEY},
"up": {"x": 88, "key": VK_UP_ARROW_KEY},
"left": {"x": 130, "key": VK_LEFT_ARROW_KEY},
}
note_templates = cv.imread("notes.png", 0)
TEMPLATE_WIDTH = 13
TEMPLATE_HEIGHT = 19
RIGHT_TEMPLATE_OFFSET = 12
for note_info in notes_info.values():
x = note_info["x"]
y = 9
note_info["template_left"] = note_templates[
y : y + TEMPLATE_HEIGHT, x : x + TEMPLATE_WIDTH
]
note_info["template_right"] = note_templates[
y : y + TEMPLATE_HEIGHT,
x + RIGHT_TEMPLATE_OFFSET : x + RIGHT_TEMPLATE_OFFSET + TEMPLATE_WIDTH,
]
sct = mss.mss()
def get_left_side_image():
"""Screenshot the left side of the screen, so we can
search for the needle after it changes rows
"""
return np.asarray(
sct.grab(
{
**GAME_WINDOW_DIMENSIONS,
"width": int(GAME_WINDOW_DIMENSIONS["width"] / 4),
}
)
)
def get_row_image(y_offset):
"""Screenshot a row of notes and lyrics"""
return np.asarray(
sct.grab(
{
**GAME_WINDOW_DIMENSIONS,
"top": int(GAME_WINDOW_DIMENSIONS["top"] + max(0, y_offset)),
"height": ROW_HEIGHT,
}
)
)
def extract_blue_needle_region(img_hsv):
low_H = 92
high_H = 115
low_S = 118
high_S = 255
low_V = 71
high_V = 255
mask = cv.inRange(img_hsv, (low_H, low_S, low_V), (high_H, high_S, high_V))
return mask
def extract_pink_needle_region(img_hsv):
low_H = 158
high_H = 169
low_S = 143
high_S = 186
low_V = 107
high_V = 223
mask = cv.inRange(img_hsv, (low_H, low_S, low_V), (high_H, high_S, high_V))
return mask
def get_needle_location(img_hsv, min_x=None):
"""Locate the needle using its color and line detection"""
# If a min_x has been set, we focus in on the only areas we want to
# consider that the needle could be.
# Also, it's a bit hacky, but the bottom 1/8 of the image
# (containing the notes) has been trimmed off to make it less likely
# to detect lines in the row below
if min_x is not None:
img_hsv = img_hsv[: int(img_hsv.shape[0] * 7 / 8), min_x : min_x + 120]
else:
img_hsv = img_hsv[: int(img_hsv.shape[0] * 7 / 8)]
# Extract the region of the image with the needle by its hue, targeted to the
# needle color we want. In retrospective, maybe I could have looked into
# template matching (as I did to identify notes) here as well, although
# it might not have been very resilient to the explosions of color
# the notes make when played. Anyway. Follow up with canny edge detection
# to get nice outlines for the Hough Transform to use for line detection.
global IS_NEEDLE_BLUE
if IS_NEEDLE_BLUE:
img_outlines = cv.Canny(extract_blue_needle_region(img_hsv), 70, 200, None, 3)
else:
img_outlines = cv.Canny(extract_pink_needle_region(img_hsv), 70, 200, None, 3)
lines = cv.HoughLinesP(img_outlines, 1, np.pi, 30, None, 30, 10)
if DEBUG and img_hsv.shape[0] <= ROW_HEIGHT:
cv.imshow("outline", img_outlines)
cv.moveWindow("outline", 10, 720 + ROW_HEIGHT)
if cv.waitKey(25) & 0xFF == ord("q"):
cv.destroyAllWindows()
if lines is None:
return 0, 0, False
# Account for the x offset that would have occurred by working with a trimmed image
if min_x is not None:
lines[:, :, 0] += min_x
lines[:, :, 2] += min_x
# Very rough approximation of the center point of the needle, using
# averages of the x's and y's in the lines detected.
# It's very weak to noise in the form of other things incorrectly
# identified as lines, but my general approach was to eliminate the
# noise itself before worrying about how I calculated this.
mean = lambda index: np.mean([l[0][index] for l in lines])
avg_x = int(mean(0))
avg_y = int((mean(1) + mean(3)) / 2)
return avg_x, avg_y, True
def get_template_match(img, template):
"""Returns an array of x's and y's corresponding to points where
a template image was most likely to be found (using a threshold)
"""
res = cv.matchTemplate(img, template, cv.TM_CCOEFF_NORMED)
return np.where(res > TEMPLATE_MATCH_THRESHOLD)
def find_notes(row_img):
"""Locates the notes in the image, and identifies if they should be held or drummed"""
# Only look at the very bottom part of the image, which contains the notes
img_bottom = row_img[int(row_img.shape[0] * 3 / 4) :, :]
# Get a HSV representation of the image so we can check hue values
# on the right side of the notes to determine if they should
# be drummed or held
img_hsv = cv.cvtColor(img_bottom, cv.COLOR_BGR2HSV)
# Get a grayscale representation of the image, necessary for template matching
img = cv.cvtColor(img_bottom, cv.COLOR_BGRA2GRAY)
notes = []
for note_name, note_info in notes_info.items():
# To avoid missing notes that are overlapped on either
# their right or left sides, we match on both sides independently,
# and then combine the results
left_side_matches = get_template_match(img, note_info["template_left"])
right_loc = get_template_match(img, note_info["template_right"])
# Put the located x's and y's into a nicer format to work with
xs = np.sort(
np.concatenate(
[
# Since the template match finds the left side of the note,
# here we add TEMPLATE_WIDTH to record the x at roughly the
# the note
left_side_matches[1] + TEMPLATE_WIDTH,
# and for the right side, we first eliminate the offset to make
# the location
right_loc[1] - RIGHT_TEMPLATE_OFFSET + TEMPLATE_WIDTH,
]
)
)
ys = np.sort(np.concatenate([left_side_matches[0], right_loc[0]]))
# If there were any template matches for this note, process them
if len(xs) > 0:
y = ys[0]
last_x = -100
for x in xs:
if x - last_x < TEMPLATE_WIDTH:
# Skip repeated hits of the same template in the same space
# Pretty hacky, but it works
continue
last_x = x
# Using a single pixel slightly to the right of the note, we
# check if the color looks close to what would appear if the
# note has a HOLD or DRUM bar after it. Surprisingly reliable.
check_pixel = (x + 35, int(y + TEMPLATE_HEIGHT / 2))
[hue, saturation, value] = img_hsv[check_pixel[1], check_pixel[0]]
has_drum = value > 20 and saturation > 20 and 147 < hue < 167
has_hold = value > 20 and saturation > 20 and 82 < hue < 102
notes.append((x, note_name, has_drum, has_hold))
# Sort the notes from left to right
notes.sort(key=lambda n: n[0])
# Print out the notes detected
print(
"new line",
",".join(
f"{f[1]}{'-drum' if f[2] else ''}{'-hold' if f[3] else ''}" for f in notes
),
)
return notes
def karaoke_singer():
last_known_row_y = 0
notes = None
fps = 0
last_time = time.time()
last_start_x = None
last_needle_x = None
frames_since_row_change = 0
last_drum_time = time.time()
drum_key = None
hold_key = None
missed_frames = 0
# The main karaoke-playing loop
while True:
frames_since_row_change += 1
img = get_row_image(last_known_row_y)
img_hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
needle_x, needle_y, needle_was_found = get_needle_location(
img_hsv, last_needle_x
)
if not needle_was_found:
# needle was not found at the row we expected.
# try to find it at the beginning of another row
# Allow the needle to go missing just for a bit before
# searching other rows
if missed_frames < 1:
missed_frames += 1
continue
last_needle_x = None
img = get_left_side_image()
img_hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
needle_x, needle_y, needle_was_found = get_needle_location(img_hsv, 80)
if not needle_was_found:
# if we still couldn't find it, give up
continue
# Reset the notes we identified on the previous row
notes = None
last_start_x = needle_x
frames_since_row_change = 0
# Snap the needle y value up to the closest row y
screen_top_padding = int(ROW_HEIGHT * 90 / 300)
last_known_row_y = (
(needle_y - screen_top_padding) // ROW_HEIGHT
) * ROW_HEIGHT + screen_top_padding
# Prepare the row image for note-finding
img = get_row_image(last_known_row_y)
# Reset some stuff since we found the needle
missed_frames = 0
last_needle_x = needle_x
# When the needle is at the beginning of a row,
# scope out all the notes coming up
if notes is None:
notes = find_notes(img)
# Get a very rough approximation of the speed per frame of the needle
# This is super finicky, and I'm still not sure if it made a difference
# in the long run.
needle_speed = (
(needle_x - last_start_x) / frames_since_row_change
if frames_since_row_change != 0 and last_start_x is not None
else 10
)
# Maximum distance the needle has to be from a note before
# we attempt to play it
strike_zone = min(
MAX_PIXELS_BEFORE_PLAYING_NOTE,
max(needle_speed * 3, MIN_PIXELS_BEFORE_PLAYING_NOTE),
)
# Rapid-fire key presses when drummed notes are active
if drum_key is not None and last_drum_time < time.time() - DRUM_DOWNTIME_SEC:
PressKey(drum_key)
time.sleep(KEYPRESS_DURATION_SEC)
ReleaseKey(drum_key)
last_drum_time = time.time()
for index, note in enumerate(notes):
x, note_name, has_drum, has_hold = note
# If we're close enough to a note, play it.
if abs(x - needle_x) < strike_zone:
print("you're welcome", note_name)
notes.pop(index)
if drum_key is not None:
# just stop drumming
drum_key = None
break
if hold_key is not None:
# release the held key
ReleaseKey(hold_key)
hold_key = None
break
PressKey(notes_info[note_name]["key"])
if not has_hold:
time.sleep(KEYPRESS_DURATION_SEC)
ReleaseKey(notes_info[note_name]["key"])
hold_key = notes_info[note_name]["key"] if has_hold else None
drum_key = notes_info[note_name]["key"] if has_drum else None
break
# Keep track of FPS for the detection, sometimes handy for debugging
fps += 1
if time.time() - last_time >= 1:
last_time = time.time()
# print(f"fps: {fps}")
fps = 0
# If debugging, draw the current row with the perceived location of the
# needle drawn as a green line
if DEBUG:
cv.line(
img,
(needle_x - int(strike_zone), ROW_HEIGHT - 10),
(needle_x + int(strike_zone), ROW_HEIGHT - 10),
(0, 255, 0),
5,
)
cv.line(
img,
(needle_x, ROW_HEIGHT - 30),
(needle_x, ROW_HEIGHT - 10),
(0, 255, 0),
5,
)
window_title = "Test"
cv.imshow(window_title, img)
cv.moveWindow(window_title, 10, 720)
if cv.waitKey(25) & 0xFF == ord("q"):
cv.destroyAllWindows()
break
karaoke_singer()
# Code from https://stackoverflow.com/a/13615802
import ctypes
from ctypes import wintypes
import time
user32 = ctypes.WinDLL("user32", use_last_error=True)
INPUT_MOUSE = 0
INPUT_KEYBOARD = 1
INPUT_HARDWARE = 2
KEYEVENTF_EXTENDEDKEY = 0x0001
KEYEVENTF_KEYUP = 0x0002
KEYEVENTF_UNICODE = 0x0004
KEYEVENTF_SCANCODE = 0x0008
MAPVK_VK_TO_VSC = 0
# msdn.microsoft.com/en-us/library/dd375731
VK_TAB = 0x09
VK_MENU = 0x12
VK_DOWN_ARROW_KEY = 0x28
VK_LEFT_ARROW_KEY = 0x25
VK_RIGHT_ARROW_KEY = 0x27
VK_UP_ARROW_KEY = 0x26
VK_A_KEY = 0x41
VK_B_KEY = 0x42
VK_C_KEY = 0x43
VK_D_KEY = 0x44
VK_E_KEY = 0x45
VK_F_KEY = 0x46
VK_G_KEY = 0x47
VK_H_KEY = 0x48
VK_I_KEY = 0x49
VK_J_KEY = 0x4A
VK_K_KEY = 0x4B
VK_L_KEY = 0x4C
VK_M_KEY = 0x4D
VK_N_KEY = 0x4E
VK_O_KEY = 0x4F
VK_P_KEY = 0x50
VK_Q_KEY = 0x51
VK_R_KEY = 0x52
VK_S_KEY = 0x53
VK_T_KEY = 0x54
VK_U_KEY = 0x55
VK_V_KEY = 0x56
VK_W_KEY = 0x57
VK_X_KEY = 0x58
VK_Y_KEY = 0x59
VK_Z_KEY = 0x5A
# C struct definitions
wintypes.ULONG_PTR = wintypes.WPARAM
class MOUSEINPUT(ctypes.Structure):
_fields_ = (
("dx", wintypes.LONG),
("dy", wintypes.LONG),
("mouseData", wintypes.DWORD),
("dwFlags", wintypes.DWORD),
("time", wintypes.DWORD),
("dwExtraInfo", wintypes.ULONG_PTR),
)
class KEYBDINPUT(ctypes.Structure):
_fields_ = (
("wVk", wintypes.WORD),
("wScan", wintypes.WORD),
("dwFlags", wintypes.DWORD),
("time", wintypes.DWORD),
("dwExtraInfo", wintypes.ULONG_PTR),
)
def __init__(self, *args, **kwds):
super(KEYBDINPUT, self).__init__(*args, **kwds)
# some programs use the scan code even if KEYEVENTF_SCANCODE
# isn't set in dwFflags, so attempt to map the correct code.
if not self.dwFlags & KEYEVENTF_UNICODE:
self.wScan = user32.MapVirtualKeyExW(self.wVk, MAPVK_VK_TO_VSC, 0)
class HARDWAREINPUT(ctypes.Structure):
_fields_ = (
("uMsg", wintypes.DWORD),
("wParamL", wintypes.WORD),
("wParamH", wintypes.WORD),
)
class INPUT(ctypes.Structure):
class _INPUT(ctypes.Union):
_fields_ = (("ki", KEYBDINPUT), ("mi", MOUSEINPUT), ("hi", HARDWAREINPUT))
_anonymous_ = ("_input",)
_fields_ = (("type", wintypes.DWORD), ("_input", _INPUT))
LPINPUT = ctypes.POINTER(INPUT)
def _check_count(result, func, args):
if result == 0:
raise ctypes.WinError(ctypes.get_last_error())
return args
user32.SendInput.errcheck = _check_count
user32.SendInput.argtypes = (
wintypes.UINT, # nInputs
LPINPUT, # pInputs
ctypes.c_int,
) # cbSize
# Functions
def PressKey(hexKeyCode):
x = INPUT(type=INPUT_KEYBOARD, ki=KEYBDINPUT(wVk=hexKeyCode))
user32.SendInput(1, ctypes.byref(x), ctypes.sizeof(x))
def ReleaseKey(hexKeyCode):
x = INPUT(
type=INPUT_KEYBOARD, ki=KEYBDINPUT(wVk=hexKeyCode, dwFlags=KEYEVENTF_KEYUP)
)
user32.SendInput(1, ctypes.byref(x), ctypes.sizeof(x))
@fritz-c
Copy link
Author

fritz-c commented May 30, 2020

Images for README:
notes

auto_karaoke

@shyney7
Copy link

shyney7 commented Jul 10, 2022

The only thing that I'm getting is the output "new line" without any keypress....

@david-allan-jones
Copy link

@shyney7 I also had the same experience out of the box following the instructions written. A few things you can try just after glancing over the code:

  1. Tweak the constants near the top of karaoke_singer.py (GAME_WINDOW_DIMENSIONS in particular is most likely the culprit). I used a third party program called ShareX to count the pixels of my windows and their borders. This got the computer vision to work much better for me.

  2. Flip the DEBUG constant in karaoke_singer.py from False to True. This will open a separate window that gives you a better sense of what the computer vision procedures are actually doing (drawing where it thinks the line and needle are, etc)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment