Last active
December 25, 2023 11:58
-
-
Save Enchufa2/9dce124762ba66d303ea490053a4b247 to your computer and use it in GitHub Desktop.
Extract a transcribed score from a video to PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/python3 | |
import argparse, textwrap | |
import cv2 as cv | |
import numpy as np | |
from pathlib import Path | |
from fpdf import FPDF | |
from tempfile import NamedTemporaryFile | |
class Selector(object): | |
def __init__(self, img, name): | |
self._window(name, img) | |
def _window(self, name, img): | |
raise NotImplementedError() | |
def _crop(self, img): | |
raise NotImplementedError() | |
def crop(self, img, bw=True, margin=None): | |
if bw: | |
img = cv.cvtColor(img, cv.COLOR_BGR2GRAY) | |
ret, img = cv.threshold(img, 0, 255, cv.THRESH_BINARY+cv.THRESH_OTSU) | |
img = self._crop(img) | |
if margin is None: | |
return img | |
gray = img.copy() if bw else cv.cvtColor(img, cv.COLOR_BGR2GRAY) | |
gray = 255 * (gray < 128).astype(np.uint8) | |
x, y, w, h = cv.boundingRect(cv.findNonZero(gray)) | |
margin = int(margin*h) | |
aux = np.ndarray((h+2*margin,) + img.shape[1:], dtype=img.dtype) | |
aux[:] = [255] | |
aux[margin:margin+h, :] = img[y:y+h, :] | |
return aux | |
class Rectangle(Selector): | |
def __init__(self, img, name='Rectangle Selector'): | |
super().__init__(img, name) | |
def _window(self, name, img): | |
cv.namedWindow(name, cv.WINDOW_NORMAL) | |
cv.setWindowProperty(name, cv.WND_PROP_FULLSCREEN, cv.WINDOW_FULLSCREEN) | |
self._roi = cv.selectROI(name, img) | |
cv.destroyWindow(name) | |
def _crop(self, img): | |
return img[self._roi[1]:self._roi[1]+self._roi[3], | |
self._roi[0]:self._roi[0]+self._roi[2]] | |
class MagicWand(Selector): | |
# Based on https://github.com/alkasm/magicwand | |
def __init__(self, img, name='Magic Wand Selector', connectivity=4, tolerance=32): | |
super().__init__(img, name) | |
self._flood_mask = np.zeros((img.shape[0]+2, img.shape[1]+2), dtype=np.uint8) | |
self._flood_fill_flags = ( | |
connectivity | cv.FLOODFILL_FIXED_RANGE | cv.FLOODFILL_MASK_ONLY | 255 << 8 | |
) # 255 << 8 tells to fill with the value 255 | |
self.tolerance = (tolerance,) * 3 | |
def _window(self, name, img): | |
cv.namedWindow(name, cv.WINDOW_NORMAL) | |
cv.setWindowProperty(name, cv.WND_PROP_FULLSCREEN, cv.WINDOW_FULLSCREEN) | |
cv.setMouseCallback(name, self._mouse_callback) | |
cv.imshow(name, img) | |
cv.waitKey(0) | |
cv.destroyWindow(name) | |
def _mouse_callback(self, event, x, y, flags, *userdata): | |
if event != cv.EVENT_LBUTTONDOWN: | |
return | |
self.pos = (x, y) | |
self._mod = flags & (cv.EVENT_FLAG_ALTKEY + cv.EVENT_FLAG_SHIFTKEY) | |
def _crop(self, img): | |
self._flood_mask[:] = 0 | |
cv.floodFill( | |
img, | |
self._flood_mask, | |
self.pos, | |
0, | |
self.tolerance, | |
self.tolerance, | |
self._flood_fill_flags, | |
) | |
mask = self._flood_mask[1:-1, 1:-1] | |
idx = [(_.min(), _.max()) for _ in np.where(mask != 0)] | |
return img[idx[0][0]:idx[0][1], idx[1][0]:idx[1][1]] | |
def mse(img1, img2): | |
# compare just the middle third | |
x = int(img1.shape[0] / 3 / 2) | |
y = int(img1.shape[1] / 3 / 2) | |
img1 = img1[x:3*x, y:3*y] | |
img2 = img2[x:3*x, y:3*y] | |
try: | |
err = np.sum((img1.astype('float') - img2.astype('float')) ** 2) | |
return err / float(img1.shape[0] * img1.shape[1]) | |
except: | |
return float('inf') | |
def parse_video(filename, thr, skip=0.0, every=1.0, bw=True, margin=None, use_mw=True): | |
def skip_seconds(video, t): | |
for _ in range(int(t * video.get(cv.CAP_PROP_FPS))): | |
video.grab() | |
video = cv.VideoCapture(filename) | |
if not video.isOpened(): | |
raise 'video cannot be opened' | |
skip_seconds(video, skip) | |
success, frame = video.read() | |
if use_mw: | |
sel = MagicWand(frame) | |
else: | |
sel = Rectangle(frame) | |
stack = [] | |
diffs = [] | |
while success: | |
frame = sel.crop(frame, bw, margin) | |
if len(stack): | |
diffs.append(mse(stack[-1], frame)) | |
if not len(diffs) or thr[0] < diffs[-1] < thr[1]: | |
stack.append(frame) | |
skip_seconds(video, every) | |
success, frame = video.read() | |
return stack, diffs | |
def write_pdf(filename, stack, thr=True): | |
def add_image(pdf, pages): | |
page = pages[-1] | |
pages[-1] = NamedTemporaryFile(suffix='.png') | |
cv.imwrite(pages[-1].name, page) | |
pdf.image(pages[-1].name, 10, 20, 190) | |
def concatenate(img1, img2): | |
imgs = [img1, img2] | |
if img1.shape[1] == img2.shape[1]: | |
return np.concatenate(imgs) | |
flipped = False | |
if img1.shape[1] < img2.shape[1]: | |
imgs = list(reversed(imgs)) | |
flipped = True | |
aux_shape = (imgs[1].shape[0], imgs[0].shape[1], imgs[1].shape[2]) | |
aux = np.ndarray(aux_shape, dtype=imgs[0].dtype) | |
aux[:, :imgs[1].shape[1]] = imgs[1] | |
aux[:, imgs[1].shape[1]:] = [255] | |
imgs[1] = aux | |
if flipped: | |
imgs = list(reversed(imgs)) | |
return np.concatenate(imgs) | |
pdf = FPDF() | |
pdf.add_page() | |
pages = [stack[0]] | |
for img in stack[1:]: | |
rel = (pages[-1].shape[0] + img.shape[0]) / pages[-1].shape[1] | |
if rel < 297 / 210: # A4 | |
pages[-1] = concatenate(pages[-1], img) | |
else: | |
add_image(pdf, pages) | |
pdf.add_page() | |
pages.append(img) | |
add_image(pdf, pages) | |
pdf.output(filename) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
description=textwrap.dedent('''\ | |
With the magic wand (default), click a blank pixel in the score and press Enter. | |
Without it, make a rectangular selection and press Enter.''')) | |
parser.add_argument('video', type=Path, help='path to video file') | |
parser.add_argument('--skip', type=float, default=0.0, help='seconds to skip') | |
parser.add_argument('--every', type=float, default=1.0, help='sample every x seconds') | |
parser.add_argument('--crop', metavar='MARGIN', type=float, | |
help='crop and add top-bottom specified margin ratio') | |
parser.add_argument('--no-mw', dest='mw', action='store_false', | |
help='no magic wand, use a rect selector instead') | |
parser.add_argument('--no-bw', dest='bw', action='store_false', | |
help='no image thresholding') | |
parser.add_argument('--mse', action='store_true', help='show MSE graph') | |
parser.add_argument('--thr', nargs=2, type=float, default=[5e3, 1e5], | |
help='MSE thresholds') | |
args = parser.parse_args() | |
stack, diffs = parse_video( | |
str(args.video), args.thr, args.skip, args.every, args.bw, args.crop, args.mw) | |
write_pdf(str(args.video.with_suffix('.pdf')), stack) | |
if args.mse: | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
pd.DataFrame({'values': diffs}).plot() | |
plt.show(block=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment