Created
December 5, 2018 10:37
-
-
Save noaione/f224aea818e3196cd98253f238db200c to your computer and use it in GitHub Desktop.
PNG2SUB, OCR PNG + XML File to Subtitle File (.ass or .srt)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pytesseract | |
import numpy as np | |
from PIL import Image, ImageOps | |
import xml.etree.ElementTree as ET | |
import os, sys | |
import argparse | |
ASS_HEADERS = """[Script Info] | |
; File generated by N4O PNG2SUB | |
Title: PNG2SUB | |
ScriptType: v4.00+ | |
WrapStyle: 0 | |
ScaledBorderAndShadow: yes | |
YCbCr Matrix: None | |
PlayResX: 1920 | |
PlayResY: 1080 | |
[V4+ Styles] | |
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding | |
Style: Default,Arial,75,&H00FFFFFF,&H000000FF,&H00000000,&H96000000,-1,0,0,0,100,100,0,0,1,3.5,1,2,15,15,65,1 | |
[Events] | |
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text | |
""" | |
def tesseractImg(IMAGEPATH, TESSERACT_PATH, USE_VERTICAL, LANGUAGE): | |
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH | |
img = IMAGEPATH | |
imgsize = img.size | |
if USE_VERTICAL: | |
if int(imgsize[1]) > int(imgsize[0]): | |
print('##! Using vertical mode') | |
uselang = LANGUAGE+'_vert' | |
else: | |
print('## Using horizontal mode') | |
uselang = LANGUAGE | |
else: | |
print('## Using horizontal mode') | |
uselang = LANGUAGE | |
return pytesseract.image_to_string(img, lang=uselang) | |
def magic_formula(img): | |
im = Image.open(img) | |
bg = Image.new("RGB", im.size, (0,0,0)) | |
bg.paste(im,im) | |
bg = ImageOps.invert(bg) | |
im = bg.convert('RGBA') | |
data = np.array(im) | |
red, green, blue, alpha = data.T | |
white_areas = (red == 63) & (blue == 63) & (green == 63) | |
data[..., :-1][white_areas.T] = (0, 0, 0) | |
im2 = Image.fromarray(data) | |
return im2 | |
def read_xml(filename): | |
tree = ET.parse(filename) | |
root = tree.getroot() | |
entries = [] | |
for child in root: | |
if "Events" in child.tag: | |
for child in child: | |
if "Event" in child.tag: | |
pre_start = child.attrib['InTC'] | |
k = pre_start.rfind(":") | |
start = pre_start[:k] + "," + pre_start[k+1:] | |
pre_end = child.attrib['OutTC'] | |
k = pre_start.rfind(":") | |
end = pre_end[:k] + "," + pre_end[k+1:] | |
for child in child: | |
if "Graphic" in child.tag: | |
filename = child.text | |
entries.append({"start": start, "end": end, "filename": filename}) | |
return entries | |
def ass_write(data, tesspath, input, output, vert, lang): | |
with open(file=output, mode='w', encoding='utf-8') as std: | |
std.write(ASS_HEADERS) | |
n = 1 | |
for d in data: | |
print('Processing: {} out of {}'.format(str(n), str(len(data)))) | |
startpoint = d['start'] | |
endpoint = d['end'] | |
WRITE_TEXT = 'Dialogue: 0,{sp},{ep},Default,,0,0,0,,'.format(sp=startpoint[1:], ep=endpoint[1:]) | |
ocr_text = tesseractImg(magic_formula(os.path.join(input, d['filename'])), tesspath, vert, lang).replace('\n', '\\n') | |
WRITE_TEXT += ocr_text | |
std.write(WRITE_TEXT + '\n') | |
n += 1 | |
def srt_write(data, tesspath, input, output, vert, lang): | |
with open(file=output, mode='w', encoding='utf-8') as std: | |
std.write(ASS_HEADERS) | |
n = 1 | |
for d in data: | |
print('Processing: {} out of {}'.format(str(n), str(len(data)))) | |
timedata = "{sp} --> {ep}".format(sp=d['start'] + '0', ep=d['end'] + '0') | |
ocr_text = tesseractImg(magic_formula(os.path.join(input, d['filename'])), tesspath, vert, lang) | |
std.write("{n}\n{td}\n{txt}\n".format(n=n, td=timedata, txt=ocr_text) | |
n += 1 | |
if __name__=="__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("inputfolder", help="Folder ccntaining Image and BDN_Index.xml") | |
parser.add_argument("-path", "--tesseract-path", dest="tesseractpath", default="tesseract", required=True, help="Full tesseract.exe path (Default: Using PATH)") | |
parser.add_argument("-l", "--language", default="eng", required=True, help="Language (Default: eng)") | |
parser.add_argument("-ass", "--ass-subs", dest="ass", action="store_true", help="Write as .ass") | |
parser.add_argument("-o", "--output", default=None, dest='outf', help="output filename") | |
args = parser.parse_args() | |
vert_ava = ('jpn', 'kor', 'chi') | |
vert = False | |
if args.language in vert_ava: | |
vert = True | |
if '\\' in args.tesseractpath: | |
tesseractpath = args.tesseractpath.replace('\\', '\\\\') | |
else: | |
tesseractpath = args.tesseractpath | |
datafile = read_xml(os.path.join(args.inputfolder, 'BDN_Index.xml')) | |
if args.outf is None: | |
out = os.path.basename(args.inputfolder) | |
else: | |
out = args.outf | |
if args.ass: | |
ass_write(datafile, tesseractpath, args.inputfolder, out+'.ass', vert, args.language) | |
else: | |
srt_write(datafile, tesseractpath, args.inputfolder, out+'.srt', vert, args.language) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment