Skip to content

Instantly share code, notes, and snippets.

@noaione
Created December 5, 2018 10:37
Show Gist options
  • Save noaione/f224aea818e3196cd98253f238db200c to your computer and use it in GitHub Desktop.
Save noaione/f224aea818e3196cd98253f238db200c to your computer and use it in GitHub Desktop.
PNG2SUB, OCR PNG + XML File to Subtitle File (.ass or .srt)
import pytesseract
import numpy as np
from PIL import Image, ImageOps
import xml.etree.ElementTree as ET
import os, sys
import argparse
ASS_HEADERS = """[Script Info]
; File generated by N4O PNG2SUB
Title: PNG2SUB
ScriptType: v4.00+
WrapStyle: 0
ScaledBorderAndShadow: yes
YCbCr Matrix: None
PlayResX: 1920
PlayResY: 1080
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,75,&H00FFFFFF,&H000000FF,&H00000000,&H96000000,-1,0,0,0,100,100,0,0,1,3.5,1,2,15,15,65,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
def tesseractImg(IMAGEPATH, TESSERACT_PATH, USE_VERTICAL, LANGUAGE):
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
img = IMAGEPATH
imgsize = img.size
if USE_VERTICAL:
if int(imgsize[1]) > int(imgsize[0]):
print('##! Using vertical mode')
uselang = LANGUAGE+'_vert'
else:
print('## Using horizontal mode')
uselang = LANGUAGE
else:
print('## Using horizontal mode')
uselang = LANGUAGE
return pytesseract.image_to_string(img, lang=uselang)
def magic_formula(img):
im = Image.open(img)
bg = Image.new("RGB", im.size, (0,0,0))
bg.paste(im,im)
bg = ImageOps.invert(bg)
im = bg.convert('RGBA')
data = np.array(im)
red, green, blue, alpha = data.T
white_areas = (red == 63) & (blue == 63) & (green == 63)
data[..., :-1][white_areas.T] = (0, 0, 0)
im2 = Image.fromarray(data)
return im2
def read_xml(filename):
tree = ET.parse(filename)
root = tree.getroot()
entries = []
for child in root:
if "Events" in child.tag:
for child in child:
if "Event" in child.tag:
pre_start = child.attrib['InTC']
k = pre_start.rfind(":")
start = pre_start[:k] + "," + pre_start[k+1:]
pre_end = child.attrib['OutTC']
k = pre_start.rfind(":")
end = pre_end[:k] + "," + pre_end[k+1:]
for child in child:
if "Graphic" in child.tag:
filename = child.text
entries.append({"start": start, "end": end, "filename": filename})
return entries
def ass_write(data, tesspath, input, output, vert, lang):
with open(file=output, mode='w', encoding='utf-8') as std:
std.write(ASS_HEADERS)
n = 1
for d in data:
print('Processing: {} out of {}'.format(str(n), str(len(data))))
startpoint = d['start']
endpoint = d['end']
WRITE_TEXT = 'Dialogue: 0,{sp},{ep},Default,,0,0,0,,'.format(sp=startpoint[1:], ep=endpoint[1:])
ocr_text = tesseractImg(magic_formula(os.path.join(input, d['filename'])), tesspath, vert, lang).replace('\n', '\\n')
WRITE_TEXT += ocr_text
std.write(WRITE_TEXT + '\n')
n += 1
def srt_write(data, tesspath, input, output, vert, lang):
with open(file=output, mode='w', encoding='utf-8') as std:
std.write(ASS_HEADERS)
n = 1
for d in data:
print('Processing: {} out of {}'.format(str(n), str(len(data))))
timedata = "{sp} --> {ep}".format(sp=d['start'] + '0', ep=d['end'] + '0')
ocr_text = tesseractImg(magic_formula(os.path.join(input, d['filename'])), tesspath, vert, lang)
std.write("{n}\n{td}\n{txt}\n".format(n=n, td=timedata, txt=ocr_text)
n += 1
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument("inputfolder", help="Folder ccntaining Image and BDN_Index.xml")
parser.add_argument("-path", "--tesseract-path", dest="tesseractpath", default="tesseract", required=True, help="Full tesseract.exe path (Default: Using PATH)")
parser.add_argument("-l", "--language", default="eng", required=True, help="Language (Default: eng)")
parser.add_argument("-ass", "--ass-subs", dest="ass", action="store_true", help="Write as .ass")
parser.add_argument("-o", "--output", default=None, dest='outf', help="output filename")
args = parser.parse_args()
vert_ava = ('jpn', 'kor', 'chi')
vert = False
if args.language in vert_ava:
vert = True
if '\\' in args.tesseractpath:
tesseractpath = args.tesseractpath.replace('\\', '\\\\')
else:
tesseractpath = args.tesseractpath
datafile = read_xml(os.path.join(args.inputfolder, 'BDN_Index.xml'))
if args.outf is None:
out = os.path.basename(args.inputfolder)
else:
out = args.outf
if args.ass:
ass_write(datafile, tesseractpath, args.inputfolder, out+'.ass', vert, args.language)
else:
srt_write(datafile, tesseractpath, args.inputfolder, out+'.srt', vert, args.language)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment