Skip to content

Instantly share code, notes, and snippets.

@BrianZbr
Created February 13, 2023 23:05
Show Gist options
  • Save BrianZbr/fef0b8b2ecbb978ebdc72a2391060acf to your computer and use it in GitHub Desktop.
Save BrianZbr/fef0b8b2ecbb978ebdc72a2391060acf to your computer and use it in GitHub Desktop.
Python3 script inspired by pyvobsub2srt, using Google Vision API as the OCR engine.
import os
import argparse
import requests
import json
import base64
from xml.dom.minidom import parse
def get_image_from_xml(nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
def get_api_key_from_env():
if not 'GV_API_KEY' in os.environ:
try:
from dotenv import load_dotenv
load_dotenv()
except ModuleNotFoundError:
print("Could not import 'dotenv' module!")
return os.getenv('GV_API_KEY')
if not api_key:
raise ValueError("Valid API key could not be found! You may either: 1) Provide an API key via command argument, " +
"2) set the environment variable GV_API_KEY, or 3) install 'dotenv' module to read a .env file.")
def extract_from_sub(inputfile,
forcedonly=False):
'''Run subp2png on inputfile'''
if not (inputfile.endswith('.sub')):
raise ValueError("Please supply the path/filename of a .sub input file.")
elif not os.path.exists(inputfile):
raise ValueError(f"Input file {inputfile} does not exist in working directory or specified path.")
print(f"Processing file with subp2png, please be patient...")
subprocess.call("subp2png -n " + ("--forced " if forcedonly else "") + inputfile + " > /dev/null", shell=True)
def gv_ocr_image(inputfile, api_key):
'''fetch text from image with Google Vision API'''
with open(inputfile, 'rb') as img_file:
img_string = base64.b64encode(img_file.read())
response = requests.post(url='https://vision.googleapis.com/v1/images:annotate',
params={'key': api_key},
headers={'Content-Type': 'application/json'},
data=json.dumps({
'requests': [{
'image': {
'content': img_string.decode()
},
'features': [{
'type': 'TEXT_DETECTION',
'maxResults': 1
}]
}]
}))
return eval(response.content.decode())['responses'][0]['textAnnotations'][0]['description']
def main():
parser = argparse.ArgumentParser(description='Convert VOBsub to .srt using Google Vision API')
parser.add_argument('input_filename', help='Name of input file in VOBsub format (can be the .sub or .idx but both must be '+
'present with same base filename)')
parser.add_argument('--api_key', default='', help='Google Vision API key (default: looks for GV_API_KEY, will set from in .env file if not set')
args = parser.parse_args()
if args.api_key:
api_key = args.api_key
else:
api_key = get_api_key_from_env()
xml_file = args.input_filename.replace(".sub", ".xml")
if os.path.exists(xml_file):
print(f"Found {xml_file}")
else:
extract_from_sub(args.input_filename)
dom = parse(xml_file)
count = 1
srt_outstring = ""
subtitles = dom.getElementsByTagName("subtitle")
user_response = input(f"Query Google Vision API on {len(subtitles)} images? Enter 'Yes' to proceed!: ")
if not user_response == "Yes":
print("Aborting!")
exit()
for subtitle in dom.getElementsByTagName("subtitle"):
if not subtitle.attributes.__contains__("start") or not subtitle.attributes.__contains__("stop"):
continue
image = get_image_from_xml(subtitle.getElementsByTagName("image")[0].childNodes)
image_text = gv_ocr_image(image, api_key)
srt_outstring += f"{count}\n" \
f"{subtitle.getAttribute('start').replace('.', ',')} --> " \
f"{subtitle.getAttribute('stop').replace('.', ',')}\n" \
f"{image_text}\n\n"
count += 1
srt_filename = args.input_filename.replace(".sub", ".srt")
with open(srt_filename, "wt") as srt_file:
srt_file.write(srt_outstring)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment