Created
January 16, 2020 01:55
-
-
Save Red5d/94b022e527b9ddfe198207e2536e21bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /usr/bin/env python | |
# | |
# Author: Red5d | |
# | |
# Description: Extract and run OCR on subtitles from a PGS-format .sup file. | |
# | |
# Example Usage: python sup2srt.py bd_subtitles.sup bd_subtitles.srt | |
# | |
# Dependencies: | |
# - pytesseract | |
# - tqdm | |
# - pysrt | |
# - pgsreader and imagemaker modules from (https://github.com/SavSanta/pgsreader) | |
# | |
import sys, pytesseract | |
from pgsreader import PGSReader | |
from imagemaker import make_image | |
from pysrt import SubRipFile, SubRipItem, SubRipTime | |
from tqdm import tqdm | |
supFile = sys.argv[1] | |
pgs = PGSReader(supFile) | |
srtFile = sys.argv[2] | |
srt = SubRipFile() | |
# get all DisplaySets that contain an image | |
print("Loading DisplaySets...") | |
allsets = [ds for ds in tqdm(pgs.iter_displaysets())] | |
print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...") | |
subText = "" | |
subStart = 0 | |
subIndex = 0 | |
for ds in tqdm(allsets): | |
if ds.has_image: | |
# get Palette Display Segment | |
pds = ds.pds[0] | |
# get Object Display Segment | |
ods = ds.ods[0] | |
img = make_image(ods, pds) | |
subText = pytesseract.image_to_string(img) | |
subStart = ods.presentation_timestamp | |
else: | |
startTime = SubRipTime(milliseconds=int(subStart)) | |
endTime = SubRipTime(milliseconds=int(ds.end[0].presentation_timestamp)) | |
srt.append(SubRipItem(subIndex, startTime, endTime, subText)) | |
subIndex += 1 | |
print(f"Done. SRT file saved as {srtFile}") | |
srt.save(srtFile, encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For a brand new environment, numpy dependency is needed too