-
-
Save syntaxhacker/dae6151cba30cf93a145ab163175a93b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /usr/bin/env python | |
# | |
# Author: Red5d | |
# | |
# Description: Extract and run OCR on subtitles from a PGS-format .sup file. | |
# | |
# Example Usage: python sup2srt.py bd_subtitles.sup bd_subtitles.srt | |
# | |
# Dependencies: | |
# - pytesseract | |
# - tqdm | |
# - pysrt | |
# - pgsreader and imagemaker modules from (https://github.com/SavSanta/pgsreader) | |
# | |
import sys, pytesseract | |
from pgsreader import PGSReader | |
from imagemaker import make_image | |
from pysrt import SubRipFile, SubRipItem, SubRipTime | |
from tqdm import tqdm | |
supFile = sys.argv[1] | |
pgs = PGSReader(supFile) | |
srtFile = sys.argv[2] | |
srt = SubRipFile() | |
# get all DisplaySets that contain an image | |
print("Loading DisplaySets...") | |
allsets = [ds for ds in tqdm(pgs.iter_displaysets())] | |
print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...") | |
subText = "" | |
subStart = 0 | |
subIndex = 0 | |
for ds in tqdm(allsets): | |
if ds.has_image: | |
# get Palette Display Segment | |
pds = ds.pds[0] | |
# get Object Display Segment | |
ods = ds.ods[0] | |
img = make_image(ods, pds) | |
subText = pytesseract.image_to_string(img) | |
subStart = ods.presentation_timestamp | |
else: | |
startTime = SubRipTime(milliseconds=int(subStart)) | |
endTime = SubRipTime(milliseconds=int(ds.end[0].presentation_timestamp)) | |
srt.append(SubRipItem(subIndex, startTime, endTime, subText)) | |
subIndex += 1 | |
print(f"Done. SRT file saved as {srtFile}") | |
srt.save(srtFile, encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment