syntaxhacker · February 4, 2021 18:20
diff --git a/sup2srt.py b/sup2srt.py
 # /usr/bin/env python
 #
 # Author: Red5d
 #
 # Description: Extract and run OCR on subtitles from a PGS-format .sup file.
 #
 # Example Usage: python sup2srt.py bd_subtitles.sup bd_subtitles.srt
 #
 # Dependencies:
 # - pytesseract
 # - tqdm
 # - pysrt
 # - pgsreader and imagemaker modules from (https://github.com/SavSanta/pgsreader)
 #

 import sys, pytesseract
 from pgsreader import PGSReader
 from imagemaker import make_image

 from pysrt import SubRipFile, SubRipItem, SubRipTime

 from tqdm import tqdm

 supFile = sys.argv[1]
 pgs = PGSReader(supFile)

 srtFile = sys.argv[2]

 srt = SubRipFile()

 # get all DisplaySets that contain an image
 print("Loading DisplaySets...")
 allsets = [ds for ds in tqdm(pgs.iter_displaysets())]

 print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...")
 subText = ""
 subStart = 0
 subIndex = 0
 for ds in tqdm(allsets):
    if ds.has_image:
        # get Palette Display Segment
        pds = ds.pds[0]
        # get Object Display Segment
        ods = ds.ods[0]

        img = make_image(ods, pds)
        subText = pytesseract.image_to_string(img)
        subStart = ods.presentation_timestamp
    else:
        startTime = SubRipTime(milliseconds=int(subStart))
        endTime = SubRipTime(milliseconds=int(ds.end[0].presentation_timestamp))
        srt.append(SubRipItem(subIndex, startTime, endTime, subText))
        subIndex += 1

 print(f"Done. SRT file saved as {srtFile}")
 srt.save(srtFile, encoding='utf-8')
	# /usr/bin/env python
	#
	# Author: Red5d
	#
	# Description: Extract and run OCR on subtitles from a PGS-format .sup file.
	#
	# Example Usage: python sup2srt.py bd_subtitles.sup bd_subtitles.srt
	#
	# Dependencies:
	# - pytesseract
	# - tqdm
	# - pysrt
	# - pgsreader and imagemaker modules from (https://github.com/SavSanta/pgsreader)
	#

	import sys, pytesseract
	from pgsreader import PGSReader
	from imagemaker import make_image

	from pysrt import SubRipFile, SubRipItem, SubRipTime

	from tqdm import tqdm

	supFile = sys.argv[1]
	pgs = PGSReader(supFile)

	srtFile = sys.argv[2]

	srt = SubRipFile()

	# get all DisplaySets that contain an image
	print("Loading DisplaySets...")
	allsets = [ds for ds in tqdm(pgs.iter_displaysets())]

	print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...")
	subText = ""
	subStart = 0
	subIndex = 0
	for ds in tqdm(allsets):
	if ds.has_image:
	# get Palette Display Segment
	pds = ds.pds[0]
	# get Object Display Segment
	ods = ds.ods[0]

	img = make_image(ods, pds)
	subText = pytesseract.image_to_string(img)
	subStart = ods.presentation_timestamp
	else:
	startTime = SubRipTime(milliseconds=int(subStart))
	endTime = SubRipTime(milliseconds=int(ds.end[0].presentation_timestamp))
	srt.append(SubRipItem(subIndex, startTime, endTime, subText))
	subIndex += 1

	print(f"Done. SRT file saved as {srtFile}")
	srt.save(srtFile, encoding='utf-8')