Skip to content

Instantly share code, notes, and snippets.

@alexsavio
Last active December 25, 2016 23:38
Show Gist options
  • Save alexsavio/2e454f04f196cbb20689 to your computer and use it in GitHub Desktop.
Save alexsavio/2e454f04f196cbb20689 to your computer and use it in GitHub Desktop.
CLI command that reads the time data from one SRT file, the speech text from another, mixes them and saves the result into a new srt file.
#!/usr/bin/env python
import os
import os.path as path
import argparse
import logging
from chardet.universaldetector import UniversalDetector
from pysrt import SubRipFile, SubRipItem
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
def get_file_encoding(afile):
""" Use chardet to try to detect the encoding of the input file."""
detector = UniversalDetector()
detector.reset()
with open(afile, 'rb') as f:
for line in f:
detector.feed(line)
if detector.done: break
detector.close()
return detector.result['encoding']
def open_subtitles_file(afile, encoding=None):
""" Guess the `encoding` of `afile` if not provided and try to
parse it using pysrt.
Parameters
----------
afile: str
Path to the subtitles file.
encoding: str, optional
Encoding of `afile`.
Returns
-------
subs: SubRipFile
encoding: str
File encoding
Raises
------
UnicodeDecodeError:
If the encoding is not valid.
"""
if encoding is None:
encoding = get_file_encoding(afile)
try:
subs = SubRipFile.open(afile, encoding=encoding)
except UnicodeDecodeError as uerr:
log.error('Error detecting the encoding of file {}, guessed {}. '
'Please give the encoding as input.'.format(afile, encoding))
raise
else:
return subs, encoding
def merge_sync_text(sync_subs, text_subs):
""" Return a new SubRipFile object with the timestamps from `sync_subs`
and the text from `text_subs`.
Parameters
----------
sync_subs: SubRipFile
text_subs: SubRipFile
Returns
-------
subs: SubRipFile
"""
subs = SubRipFile()
for idx, syncitem in enumerate(sync_subs.data):
outitem = syncitem
try:
outitem.text = text_subs[idx].text
except IndexError:
log.error('Could not read item {} from {}'.format(idx, spchf),
exc_info=True)
else:
subs.append(outitem)
return subs
def is_existing_file(parser, arg):
if not os.path.exists(arg):
parser.error("The file %s does not exist!" % arg)
else:
return arg # return the file path
def is_empty_file(fpath):
return os.stat(fpath)[6] == 0
def is_not_existing_file(parser, arg):
if os.path.exists(arg) and not is_empty_file(arg):
parser.error("The file %s already exists!" % arg)
else:
return arg # return the file path
if __name__ == '__main__':
desc = 'Reads the time data from one SRT file, the speech text '\
' from another and saves it into a new srt file.'
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-t', '--in_time_srt', metavar="FILE",
type=lambda x: is_existing_file(parser, x),
help='File to get the sync times.', required=True)
parser.add_argument('-s', '--in_speech_srt', metavar="FILE",
type=lambda x: is_existing_file(parser, x),
help='File to get the speech texts.', required=True)
parser.add_argument('-o', '--out', metavar="FILE",
type=lambda x: is_not_existing_file(parser, x),
help='Output file.', required=True)
parser.add_argument('--in_time_enc', type=str, default=None,
help='Encoding of the file to get the sync times.')
parser.add_argument('--in_speech_enc', type=str, default=None,
help='Encoding of the file to get the speech texts.')
try:
args = parser.parse_args()
except:
raise
syncf = args.in_time_srt
synce = args.in_time_enc
spchf = args.in_speech_srt
spche = args.in_speech_enc
outpf = args.out
# read source files
syncsubs, syncenc = open_subtitles_file(syncf, synce)
spchsubs, spchenc = open_subtitles_file(spchf, spche)
# merge sync and text
subs = merge_sync_text(syncsubs, spchsubs)
# save the file with the same encoding as the text file
log.info('Saving file {}'.format(outpf))
subs.save(outpf, encoding=spchenc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment