Last active
December 25, 2016 23:38
-
-
Save alexsavio/2e454f04f196cbb20689 to your computer and use it in GitHub Desktop.
CLI command that reads the time data from one SRT file, the speech text from another, mixes them and saves the result into a new srt file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import os.path as path | |
import argparse | |
import logging | |
from chardet.universaldetector import UniversalDetector | |
from pysrt import SubRipFile, SubRipItem | |
logging.basicConfig(level=logging.INFO) | |
log = logging.getLogger(__name__) | |
def get_file_encoding(afile): | |
""" Use chardet to try to detect the encoding of the input file.""" | |
detector = UniversalDetector() | |
detector.reset() | |
with open(afile, 'rb') as f: | |
for line in f: | |
detector.feed(line) | |
if detector.done: break | |
detector.close() | |
return detector.result['encoding'] | |
def open_subtitles_file(afile, encoding=None): | |
""" Guess the `encoding` of `afile` if not provided and try to | |
parse it using pysrt. | |
Parameters | |
---------- | |
afile: str | |
Path to the subtitles file. | |
encoding: str, optional | |
Encoding of `afile`. | |
Returns | |
------- | |
subs: SubRipFile | |
encoding: str | |
File encoding | |
Raises | |
------ | |
UnicodeDecodeError: | |
If the encoding is not valid. | |
""" | |
if encoding is None: | |
encoding = get_file_encoding(afile) | |
try: | |
subs = SubRipFile.open(afile, encoding=encoding) | |
except UnicodeDecodeError as uerr: | |
log.error('Error detecting the encoding of file {}, guessed {}. ' | |
'Please give the encoding as input.'.format(afile, encoding)) | |
raise | |
else: | |
return subs, encoding | |
def merge_sync_text(sync_subs, text_subs): | |
""" Return a new SubRipFile object with the timestamps from `sync_subs` | |
and the text from `text_subs`. | |
Parameters | |
---------- | |
sync_subs: SubRipFile | |
text_subs: SubRipFile | |
Returns | |
------- | |
subs: SubRipFile | |
""" | |
subs = SubRipFile() | |
for idx, syncitem in enumerate(sync_subs.data): | |
outitem = syncitem | |
try: | |
outitem.text = text_subs[idx].text | |
except IndexError: | |
log.error('Could not read item {} from {}'.format(idx, spchf), | |
exc_info=True) | |
else: | |
subs.append(outitem) | |
return subs | |
def is_existing_file(parser, arg): | |
if not os.path.exists(arg): | |
parser.error("The file %s does not exist!" % arg) | |
else: | |
return arg # return the file path | |
def is_empty_file(fpath): | |
return os.stat(fpath)[6] == 0 | |
def is_not_existing_file(parser, arg): | |
if os.path.exists(arg) and not is_empty_file(arg): | |
parser.error("The file %s already exists!" % arg) | |
else: | |
return arg # return the file path | |
if __name__ == '__main__': | |
desc = 'Reads the time data from one SRT file, the speech text '\ | |
' from another and saves it into a new srt file.' | |
parser = argparse.ArgumentParser(description=desc) | |
parser.add_argument('-t', '--in_time_srt', metavar="FILE", | |
type=lambda x: is_existing_file(parser, x), | |
help='File to get the sync times.', required=True) | |
parser.add_argument('-s', '--in_speech_srt', metavar="FILE", | |
type=lambda x: is_existing_file(parser, x), | |
help='File to get the speech texts.', required=True) | |
parser.add_argument('-o', '--out', metavar="FILE", | |
type=lambda x: is_not_existing_file(parser, x), | |
help='Output file.', required=True) | |
parser.add_argument('--in_time_enc', type=str, default=None, | |
help='Encoding of the file to get the sync times.') | |
parser.add_argument('--in_speech_enc', type=str, default=None, | |
help='Encoding of the file to get the speech texts.') | |
try: | |
args = parser.parse_args() | |
except: | |
raise | |
syncf = args.in_time_srt | |
synce = args.in_time_enc | |
spchf = args.in_speech_srt | |
spche = args.in_speech_enc | |
outpf = args.out | |
# read source files | |
syncsubs, syncenc = open_subtitles_file(syncf, synce) | |
spchsubs, spchenc = open_subtitles_file(spchf, spche) | |
# merge sync and text | |
subs = merge_sync_text(syncsubs, spchsubs) | |
# save the file with the same encoding as the text file | |
log.info('Saving file {}'.format(outpf)) | |
subs.save(outpf, encoding=spchenc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment