-
-
Save cdiazbas/bfc8b27e9eb168c752b3 to your computer and use it in GitHub Desktop.
Video caption conversion script using the PBS pycaption library
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
"""Convert caption files to different formats | |
One or more caption files will be converted to the specified output format, saved next to the input file | |
Requires the pycaption library from PBS: https://pypi.python.org/pypi/pycaption/ | |
""" | |
from __future__ import absolute_import, print_function, unicode_literals | |
import logging | |
import os | |
import sys | |
from argparse import ArgumentParser, FileType | |
try: | |
import pycaption | |
except ImportError: | |
print('Unable to import pycaption: have you installed it?', file=sys.stderr) | |
raise | |
SUPPORTED_WRITERS = { | |
'dfxp': pycaption.DFXPWriter, | |
'sami': pycaption.SAMIWriter, | |
'srt': pycaption.SRTWriter, | |
'scc': pycaption.SCCWriter, | |
'webvtt': pycaption.WebVTTWriter, | |
} | |
FILE_EXTENSIONS = { | |
'dfxp': 'dfxp.xml', | |
'sami': 'sami', | |
'srt': 'srt', | |
'scc': 'scc', | |
'webvtt': 'vtt', | |
} | |
def convert_file(input_captions, output_writer): | |
reader = pycaption.detect_format(input_captions) | |
if not reader: | |
raise RuntimeError('Unrecognized format') | |
converter = pycaption.CaptionConverter() | |
converter.read(input_captions, reader()) | |
return converter.write(output_writer) | |
if __name__ == "__main__": | |
parser = ArgumentParser(description=__doc__.strip()) | |
parser.add_argument('--output-format', type=lambda i: i.lower(), metavar='FORMAT', default='WebVTT', | |
choices=SUPPORTED_WRITERS.keys(), | |
help='Output format: %(choices)s (default: %(default)s)') | |
parser.add_argument('caption_file', type=FileType('r'), nargs='+', | |
help='Caption files to convert') | |
args = parser.parse_args() | |
if args.output_format not in SUPPORTED_WRITERS: | |
parser.error('Output format must be one of %s' % ' '.join(SUPPORTED_WRITERS.keys())) | |
else: | |
output_writer_class = SUPPORTED_WRITERS[args.output_format] | |
logging.basicConfig() | |
for f in args.caption_file: | |
output_file = '%s.%s' % (os.path.splitext(f.name)[0], | |
FILE_EXTENSIONS[args.output_format]) | |
print(output_file) | |
try: | |
with open(output_file, 'wb') as output_f: | |
funicode = f.read() | |
output_f.write(convert_file(funicode.decode('utf-8'), output_writer_class())) | |
except Exception as exc: | |
import pdb; pdb.post_mortem(sys.exc_info()[2]) | |
logging.error('Unable to convert %s: %s', f.name, exc) | |
f.close() | |
continue |
Im trying to do the same (read .dfxp using pycaption.DFXPReader) and get the same InvalidInputError: The content is not a unicode string.
error. The string that I pass is also unicode type. And when I convert it to a str
. It comes back with CaptionReadNoCaptions: CaptionReadNoCaptions(('empty caption file',))
. Could really use some help :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks a ton for this! I've gotten it to work for .vtt, but seems like I'm having some trouble with .dfxp:
before I have to Ctrl-C out. I'm not sure what's wrong.