Skip to content

Instantly share code, notes, and snippets.

@cdiazbas
Forked from acdha/convert-subtitles.py
Last active December 19, 2020 13:44
Show Gist options
  • Save cdiazbas/bfc8b27e9eb168c752b3 to your computer and use it in GitHub Desktop.
Save cdiazbas/bfc8b27e9eb168c752b3 to your computer and use it in GitHub Desktop.
Video caption conversion script using the PBS pycaption library
#!/usr/bin/env python
# encoding: utf-8
"""Convert caption files to different formats
One or more caption files will be converted to the specified output format, saved next to the input file
Requires the pycaption library from PBS: https://pypi.python.org/pypi/pycaption/
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
import os
import sys
from argparse import ArgumentParser, FileType
try:
import pycaption
except ImportError:
print('Unable to import pycaption: have you installed it?', file=sys.stderr)
raise
SUPPORTED_WRITERS = {
'dfxp': pycaption.DFXPWriter,
'sami': pycaption.SAMIWriter,
'srt': pycaption.SRTWriter,
'scc': pycaption.SCCWriter,
'webvtt': pycaption.WebVTTWriter,
}
FILE_EXTENSIONS = {
'dfxp': 'dfxp.xml',
'sami': 'sami',
'srt': 'srt',
'scc': 'scc',
'webvtt': 'vtt',
}
def convert_file(input_captions, output_writer):
reader = pycaption.detect_format(input_captions)
if not reader:
raise RuntimeError('Unrecognized format')
converter = pycaption.CaptionConverter()
converter.read(input_captions, reader())
return converter.write(output_writer)
if __name__ == "__main__":
parser = ArgumentParser(description=__doc__.strip())
parser.add_argument('--output-format', type=lambda i: i.lower(), metavar='FORMAT', default='WebVTT',
choices=SUPPORTED_WRITERS.keys(),
help='Output format: %(choices)s (default: %(default)s)')
parser.add_argument('caption_file', type=FileType('r'), nargs='+',
help='Caption files to convert')
args = parser.parse_args()
if args.output_format not in SUPPORTED_WRITERS:
parser.error('Output format must be one of %s' % ' '.join(SUPPORTED_WRITERS.keys()))
else:
output_writer_class = SUPPORTED_WRITERS[args.output_format]
logging.basicConfig()
for f in args.caption_file:
output_file = '%s.%s' % (os.path.splitext(f.name)[0],
FILE_EXTENSIONS[args.output_format])
print(output_file)
try:
with open(output_file, 'wb') as output_f:
funicode = f.read()
output_f.write(convert_file(funicode.decode('utf-8'), output_writer_class()))
except Exception as exc:
import pdb; pdb.post_mortem(sys.exc_info()[2])
logging.error('Unable to convert %s: %s', f.name, exc)
f.close()
continue
@slew06
Copy link

slew06 commented Aug 4, 2016

Thanks a ton for this! I've gotten it to work for .vtt, but seems like I'm having some trouble with .dfxp:

/usr/local/lib/python2.7/site-packages/pycaption/dfxp/base.py(63)read()
-> raise InvalidInputError('The content is not a unicode string.')
(Pdb)

before I have to Ctrl-C out. I'm not sure what's wrong.

@sahanbull
Copy link

Im trying to do the same (read .dfxp using pycaption.DFXPReader) and get the same InvalidInputError: The content is not a unicode string. error. The string that I pass is also unicode type. And when I convert it to a str. It comes back with CaptionReadNoCaptions: CaptionReadNoCaptions(('empty caption file',)). Could really use some help :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment