Created
November 8, 2018 21:03
-
-
Save nmante/d3c26cb514e49c61321b9c961cb87d2f to your computer and use it in GitHub Desktop.
Caption Converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python3 | |
import boto3 | |
import json | |
import logging | |
import subprocess as sp | |
import argparse | |
import os | |
def create_parser(): | |
parser = argparse.ArgumentParser(description='Use ffmpeg to convert .sub caption files to .srt files, and upload to s3') | |
parser.add_argument('-c', '--config', help='JSON Configuration file with s3 access keys', default='config.json') | |
parser.add_argument('-i', '--captions_info', help='JSON file containing caption key/filenames on s3', default='captions_info.json') | |
parser.add_argument( | |
'-e', '--execute', | |
help='Set this flag to actually update the files on s3. If this is not set, s3 files will NOT be updated', | |
action='store_true' | |
) | |
parser.add_argument('-o', '--output_dir', help='The output directory to store the converted caption files. Directory must exist', default='/tmp') | |
parser.add_argument( | |
'-r', '--remove_files', | |
help='Set this flag to remove the converted caption files from your system', | |
action='store_true' | |
) | |
return parser | |
class CaptionFixer: | |
def __init__( | |
self, | |
should_execute_updates=False, | |
credentials_filename='config.json', | |
json_captions_file='captions_info.json', | |
s3_url='https://s3.amazonaws.com/video-api-prod', | |
output_dir='/tmp', | |
should_remove_files=False | |
): | |
credentials = self._validate_and_load_credentials(credentials_filename) | |
transcript_assets_info = self._validate_and_load_transcript_assets(json_captions_file) | |
self._credentials = credentials | |
self._transcript_assets_info = transcript_assets_info | |
self._S3Client = self._create_s3_client(credentials) | |
self._should_execute_updates = should_execute_updates | |
self._url = s3_url | |
self._output_captions_dir = output_dir | |
self._should_remove_files = should_remove_files | |
def _create_s3_client(self, credentials): | |
try: | |
client = boto3.client( | |
's3', | |
aws_access_key_id=credentials['aws_access_key_id'], | |
aws_secret_access_key=credentials['aws_secret_access_key'], | |
) | |
except Exception as e: | |
logging.error('Could not initialize s3 boto client') | |
logging.exception(e) | |
return | |
return client | |
def _validate_and_load_credentials(self, credentials_filename): | |
required_keys = set(['aws_access_key_id', 'aws_access_secret_key']) | |
credentials = {} | |
with open(credentials_filename) as f: | |
credentials = dict(json.load(f)) | |
if len(required_keys - credentials.keys()) > 0: | |
logging.error('Must have required keys in config file {}'.format(required_keys)) | |
raise ValueError | |
return credentials | |
def _validate_and_load_transcript_assets(self, json_captions_file): | |
transcript_assets = [] | |
required_keys = set(['filename', 'storage_path']) | |
line_number = 0 | |
with open(json_captions_file) as f: | |
for line in f: | |
transcript_asset = json.loads(line) | |
if len(required_keys - transcript_asset.keys()) > 0: | |
logging.error( | |
'Transcript asset {} missing required keys {}. Line number {}' | |
.format(transcript_asset['id'], required_keys, line_number) | |
) | |
continue | |
transcript_assets.append(transcript_asset) | |
return transcript_assets | |
def _convert_sub_caption_to_srt(self, storage_path, bucket='assets'): | |
try: | |
url = '{}/{}/{}'.format(self._url, bucket, storage_path) | |
tmp_caption_filename = '{}/{}'.format(self._output_captions_dir, storage_path.replace('/', '-')) | |
command = [ | |
'ffmpeg', | |
'-i', url, | |
tmp_caption_filename | |
] | |
with sp.Popen(command, stdout=sp.PIPE, bufsize=1024) as pipe: | |
logging.info('Converted caption file {}. Stored at tmp file'.format(storage_path)) | |
logging.info(pipe.stdout.read()) | |
except Exception as e: | |
logging.info('Could not convert caption file {}'.format(storage_path)) | |
logging.exception(e) | |
return None | |
return tmp_caption_filename | |
def _update_previous_sub_file(self, tmp_caption_filename, storage_path, bucket='assets'): | |
try: | |
with open(tmp_caption_filename, 'rb') as f: | |
logging.info(f.read()) | |
self._S3Client.put_object(Body=f, Bucket=bucket, Key=storage_path) | |
return True | |
except Except: | |
logging.error('Could not update caption file {}'.format(storage_path)) | |
logging.exception(e) | |
return False | |
def execute(self): | |
for transcript_asset in self._transcript_assets_info: | |
storage_path = transcript_asset['storage_path'].replace('assets/', '') | |
tmp_caption_filename = self._convert_sub_caption_to_srt(storage_path) | |
if not tmp_caption_filename: | |
continue | |
if self._should_execute_updates: | |
self._update_previous_sub_file(tmp_caption_filename, storage_path) | |
if self._should_remove_files: | |
os.remove(tmp_caption_filename) | |
def main(): | |
parser = create_parser() | |
args = parser.parse_args() | |
fixer = CaptionFixer( | |
should_execute_updates=args.execute, | |
credentials_filename=args.config, | |
json_captions_file=args.captions_info, | |
output_dir=args.output_dir, | |
should_remove_files=args.remove_files | |
) | |
fixer.execute() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment