cobookman · September 19, 2019 21:11
diff --git a/README.md b/README.md
diff --git a/captioner.py b/captioner.py
 #!/usr/bin/env python3
 # Copyright 2017 The Abseil Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Author: jimmymkude@ / bookman@

 This script takes in Cloud VideoIntelligence API Speech Transcription response and converts to WebVTT.

 Modify is_break_point for max words / min words / max time per subtitle logic.
 
 Before running install google-cloud-videointelligence API:
   `pip3 install google-cloud-videointelligence`
  
 Auth is done through a service account. Which needs Video Intelligence & GCS permissions.

 Usage:
   python3 captioner.py --video=<gcs url> --service_account=service_account.json --out=subtitle.vtt
   
 Example:
   ./captioner.py \
       --video=gs://us-central1-cdn-test-files/static-mp4/stay_tuned.mp4 \
       --service_account=service_account.json \
       --out=stay_tuned.vtt

 """
 import argparse
 import json
 import math
 from google.protobuf.json_format import MessageToJson
 from google.cloud import videointelligence



 def main():
    args = arg_parse()

    # Client is not thread safe.
    # However, one should re-use client within the 
    # same thread, and avoid creating a new client
    # per user request.
    video_client = (videointelligence
                    .VideoIntelligenceServiceClient
                    .from_service_account_file(args.service_account))

    alternatives = transcribe(video_client, args.video)
    captions = break_down_transcriptions(alternatives)
    vtt = gen_vtt(captions)
    print(vtt)
    
    f = open(args.out, 'w')
    f.write(vtt)
    f.close()

 def arg_parse():
    """Parses args from cli."""
    parser = argparse.ArgumentParser(description='WebVTT Caption Video in GCS.')

    parser.add_argument(
        '--video', type=str, required=True,
        help='gcs path to video, e.g: gs://mybucket/myvideo.mp4')
    
    parser.add_argument(
        '--service_account', type=str, required=True,
        help='path to service account, e.g: service_account.json')
    
    parser.add_argument(
        '--out', type=str, required=True,
        help='where to write vtt subtitles, e.g: subtitles.vtt')

    return parser.parse_args()    

 def transcribe(video_client, gcs_video_path, language='en-US'):
    """Transcribes video file in gcs."""
    features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

    config = videointelligence.types.SpeechTranscriptionConfig(
        language_code='en-US',
        enable_automatic_punctuation=True)
    video_context = videointelligence.types.VideoContext(
        speech_transcription_config=config)

    operation = video_client.annotate_video(
        gcs_video_path, features=features,
        video_context=video_context)

    result = operation.result(timeout=600)

    # There is only one annotation_result since only
    # one video is processed.
    return (result
        .annotation_results[0]
        .speech_transcriptions[0]
        .alternatives)

 def duration_to_seconds(duration):
    """Converts Protobuf Duration into second float."""
    return duration.seconds + (duration.nanos * 1e-9)

 def is_punctuation(char):
    if len(char) != 1:
        return False
    punctuations = {'.', ',', '!', '?', ':', ';'}
    return char in punctuations


 def is_break_point(word, diff, numberOfWordsInSentence):
    max_time = 3 # seconds
    min_num_of_words = 2
    max_num_of_words = 14
    last_char = word[(len(word) - 1)]

    # Max time elapsed
    if diff >= max_time:
        return True

    # End of sentance, and line length longer than min allowed
    if is_punctuation(last_char) and numberOfWordsInSentence >= min_num_of_words:
        return True

    # More than max words per line
    if numberOfWordsInSentence >= max_num_of_words:
        return True
    return False

 def seconds_to_timestring(elapsed):
    seconds = elapsed % 60
    
    elapsed -= seconds
    
    hours = math.floor(elapsed / 60 / 60)
    elapsed -= hours * 60 * 60
    
    minutes = math.floor(elapsed / 60)
    
    return '{}:{}:{:.3f}'.format(hours, minutes, seconds)
    
    
 def gen_vtt(captions):
    out = 'WEBVTT\n\n'

    for caption in captions:
        start = seconds_to_timestring(caption['videoSegment']['startTimeOffset'])
        end = seconds_to_timestring(caption['videoSegment']['endTimeOffset'])
        out += '{} --> {}\n'.format(start, end)
        out += caption['transcript'] + '\n'
        out += '\n'
    return out


 def break_down_transcriptions(alternatives):
    captions = []
    for alternative in alternatives:
        start_time = duration_to_seconds(alternative.words[0].start_time)
        transcript = ''
        num_words_in_segment = 0
        for word in alternative.words:
            if not transcript:
                start_time = duration_to_seconds(word.start_time)
            end_time = duration_to_seconds(word.end_time)
            diff = end_time - start_time
            transcript += word.word + ' '
            num_words_in_segment += 1
            if is_break_point(word.word, diff, num_words_in_segment):
                # found break point.
                video_segment = {
                    'startTimeOffset': start_time,
                    'endTimeOffset': end_time
                }

                # Gets rid of unnecessary white space 
                # at the end of a caption.
                if transcript[-1] == ' ':
                    transcript = transcript[:-1]
                caption = {
                    'transcript': transcript,
                    'confidence': alternative.confidence,
                    'videoSegment': video_segment,
                }
                captions.append(caption)
                transcript = ''
                num_words_in_segment = 0
    return captions


 if __name__ == '__main__':
    main()
diff --git a/Example VTT b/Example VTT
 ./captioner.py \
    --video=gs://us-central1-cdn-test-files/static-mp4/stay_tuned.mp4 \
    --service_account=service_account.json \
    --out=stay_tuned.vtt

 ---
 WEBVTT

 0:0:1.500 --> 0:0:4.500
 Is this a scene from an action movie where this wild

 0:0:4.500 --> 0:0:6.200
 Chase actually happened?

 0:0:6.300 --> 0:0:6.700
 Hey guys,

 0:0:6.700 --> 0:0:7.200
 it's David.

 0:0:7.200 --> 0:0:7.900
 Dobrik. What's up?

 0:0:7.900 --> 0:0:8.600
 Dave? Good to see you,

 0:0:8.600 --> 0:0:9.100
 dude. Hey,

 0:0:10.400 --> 0:0:12.200
 yeah. Well you've been doing this for a year now,

 0:0:12.200 --> 0:0:13.900
 so might as well take a little bit of a break.

 0:0:14.000 --> 0:0:17.300
 I got it from here right on I'll see you later plus

 0:0:17.300 --> 0:0:19.600
 how one sign and Harry Styles totally changed.

 0:0:19.600 --> 0:0:20.600
 This girl's life.

 0:0:21.400 --> 0:0:24.500
 This is seriously nuts Las Vegas Police

 0:0:24.500 --> 0:0:28.000
 rolled up on these two suspected Killers driving in that black SUV,

 0:0:28.000 --> 0:0:31.000
 but when they try to stop it all hell broke loose

 0:0:32.200 --> 0:0:35.300
 those guys raced off shooting back at the officer and at times

 0:0:35.300 --> 0:0:38.700
 the cop shot back at them blasting bullets right through the windshield

 0:0:38.700 --> 0:0:41.000
 dozens of shots while flying back and forth.

 0:0:41.000 --> 0:0:44.100
 It all ended with those guys finally crashing into a school their

 0:0:44.100 --> 0:0:47.900
 car was all shot up and one of those suspects died those

 0:0:47.900 --> 0:0:49.700
 soccer players who were stuck in a tie case.

 0:0:49.700 --> 0:0:52.700
 They thought they were hallucinating during the rescue and now they

 0:0:52.700 --> 0:0:54.600
 want to get ordained as monks together.

 0:0:54.800 --> 0:0:57.800
 This deadly wild fire is still burning near Yosemite.

 0:0:57.800 --> 0:0:58.800
 It's already torn through.

 0:0:58.800 --> 0:0:59.800
 12,000 Acres.
	#!/usr/bin/env python3
	# Copyright 2017 The Abseil Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Author: jimmymkude@ / bookman@

	This script takes in Cloud VideoIntelligence API Speech Transcription response and converts to WebVTT.

	Modify is_break_point for max words / min words / max time per subtitle logic.

	Before running install google-cloud-videointelligence API:
	`pip3 install google-cloud-videointelligence`

	Auth is done through a service account. Which needs Video Intelligence & GCS permissions.

	Usage:
	python3 captioner.py --video=<gcs url> --service_account=service_account.json --out=subtitle.vtt

	Example:
	./captioner.py \
	--video=gs://us-central1-cdn-test-files/static-mp4/stay_tuned.mp4 \
	--service_account=service_account.json \
	--out=stay_tuned.vtt

	"""
	import argparse
	import json
	import math
	from google.protobuf.json_format import MessageToJson
	from google.cloud import videointelligence



	def main():
	args = arg_parse()

	# Client is not thread safe.
	# However, one should re-use client within the
	# same thread, and avoid creating a new client
	# per user request.
	video_client = (videointelligence
	.VideoIntelligenceServiceClient
	.from_service_account_file(args.service_account))

	alternatives = transcribe(video_client, args.video)
	captions = break_down_transcriptions(alternatives)
	vtt = gen_vtt(captions)
	print(vtt)

	f = open(args.out, 'w')
	f.write(vtt)
	f.close()

	def arg_parse():
	"""Parses args from cli."""
	parser = argparse.ArgumentParser(description='WebVTT Caption Video in GCS.')

	parser.add_argument(
	'--video', type=str, required=True,
	help='gcs path to video, e.g: gs://mybucket/myvideo.mp4')

	parser.add_argument(
	'--service_account', type=str, required=True,
	help='path to service account, e.g: service_account.json')

	parser.add_argument(
	'--out', type=str, required=True,
	help='where to write vtt subtitles, e.g: subtitles.vtt')

	return parser.parse_args()

	def transcribe(video_client, gcs_video_path, language='en-US'):
	"""Transcribes video file in gcs."""
	features = [videointelligence.enums.Feature.SPEECH_TRANSCRIPTION]

	config = videointelligence.types.SpeechTranscriptionConfig(
	language_code='en-US',
	enable_automatic_punctuation=True)
	video_context = videointelligence.types.VideoContext(
	speech_transcription_config=config)

	operation = video_client.annotate_video(
	gcs_video_path, features=features,
	video_context=video_context)

	result = operation.result(timeout=600)

	# There is only one annotation_result since only
	# one video is processed.
	return (result
	.annotation_results[0]
	.speech_transcriptions[0]
	.alternatives)

	def duration_to_seconds(duration):
	"""Converts Protobuf Duration into second float."""
	return duration.seconds + (duration.nanos * 1e-9)

	def is_punctuation(char):
	if len(char) != 1:
	return False
	punctuations = {'.', ',', '!', '?', ':', ';'}
	return char in punctuations


	def is_break_point(word, diff, numberOfWordsInSentence):
	max_time = 3 # seconds
	min_num_of_words = 2
	max_num_of_words = 14
	last_char = word[(len(word) - 1)]

	# Max time elapsed
	if diff >= max_time:
	return True

	# End of sentance, and line length longer than min allowed
	if is_punctuation(last_char) and numberOfWordsInSentence >= min_num_of_words:
	return True

	# More than max words per line
	if numberOfWordsInSentence >= max_num_of_words:
	return True
	return False

	def seconds_to_timestring(elapsed):
	seconds = elapsed % 60

	elapsed -= seconds

	hours = math.floor(elapsed / 60 / 60)
	elapsed -= hours * 60 * 60

	minutes = math.floor(elapsed / 60)

	return '{}:{}:{:.3f}'.format(hours, minutes, seconds)


	def gen_vtt(captions):
	out = 'WEBVTT\n\n'

	for caption in captions:
	start = seconds_to_timestring(caption['videoSegment']['startTimeOffset'])
	end = seconds_to_timestring(caption['videoSegment']['endTimeOffset'])
	out += '{} --> {}\n'.format(start, end)
	out += caption['transcript'] + '\n'
	out += '\n'
	return out


	def break_down_transcriptions(alternatives):
	captions = []
	for alternative in alternatives:
	start_time = duration_to_seconds(alternative.words[0].start_time)
	transcript = ''
	num_words_in_segment = 0
	for word in alternative.words:
	if not transcript:
	start_time = duration_to_seconds(word.start_time)
	end_time = duration_to_seconds(word.end_time)
	diff = end_time - start_time
	transcript += word.word + ' '
	num_words_in_segment += 1
	if is_break_point(word.word, diff, num_words_in_segment):
	# found break point.
	video_segment = {
	'startTimeOffset': start_time,
	'endTimeOffset': end_time
	}

	# Gets rid of unnecessary white space
	# at the end of a caption.
	if transcript[-1] == ' ':
	transcript = transcript[:-1]
	caption = {
	'transcript': transcript,
	'confidence': alternative.confidence,
	'videoSegment': video_segment,
	}
	captions.append(caption)
	transcript = ''
	num_words_in_segment = 0
	return captions


	if __name__ == '__main__':
	main()