Created
May 27, 2019 01:39
-
-
Save tucan9389/84c1fea7d35fc2c653243b759b51ecb7 to your computer and use it in GitHub Desktop.
This script is a simple audio recognition using google's Cloud Speech-to-Text API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# This script is a simple audio recognition using google's Cloud Speech-to-Text API | |
# The script can recognize long audio or video (over 1 minute, in my case 60 minute video) | |
# Prerequisites libraries | |
# - ffmpeg | |
# - google-cloud-speech | |
# My test | |
# - recognize 60 minute video(.mp4) | |
# How to run | |
# 1. install anaconda and create a virtual env | |
# 2. install prerequisites on the virtual env | |
# 3. configurate some key for google-cloud-speech | |
# 4. run this script | |
# 5. you can find the result on the input file path | |
# Example usage: | |
# /Users/doyounggwak/anaconda3/envs/tts-env/bin/python main.py | |
# ================================================ # | |
# ================================================ # | |
# ================================================ # | |
main_audio_file = "audio/21th.mp4" # input video or audio file path | |
split_durection = "58" # I recommand do not change this | |
# ================================================ # | |
# ================================================ # | |
# ================================================ # | |
# I reference google's baseline code for google-cloud-speech | |
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/speech/cloud-client/transcribe_async.py | |
# Copyright 2017 Google Inc. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import io | |
import os | |
# [START speech_transcribe_async] | |
def transcribe_file(speech_file): | |
"""Transcribe the given audio file asynchronously.""" | |
from google.cloud import speech | |
from google.cloud.speech import enums | |
from google.cloud.speech import types | |
client = speech.SpeechClient() | |
# [START speech_python_migration_async_request] | |
with io.open(speech_file, 'rb') as audio_file: | |
content = audio_file.read() | |
audio = types.RecognitionAudio(content=content) | |
config = types.RecognitionConfig( | |
encoding=enums.RecognitionConfig.AudioEncoding.FLAC, | |
# sample_rate_hertz=16000, | |
language_code='ko-KR') | |
# [START speech_python_migration_async_response] | |
operation = client.long_running_recognize(config, audio) | |
# [END speech_python_migration_async_request] | |
# print('Waiting for operation to complete...') | |
response = operation.result(timeout=90) | |
my_result = [] | |
# Each result is for a consecutive portion of the audio. Iterate through | |
# them to get the transcripts for the entire audio file. | |
for result in response.results: | |
# The first alternative is the most likely one for this portion. | |
# print(u'Transcript: {}'.format(result.alternatives[0].transcript)) | |
# print('Confidence: {}'.format(result.alternatives[0].confidence)) | |
print(result.alternatives[0].transcript) | |
my_result.append(result.alternatives[0].transcript) | |
# [END speech_python_migration_async_response] | |
my_result.append("") | |
return my_result | |
# [END speech_transcribe_async] | |
if __name__ == '__main__': | |
if os.path.splitext(main_audio_file)[-1] == ".mp4": | |
file_path = os.path.splitext(main_audio_file)[0] | |
audio_path = file_path + ".mp3" | |
cmd = "ffmpeg -i \"" + main_audio_file + "\" \"" + audio_path + "\"" | |
print(cmd) | |
os.system(cmd) | |
main_audio_file = audio_path | |
splited_audios_path = os.path.splitext(main_audio_file)[0] | |
main_audio_file_name = splited_audios_path.split("/")[-1] | |
os.mkdir(splited_audios_path) | |
# split | |
splited_audio_file = os.path.join(splited_audios_path, main_audio_file_name + "_%03d.mp3") | |
split_command = "ffmpeg -i " + main_audio_file + " -f segment -segment_time " + split_durection + " -c copy " + splited_audio_file | |
os.system(split_command) | |
import glob | |
txtfiles = [] | |
for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.mp3"): | |
txtfiles.append(file) | |
# convert mp3 to flac with ac 1 | |
for file in txtfiles: | |
cmd = "ffmpeg -i " + file + " -ac 1 " + os.path.splitext(file)[0] + ".flac" | |
print(cmd) | |
os.system(cmd) | |
txtfiles = [] | |
for file in glob.glob(os.path.join(splited_audios_path, main_audio_file_name) + "_*.flac"): | |
txtfiles.append(file) | |
txtfiles.sort() | |
print(txtfiles) | |
print("\n\n") | |
text_file = open(splited_audios_path + ".txt", "w") | |
# recognize! | |
results = [] | |
index = 1; | |
for file in txtfiles: | |
print("--------> " + str(index) + "/" + str(len(txtfiles)) + " <--------", file) | |
result = transcribe_file(file) | |
for r in result: | |
text_file.write(r) | |
text_file.write("\n") | |
index = index + 1 | |
text_file.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment