Last active
June 26, 2020 17:44
-
-
Save JeremieGomez/68b8acc195404ce96adbaa6ff13619db to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pydub import AudioSegment | |
from google.cloud import speech_v1p1beta1 as speech | |
from google.cloud.speech_v1p1beta1 import enums | |
AUDIO_FILE_PATH_LOCAL = '<PATH TO YOUR FILE WHERE YOU WANT TO REMOVE MUSIC>' | |
AUDIO_FILE_PATH_GCS = '<PATH TO THE SAME FILE IN GCS IF YOU USE THE FIRST SOLUTION>' | |
OUTPUT_FILE_PATH_LOCAL = '<PATH TO SAVE THE OUTPUT FILE>' | |
# Prepare audio files | |
show_without_music = AudioSegment.empty() | |
audio_segment = AudioSegment.from_mp3(AUDIO_FILE_PATH_LOCAL) | |
client = speech.SpeechClient() | |
speech_config = speech.types.RecognitionConfig( | |
encoding=enums.RecognitionConfig.AudioEncoding.MP3, | |
sample_rate_hertz=44100, | |
language_code="fr-FR", | |
audio_channel_count=2, | |
enable_word_time_offsets=True, | |
) | |
#-------First solution: send the whole file to STT (needs a copy in GCS)----------- | |
# Run STT | |
recognition_audio = {'uri': AUDIO_FILE_PATH_GCS} | |
operation = client.long_running_recognize(speech_config, recognition_audio) | |
print(u"Waiting Speech To Text to complete...") | |
response = operation.result(timeout=2000) | |
# Interpret results | |
first_text_start = None | |
last_text_end = None | |
for result in response.results: | |
print(result.alternatives[0]) | |
best_alternative = result.alternatives[0] | |
if best_alternative.confidence > 0.9: | |
nb_words = len(best_alternative.words) | |
if first_text_start is None: # explicitely compares to None because it can be 0 which is considered false | |
first_text_start = best_alternative.words[0].start_time.seconds | |
last_text_end = result.alternatives[0].words[nb_words-1].end_time.seconds | |
else: | |
if first_text_start is not None: | |
show_without_music += audio_segment[first_text_start*1000:last_text_end*1000 + 1000] | |
first_text_start = None | |
last_text_end = None | |
else: | |
continue | |
if first_text_start: | |
show_without_music += audio_segment[first_text_start * 1000:last_text_end * 1000 + 1000] | |
# Write output file to disk | |
show_without_music.export(OUTPUT_FILE_PATH_LOCAL, format="mp3") | |
# Use this to generate smaller files | |
# audio_segment = AudioSegment.from_mp3(AUDIO_FILE_PATH_LOCAL) | |
# audio_segment = audio_segment[0:930*1000] | |
# audio_segment.export(OUTPUT_FILE_PATH_LOCAL, format="mp3") | |
#-------Second solution: cut file into 5 seconds chunks and send to the api----- | |
# CHUNKS_SECONDS = 5 | |
# | |
# speech_config.enable_word_time_offsets = False | |
# output = io.BytesIO() | |
# nb_splits = int(len(audio_segment)/1000/CHUNKS_SECONDS)+1 | |
# for i in range(nb_splits): | |
# start_second, end_second = i*CHUNKS_SECONDS, (i+1)*CHUNKS_SECONDS | |
# print("processing second %d to second %d" % (start_second, end_second)) | |
# fivesec_audio = audio_segment[start_second*1000:end_second*1000] | |
# fivesec_audio.export(output, format="mp3") | |
# recognition_audio = {'content': output.read()} | |
# output.flush() | |
# response = client.recognize(speech_config, recognition_audio) | |
# print(response.results) | |
# if len(response.results) > 0: | |
# print("Speech") | |
# show_without_music += fivesec_audio | |
# else: | |
# print("Music") | |
# | |
# | |
# show_without_music.export(OUTPUT_FILE_PATH_LOCAL, format="mp3") | |
# print("File without music written at %s" % OUTPUT_FILE_PATH_LOCAL) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment