Skip to content

Instantly share code, notes, and snippets.

@JeremieGomez
Last active June 26, 2020 17:44
Show Gist options
  • Save JeremieGomez/68b8acc195404ce96adbaa6ff13619db to your computer and use it in GitHub Desktop.
Save JeremieGomez/68b8acc195404ce96adbaa6ff13619db to your computer and use it in GitHub Desktop.
from pydub import AudioSegment
from google.cloud import speech_v1p1beta1 as speech
from google.cloud.speech_v1p1beta1 import enums
AUDIO_FILE_PATH_LOCAL = '<PATH TO YOUR FILE WHERE YOU WANT TO REMOVE MUSIC>'
AUDIO_FILE_PATH_GCS = '<PATH TO THE SAME FILE IN GCS IF YOU USE THE FIRST SOLUTION>'
OUTPUT_FILE_PATH_LOCAL = '<PATH TO SAVE THE OUTPUT FILE>'
# Prepare audio files
show_without_music = AudioSegment.empty()
audio_segment = AudioSegment.from_mp3(AUDIO_FILE_PATH_LOCAL)
client = speech.SpeechClient()
speech_config = speech.types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.MP3,
sample_rate_hertz=44100,
language_code="fr-FR",
audio_channel_count=2,
enable_word_time_offsets=True,
)
#-------First solution: send the whole file to STT (needs a copy in GCS)-----------
# Run STT
recognition_audio = {'uri': AUDIO_FILE_PATH_GCS}
operation = client.long_running_recognize(speech_config, recognition_audio)
print(u"Waiting Speech To Text to complete...")
response = operation.result(timeout=2000)
# Interpret results
first_text_start = None
last_text_end = None
for result in response.results:
print(result.alternatives[0])
best_alternative = result.alternatives[0]
if best_alternative.confidence > 0.9:
nb_words = len(best_alternative.words)
if first_text_start is None: # explicitely compares to None because it can be 0 which is considered false
first_text_start = best_alternative.words[0].start_time.seconds
last_text_end = result.alternatives[0].words[nb_words-1].end_time.seconds
else:
if first_text_start is not None:
show_without_music += audio_segment[first_text_start*1000:last_text_end*1000 + 1000]
first_text_start = None
last_text_end = None
else:
continue
if first_text_start:
show_without_music += audio_segment[first_text_start * 1000:last_text_end * 1000 + 1000]
# Write output file to disk
show_without_music.export(OUTPUT_FILE_PATH_LOCAL, format="mp3")
# Use this to generate smaller files
# audio_segment = AudioSegment.from_mp3(AUDIO_FILE_PATH_LOCAL)
# audio_segment = audio_segment[0:930*1000]
# audio_segment.export(OUTPUT_FILE_PATH_LOCAL, format="mp3")
#-------Second solution: cut file into 5 seconds chunks and send to the api-----
# CHUNKS_SECONDS = 5
#
# speech_config.enable_word_time_offsets = False
# output = io.BytesIO()
# nb_splits = int(len(audio_segment)/1000/CHUNKS_SECONDS)+1
# for i in range(nb_splits):
# start_second, end_second = i*CHUNKS_SECONDS, (i+1)*CHUNKS_SECONDS
# print("processing second %d to second %d" % (start_second, end_second))
# fivesec_audio = audio_segment[start_second*1000:end_second*1000]
# fivesec_audio.export(output, format="mp3")
# recognition_audio = {'content': output.read()}
# output.flush()
# response = client.recognize(speech_config, recognition_audio)
# print(response.results)
# if len(response.results) > 0:
# print("Speech")
# show_without_music += fivesec_audio
# else:
# print("Music")
#
#
# show_without_music.export(OUTPUT_FILE_PATH_LOCAL, format="mp3")
# print("File without music written at %s" % OUTPUT_FILE_PATH_LOCAL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment