Last active
April 25, 2024 23:57
-
-
Save 0187773933/1d5f4f127ec5986c1026788c1aff3e40 to your computer and use it in GitHub Desktop.
Runs Google Media Pipe Yamnet Audio Classification on MP3 File
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from pprint import pprint | |
import librosa | |
import tensorflow as tf | |
# https://storage.googleapis.com/mediapipe-models/audio_classifier/yamnet/float32/latest/yamnet.tflite | |
# https://github.com/tensorflow/models/blob/master/research/audioset/yamnet/yamnet.py | |
# https://research.google.com/audioset/ontology/index.html | |
# https://storage.googleapis.com/mediapipe-tasks/audio_classifier/yamnet_label_list.txt | |
# https://github.com/tensorflow/models/blob/master/research/audioset/yamnet/params.py#L25 | |
sample_rate = 16000.0 | |
stft_window_seconds = 0.025 | |
stft_hop_seconds = 0.010 | |
mel_bands = 64 | |
mel_min_hz = 125.0 | |
mel_max_hz = 7500.0 | |
log_offset = 0.001 | |
# patch_window_seconds = 0.96 # or 0.975 ? | |
patch_window_seconds = 0.975 | |
patch_hop_seconds = 0.48 | |
interpreter = tf.lite.Interpreter( model_path='./yamnet.tflite' ) | |
interpreter.allocate_tensors() | |
input_details = interpreter.get_input_details() | |
output_details = interpreter.get_output_details() | |
def read_text( file_path ): | |
with open( file_path ) as f: | |
return f.read().splitlines() | |
def process_audio_segment( segment ): | |
waveform = segment.astype( np.float32 ) | |
interpreter.set_tensor( input_details[ 0 ][ "index" ] , waveform ) | |
interpreter.invoke() | |
return interpreter.get_tensor( output_details[ 0 ][ "index" ] ).flatten() | |
if __name__ == "__main__": | |
model_labels = read_text( "./yamnet_label_list.txt" ) | |
# classify | |
audio_path = './Tradition.mp3' | |
audio_data, sr = librosa.load( audio_path , sr=sample_rate ) | |
samples_per_patch = ( patch_window_seconds * sample_rate ) | |
samples_per_patch_50_percent_overlap = int( samples_per_patch / 2 ) | |
results = [] | |
duration_in_samples = len( audio_data ) | |
range_end = int( duration_in_samples - ( samples_per_patch + 1 ) ) | |
for start in range( 0 , range_end , samples_per_patch_50_percent_overlap ): # 50% overlap | |
segment = audio_data[ start: ( start + int( samples_per_patch ) ) ] | |
probabilities = process_audio_segment( segment ) | |
results.append( probabilities ) | |
total_results = len( results ) | |
for i , result in enumerate( results ): | |
print( f"\nSection [{i+1}] of {total_results}" ) | |
labeled_probabilities = list( zip( model_labels , result ) ) | |
labeled_probabilities = [ pair for pair in labeled_probabilities if pair[ 1 ] > 0 ] | |
labeled_probabilities.sort( key=lambda x: x[ 1 ] , reverse=True ) | |
for label, probability in labeled_probabilities[ : 19 ]: | |
print( f"{label}: {probability}" ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment