Skip to content

Instantly share code, notes, and snippets.

@thewh1teagle
Last active August 2, 2024 19:12
Show Gist options
  • Select an option

  • Save thewh1teagle/b3f1002c690ac567b4cef0613e0fbfa8 to your computer and use it in GitHub Desktop.

Select an option

Save thewh1teagle/b3f1002c690ac567b4cef0613e0fbfa8 to your computer and use it in GitHub Desktop.
Audio speech segmentation using pyannote
# python3 -m venv venv
# source venv/bin/activate
# pip3 install onnxruntime numpy librosa
# wget https://github.com/pengzhendong/pyannote-onnx/blob/master/pyannote_onnx/segmentation-3.0.onnx
# wget https://github.com/thewh1teagle/sherpa-rs/releases/download/v0.1.0/motivation.wav -Otest.wav
# python3 main.py
import onnxruntime as ort
import librosa
import numpy as np
def init_session(model_path):
opts = ort.SessionOptions()
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1
opts.log_severity_level = 3
sess = ort.InferenceSession(model_path, sess_options=opts)
return sess
def read_wav(path: str):
return librosa.load(path, sr=16000)
if __name__ == '__main__':
session = init_session('segmentation-3.0.onnx')
samples, sample_rate = read_wav('test.wav')
# Conv1d & MaxPool1d & SincNet https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html https://pytorch.org/docs/stable/generated/torch.nn.MaxPool1d.html https://github.com/pyannote/pyannote-audio/blob/develop/pyannote/audio/models/blocks/sincnet.py#L50-L71
frame_size = 270
frame_start = 721
window_size = sample_rate * 10 # 10s
# State and offset
is_speeching = False
offset = frame_start
start_offset = 0
# Pad end with silence for full last segment
samples = np.pad(samples, (0, window_size), 'constant')
for start in range(0, len(samples), window_size):
window = samples[start:start + window_size]
ort_outs: np.array = session.run(None, {'input': window[None, None, :]})[0][0]
for probs in ort_outs:
predicted_id = np.argmax(probs)
if predicted_id != 0:
if not is_speeching:
start_offset = offset
is_speeching = True
elif is_speeching:
start = round(start_offset / sample_rate, 3)
end = round(offset / sample_rate, 3)
print(f'{start}s - {end}s')
is_speeching = False
offset += frame_size
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment