Skip to content

Instantly share code, notes, and snippets.

@aqzlpm11
Last active April 20, 2021 13:52
Show Gist options
  • Save aqzlpm11/403789ef7d4a632e1300b97040b51ed7 to your computer and use it in GitHub Desktop.
Save aqzlpm11/403789ef7d4a632e1300b97040b51ed7 to your computer and use it in GitHub Desktop.
python: vad script
import bob.io.audio
import bob.kaldi # https://www.idiap.ch/software/bob/docs/bob/docs/stable/bob/doc/install.html
import matplotlib.pyplot as plt
def vad(sig, sr, vad_energy_mean_scale=0.5, vad_energy_th=9, vad_frames_context=20, vad_proportion_th=0.4):
""" Energy Based Voice Activate Detection algorithm. (based on kaldi)
Param:
sig: list or np.array
the signal, list of samples
sr: int
sample rate
vad_energy_mean_scale: :obj:`float`, optional
If this is set to s, to get the actual threshold we let m be the mean
log-energy of the file, and use s*m + vad-energy-th
vad_energy_th: :obj:`float`, optional
Constant term in energy threshold for MFCC0 for VAD.
vad_frames_context: :obj:`int`, optional
Number of frames of context on each side of central frame,
in window for which energy is monitored
vad_proportion_th: :obj:`float`, optional
Parameter controlling the proportion of frames within the window that
need to have more energy than the threshold
"""
sig = sig.copy() # vad in kaldi modify the sig
VAD_labels = bob.kaldi.compute_vad(sig, sr,
vad_energy_mean_scale=vad_energy_mean_scale,
vad_energy_th=vad_energy_th,
vad_frames_context=vad_frames_context,
vad_proportion_th=vad_proportion_th)
# convert to sample index
def _pos_section(arr, th=0.5):
"""
Return: list of session [left, right). 左闭右开
"""
cur_i = 0
while(True):
# skip lower item
while(cur_i < len(arr) and arr[cur_i] < th):
cur_i += 1
if cur_i >= len(arr):
break
# iter higer item
left_i = cur_i
while (cur_i < len(arr) and arr[cur_i] >= th):
cur_i += 1
yield left_i, cur_i
cur_i += 1
VAD_WINDOW_SHIFT = 0.010 # kaldi defalut
ret = []
for session in _pos_section(VAD_labels):
ret.append([int(session[0]*sr*VAD_WINDOW_SHIFT), int(session[1]*sr*VAD_WINDOW_SHIFT)])
return ret
def vad_plot(sig, sr, vad_result):
max_h = np.max(abs(sig))*1.1
for s in vad_result:
plt.fill_between(np.array(s)/sr, -max_h, max_h, facecolor='green', alpha=0.3)
plt.plot(np.array(range(len(sig)))/sr, sig)
plt.xlabel("Time(s)")
wav = 'sample.wav'
data = bob.io.audio.reader(wav)
sig, sr = data.load()[0], data.rate
vad_result = vad(sig, sr)
vad_plot(sig, sr, vad_result)
import librosa
import numpy as np
def vad_kaldi_like(y,
sr,
frame_length_ms=20,
frame_shift_ms=10,
energy_threshold=-2.5,
frames_content=20,
proportion_threshold=0.4,
vad_energy_mean_scale=0.6
):
""" 仿照kaldi写法。 参数基本可用,待调优。
y: pcm array
sr: sample rate
frame_length_ms: frame_length in ms
frame_shift_ms: frame_shift in ms
energy_threshold: 能量阈值
vad_energy_mean_scale: 根据mean重新调整energy_threshold
frames_content: the num of content frame
proportion_threshold: content中,超过此比例的帧超过阈值,则本帧视为检出。
"""
energy = librosa.feature.rms(y, frame_length=int(
sr*frame_length_ms/1000), hop_length=int(sr*frame_shift_ms/1000))[0, :]
log_energy = np.log(energy+1e-8)
if vad_energy_mean_scale != 0.0:
assert(vad_energy_mean_scale > 0)
energy_threshold += vad_energy_mean_scale * np.mean(log_energy)
res = []
for t in range(len(log_energy)):
num_count = 0
den_count = 0
for t2 in range(t-frames_content, t+frames_content+1):
if t2 >= 0 and t2 < len(log_energy):
den_count += 1
if log_energy[t2] > energy_threshold:
num_count += 1
if num_count >= den_count * proportion_threshold:
res.append(1)
else:
res.append(0)
return res
def vad(y,
sr,
frame_length_ms=20,
frame_shift_ms=5,
**kwargs):
frame_vad_info = vad_kaldi_like(y, sr, frame_length_ms=frame_length_ms, frame_shift_ms=frame_shift_ms, **kwargs)
def _frame_to_sample_point(i):
return int(i * frame_shift_ms / 1000 * sr)
result = []
def _change_point(l, r):
assert(frame_vad_info[l] == frame_vad_info[r])
if frame_vad_info[l] == 1: # only voiced part
result.append([_frame_to_sample_point(l), _frame_to_sample_point(r+1)])
pre = 0
for i in range(len(frame_vad_info)-1):
if frame_vad_info[i] != frame_vad_info[i+1]:
_change_point(pre, i)
pre = i+1
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment