Last active
April 20, 2021 13:52
-
-
Save aqzlpm11/403789ef7d4a632e1300b97040b51ed7 to your computer and use it in GitHub Desktop.
python: vad script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import bob.io.audio | |
| import bob.kaldi # https://www.idiap.ch/software/bob/docs/bob/docs/stable/bob/doc/install.html | |
| import matplotlib.pyplot as plt | |
| def vad(sig, sr, vad_energy_mean_scale=0.5, vad_energy_th=9, vad_frames_context=20, vad_proportion_th=0.4): | |
| """ Energy Based Voice Activate Detection algorithm. (based on kaldi) | |
| Param: | |
| sig: list or np.array | |
| the signal, list of samples | |
| sr: int | |
| sample rate | |
| vad_energy_mean_scale: :obj:`float`, optional | |
| If this is set to s, to get the actual threshold we let m be the mean | |
| log-energy of the file, and use s*m + vad-energy-th | |
| vad_energy_th: :obj:`float`, optional | |
| Constant term in energy threshold for MFCC0 for VAD. | |
| vad_frames_context: :obj:`int`, optional | |
| Number of frames of context on each side of central frame, | |
| in window for which energy is monitored | |
| vad_proportion_th: :obj:`float`, optional | |
| Parameter controlling the proportion of frames within the window that | |
| need to have more energy than the threshold | |
| """ | |
| sig = sig.copy() # vad in kaldi modify the sig | |
| VAD_labels = bob.kaldi.compute_vad(sig, sr, | |
| vad_energy_mean_scale=vad_energy_mean_scale, | |
| vad_energy_th=vad_energy_th, | |
| vad_frames_context=vad_frames_context, | |
| vad_proportion_th=vad_proportion_th) | |
| # convert to sample index | |
| def _pos_section(arr, th=0.5): | |
| """ | |
| Return: list of session [left, right). 左闭右开 | |
| """ | |
| cur_i = 0 | |
| while(True): | |
| # skip lower item | |
| while(cur_i < len(arr) and arr[cur_i] < th): | |
| cur_i += 1 | |
| if cur_i >= len(arr): | |
| break | |
| # iter higer item | |
| left_i = cur_i | |
| while (cur_i < len(arr) and arr[cur_i] >= th): | |
| cur_i += 1 | |
| yield left_i, cur_i | |
| cur_i += 1 | |
| VAD_WINDOW_SHIFT = 0.010 # kaldi defalut | |
| ret = [] | |
| for session in _pos_section(VAD_labels): | |
| ret.append([int(session[0]*sr*VAD_WINDOW_SHIFT), int(session[1]*sr*VAD_WINDOW_SHIFT)]) | |
| return ret | |
| def vad_plot(sig, sr, vad_result): | |
| max_h = np.max(abs(sig))*1.1 | |
| for s in vad_result: | |
| plt.fill_between(np.array(s)/sr, -max_h, max_h, facecolor='green', alpha=0.3) | |
| plt.plot(np.array(range(len(sig)))/sr, sig) | |
| plt.xlabel("Time(s)") | |
| wav = 'sample.wav' | |
| data = bob.io.audio.reader(wav) | |
| sig, sr = data.load()[0], data.rate | |
| vad_result = vad(sig, sr) | |
| vad_plot(sig, sr, vad_result) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import librosa | |
| import numpy as np | |
| def vad_kaldi_like(y, | |
| sr, | |
| frame_length_ms=20, | |
| frame_shift_ms=10, | |
| energy_threshold=-2.5, | |
| frames_content=20, | |
| proportion_threshold=0.4, | |
| vad_energy_mean_scale=0.6 | |
| ): | |
| """ 仿照kaldi写法。 参数基本可用,待调优。 | |
| y: pcm array | |
| sr: sample rate | |
| frame_length_ms: frame_length in ms | |
| frame_shift_ms: frame_shift in ms | |
| energy_threshold: 能量阈值 | |
| vad_energy_mean_scale: 根据mean重新调整energy_threshold | |
| frames_content: the num of content frame | |
| proportion_threshold: content中,超过此比例的帧超过阈值,则本帧视为检出。 | |
| """ | |
| energy = librosa.feature.rms(y, frame_length=int( | |
| sr*frame_length_ms/1000), hop_length=int(sr*frame_shift_ms/1000))[0, :] | |
| log_energy = np.log(energy+1e-8) | |
| if vad_energy_mean_scale != 0.0: | |
| assert(vad_energy_mean_scale > 0) | |
| energy_threshold += vad_energy_mean_scale * np.mean(log_energy) | |
| res = [] | |
| for t in range(len(log_energy)): | |
| num_count = 0 | |
| den_count = 0 | |
| for t2 in range(t-frames_content, t+frames_content+1): | |
| if t2 >= 0 and t2 < len(log_energy): | |
| den_count += 1 | |
| if log_energy[t2] > energy_threshold: | |
| num_count += 1 | |
| if num_count >= den_count * proportion_threshold: | |
| res.append(1) | |
| else: | |
| res.append(0) | |
| return res | |
| def vad(y, | |
| sr, | |
| frame_length_ms=20, | |
| frame_shift_ms=5, | |
| **kwargs): | |
| frame_vad_info = vad_kaldi_like(y, sr, frame_length_ms=frame_length_ms, frame_shift_ms=frame_shift_ms, **kwargs) | |
| def _frame_to_sample_point(i): | |
| return int(i * frame_shift_ms / 1000 * sr) | |
| result = [] | |
| def _change_point(l, r): | |
| assert(frame_vad_info[l] == frame_vad_info[r]) | |
| if frame_vad_info[l] == 1: # only voiced part | |
| result.append([_frame_to_sample_point(l), _frame_to_sample_point(r+1)]) | |
| pre = 0 | |
| for i in range(len(frame_vad_info)-1): | |
| if frame_vad_info[i] != frame_vad_info[i+1]: | |
| _change_point(pre, i) | |
| pre = i+1 | |
| return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment