Created
March 24, 2022 12:58
-
-
Save astellon/8655419236fc63b182fadde3799d09bc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Julia implementation of [rVAD](https://www.sciencedirect.com/science/article/abs/pii/S0885230819300920). | |
# This implementation is based on [the official Python implementation](https://github.com/zhenghuatan/rVAD/blob/master/rVADfast_py_2.0/rVAD_fast.py) | |
# and the original license is [here](https://github.com/zhenghuatan/rVAD/blob/master/rVADfast_py_2.0/LICENSE). | |
using Base.Iterators: partition | |
using DSP, WAV | |
function loadwav(path) | |
xs, fs = wavread(path) | |
xs = reshape(sum(xs, dims=2)./size(xs, 2), :) # make mono | |
return xs, fs | |
end | |
function enframe(xs, fs, window, hop) | |
arraysplit(xs, floor(Int, fs*window), floor(Int, fs*window) - floor(Int, fs*hop)) | |
end | |
function foractivesegment(f, activity) | |
nframes = length(activity) | |
prv = 0 | |
start = 0 | |
for i in 1:nframes | |
# start of a active segment | |
if activity[i] == 1 && prv == 0 | |
prv = 1 | |
start = i | |
# stop of a active segment | |
elseif (i == nframes || activity[i + 1] == 0) && prv == 1 | |
prv = 0 | |
stop = i | |
f(start, stop) | |
end | |
end | |
end | |
function lpf(xs) | |
b = [0.9770, -0.9770] | |
a = [1.0000, -0.9540] | |
return filt(PolynomialRatio(b, a), xs) | |
end | |
# calculate spectral flatness | |
function sft(xs, fs, window, hop, nfft) | |
ϵ = eps(eltype(xs)) | |
nwindow = floor(Int, fs * window) | |
nhop = floor(Int, fs * hop) | |
noverwrap = nwindow - nhop | |
# amplitude spetrogram | |
A = abs.(stft(xs, nwindow, noverwrap, nfft=nfft, window=hamming)) | |
nbin = nfft/2 + 1 | |
numerator = exp.(1/nbin .* sum(log.(A .+ ϵ), dims=1)) | |
denominator = 1/nbin .* sum(A, dims=1) | |
#spectral flatness | |
sft = @. (numerator + ϵ) / (denominator + ϵ) | |
# flatten | |
reshape(sft, :) | |
end | |
function energydifference(xs, fs, window, hop, segmentsize=200, smooth=18, energyfloor=exp(-50)) | |
frames = enframe(xs, fs, window, hop) | |
nframes = length(frames) | |
perframe = floor(Int, fs*window) | |
e = map(x->max(sum(x.^2), energyfloor), frames) | |
supersegments = partition(e, segmentsize) | |
# estimated energy of noise | |
ẽᵥ = zeros(length(supersegments)) | |
for (i, segment) in enumerate(supersegments) | |
segment = sort(segment) | |
eᵥ = segment[max(floor(Int, length(segment) * 0.1), 1)] | |
if i == 1 | |
ẽᵥ[i] = eᵥ | |
else | |
ẽᵥ[i] = 0.9 * ẽᵥ[i-1] + 0.1 * eᵥ | |
end | |
end | |
# noise energy for each frame | |
# TODO: memory efficient | |
ẽᵥ = repeat(ẽᵥ, inner=segmentsize)[1:nframes] | |
postsnr = @. log10(e) - log10(ẽᵥ) | |
d = zeros(nframes) | |
for i in 2:nframes | |
d[i] = sqrt(abs(e[i] - e[i-1]) * max(postsnr[i], 0)) | |
end | |
d[1] = d[2] | |
# extend to both side | |
# TODO: momory efficient | |
dextended = vcat(ones(smooth) * d[1], d[1:end], ones(smooth) * d[end]) | |
# central-smooth the a posteriori SNR weighted energy difference | |
d̄ = | |
map(1:nframes) do i | |
sum(view(dextended, i:i+smooth*2)) | |
end | |
# threshould | |
# TODO: memory efficient | |
θhe = repeat(0.25 * maximum.(supersegments), inner=segmentsize)[1:nframes] | |
e, ẽᵥ, postsnr, d, d̄, θhe | |
end | |
function pitchdetect(sft, θsft) | |
# detect | |
y = @. Int(sft <= θsft) | |
nframes = length(y) | |
# extended from both ends by 60 frames | |
prv = 0 | |
for i in 1:nframes | |
# extend backward | |
if y[i] == 1 && prv == 0 | |
prv = 1 | |
y[max(i-60, 1):i] .= 1 | |
# extend forward | |
elseif y[i] == 0 && prv == 1 | |
prv = 0 | |
y[i:min(i+60, nframes)] .= 1 | |
end | |
end | |
y | |
end | |
# first pass denoising by using segmentation based on a posteriori SNR weighted energy difference | |
function denoise(xs, pitchblock, fs, window, hop, segmentsize = 200, smooth=18, energyfloor=exp=(-50)) | |
frames = enframe(xs, fs, window, hop) | |
nframes = length(frames) | |
perframe = floor(Int, fs*window) | |
e, ẽᵥ, postsnr, d, d̄, θhe = energydifference(xs, fs, window, hop, segmentsize, smooth, energyfloor) | |
# high energy frames | |
highenergy = d̄ .> θhe | |
ys = copy(xs) | |
foractivesegment(highenergy) do start, stop | |
if sum(view(pitchblock, start:stop)) < 2 | |
ys[1+(start-1)*perframe:stop*perframe] .= 0 | |
end | |
end | |
ys | |
end | |
function detect(xs, pitchblock, fs, window, hop, β, segmentsize = 200, smooth=18, energyfloor=exp(-50)) | |
frames = enframe(xs, fs, window, hop) | |
nframes = length(frames) | |
perframe = floor(Int, fs*window) | |
e, ẽᵥ, postsnr, d, d̄, θhe = energydifference(xs, fs, window, hop, segmentsize, smooth, energyfloor) | |
# vad based on a posteriori SNR weighted energy difference | |
activity1 = zeros(Bool, nframes) | |
foractivesegment(pitchblock) do start, stop | |
θvad = β * 1/sum(pitchblock[start:stop]) * sum(d̄[start:stop] .* pitchblock[start:stop]) | |
@. activity1[start:stop] = d̄[start:stop] > θvad | |
end | |
# postprocessing 1: shrink | |
activity2 = copy(activity1) | |
expl, expr = 33, 47 | |
foractivesegment(activity1) do start, stop | |
p = pitchblock[start:stop] | |
if sum(p) > 0 | |
activity2[start:(findfirst(isequal(1), p)+start-1-expl)] .= 0 | |
activity2[(findlast(isequal(1), p)+start-1+expr):stop] .= 0 | |
end | |
end | |
# postprocessing 2: extend | |
activity3 = copy(activity2) | |
expl, expr = 5, 12 | |
foractivesegment(activity2) do start, stop | |
p = view(pitchblock, start:stop) | |
if sum(p) > 0 | |
activity3[(findfirst(isequal(1), p)+start-1-expl):start] .= 1 | |
activity3[stop:min(findlast(isequal(1), p)+start-1+expr, nframes)] .= 1 | |
end | |
end | |
# cut low energy segments | |
ē = sum(e[activity3]) / sum(activity3) | |
foractivesegment(activity2) do start, stop | |
if @views sum(e[start:stop])/length(start:stop) < ē * 0.05 | |
activity3[start:stop] .= 0 | |
end | |
end | |
activity3 | |
end | |
function tosegment(activity, fs, window, hop) | |
nframes = length(activity) | |
perframe = floor(Int, fs * window) | |
nhop = floor(Int, fs * hop) | |
segments = UnitRange{Int}[] | |
foractivesegment(activity) do start, stop | |
push!(segments, ((start-1) * nhop + 1):((stop-1) * nhop + nframes)) | |
end | |
segments | |
end | |
function vad(xs, fs, window = 0.025, hop = 0.01, nfft = 512, θsft = 0.5, β = 0.4) | |
# detect pitch block | |
flatness = sft(xs, fs, window, hop, nfft) | |
pitches = pitchdetect(flatness, θsft) | |
xs = lpf(xs) | |
# first pass denoise | |
xs = denoise(xs, pitches, fs, window, hop) | |
# activity detection | |
activity = detect(xs, pitches, fs, window, hop, β) | |
# segments index | |
segments = tosegment(activity, fs, window, hop) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment