Skip to content

Instantly share code, notes, and snippets.

@astellon
Created March 24, 2022 12:58
Show Gist options
  • Save astellon/8655419236fc63b182fadde3799d09bc to your computer and use it in GitHub Desktop.
Save astellon/8655419236fc63b182fadde3799d09bc to your computer and use it in GitHub Desktop.
# Julia implementation of [rVAD](https://www.sciencedirect.com/science/article/abs/pii/S0885230819300920).
# This implementation is based on [the official Python implementation](https://github.com/zhenghuatan/rVAD/blob/master/rVADfast_py_2.0/rVAD_fast.py)
# and the original license is [here](https://github.com/zhenghuatan/rVAD/blob/master/rVADfast_py_2.0/LICENSE).
using Base.Iterators: partition
using DSP, WAV
function loadwav(path)
xs, fs = wavread(path)
xs = reshape(sum(xs, dims=2)./size(xs, 2), :) # make mono
return xs, fs
end
function enframe(xs, fs, window, hop)
arraysplit(xs, floor(Int, fs*window), floor(Int, fs*window) - floor(Int, fs*hop))
end
function foractivesegment(f, activity)
nframes = length(activity)
prv = 0
start = 0
for i in 1:nframes
# start of a active segment
if activity[i] == 1 && prv == 0
prv = 1
start = i
# stop of a active segment
elseif (i == nframes || activity[i + 1] == 0) && prv == 1
prv = 0
stop = i
f(start, stop)
end
end
end
function lpf(xs)
b = [0.9770, -0.9770]
a = [1.0000, -0.9540]
return filt(PolynomialRatio(b, a), xs)
end
# calculate spectral flatness
function sft(xs, fs, window, hop, nfft)
ϵ = eps(eltype(xs))
nwindow = floor(Int, fs * window)
nhop = floor(Int, fs * hop)
noverwrap = nwindow - nhop
# amplitude spetrogram
A = abs.(stft(xs, nwindow, noverwrap, nfft=nfft, window=hamming))
nbin = nfft/2 + 1
numerator = exp.(1/nbin .* sum(log.(A .+ ϵ), dims=1))
denominator = 1/nbin .* sum(A, dims=1)
#spectral flatness
sft = @. (numerator + ϵ) / (denominator + ϵ)
# flatten
reshape(sft, :)
end
function energydifference(xs, fs, window, hop, segmentsize=200, smooth=18, energyfloor=exp(-50))
frames = enframe(xs, fs, window, hop)
nframes = length(frames)
perframe = floor(Int, fs*window)
e = map(x->max(sum(x.^2), energyfloor), frames)
supersegments = partition(e, segmentsize)
# estimated energy of noise
ẽᵥ = zeros(length(supersegments))
for (i, segment) in enumerate(supersegments)
segment = sort(segment)
eᵥ = segment[max(floor(Int, length(segment) * 0.1), 1)]
if i == 1
ẽᵥ[i] = eᵥ
else
ẽᵥ[i] = 0.9 * ẽᵥ[i-1] + 0.1 * eᵥ
end
end
# noise energy for each frame
# TODO: memory efficient
ẽᵥ = repeat(ẽᵥ, inner=segmentsize)[1:nframes]
postsnr = @. log10(e) - log10(ẽᵥ)
d = zeros(nframes)
for i in 2:nframes
d[i] = sqrt(abs(e[i] - e[i-1]) * max(postsnr[i], 0))
end
d[1] = d[2]
# extend to both side
# TODO: momory efficient
dextended = vcat(ones(smooth) * d[1], d[1:end], ones(smooth) * d[end])
# central-smooth the a posteriori SNR weighted energy difference
d̄ =
map(1:nframes) do i
sum(view(dextended, i:i+smooth*2))
end
# threshould
# TODO: memory efficient
θhe = repeat(0.25 * maximum.(supersegments), inner=segmentsize)[1:nframes]
e, ẽᵥ, postsnr, d, d̄, θhe
end
function pitchdetect(sft, θsft)
# detect
y = @. Int(sft <= θsft)
nframes = length(y)
# extended from both ends by 60 frames
prv = 0
for i in 1:nframes
# extend backward
if y[i] == 1 && prv == 0
prv = 1
y[max(i-60, 1):i] .= 1
# extend forward
elseif y[i] == 0 && prv == 1
prv = 0
y[i:min(i+60, nframes)] .= 1
end
end
y
end
# first pass denoising by using segmentation based on a posteriori SNR weighted energy difference
function denoise(xs, pitchblock, fs, window, hop, segmentsize = 200, smooth=18, energyfloor=exp=(-50))
frames = enframe(xs, fs, window, hop)
nframes = length(frames)
perframe = floor(Int, fs*window)
e, ẽᵥ, postsnr, d, d̄, θhe = energydifference(xs, fs, window, hop, segmentsize, smooth, energyfloor)
# high energy frames
highenergy = d̄ .> θhe
ys = copy(xs)
foractivesegment(highenergy) do start, stop
if sum(view(pitchblock, start:stop)) < 2
ys[1+(start-1)*perframe:stop*perframe] .= 0
end
end
ys
end
function detect(xs, pitchblock, fs, window, hop, β, segmentsize = 200, smooth=18, energyfloor=exp(-50))
frames = enframe(xs, fs, window, hop)
nframes = length(frames)
perframe = floor(Int, fs*window)
e, ẽᵥ, postsnr, d, d̄, θhe = energydifference(xs, fs, window, hop, segmentsize, smooth, energyfloor)
# vad based on a posteriori SNR weighted energy difference
activity1 = zeros(Bool, nframes)
foractivesegment(pitchblock) do start, stop
θvad = β * 1/sum(pitchblock[start:stop]) * sum(d̄[start:stop] .* pitchblock[start:stop])
@. activity1[start:stop] = d̄[start:stop] > θvad
end
# postprocessing 1: shrink
activity2 = copy(activity1)
expl, expr = 33, 47
foractivesegment(activity1) do start, stop
p = pitchblock[start:stop]
if sum(p) > 0
activity2[start:(findfirst(isequal(1), p)+start-1-expl)] .= 0
activity2[(findlast(isequal(1), p)+start-1+expr):stop] .= 0
end
end
# postprocessing 2: extend
activity3 = copy(activity2)
expl, expr = 5, 12
foractivesegment(activity2) do start, stop
p = view(pitchblock, start:stop)
if sum(p) > 0
activity3[(findfirst(isequal(1), p)+start-1-expl):start] .= 1
activity3[stop:min(findlast(isequal(1), p)+start-1+expr, nframes)] .= 1
end
end
# cut low energy segments
ē = sum(e[activity3]) / sum(activity3)
foractivesegment(activity2) do start, stop
if @views sum(e[start:stop])/length(start:stop) < ē * 0.05
activity3[start:stop] .= 0
end
end
activity3
end
function tosegment(activity, fs, window, hop)
nframes = length(activity)
perframe = floor(Int, fs * window)
nhop = floor(Int, fs * hop)
segments = UnitRange{Int}[]
foractivesegment(activity) do start, stop
push!(segments, ((start-1) * nhop + 1):((stop-1) * nhop + nframes))
end
segments
end
function vad(xs, fs, window = 0.025, hop = 0.01, nfft = 512, θsft = 0.5, β = 0.4)
# detect pitch block
flatness = sft(xs, fs, window, hop, nfft)
pitches = pitchdetect(flatness, θsft)
xs = lpf(xs)
# first pass denoise
xs = denoise(xs, pitches, fs, window, hop)
# activity detection
activity = detect(xs, pitches, fs, window, hop, β)
# segments index
segments = tosegment(activity, fs, window, hop)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment