Last active
October 6, 2016 07:53
-
-
Save muhrifqii/785de7844437e114a24b45c91a1d71f2 to your computer and use it in GitHub Desktop.
RASTA-PLP in MATLAB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [aspectrum,wts] = audspec(pspectrum, sr, nfilts, fbtype, minfreq, maxfreq, sumpower, bwidth) | |
%[aspectrum,wts] = audspec(pspectrum, sr, nfilts, fbtype, minfreq, maxfreq, sumpower, bwidth) | |
% | |
% perform critical band analysis (see PLP) | |
% takes power spectrogram as input | |
if nargin < 2; sr = 16000; end | |
if nargin < 3; nfilts = ceil(hz2bark(sr/2))+1; end | |
if nargin < 4; fbtype = 'bark'; end | |
if nargin < 5; minfreq = 0; end | |
if nargin < 6; maxfreq = sr/2; end | |
if nargin < 7; sumpower = 1; end | |
if nargin < 8; bwidth = 1.0; end | |
[nfreqs,nframes] = size(pspectrum); | |
nfft = (nfreqs-1)*2; | |
if strcmp(fbtype, 'bark') | |
wts = fft2barkmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq); | |
elseif strcmp(fbtype, 'mel') | |
wts = fft2melmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq); | |
elseif strcmp(fbtype, 'htkmel') | |
wts = fft2melmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq, 1, 1); | |
elseif strcmp(fbtype, 'fcmel') | |
wts = fft2melmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq, 1, 0); | |
else | |
disp(['fbtype ', fbtype, ' not recognized']); | |
error; | |
end | |
wts = wts(:, 1:nfreqs); | |
% Integrate FFT bins into Mel bins, in abs or abs^2 domains: | |
if (sumpower) | |
aspectrum = wts * pspectrum; | |
else | |
aspectrum = (wts * sqrt(pspectrum)).^2; | |
end | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function y = dolpc(x,modelorder) | |
%y = dolpc(x,modelorder) | |
% | |
% compute autoregressive model from spectral magnitude samples | |
% | |
% rows(x) = critical band | |
% col(x) = frame | |
% | |
% row(y) = lpc a_i coeffs, scaled by gain | |
% col(y) = frame | |
% | |
% modelorder is order of model, defaults to 8 | |
% 2003-04-12 [email protected] after [email protected] | |
[nbands,nframes] = size(x); | |
if nargin < 2 | |
modelorder = 8; | |
end | |
% Calculate autocorrelation | |
r = real(ifft([x;x([(nbands-1):-1:2],:)])); | |
% First half only | |
r = r(1:nbands,:); | |
% Find LPC coeffs by durbin | |
[y,e] = levinson(r, modelorder); | |
% Normalize each poly by gain | |
y = y'./repmat(e',(modelorder+1),1); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function y = lifter(x, lift, invs) | |
% y = lifter(x, lift, invs) | |
% Apply lifter to matrix of cepstra (one per column) | |
% lift = exponent of x i^n liftering | |
% or, as a negative integer, the length of HTK-style sin-curve liftering. | |
% If inverse == 1 (default 0), undo the liftering. | |
% 2005-05-19 [email protected] | |
if nargin < 2; lift = 0.6; end % liftering exponent | |
if nargin < 3; invs = 0; end % flag to undo liftering | |
[ncep, nfrm] = size(x); | |
if lift == 0 | |
y = x; | |
else | |
if lift > 0 | |
if lift > 10 | |
disp(['Unlikely lift exponent of ', num2str(lift),' (did you mean -ve?)']); | |
end | |
liftwts = [1, ([1:(ncep-1)].^lift)]; | |
elseif lift < 0 | |
% Hack to support HTK liftering | |
L = -lift; | |
if (L ~= round(L)) | |
disp(['HTK liftering value ', num2str(L),' must be integer']); | |
end | |
liftwts = [1, (1+L/2*sin([1:(ncep-1)]*pi/L))]; | |
end | |
if (invs) | |
liftwts = 1./liftwts; | |
end | |
y = diag(liftwts)*x; | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function features = lpc2cep(a,nout) | |
% features = lpc2cep(lpcas,nout) | |
% Convert the LPC 'a' coefficients in each column of lpcas | |
% into frames of cepstra. | |
% nout is number of cepstra to produce, defaults to size(lpcas,1) | |
% 2003-04-11 [email protected] | |
[nin, ncol] = size(a); | |
order = nin - 1; | |
if nargin < 2 | |
nout = order + 1; | |
end | |
c = zeros(nout, ncol); | |
% Code copied from HSigP.c: LPC2Cepstrum | |
% First cep is log(Error) from Durbin | |
c(1,:) = -log(a(1,:)); | |
% Renormalize lpc A coeffs | |
a = a ./ repmat(a(1,:), nin, 1); | |
for n = 2:nout | |
sum = 0; | |
for m = 2:n | |
sum = sum + (n - m) * a(m,:) .* c(n - m + 1, :); | |
end | |
c(n,:) = -(a(n,:) + sum / (n-1)); | |
end | |
features = c; | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [cepstra,aspectrum,pspectrum] = melfcc(samples, sr, varargin) | |
%[cepstra,aspectrum,pspectrum] = melfcc(samples, sr[, opts ...]) | |
% Calculate Mel-frequency cepstral coefficients by: | |
% - take the absolute value of the STFT | |
% - warp to a Mel frequency scale | |
% - take the DCT of the log-Mel-spectrum | |
% - return the first <ncep> components | |
% This version allows a lot of options to be controlled, as optional | |
% 'name', value pairs from the 3rd argument on: (defaults in parens) | |
% 'wintime' (0.025): window length in sec | |
% 'hoptime' (0.010): step between successive windows in sec | |
% 'numcep' (13): number of cepstra to return | |
% 'lifterexp' (0.6): exponent for liftering; 0 = none; < 0 = HTK sin lifter | |
% 'sumpower' (1): 1 = sum abs(fft)^2; 0 = sum abs(fft) | |
% 'preemph' (0.97): apply pre-emphasis filter [1 -preemph] (0 = none) | |
% 'dither' (0): 1 = add offset to spectrum as if dither noise | |
% 'minfreq' (0): lowest band edge of mel filters (Hz) | |
% 'maxfreq' (4000): highest band edge of mel filters (Hz) | |
% 'nbands' (40): number of warped spectral bands to use | |
% 'bwidth' (1.0): width of aud spec filters relative to default | |
% 'dcttype' (2): type of DCT used - 1 or 2 (or 3 for HTK or 4 for feac) | |
% 'fbtype' ('mel'): frequency warp: 'mel','bark','htkmel','fcmel' | |
% 'usecmp' (0): apply equal-loudness weighting and cube-root compr. | |
% 'modelorder' (0): if > 0, fit a PLP model of this order | |
% 'broaden' (0): flag to retain the (useless?) first and last bands | |
% 'useenergy' (0): overwrite C0 with true log energy | |
% The following non-default values nearly duplicate Malcolm Slaney's mfcc | |
% (i.e. melfcc(d,16000,opts...) =~= log(10)*2*mfcc(d*(2^17),16000) ) | |
% 'wintime': 0.016 | |
% 'lifterexp': 0 | |
% 'minfreq': 133.33 | |
% 'maxfreq': 6855.6 | |
% 'sumpower': 0 | |
% The following non-default values nearly duplicate HTK's MFCC | |
% (i.e. melfcc(d,16000,opts...) =~= 2*htkmelfcc(:,[13,[1:12]])' | |
% where HTK config has PREEMCOEF = 0.97, NUMCHANS = 20, CEPLIFTER = 22, | |
% NUMCEPS = 12, WINDOWSIZE = 250000.0, USEHAMMING = T, TARGETKIND = MFCC_0) | |
% 'lifterexp': -22 | |
% 'nbands': 20 | |
% 'maxfreq': 8000 | |
% 'sumpower': 0 | |
% 'fbtype': 'htkmel' | |
% 'dcttype': 3 | |
% For more detail on reproducing other programs' outputs, see | |
% http://www.ee.columbia.edu/~dpwe/resources/matlab/rastamat/mfccs.html | |
% | |
% 2005-04-19 [email protected] after rastaplp.m. | |
% Uses Mark Paskin's process_options.m from KPMtools | |
% $Header: /Users/dpwe/matlab/rastamat/RCS/melfcc.m,v 1.3 2012/09/03 14:01:26 dpwe Exp dpwe $ | |
if nargin < 2; sr = 16000; end | |
% Parse out the optional arguments | |
[wintime, hoptime, numcep, lifterexp, sumpower, preemph, dither, ... | |
minfreq, maxfreq, nbands, bwidth, dcttype, fbtype, usecmp, modelorder, ... | |
broaden, useenergy] = ... | |
process_options(varargin, 'wintime', 0.025, 'hoptime', 0.010, ... | |
'numcep', 13, 'lifterexp', 0.6, 'sumpower', 1, 'preemph', 0.97, ... | |
'dither', 0, 'minfreq', 0, 'maxfreq', 4000, ... | |
'nbands', 40, 'bwidth', 1.0, 'dcttype', 2, ... | |
'fbtype', 'mel', 'usecmp', 0, 'modelorder', 0, ... | |
'broaden', 0, 'useenergy', 0); | |
if preemph ~= 0 | |
samples = filter([1 -preemph], 1, samples); | |
end | |
% Compute FFT power spectrum | |
[pspectrum,logE] = powspec(samples, sr, wintime, hoptime, dither); | |
aspectrum = audspec(pspectrum, sr, nbands, fbtype, minfreq, maxfreq, sumpower, bwidth); | |
if (usecmp) | |
% PLP-like weighting/compression | |
aspectrum = postaud(aspectrum, maxfreq, fbtype, broaden); | |
end | |
if modelorder > 0 | |
if (dcttype ~= 1) | |
disp(['warning: plp cepstra are implicitly dcttype 1 (not ', num2str(dcttype), ')']); | |
end | |
% LPC analysis | |
lpcas = dolpc(aspectrum, modelorder); | |
% convert lpc to cepstra | |
cepstra = lpc2cep(lpcas, numcep); | |
% Return the auditory spectrum corresponding to the cepstra? | |
% aspectrum = lpc2spec(lpcas, nbands); | |
% else return the aspectrum that the cepstra are based on, prior to PLP | |
else | |
% Convert to cepstra via DCT | |
cepstra = spec2cep(aspectrum, numcep, dcttype); | |
end | |
cepstra = lifter(cepstra, lifterexp); | |
if useenergy | |
cepstra(1,:) = logE; | |
end | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [y,eql] = postaud(x,fmax,fbtype,broaden) | |
%y = postaud(x, fmax, fbtype) | |
% | |
% do loudness equalization and cube root compression | |
% x = critical band filters | |
% rows = critical bands | |
% cols = frames | |
if nargin < 3 | |
fbtype = 'bark'; | |
end | |
if nargin < 4 | |
% By default, don't add extra flanking bands | |
broaden = 0; | |
end | |
[nbands,nframes] = size(x); | |
% equal loundness weights stolen from rasta code | |
%eql = [0.000479 0.005949 0.021117 0.044806 0.073345 0.104417 0.137717 ... | |
% 0.174255 0.215590 0.263260 0.318302 0.380844 0.449798 0.522813 | |
% 0.596597]; | |
% Include frequency points at extremes, discard later | |
nfpts = nbands+2*broaden; | |
if strcmp(fbtype, 'bark') | |
bandcfhz = bark2hz(linspace(0, hz2bark(fmax), nfpts)); | |
elseif strcmp(fbtype, 'mel') | |
bandcfhz = mel2hz(linspace(0, hz2mel(fmax), nfpts)); | |
elseif strcmp(fbtype, 'htkmel') || strcmp(fbtype, 'fcmel') | |
bandcfhz = mel2hz(linspace(0, hz2mel(fmax,1), nfpts),1); | |
else | |
disp(['unknown fbtype', fbtype]); | |
error; | |
end | |
% Remove extremal bands (the ones that will be duplicated) | |
bandcfhz = bandcfhz((1+broaden):(nfpts-broaden)); | |
% Hynek's magic equal-loudness-curve formula | |
fsq = bandcfhz.^2; | |
ftmp = fsq + 1.6e5; | |
eql = ((fsq./ftmp).^2) .* ((fsq + 1.44e6)./(fsq + 9.61e6)); | |
% weight the critical bands | |
z = repmat(eql',1,nframes).*x; | |
% cube root compress | |
z = z .^ (.33); | |
% replicate first and last band (because they are unreliable as calculated) | |
if (broaden) | |
y = z([1,1:nbands,nbands],:); | |
else | |
y = z([2,2:(nbands-1),nbands-1],:); | |
end | |
%y = z([1,1:nbands-2,nbands-2],:); | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [y,e] = powspec(x, sr, wintime, steptime, dither) | |
%[y,e] = powspec(x, sr, wintime, steptime, sumlin, dither) | |
% | |
% compute the powerspectrum and frame energy of the input signal. | |
% basically outputs a power spectrogram | |
% | |
% each column represents a power spectrum for a given frame | |
% each row represents a frequency | |
% | |
% default values: | |
% sr = 8000Hz | |
% wintime = 25ms (200 samps) | |
% steptime = 10ms (80 samps) | |
% which means use 256 point fft | |
% hamming window | |
% | |
% $Header: /Users/dpwe/matlab/rastamat/RCS/powspec.m,v 1.3 2012/09/03 14:02:01 dpwe Exp dpwe $ | |
% for sr = 8000 | |
%NFFT = 256; | |
%NOVERLAP = 120; | |
%SAMPRATE = 8000; | |
%WINDOW = hamming(200); | |
if nargin < 2 | |
sr = 8000; | |
end | |
if nargin < 3 | |
wintime = 0.025; | |
end | |
if nargin < 4 | |
steptime = 0.010; | |
end | |
if nargin < 5 | |
dither = 1; | |
end | |
winpts = round(wintime*sr); | |
steppts = round(steptime*sr); | |
NFFT = 2^(ceil(log(winpts)/log(2))); | |
%WINDOW = hamming(winpts); | |
%WINDOW = [0,hanning(winpts)']; | |
WINDOW = [hanning(winpts)']; | |
% hanning gives much less noisy sidelobes | |
NOVERLAP = winpts - steppts; | |
SAMPRATE = sr; | |
% Values coming out of rasta treat samples as integers, | |
% not range -1..1, hence scale up here to match (approx) | |
y = abs(specgram(x*32768,NFFT,SAMPRATE,WINDOW,NOVERLAP)).^2; | |
% imagine we had random dither that had a variance of 1 sample | |
% step and a white spectrum. That's like (in expectation, anyway) | |
% adding a constant value to every bin (to avoid digital zero) | |
if (dither) | |
y = y + winpts; | |
end | |
% ignoring the hamming window, total power would be = #pts | |
% I think this doesn't quite make sense, but it's what rasta/powspec.c does | |
% that's all she wrote | |
% 2012-09-03 Calculate log energy - after windowing, by parseval | |
e = log(sum(y)); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function y = rastafilt(x) | |
% y = rastafilt(x) | |
% | |
% rows of x = critical bands, cols of x = frame | |
% same for y but after filtering | |
% | |
% default filter is single pole at 0.94 | |
% rasta filter | |
numer = [-2:2]; | |
numer = -numer ./ sum(numer.*numer); | |
denom = [1 -0.94]; | |
% Initialize the state. This avoids a big spike at the beginning | |
% resulting from the dc offset level in each band. | |
% (this is effectively what rasta/rasta_filt.c does). | |
% Because Matlab uses a DF2Trans implementation, we have to | |
% specify the FIR part to get the state right (but not the IIR part) | |
%[y,z] = filter(numer, 1, x(:,1:4)'); | |
% 2010-08-12 filter() chooses the wrong dimension for very short | |
% signals (thanks to Benjamin K [email protected]) | |
[y,z] = filter(numer, 1, x(:,1:4)',[],1); | |
% .. but don't keep any of these values, just output zero at the beginning | |
y = 0*y'; | |
size(z) | |
size(x) | |
% Apply the full filter to the rest of the signal, append it | |
y = [y,filter(numer, denom, x(:,5:end)',z,1)']; | |
%y = [y,filter(numer, denom, x(:,5:end)',z)']; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [cepstra, spectra, pspectrum, lpcas, F, M] = rastaplp(samples, sr, dorasta, modelorder) | |
%[cepstra, spectra, lpcas] = rastaplp(samples, sr, dorasta, modelorder) | |
% | |
% cheap version of log rasta with fixed parameters | |
% | |
% output is matrix of features, row = feature, col = frame | |
% | |
% sr is sampling rate of samples, defaults to 8000 | |
% dorasta defaults to 1; if 0, just calculate PLP | |
% modelorder is order of PLP model, defaults to 8. 0 -> no PLP | |
% | |
% rastaplp(d, sr, 0, 12) is pretty close to the unix command line | |
% feacalc -dith -delta 0 -ras no -plp 12 -dom cep ... | |
% except during very quiet areas, where our approach of adding noise | |
% in the time domain is different from rasta's approach | |
% | |
% 2003-04-12 [email protected] after [email protected]'s version | |
if nargin < 2 | |
sr = 8000; | |
end | |
if nargin < 3 | |
dorasta = 1; | |
end | |
if nargin < 4 | |
modelorder = 8; | |
end | |
% add miniscule amount of noise | |
%samples = samples + randn(size(samples))*0.0001; | |
% first compute power spectrum | |
pspectrum = powspec(samples, sr); | |
% next group to critical bands | |
aspectrum = audspec(pspectrum, sr); | |
nbands = size(aspectrum,1); | |
if dorasta ~= 0 | |
% put in log domain | |
nl_aspectrum = log(aspectrum); | |
% next do rasta filtering | |
ras_nl_aspectrum = rastafilt(nl_aspectrum); | |
% do inverse log | |
aspectrum = exp(ras_nl_aspectrum); | |
end | |
% do final auditory compressions | |
postspectrum = postaud(aspectrum, sr/2); % 2012-09-03 bug: was sr | |
if modelorder > 0 | |
% LPC analysis | |
lpcas = dolpc(postspectrum, modelorder); | |
% convert lpc to cepstra | |
cepstra = lpc2cep(lpcas, modelorder+1); | |
% .. or to spectra | |
[spectra,F,M] = lpc2spec(lpcas, nbands); | |
else | |
% No LPC smoothing of spectrum | |
spectra = postspectrum; | |
cepstra = spec2cep(spectra); | |
end | |
cepstra = lifter(cepstra, 0.6); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [cep,dctm] = spec2cep(spec, ncep, type) | |
% [cep,dctm] = spec2cep(spec, ncep, type) | |
% Calculate cepstra from spectral samples (in columns of spec) | |
% Return ncep cepstral rows (defaults to 9) | |
% This one does type II dct, or type I if type is specified as 1 | |
% dctm returns the DCT matrix that spec was multiplied by to give cep. | |
% 2005-04-19 [email protected] for mfcc_dpwe | |
if nargin < 2; ncep = 13; end | |
if nargin < 3; type = 2; end % type of DCT | |
[nrow, ncol] = size(spec); | |
% Make the DCT matrix | |
dctm = zeros(ncep, nrow); | |
if type == 2 || type == 3 | |
% this is the orthogonal one, the one you want | |
for i = 1:ncep | |
dctm(i,:) = cos((i-1)*[1:2:(2*nrow-1)]/(2*nrow)*pi) * sqrt(2/nrow); | |
end | |
if type == 2 | |
% make it unitary! (but not for HTK type 3) | |
dctm(1,:) = dctm(1,:)/sqrt(2); | |
end | |
elseif type == 4 % type 1 with implicit repeating of first, last bins | |
% Deep in the heart of the rasta/feacalc code, there is the logic | |
% that the first and last auditory bands extend beyond the edge of | |
% the actual spectra, and they are thus copied from their neighbors. | |
% Normally, we just ignore those bands and take the 19 in the middle, | |
% but when feacalc calculates mfccs, it actually takes the cepstrum | |
% over the spectrum *including* the repeated bins at each end. | |
% Here, we simulate 'repeating' the bins and an nrow+2-length | |
% spectrum by adding in extra DCT weight to the first and last | |
% bins. | |
for i = 1:ncep | |
dctm(i,:) = cos((i-1)*[1:nrow]/(nrow+1)*pi) * 2; | |
% Add in edge points at ends (includes fixup scale) | |
dctm(i,1) = dctm(i,1) + 1; | |
dctm(i,nrow) = dctm(i,nrow) + ((-1)^(i-1)); | |
end | |
dctm = dctm / (2*(nrow+1)); | |
else % dpwe type 1 - same as old spec2cep that expanded & used fft | |
for i = 1:ncep | |
dctm(i,:) = cos((i-1)*[0:(nrow-1)]/(nrow-1)*pi) * 2 / (2*(nrow-1)); | |
end | |
% fixup 'non-repeated' points | |
dctm(:,[1 nrow]) = dctm(:, [1 nrow])/2; | |
end | |
cep = dctm*log(spec); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment