Skip to content

Instantly share code, notes, and snippets.

@muhrifqii
Last active October 6, 2016 07:53
Show Gist options
  • Save muhrifqii/785de7844437e114a24b45c91a1d71f2 to your computer and use it in GitHub Desktop.
Save muhrifqii/785de7844437e114a24b45c91a1d71f2 to your computer and use it in GitHub Desktop.
RASTA-PLP in MATLAB
function [aspectrum,wts] = audspec(pspectrum, sr, nfilts, fbtype, minfreq, maxfreq, sumpower, bwidth)
%[aspectrum,wts] = audspec(pspectrum, sr, nfilts, fbtype, minfreq, maxfreq, sumpower, bwidth)
%
% perform critical band analysis (see PLP)
% takes power spectrogram as input
if nargin < 2; sr = 16000; end
if nargin < 3; nfilts = ceil(hz2bark(sr/2))+1; end
if nargin < 4; fbtype = 'bark'; end
if nargin < 5; minfreq = 0; end
if nargin < 6; maxfreq = sr/2; end
if nargin < 7; sumpower = 1; end
if nargin < 8; bwidth = 1.0; end
[nfreqs,nframes] = size(pspectrum);
nfft = (nfreqs-1)*2;
if strcmp(fbtype, 'bark')
wts = fft2barkmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq);
elseif strcmp(fbtype, 'mel')
wts = fft2melmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq);
elseif strcmp(fbtype, 'htkmel')
wts = fft2melmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq, 1, 1);
elseif strcmp(fbtype, 'fcmel')
wts = fft2melmx(nfft, sr, nfilts, bwidth, minfreq, maxfreq, 1, 0);
else
disp(['fbtype ', fbtype, ' not recognized']);
error;
end
wts = wts(:, 1:nfreqs);
% Integrate FFT bins into Mel bins, in abs or abs^2 domains:
if (sumpower)
aspectrum = wts * pspectrum;
else
aspectrum = (wts * sqrt(pspectrum)).^2;
end
function y = dolpc(x,modelorder)
%y = dolpc(x,modelorder)
%
% compute autoregressive model from spectral magnitude samples
%
% rows(x) = critical band
% col(x) = frame
%
% row(y) = lpc a_i coeffs, scaled by gain
% col(y) = frame
%
% modelorder is order of model, defaults to 8
% 2003-04-12 [email protected] after [email protected]
[nbands,nframes] = size(x);
if nargin < 2
modelorder = 8;
end
% Calculate autocorrelation
r = real(ifft([x;x([(nbands-1):-1:2],:)]));
% First half only
r = r(1:nbands,:);
% Find LPC coeffs by durbin
[y,e] = levinson(r, modelorder);
% Normalize each poly by gain
y = y'./repmat(e',(modelorder+1),1);
function y = lifter(x, lift, invs)
% y = lifter(x, lift, invs)
% Apply lifter to matrix of cepstra (one per column)
% lift = exponent of x i^n liftering
% or, as a negative integer, the length of HTK-style sin-curve liftering.
% If inverse == 1 (default 0), undo the liftering.
% 2005-05-19 [email protected]
if nargin < 2; lift = 0.6; end % liftering exponent
if nargin < 3; invs = 0; end % flag to undo liftering
[ncep, nfrm] = size(x);
if lift == 0
y = x;
else
if lift > 0
if lift > 10
disp(['Unlikely lift exponent of ', num2str(lift),' (did you mean -ve?)']);
end
liftwts = [1, ([1:(ncep-1)].^lift)];
elseif lift < 0
% Hack to support HTK liftering
L = -lift;
if (L ~= round(L))
disp(['HTK liftering value ', num2str(L),' must be integer']);
end
liftwts = [1, (1+L/2*sin([1:(ncep-1)]*pi/L))];
end
if (invs)
liftwts = 1./liftwts;
end
y = diag(liftwts)*x;
end
function features = lpc2cep(a,nout)
% features = lpc2cep(lpcas,nout)
% Convert the LPC 'a' coefficients in each column of lpcas
% into frames of cepstra.
% nout is number of cepstra to produce, defaults to size(lpcas,1)
% 2003-04-11 [email protected]
[nin, ncol] = size(a);
order = nin - 1;
if nargin < 2
nout = order + 1;
end
c = zeros(nout, ncol);
% Code copied from HSigP.c: LPC2Cepstrum
% First cep is log(Error) from Durbin
c(1,:) = -log(a(1,:));
% Renormalize lpc A coeffs
a = a ./ repmat(a(1,:), nin, 1);
for n = 2:nout
sum = 0;
for m = 2:n
sum = sum + (n - m) * a(m,:) .* c(n - m + 1, :);
end
c(n,:) = -(a(n,:) + sum / (n-1));
end
features = c;
function [cepstra,aspectrum,pspectrum] = melfcc(samples, sr, varargin)
%[cepstra,aspectrum,pspectrum] = melfcc(samples, sr[, opts ...])
% Calculate Mel-frequency cepstral coefficients by:
% - take the absolute value of the STFT
% - warp to a Mel frequency scale
% - take the DCT of the log-Mel-spectrum
% - return the first <ncep> components
% This version allows a lot of options to be controlled, as optional
% 'name', value pairs from the 3rd argument on: (defaults in parens)
% 'wintime' (0.025): window length in sec
% 'hoptime' (0.010): step between successive windows in sec
% 'numcep' (13): number of cepstra to return
% 'lifterexp' (0.6): exponent for liftering; 0 = none; < 0 = HTK sin lifter
% 'sumpower' (1): 1 = sum abs(fft)^2; 0 = sum abs(fft)
% 'preemph' (0.97): apply pre-emphasis filter [1 -preemph] (0 = none)
% 'dither' (0): 1 = add offset to spectrum as if dither noise
% 'minfreq' (0): lowest band edge of mel filters (Hz)
% 'maxfreq' (4000): highest band edge of mel filters (Hz)
% 'nbands' (40): number of warped spectral bands to use
% 'bwidth' (1.0): width of aud spec filters relative to default
% 'dcttype' (2): type of DCT used - 1 or 2 (or 3 for HTK or 4 for feac)
% 'fbtype' ('mel'): frequency warp: 'mel','bark','htkmel','fcmel'
% 'usecmp' (0): apply equal-loudness weighting and cube-root compr.
% 'modelorder' (0): if > 0, fit a PLP model of this order
% 'broaden' (0): flag to retain the (useless?) first and last bands
% 'useenergy' (0): overwrite C0 with true log energy
% The following non-default values nearly duplicate Malcolm Slaney's mfcc
% (i.e. melfcc(d,16000,opts...) =~= log(10)*2*mfcc(d*(2^17),16000) )
% 'wintime': 0.016
% 'lifterexp': 0
% 'minfreq': 133.33
% 'maxfreq': 6855.6
% 'sumpower': 0
% The following non-default values nearly duplicate HTK's MFCC
% (i.e. melfcc(d,16000,opts...) =~= 2*htkmelfcc(:,[13,[1:12]])'
% where HTK config has PREEMCOEF = 0.97, NUMCHANS = 20, CEPLIFTER = 22,
% NUMCEPS = 12, WINDOWSIZE = 250000.0, USEHAMMING = T, TARGETKIND = MFCC_0)
% 'lifterexp': -22
% 'nbands': 20
% 'maxfreq': 8000
% 'sumpower': 0
% 'fbtype': 'htkmel'
% 'dcttype': 3
% For more detail on reproducing other programs' outputs, see
% http://www.ee.columbia.edu/~dpwe/resources/matlab/rastamat/mfccs.html
%
% 2005-04-19 [email protected] after rastaplp.m.
% Uses Mark Paskin's process_options.m from KPMtools
% $Header: /Users/dpwe/matlab/rastamat/RCS/melfcc.m,v 1.3 2012/09/03 14:01:26 dpwe Exp dpwe $
if nargin < 2; sr = 16000; end
% Parse out the optional arguments
[wintime, hoptime, numcep, lifterexp, sumpower, preemph, dither, ...
minfreq, maxfreq, nbands, bwidth, dcttype, fbtype, usecmp, modelorder, ...
broaden, useenergy] = ...
process_options(varargin, 'wintime', 0.025, 'hoptime', 0.010, ...
'numcep', 13, 'lifterexp', 0.6, 'sumpower', 1, 'preemph', 0.97, ...
'dither', 0, 'minfreq', 0, 'maxfreq', 4000, ...
'nbands', 40, 'bwidth', 1.0, 'dcttype', 2, ...
'fbtype', 'mel', 'usecmp', 0, 'modelorder', 0, ...
'broaden', 0, 'useenergy', 0);
if preemph ~= 0
samples = filter([1 -preemph], 1, samples);
end
% Compute FFT power spectrum
[pspectrum,logE] = powspec(samples, sr, wintime, hoptime, dither);
aspectrum = audspec(pspectrum, sr, nbands, fbtype, minfreq, maxfreq, sumpower, bwidth);
if (usecmp)
% PLP-like weighting/compression
aspectrum = postaud(aspectrum, maxfreq, fbtype, broaden);
end
if modelorder > 0
if (dcttype ~= 1)
disp(['warning: plp cepstra are implicitly dcttype 1 (not ', num2str(dcttype), ')']);
end
% LPC analysis
lpcas = dolpc(aspectrum, modelorder);
% convert lpc to cepstra
cepstra = lpc2cep(lpcas, numcep);
% Return the auditory spectrum corresponding to the cepstra?
% aspectrum = lpc2spec(lpcas, nbands);
% else return the aspectrum that the cepstra are based on, prior to PLP
else
% Convert to cepstra via DCT
cepstra = spec2cep(aspectrum, numcep, dcttype);
end
cepstra = lifter(cepstra, lifterexp);
if useenergy
cepstra(1,:) = logE;
end
function [y,eql] = postaud(x,fmax,fbtype,broaden)
%y = postaud(x, fmax, fbtype)
%
% do loudness equalization and cube root compression
% x = critical band filters
% rows = critical bands
% cols = frames
if nargin < 3
fbtype = 'bark';
end
if nargin < 4
% By default, don't add extra flanking bands
broaden = 0;
end
[nbands,nframes] = size(x);
% equal loundness weights stolen from rasta code
%eql = [0.000479 0.005949 0.021117 0.044806 0.073345 0.104417 0.137717 ...
% 0.174255 0.215590 0.263260 0.318302 0.380844 0.449798 0.522813
% 0.596597];
% Include frequency points at extremes, discard later
nfpts = nbands+2*broaden;
if strcmp(fbtype, 'bark')
bandcfhz = bark2hz(linspace(0, hz2bark(fmax), nfpts));
elseif strcmp(fbtype, 'mel')
bandcfhz = mel2hz(linspace(0, hz2mel(fmax), nfpts));
elseif strcmp(fbtype, 'htkmel') || strcmp(fbtype, 'fcmel')
bandcfhz = mel2hz(linspace(0, hz2mel(fmax,1), nfpts),1);
else
disp(['unknown fbtype', fbtype]);
error;
end
% Remove extremal bands (the ones that will be duplicated)
bandcfhz = bandcfhz((1+broaden):(nfpts-broaden));
% Hynek's magic equal-loudness-curve formula
fsq = bandcfhz.^2;
ftmp = fsq + 1.6e5;
eql = ((fsq./ftmp).^2) .* ((fsq + 1.44e6)./(fsq + 9.61e6));
% weight the critical bands
z = repmat(eql',1,nframes).*x;
% cube root compress
z = z .^ (.33);
% replicate first and last band (because they are unreliable as calculated)
if (broaden)
y = z([1,1:nbands,nbands],:);
else
y = z([2,2:(nbands-1),nbands-1],:);
end
%y = z([1,1:nbands-2,nbands-2],:);
function [y,e] = powspec(x, sr, wintime, steptime, dither)
%[y,e] = powspec(x, sr, wintime, steptime, sumlin, dither)
%
% compute the powerspectrum and frame energy of the input signal.
% basically outputs a power spectrogram
%
% each column represents a power spectrum for a given frame
% each row represents a frequency
%
% default values:
% sr = 8000Hz
% wintime = 25ms (200 samps)
% steptime = 10ms (80 samps)
% which means use 256 point fft
% hamming window
%
% $Header: /Users/dpwe/matlab/rastamat/RCS/powspec.m,v 1.3 2012/09/03 14:02:01 dpwe Exp dpwe $
% for sr = 8000
%NFFT = 256;
%NOVERLAP = 120;
%SAMPRATE = 8000;
%WINDOW = hamming(200);
if nargin < 2
sr = 8000;
end
if nargin < 3
wintime = 0.025;
end
if nargin < 4
steptime = 0.010;
end
if nargin < 5
dither = 1;
end
winpts = round(wintime*sr);
steppts = round(steptime*sr);
NFFT = 2^(ceil(log(winpts)/log(2)));
%WINDOW = hamming(winpts);
%WINDOW = [0,hanning(winpts)'];
WINDOW = [hanning(winpts)'];
% hanning gives much less noisy sidelobes
NOVERLAP = winpts - steppts;
SAMPRATE = sr;
% Values coming out of rasta treat samples as integers,
% not range -1..1, hence scale up here to match (approx)
y = abs(specgram(x*32768,NFFT,SAMPRATE,WINDOW,NOVERLAP)).^2;
% imagine we had random dither that had a variance of 1 sample
% step and a white spectrum. That's like (in expectation, anyway)
% adding a constant value to every bin (to avoid digital zero)
if (dither)
y = y + winpts;
end
% ignoring the hamming window, total power would be = #pts
% I think this doesn't quite make sense, but it's what rasta/powspec.c does
% that's all she wrote
% 2012-09-03 Calculate log energy - after windowing, by parseval
e = log(sum(y));
function y = rastafilt(x)
% y = rastafilt(x)
%
% rows of x = critical bands, cols of x = frame
% same for y but after filtering
%
% default filter is single pole at 0.94
% rasta filter
numer = [-2:2];
numer = -numer ./ sum(numer.*numer);
denom = [1 -0.94];
% Initialize the state. This avoids a big spike at the beginning
% resulting from the dc offset level in each band.
% (this is effectively what rasta/rasta_filt.c does).
% Because Matlab uses a DF2Trans implementation, we have to
% specify the FIR part to get the state right (but not the IIR part)
%[y,z] = filter(numer, 1, x(:,1:4)');
% 2010-08-12 filter() chooses the wrong dimension for very short
% signals (thanks to Benjamin K [email protected])
[y,z] = filter(numer, 1, x(:,1:4)',[],1);
% .. but don't keep any of these values, just output zero at the beginning
y = 0*y';
size(z)
size(x)
% Apply the full filter to the rest of the signal, append it
y = [y,filter(numer, denom, x(:,5:end)',z,1)'];
%y = [y,filter(numer, denom, x(:,5:end)',z)'];
function [cepstra, spectra, pspectrum, lpcas, F, M] = rastaplp(samples, sr, dorasta, modelorder)
%[cepstra, spectra, lpcas] = rastaplp(samples, sr, dorasta, modelorder)
%
% cheap version of log rasta with fixed parameters
%
% output is matrix of features, row = feature, col = frame
%
% sr is sampling rate of samples, defaults to 8000
% dorasta defaults to 1; if 0, just calculate PLP
% modelorder is order of PLP model, defaults to 8. 0 -> no PLP
%
% rastaplp(d, sr, 0, 12) is pretty close to the unix command line
% feacalc -dith -delta 0 -ras no -plp 12 -dom cep ...
% except during very quiet areas, where our approach of adding noise
% in the time domain is different from rasta's approach
%
% 2003-04-12 [email protected] after [email protected]'s version
if nargin < 2
sr = 8000;
end
if nargin < 3
dorasta = 1;
end
if nargin < 4
modelorder = 8;
end
% add miniscule amount of noise
%samples = samples + randn(size(samples))*0.0001;
% first compute power spectrum
pspectrum = powspec(samples, sr);
% next group to critical bands
aspectrum = audspec(pspectrum, sr);
nbands = size(aspectrum,1);
if dorasta ~= 0
% put in log domain
nl_aspectrum = log(aspectrum);
% next do rasta filtering
ras_nl_aspectrum = rastafilt(nl_aspectrum);
% do inverse log
aspectrum = exp(ras_nl_aspectrum);
end
% do final auditory compressions
postspectrum = postaud(aspectrum, sr/2); % 2012-09-03 bug: was sr
if modelorder > 0
% LPC analysis
lpcas = dolpc(postspectrum, modelorder);
% convert lpc to cepstra
cepstra = lpc2cep(lpcas, modelorder+1);
% .. or to spectra
[spectra,F,M] = lpc2spec(lpcas, nbands);
else
% No LPC smoothing of spectrum
spectra = postspectrum;
cepstra = spec2cep(spectra);
end
cepstra = lifter(cepstra, 0.6);
function [cep,dctm] = spec2cep(spec, ncep, type)
% [cep,dctm] = spec2cep(spec, ncep, type)
% Calculate cepstra from spectral samples (in columns of spec)
% Return ncep cepstral rows (defaults to 9)
% This one does type II dct, or type I if type is specified as 1
% dctm returns the DCT matrix that spec was multiplied by to give cep.
% 2005-04-19 [email protected] for mfcc_dpwe
if nargin < 2; ncep = 13; end
if nargin < 3; type = 2; end % type of DCT
[nrow, ncol] = size(spec);
% Make the DCT matrix
dctm = zeros(ncep, nrow);
if type == 2 || type == 3
% this is the orthogonal one, the one you want
for i = 1:ncep
dctm(i,:) = cos((i-1)*[1:2:(2*nrow-1)]/(2*nrow)*pi) * sqrt(2/nrow);
end
if type == 2
% make it unitary! (but not for HTK type 3)
dctm(1,:) = dctm(1,:)/sqrt(2);
end
elseif type == 4 % type 1 with implicit repeating of first, last bins
% Deep in the heart of the rasta/feacalc code, there is the logic
% that the first and last auditory bands extend beyond the edge of
% the actual spectra, and they are thus copied from their neighbors.
% Normally, we just ignore those bands and take the 19 in the middle,
% but when feacalc calculates mfccs, it actually takes the cepstrum
% over the spectrum *including* the repeated bins at each end.
% Here, we simulate 'repeating' the bins and an nrow+2-length
% spectrum by adding in extra DCT weight to the first and last
% bins.
for i = 1:ncep
dctm(i,:) = cos((i-1)*[1:nrow]/(nrow+1)*pi) * 2;
% Add in edge points at ends (includes fixup scale)
dctm(i,1) = dctm(i,1) + 1;
dctm(i,nrow) = dctm(i,nrow) + ((-1)^(i-1));
end
dctm = dctm / (2*(nrow+1));
else % dpwe type 1 - same as old spec2cep that expanded & used fft
for i = 1:ncep
dctm(i,:) = cos((i-1)*[0:(nrow-1)]/(nrow-1)*pi) * 2 / (2*(nrow-1));
end
% fixup 'non-repeated' points
dctm(:,[1 nrow]) = dctm(:, [1 nrow])/2;
end
cep = dctm*log(spec);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment