Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save casperkaae/6c4be96c29ecb9c2bc14efe713df71b8 to your computer and use it in GitHub Desktop.
Save casperkaae/6c4be96c29ecb9c2bc14efe713df71b8 to your computer and use it in GitHub Desktop.
sound fun
# Packages we're using
import numpy as np
import matplotlib.pyplot as plt
import copy
from scipy.io import wavfile
import soundfile as sf
from utils.snr import residual_snrdb
def overlap(X, window_size, window_step):
"""
Create an overlapped version of X
Parameters
----------
X : ndarray, shape=(n_samples,)
Input signal to window and overlap
window_size : int
Size of windows to take
window_step : int
Step size between windows
Returns
-------
X_strided : shape=(n_windows, window_size)
2D array of overlapped X
"""
if window_size % 2 != 0:
raise ValueError("Window size must be even!")
# Make sure there are an even number of windows before stridetricks
append = np.zeros((window_size - len(X) % window_size))
X = np.hstack((X, append))
ws = window_size
ss = window_step
a = X
valid = int(len(a) - ws)
nw = int((valid) // ss)
out = np.ndarray((nw,ws),dtype = a.dtype)
for i in range(nw):
# "slide" the window along the samples
start = int(i * ss)
stop = int(start + ws)
out[i] = a[start : stop]
return out
def stft(X, window_size=128, fftsize=256, step=65, mean_normalize=True, real=False,
compute_onesided=True, windowtype='hamming'):
"""
Compute STFT for 1D real valued input X
"""
if real:
local_fft = np.fft.rfft
cut = -1
else:
local_fft = np.fft.fft
cut = None
if compute_onesided:
cut = fftsize // 2
if mean_normalize:
X -= X.mean()
X = overlap(X, window_size, step)
if windowtype == 'hamming':
win = np.hamming(window_size)
elif windowtype == 'bartlett':
win = np.bartlett(window_size)
elif windowtype == 'blackman':
win = np.blackman(window_size)
elif windowtype == 'kaiser':
win = np.kaiser(window_size)
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (window_size - 1))
else:
raise ValueError()
X = X * win[None]
X = local_fft(X,fftsize)[:, :cut]
return X
def spectrogram(d,log = True, thresh= 5, window_size = 256, fft_size = 512, step_size = 64, windowtype='hamming'):
"""
creates a spectrogram
log: take the log of the spectrgram
thresh: threshold minimum power for log spectrogram
"""
F = stft(d, window_size= window_size, fftsize=fft_size, step=step_size, real=False,
compute_onesided=True, windowtype=windowtype)
specgram = np.abs(F)
phase = np.angle(F)
if log == True:
specgram /= specgram.max() # volume normalize to max 1
specgram = np.log10(specgram) # take log
specgram[specgram < -thresh] = -thresh # set anything less than the threshold as the threshold
else:
specgram[specgram < thresh] = thresh # set anything less than the threshold as the threshold
return specgram, phase
# Also mostly modified or taken from https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe
def griffin_lim(X_s, log = True, fft_size = 512, step_size = 512/4, n_iter = 10, pre_raise=None, windowtype='hamming'):
if log == True:
X_s = np.power(10, X_s)
if pre_raise:
X_s = X_s**pre_raise
X_s = np.concatenate([X_s, X_s[:, ::-1]], axis=1)
X_t = iterate_invert_spectrogram(X_s, fft_size, step_size, n_iter=n_iter,windowtype=windowtype)
return X_t
def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False, windowtype='hamming'):
"""
Under MSR-LA License
Based on MATLAB implementation from Spectrogram Inversion Toolbox
References
----------
D. Griffin and J. Lim. Signal estimation from modified
short-time Fourier transform. IEEE Trans. Acoust. Speech
Signal Process., 32(2):236-243, 1984.
Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory
Model Inversion for Sound Separation. Proc. IEEE-ICASSP,
Adelaide, 1994, II.77-80.
Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal
Estimation from Modified Short-Time Fourier Transform
Magnitude Spectra. IEEE Transactions on Audio Speech and
Language Processing, 08/2007.
"""
reg = np.max(X_s) / 1E8
X_best = copy.deepcopy(X_s)
for i in range(n_iter):
if verbose:
print("Runnning iter %i" % i)
if i == 0:
X_t = invert_spectrogram(X_best, step, calculate_offset=True,
set_zero_phase=True,windowtype=windowtype)
else:
# Calculate offset was False in the MATLAB version
# but in mine it massively improves the result
# Possible bug in my impl?
X_t = invert_spectrogram(X_best, step, calculate_offset=True,
set_zero_phase=False,windowtype=windowtype)
est = stft(X_t, window_size=window_size, fftsize=fftsize, step=step, compute_onesided=False)
phase = est / np.maximum(reg, np.abs(est))
X_best = X_s * phase[:len(X_s)]
X_t = invert_spectrogram(X_best, step, calculate_offset=True,
set_zero_phase=False)
return np.real(X_t)
def invert_spectrogram(X_s, step, calculate_offset=True, set_zero_phase=True, windowtype='hamming'):
"""
Under MSR-LA License
Based on MATLAB implementation from Spectrogram Inversion Toolbox
References
----------
D. Griffin and J. Lim. Signal estimation from modified
short-time Fourier transform. IEEE Trans. Acoust. Speech
Signal Process., 32(2):236-243, 1984.
Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory
Model Inversion for Sound Separation. Proc. IEEE-ICASSP,
Adelaide, 1994, II.77-80.
Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal
Estimation from Modified Short-Time Fourier Transform
Magnitude Spectra. IEEE Transactions on Audio Speech and
Language Processing, 08/2007.
"""
size = int(X_s.shape[1] // 2)
wave = np.zeros(int(X_s.shape[0] * step + size))
# Getting overflow warnings with 32 bit...
wave = wave.astype('float64')
total_windowing_sum = np.zeros(int(X_s.shape[0] * step + size))
if windowtype == 'hamming':
win = np.hamming(size)
elif windowtype == 'bartlett':
win = np.bartlett(size)
elif windowtype == 'blackman':
win = np.blackman(size)
elif windowtype == 'kaiser':
win = np.kaiser(size)
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (window_size - 1))
else:
raise ValueError()
est_start = int(size // 2) - 1
est_end = est_start + size
for i in range(X_s.shape[0]):
wave_start = int(step * i)
wave_end = int(wave_start + size)
if set_zero_phase:
spectral_slice = X_s[i].real + 0j
else:
# already complex
spectral_slice = X_s[i]
# Don't need fftshift due to different impl.
wave_est = np.real(np.fft.ifft(spectral_slice))[::-1]
if calculate_offset and i > 0:
offset_size = int(size - step)
if offset_size <= 0:
print("WARNING: Large step size >50\% detected! "
"This code works best with high overlap - try "
"with 75% or greater")
offset_size = step
offset = xcorr_offset(wave[wave_start:wave_start + offset_size],
wave_est[est_start:est_start + offset_size])
else:
offset = 0
wave[wave_start:wave_end] += win * wave_est[
est_start - offset:est_end - offset]
total_windowing_sum[wave_start:wave_end] += win
wave = np.real(wave) / (total_windowing_sum + 1E-6)
return wave
def xcorr_offset(x1, x2):
"""
Under MSR-LA License
Based on MATLAB implementation from Spectrogram Inversion Toolbox
References
----------
D. Griffin and J. Lim. Signal estimation from modified
short-time Fourier transform. IEEE Trans. Acoust. Speech
Signal Process., 32(2):236-243, 1984.
Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory
Model Inversion for Sound Separation. Proc. IEEE-ICASSP,
Adelaide, 1994, II.77-80.
Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal
Estimation from Modified Short-Time Fourier Transform
Magnitude Spectra. IEEE Transactions on Audio Speech and
Language Processing, 08/2007.
"""
x1 = x1 - x1.mean()
x2 = x2 - x2.mean()
frame_size = len(x2)
half = frame_size // 2
corrs = np.convolve(x1.astype('float32'), x2[::-1].astype('float32'))
corrs[:half] = -1E30
corrs[-half:] = -1E30
offset = corrs.argmax() - len(x1)
return offset
def spectrogram_to_mel(spectrogram, mel_filter):
mel_spec = spectrogram.dot(mel_filter)
return mel_spec
def mel_to_spectrogram(mel_spec, mel_inversion_filter, spec_thresh):
"""
takes in an mel spectrogram and returns a normal spectrogram for inversion
"""
mel_spec = mel_spec+spec_thresh
uncompressed_spec = mel_spec.dot(mel_inversion_filter)
return uncompressed_spec-spec_thresh
# From https://github.com/jameslyons/python_speech_features
def hz2mel(hz):
"""Convert a value in Hertz to Mels
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
"""
return 2595 * np.log10(1+hz/700.)
def mel2hz(mel):
"""Convert a value in Mels to Hertz
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
"""
return 700*(10**(mel/2595.0)-1)
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
:param nfilt: the number of filters in the filterbank, default 20.
:param nfft: the FFT size. Default is 512.
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
:param lowfreq: lowest band edge of mel filters, default 0 Hz
:param highfreq: highest band edge of mel filters, default samplerate/2
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
"""
highfreq= highfreq or samplerate/2
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
# compute points evenly spaced in mels
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
melpoints = np.linspace(lowmel,highmel,nfilt+2)
# our points are in Hz, but we use fft bins, so we have to convert
# from Hz to fft bin number
bin = np.floor((nfft+1)*mel2hz(melpoints)/samplerate)
fbank = np.zeros([nfilt,nfft//2])
for j in range(0,nfilt):
for i in range(int(bin[j]), int(bin[j+1])):
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
for i in range(int(bin[j+1]), int(bin[j+2])):
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
return fbank
def create_mel_filter(fft_size, n_freq_components = 64, start_freq = 300, end_freq = 8000, samplerate=44100):
"""
Creates a filter to convolve with the spectrogram to get out mels
"""
mel_inversion_filter = get_filterbanks(nfilt=n_freq_components,
nfft=fft_size, samplerate=samplerate,
lowfreq=start_freq, highfreq=end_freq)
# Normalize filter
mel_filter = mel_inversion_filter / mel_inversion_filter.sum(axis=1,keepdims=True)
return mel_filter.T, mel_inversion_filter
def get_spectrograms(x, window_size, step_size, spec_thresh=6, n_mel_freq_components = 64, start_freq = 0, end_freq=None):
fft_size = window_size
assert x.shape[0] % fft_size == 0
assert fft_size % step_size == 0
cut = int((fft_size//step_size)/2)
windowtype = 'hamming'
mel_filter, mel_inversion_filter = create_mel_filter(fft_size = fft_size,
n_freq_components = n_mel_freq_components,
start_freq = start_freq,
end_freq = end_freq)
assert not np.isnan(mel_filter.sum()), 'nans in mel filter, try lowering n_mel_freq_components'
spec_linear,phase = spectrogram(x.astype('float64'), window_size=window_size, fft_size = fft_size,
step_size = step_size, log = True, thresh = spec_thresh,windowtype=windowtype)
spec_linear = spec_linear[:-cut]
phase = spec_linear[:-cut]
spec_mel = spectrogram_to_mel(spec_linear, mel_filter)
return spec_linear, phase, spec_mel
#SETTINGS
n_mel_freq_components = 80
start_freq = 200
spec_thresh = 6
step_size = 64
window_size = 1024
######## GET SPECTOGRAM FOR A SINGLE FILE #########
### Discard phase
spec_linear, phase, spec_mel = get_spectrograms(data, window_size, step_size, spec_thresh=spec_thresh, n_mel_freq_components = n_mel_freq_components, start_freq = start_freq, end_freq=None)
####### ######
recovered_audio_orig = griffin_lim(spec_linear, fft_size = fft_size,
step_size = step_size, log = True, n_iter = 50, pre_raise=1.1,windowtype=windowtype)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment