Created
April 2, 2017 22:03
-
-
Save casperkaae/6c4be96c29ecb9c2bc14efe713df71b8 to your computer and use it in GitHub Desktop.
sound fun
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Packages we're using | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import copy | |
from scipy.io import wavfile | |
import soundfile as sf | |
from utils.snr import residual_snrdb | |
def overlap(X, window_size, window_step): | |
""" | |
Create an overlapped version of X | |
Parameters | |
---------- | |
X : ndarray, shape=(n_samples,) | |
Input signal to window and overlap | |
window_size : int | |
Size of windows to take | |
window_step : int | |
Step size between windows | |
Returns | |
------- | |
X_strided : shape=(n_windows, window_size) | |
2D array of overlapped X | |
""" | |
if window_size % 2 != 0: | |
raise ValueError("Window size must be even!") | |
# Make sure there are an even number of windows before stridetricks | |
append = np.zeros((window_size - len(X) % window_size)) | |
X = np.hstack((X, append)) | |
ws = window_size | |
ss = window_step | |
a = X | |
valid = int(len(a) - ws) | |
nw = int((valid) // ss) | |
out = np.ndarray((nw,ws),dtype = a.dtype) | |
for i in range(nw): | |
# "slide" the window along the samples | |
start = int(i * ss) | |
stop = int(start + ws) | |
out[i] = a[start : stop] | |
return out | |
def stft(X, window_size=128, fftsize=256, step=65, mean_normalize=True, real=False, | |
compute_onesided=True, windowtype='hamming'): | |
""" | |
Compute STFT for 1D real valued input X | |
""" | |
if real: | |
local_fft = np.fft.rfft | |
cut = -1 | |
else: | |
local_fft = np.fft.fft | |
cut = None | |
if compute_onesided: | |
cut = fftsize // 2 | |
if mean_normalize: | |
X -= X.mean() | |
X = overlap(X, window_size, step) | |
if windowtype == 'hamming': | |
win = np.hamming(window_size) | |
elif windowtype == 'bartlett': | |
win = np.bartlett(window_size) | |
elif windowtype == 'blackman': | |
win = np.blackman(window_size) | |
elif windowtype == 'kaiser': | |
win = np.kaiser(window_size) | |
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (window_size - 1)) | |
else: | |
raise ValueError() | |
X = X * win[None] | |
X = local_fft(X,fftsize)[:, :cut] | |
return X | |
def spectrogram(d,log = True, thresh= 5, window_size = 256, fft_size = 512, step_size = 64, windowtype='hamming'): | |
""" | |
creates a spectrogram | |
log: take the log of the spectrgram | |
thresh: threshold minimum power for log spectrogram | |
""" | |
F = stft(d, window_size= window_size, fftsize=fft_size, step=step_size, real=False, | |
compute_onesided=True, windowtype=windowtype) | |
specgram = np.abs(F) | |
phase = np.angle(F) | |
if log == True: | |
specgram /= specgram.max() # volume normalize to max 1 | |
specgram = np.log10(specgram) # take log | |
specgram[specgram < -thresh] = -thresh # set anything less than the threshold as the threshold | |
else: | |
specgram[specgram < thresh] = thresh # set anything less than the threshold as the threshold | |
return specgram, phase | |
# Also mostly modified or taken from https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe | |
def griffin_lim(X_s, log = True, fft_size = 512, step_size = 512/4, n_iter = 10, pre_raise=None, windowtype='hamming'): | |
if log == True: | |
X_s = np.power(10, X_s) | |
if pre_raise: | |
X_s = X_s**pre_raise | |
X_s = np.concatenate([X_s, X_s[:, ::-1]], axis=1) | |
X_t = iterate_invert_spectrogram(X_s, fft_size, step_size, n_iter=n_iter,windowtype=windowtype) | |
return X_t | |
def iterate_invert_spectrogram(X_s, fftsize, step, n_iter=10, verbose=False, windowtype='hamming'): | |
""" | |
Under MSR-LA License | |
Based on MATLAB implementation from Spectrogram Inversion Toolbox | |
References | |
---------- | |
D. Griffin and J. Lim. Signal estimation from modified | |
short-time Fourier transform. IEEE Trans. Acoust. Speech | |
Signal Process., 32(2):236-243, 1984. | |
Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory | |
Model Inversion for Sound Separation. Proc. IEEE-ICASSP, | |
Adelaide, 1994, II.77-80. | |
Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal | |
Estimation from Modified Short-Time Fourier Transform | |
Magnitude Spectra. IEEE Transactions on Audio Speech and | |
Language Processing, 08/2007. | |
""" | |
reg = np.max(X_s) / 1E8 | |
X_best = copy.deepcopy(X_s) | |
for i in range(n_iter): | |
if verbose: | |
print("Runnning iter %i" % i) | |
if i == 0: | |
X_t = invert_spectrogram(X_best, step, calculate_offset=True, | |
set_zero_phase=True,windowtype=windowtype) | |
else: | |
# Calculate offset was False in the MATLAB version | |
# but in mine it massively improves the result | |
# Possible bug in my impl? | |
X_t = invert_spectrogram(X_best, step, calculate_offset=True, | |
set_zero_phase=False,windowtype=windowtype) | |
est = stft(X_t, window_size=window_size, fftsize=fftsize, step=step, compute_onesided=False) | |
phase = est / np.maximum(reg, np.abs(est)) | |
X_best = X_s * phase[:len(X_s)] | |
X_t = invert_spectrogram(X_best, step, calculate_offset=True, | |
set_zero_phase=False) | |
return np.real(X_t) | |
def invert_spectrogram(X_s, step, calculate_offset=True, set_zero_phase=True, windowtype='hamming'): | |
""" | |
Under MSR-LA License | |
Based on MATLAB implementation from Spectrogram Inversion Toolbox | |
References | |
---------- | |
D. Griffin and J. Lim. Signal estimation from modified | |
short-time Fourier transform. IEEE Trans. Acoust. Speech | |
Signal Process., 32(2):236-243, 1984. | |
Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory | |
Model Inversion for Sound Separation. Proc. IEEE-ICASSP, | |
Adelaide, 1994, II.77-80. | |
Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal | |
Estimation from Modified Short-Time Fourier Transform | |
Magnitude Spectra. IEEE Transactions on Audio Speech and | |
Language Processing, 08/2007. | |
""" | |
size = int(X_s.shape[1] // 2) | |
wave = np.zeros(int(X_s.shape[0] * step + size)) | |
# Getting overflow warnings with 32 bit... | |
wave = wave.astype('float64') | |
total_windowing_sum = np.zeros(int(X_s.shape[0] * step + size)) | |
if windowtype == 'hamming': | |
win = np.hamming(size) | |
elif windowtype == 'bartlett': | |
win = np.bartlett(size) | |
elif windowtype == 'blackman': | |
win = np.blackman(size) | |
elif windowtype == 'kaiser': | |
win = np.kaiser(size) | |
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (window_size - 1)) | |
else: | |
raise ValueError() | |
est_start = int(size // 2) - 1 | |
est_end = est_start + size | |
for i in range(X_s.shape[0]): | |
wave_start = int(step * i) | |
wave_end = int(wave_start + size) | |
if set_zero_phase: | |
spectral_slice = X_s[i].real + 0j | |
else: | |
# already complex | |
spectral_slice = X_s[i] | |
# Don't need fftshift due to different impl. | |
wave_est = np.real(np.fft.ifft(spectral_slice))[::-1] | |
if calculate_offset and i > 0: | |
offset_size = int(size - step) | |
if offset_size <= 0: | |
print("WARNING: Large step size >50\% detected! " | |
"This code works best with high overlap - try " | |
"with 75% or greater") | |
offset_size = step | |
offset = xcorr_offset(wave[wave_start:wave_start + offset_size], | |
wave_est[est_start:est_start + offset_size]) | |
else: | |
offset = 0 | |
wave[wave_start:wave_end] += win * wave_est[ | |
est_start - offset:est_end - offset] | |
total_windowing_sum[wave_start:wave_end] += win | |
wave = np.real(wave) / (total_windowing_sum + 1E-6) | |
return wave | |
def xcorr_offset(x1, x2): | |
""" | |
Under MSR-LA License | |
Based on MATLAB implementation from Spectrogram Inversion Toolbox | |
References | |
---------- | |
D. Griffin and J. Lim. Signal estimation from modified | |
short-time Fourier transform. IEEE Trans. Acoust. Speech | |
Signal Process., 32(2):236-243, 1984. | |
Malcolm Slaney, Daniel Naar and Richard F. Lyon. Auditory | |
Model Inversion for Sound Separation. Proc. IEEE-ICASSP, | |
Adelaide, 1994, II.77-80. | |
Xinglei Zhu, G. Beauregard, L. Wyse. Real-Time Signal | |
Estimation from Modified Short-Time Fourier Transform | |
Magnitude Spectra. IEEE Transactions on Audio Speech and | |
Language Processing, 08/2007. | |
""" | |
x1 = x1 - x1.mean() | |
x2 = x2 - x2.mean() | |
frame_size = len(x2) | |
half = frame_size // 2 | |
corrs = np.convolve(x1.astype('float32'), x2[::-1].astype('float32')) | |
corrs[:half] = -1E30 | |
corrs[-half:] = -1E30 | |
offset = corrs.argmax() - len(x1) | |
return offset | |
def spectrogram_to_mel(spectrogram, mel_filter): | |
mel_spec = spectrogram.dot(mel_filter) | |
return mel_spec | |
def mel_to_spectrogram(mel_spec, mel_inversion_filter, spec_thresh): | |
""" | |
takes in an mel spectrogram and returns a normal spectrogram for inversion | |
""" | |
mel_spec = mel_spec+spec_thresh | |
uncompressed_spec = mel_spec.dot(mel_inversion_filter) | |
return uncompressed_spec-spec_thresh | |
# From https://github.com/jameslyons/python_speech_features | |
def hz2mel(hz): | |
"""Convert a value in Hertz to Mels | |
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. | |
:returns: a value in Mels. If an array was passed in, an identical sized array is returned. | |
""" | |
return 2595 * np.log10(1+hz/700.) | |
def mel2hz(mel): | |
"""Convert a value in Mels to Hertz | |
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. | |
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned. | |
""" | |
return 700*(10**(mel/2595.0)-1) | |
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): | |
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond | |
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) | |
:param nfilt: the number of filters in the filterbank, default 20. | |
:param nfft: the FFT size. Default is 512. | |
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing. | |
:param lowfreq: lowest band edge of mel filters, default 0 Hz | |
:param highfreq: highest band edge of mel filters, default samplerate/2 | |
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. | |
""" | |
highfreq= highfreq or samplerate/2 | |
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" | |
# compute points evenly spaced in mels | |
lowmel = hz2mel(lowfreq) | |
highmel = hz2mel(highfreq) | |
melpoints = np.linspace(lowmel,highmel,nfilt+2) | |
# our points are in Hz, but we use fft bins, so we have to convert | |
# from Hz to fft bin number | |
bin = np.floor((nfft+1)*mel2hz(melpoints)/samplerate) | |
fbank = np.zeros([nfilt,nfft//2]) | |
for j in range(0,nfilt): | |
for i in range(int(bin[j]), int(bin[j+1])): | |
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j]) | |
for i in range(int(bin[j+1]), int(bin[j+2])): | |
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1]) | |
return fbank | |
def create_mel_filter(fft_size, n_freq_components = 64, start_freq = 300, end_freq = 8000, samplerate=44100): | |
""" | |
Creates a filter to convolve with the spectrogram to get out mels | |
""" | |
mel_inversion_filter = get_filterbanks(nfilt=n_freq_components, | |
nfft=fft_size, samplerate=samplerate, | |
lowfreq=start_freq, highfreq=end_freq) | |
# Normalize filter | |
mel_filter = mel_inversion_filter / mel_inversion_filter.sum(axis=1,keepdims=True) | |
return mel_filter.T, mel_inversion_filter | |
def get_spectrograms(x, window_size, step_size, spec_thresh=6, n_mel_freq_components = 64, start_freq = 0, end_freq=None): | |
fft_size = window_size | |
assert x.shape[0] % fft_size == 0 | |
assert fft_size % step_size == 0 | |
cut = int((fft_size//step_size)/2) | |
windowtype = 'hamming' | |
mel_filter, mel_inversion_filter = create_mel_filter(fft_size = fft_size, | |
n_freq_components = n_mel_freq_components, | |
start_freq = start_freq, | |
end_freq = end_freq) | |
assert not np.isnan(mel_filter.sum()), 'nans in mel filter, try lowering n_mel_freq_components' | |
spec_linear,phase = spectrogram(x.astype('float64'), window_size=window_size, fft_size = fft_size, | |
step_size = step_size, log = True, thresh = spec_thresh,windowtype=windowtype) | |
spec_linear = spec_linear[:-cut] | |
phase = spec_linear[:-cut] | |
spec_mel = spectrogram_to_mel(spec_linear, mel_filter) | |
return spec_linear, phase, spec_mel | |
#SETTINGS | |
n_mel_freq_components = 80 | |
start_freq = 200 | |
spec_thresh = 6 | |
step_size = 64 | |
window_size = 1024 | |
######## GET SPECTOGRAM FOR A SINGLE FILE ######### | |
### Discard phase | |
spec_linear, phase, spec_mel = get_spectrograms(data, window_size, step_size, spec_thresh=spec_thresh, n_mel_freq_components = n_mel_freq_components, start_freq = start_freq, end_freq=None) | |
####### ###### | |
recovered_audio_orig = griffin_lim(spec_linear, fft_size = fft_size, | |
step_size = step_size, log = True, n_iter = 50, pre_raise=1.1,windowtype=windowtype) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment