Last active
April 2, 2017 22:32
-
-
Save casperkaae/77a507ddf5a83b20a0dc6bf08c9d01b8 to your computer and use it in GitHub Desktop.
sound fun
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Packages we're using | |
import numpy as np | |
import copy | |
import soundfile as sf | |
def overlap(X, window_size, window_step): | |
""" | |
Create an overlapped version of X | |
Parameters | |
---------- | |
X : ndarray, shape=(n_samples,) | |
Input signal to window and overlap | |
window_size : int | |
Size of windows to take | |
window_step : int | |
Step size between windows | |
Returns | |
------- | |
X_strided : shape=(n_windows, window_size) | |
2D array of overlapped X | |
""" | |
if window_size % 2 != 0: | |
raise ValueError("Window size must be even!") | |
# Make sure there are an even number of windows before stridetricks | |
append = np.zeros((window_size - len(X) % window_size)) | |
X = np.hstack((X, append)) | |
ws = window_size | |
ss = window_step | |
a = X | |
valid = int(len(a) - ws) | |
nw = int((valid) // ss) | |
out = np.ndarray((nw,ws),dtype = a.dtype) | |
for i in range(nw): | |
# "slide" the window along the samples | |
start = int(i * ss) | |
stop = int(start + ws) | |
out[i] = a[start : stop] | |
return out | |
def stft(X, window_size=128, fftsize=256, step=65, mean_normalize=True, real=False, | |
compute_onesided=True, windowtype='hamming'): | |
""" | |
Compute STFT for 1D real valued input X | |
""" | |
if real: | |
local_fft = np.fft.rfft | |
cut = -1 | |
else: | |
local_fft = np.fft.fft | |
cut = None | |
if compute_onesided: | |
cut = fftsize // 2 | |
if mean_normalize: | |
X -= X.mean() | |
X = overlap(X, window_size, step) | |
if windowtype == 'hamming': | |
win = np.hamming(window_size) | |
elif windowtype == 'bartlett': | |
win = np.bartlett(window_size) | |
elif windowtype == 'blackman': | |
win = np.blackman(window_size) | |
elif windowtype == 'kaiser': | |
win = np.kaiser(window_size) | |
#win = 0.54 - .46 * np.cos(2 * np.pi * np.arange(window_size) / (window_size - 1)) | |
else: | |
raise ValueError() | |
X = X * win[None] | |
X = local_fft(X,fftsize)[:, :cut] | |
return X | |
def spectrogram(d,log = True, thresh= 5, window_size = 256, fft_size = 512, step_size = 64, windowtype='hamming'): | |
""" | |
creates a spectrogram | |
log: take the log of the spectrgram | |
thresh: threshold minimum power for log spectrogram | |
""" | |
F = stft(d, window_size= window_size, fftsize=fft_size, step=step_size, real=False, | |
compute_onesided=True, windowtype=windowtype) | |
specgram = np.abs(F) | |
phase = np.angle(F) | |
if log == True: | |
specgram /= specgram.max() # volume normalize to max 1 | |
specgram = np.log10(specgram) # take log | |
specgram[specgram < -thresh] = -thresh # set anything less than the threshold as the threshold | |
else: | |
specgram[specgram < thresh] = thresh # set anything less than the threshold as the threshold | |
return specgram, phase | |
def spectrogram_to_mel(spectrogram, mel_filter): | |
mel_spec = spectrogram.dot(mel_filter) | |
return mel_spec | |
def hz2mel(hz): | |
"""Convert a value in Hertz to Mels | |
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. | |
:returns: a value in Mels. If an array was passed in, an identical sized array is returned. | |
""" | |
return 2595 * np.log10(1+hz/700.) | |
def mel2hz(mel): | |
"""Convert a value in Mels to Hertz | |
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. | |
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned. | |
""" | |
return 700*(10**(mel/2595.0)-1) | |
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): | |
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond | |
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) | |
:param nfilt: the number of filters in the filterbank, default 20. | |
:param nfft: the FFT size. Default is 512. | |
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing. | |
:param lowfreq: lowest band edge of mel filters, default 0 Hz | |
:param highfreq: highest band edge of mel filters, default samplerate/2 | |
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. | |
""" | |
highfreq= highfreq or samplerate/2 | |
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" | |
# compute points evenly spaced in mels | |
lowmel = hz2mel(lowfreq) | |
highmel = hz2mel(highfreq) | |
melpoints = np.linspace(lowmel,highmel,nfilt+2) | |
# our points are in Hz, but we use fft bins, so we have to convert | |
# from Hz to fft bin number | |
bin = np.floor((nfft+1)*mel2hz(melpoints)/samplerate) | |
fbank = np.zeros([nfilt,nfft//2]) | |
for j in range(0,nfilt): | |
for i in range(int(bin[j]), int(bin[j+1])): | |
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j]) | |
for i in range(int(bin[j+1]), int(bin[j+2])): | |
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1]) | |
return fbank | |
def create_mel_filter(fft_size, n_freq_components = 64, start_freq = 300, end_freq = 8000, samplerate=44100): | |
""" | |
Creates a filter to convolve with the spectrogram to get out mels | |
""" | |
mel_inversion_filter = get_filterbanks(nfilt=n_freq_components, | |
nfft=fft_size, samplerate=samplerate, | |
lowfreq=start_freq, highfreq=end_freq) | |
# Normalize filter | |
mel_filter = mel_inversion_filter / mel_inversion_filter.sum(axis=1,keepdims=True) | |
return mel_filter.T, mel_inversion_filter | |
def get_spectrograms(x, window_size, step_size, spec_thresh=6, n_mel_freq_components = 64, start_freq = 0, end_freq=None, samplerate=22050): | |
fft_size = window_size | |
n = x.shape[0] | |
x = x[:n-n%fft_size] | |
assert x.shape[0] % fft_size == 0 | |
assert fft_size % step_size == 0 | |
cut = int((fft_size//step_size)/2) | |
windowtype = 'hamming' | |
mel_filter, mel_inversion_filter = create_mel_filter(fft_size = fft_size, | |
n_freq_components = n_mel_freq_components, | |
start_freq = start_freq, | |
end_freq = end_freq, | |
samplerate = samplerate) | |
assert not np.isnan(mel_filter.sum()), 'nans in mel filter, try lowering n_mel_freq_components' | |
spec_linear,phase = spectrogram(x.astype('float64'), window_size=window_size, fft_size = fft_size, | |
step_size = step_size, log = True, thresh = spec_thresh,windowtype=windowtype) | |
spec_linear = spec_linear[:-cut] | |
phase = spec_linear[:-cut] | |
spec_mel = spectrogram_to_mel(spec_linear, mel_filter) | |
return x, spec_linear, phase, spec_mel | |
#SETTINGS | |
n_mel_freq_components = 80 | |
start_freq = 200 | |
spec_thresh = 6 | |
step_size = 64 | |
window_size = 1024 | |
samplerate = 22050 | |
######## GET SPECTOGRAM FOR A SINGLE FILE ######### | |
### Discard phase | |
xshortened, spec_linear, phase, spec_mel = get_spectrograms(data, window_size, step_size, spec_thresh=spec_thresh, n_mel_freq_components = n_mel_freq_components, start_freq = start_freq, end_freq=None, samplerate=samplerate) | |
####### ###### | |
recovered_audio_orig = griffin_lim(spec_linear, fft_size = fft_size, | |
step_size = step_size, log = True, n_iter = 50, pre_raise=1.1,windowtype=windowtype) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment