Last active
July 20, 2018 03:26
-
-
Save dallarosa/a2e129a59fa9845940e8e98d958ca603 to your computer and use it in GitHub Desktop.
Spectogram images from audio for machine learning
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Based on http://www.frank-zalkow.de/en/code-snippets/create-audio-spectrograms-with-python.html | |
#This work is licensed under Creative Commons Attribution 3.0 International(Unported), according to the original work. | |
#Original Author: Frank Zalkow | |
#This Version's Author: Francisco Dalla Rosa Soares | |
import numpy as np | |
from matplotlib import pyplot as plt | |
import scipy.io.wavfile as wav | |
from numpy.lib import stride_tricks | |
import math | |
""" short time fourier transform of audio signal """ | |
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning): | |
win = window(frameSize) | |
hopSize = int(frameSize - np.floor(overlapFac * frameSize)) | |
# zeros at beginning (thus center of 1st window should be for sample nr. 0) | |
samples = np.append(np.zeros(math.floor(frameSize/2.0)), sig) | |
# cols for windowing | |
cols = math.ceil( (len(samples) - frameSize) / float(hopSize)) + 1 | |
# zeros at end (thus samples can be fully covered by frames) | |
samples = np.append(samples, np.zeros(frameSize)) | |
frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy() | |
frames *= win | |
return np.fft.rfft(frames) | |
""" scale frequency axis logarithmically """ | |
def logscale_spec(spec, sr=44100, factor=20.): | |
timebins, freqbins = np.shape(spec) | |
scale = np.linspace(0, 1, freqbins) ** factor | |
scale *= (freqbins-1)/max(scale) | |
scale = np.unique(np.round(scale)).astype(int) | |
# create spectrogram with new freq bins | |
newspec = np.complex128(np.zeros([timebins, len(scale)])) | |
for i in range(0, len(scale)): | |
if i == len(scale)-1: | |
newspec[:,i] = np.sum(spec[:,scale[i]:], axis=1) | |
else: | |
newspec[:,i] = np.sum(spec[:,scale[i]:scale[i+1]], axis=1) | |
# list center freq of bins | |
allfreqs = np.abs(np.fft.fftfreq(freqbins*2, 1./sr)[:freqbins+1]) | |
freqs = [] | |
for i in range(0, len(scale)): | |
if i == len(scale)-1: | |
freqs += [np.mean(allfreqs[scale[i]:])] | |
else: | |
freqs += [np.mean(allfreqs[scale[i]:scale[i+1]])] | |
return newspec, freqs | |
""" plot spectrogram""" | |
def plotstft(audiopath, binsize=2**10, plotpath=None, colormap="jet"): | |
samplerate, samples = wav.read(audiopath) | |
s = stft(samples, binsize) | |
sshow, freq = logscale_spec(s, factor=1.0, sr=samplerate) | |
ims = 20.*np.log10(np.abs(sshow)/10e-6) # amplitude to decibel | |
timebins, freqbins = np.shape(ims) | |
plt.figure(figsize=(15, 7.5)) | |
plt.imshow(np.transpose(ims), origin="lower", aspect="auto", cmap=colormap, interpolation="none") | |
plt.xlim([0, timebins-1]) | |
plt.ylim([0, freqbins]) | |
plt.axis('off') | |
if plotpath: | |
plt.savefig(plotpath, bbox_inches="tight") | |
else: | |
plt.show() | |
plt.clf() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment