Skip to content

Instantly share code, notes, and snippets.

@mthrok
Last active February 19, 2024 08:30
Show Gist options
  • Save mthrok/01f89d9bc27a7fe618bf5e8ef71b44ba to your computer and use it in GitHub Desktop.
Save mthrok/01f89d9bc27a7fe618bf5e8ef71b44ba to your computer and use it in GitHub Desktop.
Compare spectrograms of torchaudio and librosa
import torch
import torchaudio
import librosa
import matplotlib.pyplot as plt
from torchaudio.functional import create_fb_matrix
n_fft = 2048
n_mels = 128
sample_rate = 6000
torchaudio_mel = create_fb_matrix(
int(n_fft // 2 + 1),
n_mels=n_mels,
f_min=0.,
f_max=sample_rate/2.,
sample_rate=sample_rate,
norm='slaney'
)
librosa_mel = librosa.filters.mel(
sample_rate,
n_fft,
n_mels=n_mels,
fmin=0.,
fmax=sample_rate/2.,
norm='slaney',
htk=True,
).T
mse = ((torchaudio_mel - librosa_mel) ** 2).mean()
print(f'MSE:\t{mse}')
fig, axs = plt.subplots(1, 2, figsize=(10, 10))
fig.suptitle('mel-filter bank')
axs[0].set_title('torchaudio[slaney]')
axs[0].imshow(torchaudio_mel, aspect='auto')
axs[0].set_ylabel('frequency bin')
axs[0].set_xlabel('mel bin')
axs[1].set_title('librosa[htk + slaney]')
axs[1].imshow(librosa_mel, aspect='auto')
axs[0].set_ylabel('frequency bin')
axs[1].set_xlabel('mel bin')
plt.show()
import librosa
import matplotlib.pyplot as plt
import torch
from torchaudio.transforms import MelSpectrogram
n_fft = 2048
win_len = None
hop_len = 512
n_mels = 128
sample_rate = 6000
path = 'test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav'
waveform, sample_rate = librosa.load(path, sr=sample_rate)
waveform = torch.Tensor(waveform)
torchaudio_melspec = MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
win_length=win_len,
hop_length=hop_len,
center=True,
pad_mode="reflect",
power=2.0,
norm='slaney',
onesided=True,
n_mels=n_mels,
)(waveform)
librosa_melspec = librosa.feature.melspectrogram(
waveform.numpy(),
sr=sample_rate,
n_fft=n_fft,
hop_length=hop_len,
win_length=win_len,
center=True,
pad_mode="reflect",
power=2.0,
n_mels=n_mels,
norm='slaney',
htk=True,
)
mse = ((torchaudio_melspec - librosa_melspec) ** 2).mean()
print(f'MSE:\t{mse}')
fig, axs = plt.subplots(1, 2, figsize=(20, 5))
fig.suptitle('Mel Spectrogram')
axs[0].set_title('torchaudio')
axs[0].set_ylabel('mel bin')
axs[0].set_xlabel('frame')
axs[0].imshow(librosa.power_to_db(torchaudio_melspec), aspect='auto')
axs[1].set_title('librosa')
axs[1].set_ylabel('mel bin')
axs[1].set_xlabel('frame')
axs[1].imshow(librosa.power_to_db(librosa_melspec), aspect='auto')
plt.show()
import librosa.core.spectrum
import matplotlib.pyplot as plt
import torch
from torchaudio.transforms import Spectrogram
n_fft = 2048
win_len = None
hop_len = 512
sample_rate = 6000
path = 'test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav'
waveform, _ = librosa.load(path, sr=sample_rate)
waveform = torch.Tensor(waveform)
torchaudio_spec = Spectrogram(
n_fft=n_fft,
win_length=win_len,
hop_length=hop_len,
center=True,
pad_mode="reflect",
power=2.0,
)(waveform)
librosa_spec, _ = librosa.core.spectrum._spectrogram(
waveform.numpy(),
n_fft=n_fft,
hop_length=hop_len,
win_length=win_len,
center=True,
pad_mode="reflect",
power=2.0,
)
mse = ((torchaudio_spec - librosa_spec) ** 2).mean()
print(f'MSE:\t{mse}')
fig, axs = plt.subplots(1, 2, figsize=(20, 5))
fig.suptitle('mel-Kernel')
axs[0].set_title('torchaudio')
axs[0].set_ylabel('mel bin')
axs[0].set_xlabel('frame')
axs[0].imshow(librosa.power_to_db(torchaudio_spec), aspect='auto')
axs[1].set_title('librosa')
axs[1].set_ylabel('mel bin')
axs[1].set_xlabel('frame')
axs[1].imshow(librosa.power_to_db(librosa_spec), aspect='auto')
plt.show()
@mthrok
Copy link
Author

mthrok commented Feb 13, 2022

@jerpint

Not among the set of compatible parameters. Please refer to https://pytorch.org/audio/0.10.0/tutorials/audio_feature_extractions_tutorial.html#mel-filter-bank for a better explanation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment