-
-
Save aFewThings/f4dde48993709ab67e7223e75c749d9d to your computer and use it in GitHub Desktop.
Compare mel spectrograms of torchaudio and librosa
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import librosa | |
import matplotlib.pyplot as plt | |
import torch | |
from torchaudio.transforms import MelSpectrogram | |
n_fft = 2048 | |
win_len = None | |
hop_len = 512 | |
n_mels = 128 | |
sample_rate = 8000 | |
path = 'test/torchaudio_unittest/assets/steam-train-whistle-daniel_simon.wav' | |
waveform, sample_rate = librosa.load(path, sr=sample_rate) | |
waveform = torch.Tensor(waveform) | |
torchaudio_melspec = MelSpectrogram( | |
sample_rate=sample_rate, | |
n_fft=n_fft, | |
win_length=win_len, | |
hop_length=hop_len, | |
center=True, | |
pad_mode="constant", | |
power=2.0, | |
norm='slaney', | |
mel_scale='slaney' | |
n_mels=n_mels, | |
)(waveform) | |
librosa_melspec = librosa.feature.melspectrogram( | |
y=waveform.numpy(), | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_len, | |
win_length=win_len, | |
center=True, | |
pad_mode="constant", | |
power=2.0, | |
n_mels=n_mels, | |
norm='slaney', | |
# htk=True, # default is False | |
) | |
mse = ((torchaudio_melspec - librosa_melspec) ** 2).mean() | |
print(f'MSE:\t{mse}') | |
fig, axs = plt.subplots(1, 2, figsize=(20, 5)) | |
fig.suptitle('Mel Spectrogram') | |
axs[0].set_title('torchaudio') | |
axs[0].set_ylabel('mel bin') | |
axs[0].set_xlabel('frame') | |
axs[0].imshow(librosa.power_to_db(torchaudio_melspec), aspect='auto') | |
axs[1].set_title('librosa') | |
axs[1].set_ylabel('mel bin') | |
axs[1].set_xlabel('frame') | |
axs[1].imshow(librosa.power_to_db(librosa_melspec), aspect='auto') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Torchaudio melspectrogram with pad_mode="constant", norm="slaney", and mel_scale="slaney" parameters will produce comparable result with librosa melspec (w/ default parameters).
See pytorch/audio#1058