-
Let's have everything as functional. Then wrap it as a layer.
-
All should work for both batch/non-batch ..with forcing
- freq = height
- time = width
- channel = channel
-
Least dependency
- no librosa, no scipy.
import torch | |
def STFT(src, n_fft, hop_length=None, window=None, **kwargs): | |
"""A wrapper for torch.stft with some preset parameters. | |
Returned value keeps both real and imageinary parts. | |
For STFT magnitude, see spectrogram | |
""" | |
if hop_length is None: | |
hop_length = n_fft // 4 | |
if window is None: | |
window = torch.hann_window(n_fft) | |
return torch.stft(src, n_fft, hop_length, window=window, **kwargs) | |
def spectrogram(src, n_fft, hop_length=None, window=None, power=2.0, **kwargs): | |
""" | |
returns magnitude of spectrogram | |
""" | |
stft = STFT(src, n_fft, hop_length=hop_length, window=window, **kwargs) | |
return stft.pow(2).sum(-1).pow(power / 2.0) | |
def melspectrogram(src, n_fft, hop_length, window, sr, power=2.0, **kwargs): | |
""" | |
returns melspectrogram | |
""" | |
specgram = spectrogram(src, n_fft, hop_length, window, power=2.0, **kwargs) | |
# TODO: mel basis matrix | |
def amplitude_to_db(): | |
""" | |
magnitude scaling | |
""" | |
def power_to_db(): | |
""" | |
magnitude scaling | |
""" | |
def pseudo_cqt(): | |
"""computing pseudo-cqt | |
""" | |
# To inherit from torchaudio | |
def compose(): | |
def scale(): | |
def pad_trim(): | |
def downmix(): | |
def mu_law_encoding(): | |
def mu_law_decoding(): | |
""" | |
wrap up all the functionals | |
""" |