Skip to content

Instantly share code, notes, and snippets.

@jesseengel
Last active May 3, 2021 09:21
Show Gist options
  • Save jesseengel/acc922d6ee96fcc03e2f55c73d4bc42e to your computer and use it in GitHub Desktop.
Save jesseengel/acc922d6ee96fcc03e2f55c73d4bc42e to your computer and use it in GitHub Desktop.
Mel Scaling
# mel spectrum constants.
_MEL_BREAK_FREQUENCY_HERTZ = 700.0
_MEL_HIGH_FREQUENCY_Q = 1127.0
def mel_to_hertz(mel_values):
"""Converts frequencies in `mel_values` from the mel scale to linear scale."""
return _MEL_BREAK_FREQUENCY_HERTZ * (
np.exp(np.array(mel_values) / _MEL_HIGH_FREQUENCY_Q) - 1.0)
def hertz_to_mel(frequencies_hertz):
"""Converts frequencies in `frequencies_hertz` in Hertz to the mel scale."""
return _MEL_HIGH_FREQUENCY_Q * np.log(
1.0 + (np.array(frequencies_hertz) / _MEL_BREAK_FREQUENCY_HERTZ))
def linear_to_mel_weight_matrix(num_mel_bins=20,
num_spectrogram_bins=129,
sample_rate=16000,
lower_edge_hertz=125.0,
upper_edge_hertz=3800.0):
"""Returns a matrix to warp linear scale spectrograms to the mel scale.
Adapted from tf.signal.linear_to_mel_weight_matrix with a minimum
band width (in Hz scale) of 1.5 * freq_bin. To preserve accuracy,
we compute the matrix at float64 precision and then cast to `dtype`
at the end. This function can be constant folded by graph optimization
since there are no Tensor inputs.
Args:
num_mel_bins: Int, number of output frequency dimensions.
num_spectrogram_bins: Int, number of input frequency dimensions.
sample_rate: Int, sample rate of the audio.
lower_edge_hertz: Float, lowest frequency to consider.
upper_edge_hertz: Float, highest frequency to consider.
Returns:
Numpy float32 matrix of shape [num_spectrogram_bins, num_mel_bins].
Raises:
ValueError: Input argument in the wrong range.
"""
# Validate input arguments
if num_mel_bins <= 0:
raise ValueError('num_mel_bins must be positive. Got: %s' % num_mel_bins)
if num_spectrogram_bins <= 0:
raise ValueError(
'num_spectrogram_bins must be positive. Got: %s' % num_spectrogram_bins)
if sample_rate <= 0.0:
raise ValueError('sample_rate must be positive. Got: %s' % sample_rate)
if lower_edge_hertz < 0.0:
raise ValueError(
'lower_edge_hertz must be non-negative. Got: %s' % lower_edge_hertz)
if lower_edge_hertz >= upper_edge_hertz:
raise ValueError('lower_edge_hertz %.1f >= upper_edge_hertz %.1f' %
(lower_edge_hertz, upper_edge_hertz))
if upper_edge_hertz > sample_rate / 2:
raise ValueError('upper_edge_hertz must not be larger than the Nyquist '
'frequency (sample_rate / 2). Got: %s for sample_rate: %s'
% (upper_edge_hertz, sample_rate))
# HTK excludes the spectrogram DC bin.
bands_to_zero = 1
nyquist_hertz = sample_rate / 2.0
linear_frequencies = np.linspace(
0.0, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:, np.newaxis]
# spectrogram_bins_mel = hertz_to_mel(linear_frequencies)
# Compute num_mel_bins triples of (lower_edge, center, upper_edge). The
# center of each band is the lower and upper edge of the adjacent bands.
# Accordingly, we divide [lower_edge_hertz, upper_edge_hertz] into
# num_mel_bins + 2 pieces.
band_edges_mel = np.linspace(
hertz_to_mel(lower_edge_hertz), hertz_to_mel(upper_edge_hertz),
num_mel_bins + 2)
lower_edge_mel = band_edges_mel[0:-2]
center_mel = band_edges_mel[1:-1]
upper_edge_mel = band_edges_mel[2:]
freq_res = nyquist_hertz / float(num_spectrogram_bins)
freq_th = 1.5 * freq_res
for i in range(0, num_mel_bins):
center_hz = mel_to_hertz(center_mel[i])
lower_hz = mel_to_hertz(lower_edge_mel[i])
upper_hz = mel_to_hertz(upper_edge_mel[i])
if upper_hz - lower_hz < freq_th:
rhs = 0.5 * freq_th / (center_hz + _MEL_BREAK_FREQUENCY_HERTZ)
dm = _MEL_HIGH_FREQUENCY_Q * np.log(rhs + np.sqrt(1.0 + rhs**2))
lower_edge_mel[i] = center_mel[i] - dm
upper_edge_mel[i] = center_mel[i] + dm
lower_edge_hz = mel_to_hertz(lower_edge_mel)[np.newaxis, :]
center_hz = mel_to_hertz(center_mel)[np.newaxis, :]
upper_edge_hz = mel_to_hertz(upper_edge_mel)[np.newaxis, :]
# Calculate lower and upper slopes for every spectrogram bin.
# Line segments are linear in the mel domain, not Hertz.
lower_slopes = (linear_frequencies - lower_edge_hz) / (
center_hz - lower_edge_hz)
upper_slopes = (upper_edge_hz - linear_frequencies) / (
upper_edge_hz - center_hz)
# Intersect the line segments with each other and zero.
mel_weights_matrix = np.maximum(0.0, np.minimum(lower_slopes, upper_slopes))
# Re-add the zeroed lower bins we sliced out above.
# [freq, mel]
mel_weights_matrix = np.pad(mel_weights_matrix, [[bands_to_zero, 0], [0, 0]],
'constant')
return mel_weights_matrix
def _linear_to_mel_matrix(self):
"""Get the mel transformation matrix."""
num_freq_bins = self._nfft // 2
lower_edge_hertz = 0.0
upper_edge_hertz = self._sample_rate / 2.0
num_mel_bins = num_freq_bins // self._mel_downscale
return spectral_ops.linear_to_mel_weight_matrix(
num_mel_bins, num_freq_bins, self._sample_rate, lower_edge_hertz,
upper_edge_hertz)
def _mel_to_linear_matrix(self):
"""Get the inverse mel transformation matrix."""
m = self._linear_to_mel_matrix()
m_t = np.transpose(m)
p = np.matmul(m, m_t)
d = [1.0 / x if np.abs(x) > 1.0e-8 else x for x in np.sum(p, axis=0)]
return np.matmul(m_t, np.diag(d))
def specgrams_to_melspecgrams(specgrams):
"""Converts specgrams to melspecgrams.
Args:
specgrams: Tensor of log magnitudes and instantaneous frequencies,
shape [batch, time, freq, 2].
Returns:
melspecgrams: Tensor of log magnitudes and instantaneous frequencies,
shape [batch, time, freq, 2], mel scaling of frequencies.
"""
logmag = specgrams[:, :, :, 0]
p = specgrams[:, :, :, 1]
mag2 = tf.exp(2.0 * logmag)
phase_angle = tf.cumsum(p * np.pi, axis=-2)
l2mel = tf.to_float(self._linear_to_mel_matrix())
logmelmag2 = self._safe_log(tf.tensordot(mag2, l2mel, 1))
mel_phase_angle = tf.tensordot(phase_angle, l2mel, 1)
mel_p = spectral_ops.instantaneous_frequency(mel_phase_angle)
return tf.concat(
[logmelmag2[:, :, :, tf.newaxis], mel_p[:, :, :, tf.newaxis]], axis=-1)
def melspecgrams_to_specgrams(melspecgrams):
"""Converts melspecgrams to specgrams.
Args:
melspecgrams: Tensor of log magnitudes and instantaneous frequencies,
shape [batch, time, freq, 2], mel scaling of frequencies.
Returns:
specgrams: Tensor of log magnitudes and instantaneous frequencies,
shape [batch, time, freq, 2].
"""
if self._mel_downscale is None:
return melspecgrams
logmelmag2 = melspecgrams[:, :, :, 0]
mel_p = melspecgrams[:, :, :, 1]
mel2l = tf.to_float(self._mel_to_linear_matrix())
mag2 = tf.tensordot(tf.exp(logmelmag2), mel2l, 1)
logmag = 0.5 * self._safe_log(mag2)
mel_phase_angle = tf.cumsum(mel_p * np.pi, axis=-2)
phase_angle = tf.tensordot(mel_phase_angle, mel2l, 1)
p = spectral_ops.instantaneous_frequency(phase_angle)
return tf.concat(
[logmag[:, :, :, tf.newaxis], p[:, :, :, tf.newaxis]], axis=-1)
def instantaneous_frequency(phase_angle, time_axis=-2):
"""Transform a fft tensor from phase angle to instantaneous frequency.
Unwrap and take the finite difference of the phase. Pad with initial phase to
keep the tensor the same size.
Args:
phase_angle: Tensor of angles in radians. [Batch, Time, Freqs]
time_axis: Axis over which to unwrap and take finite difference.
Returns:
dphase: Instantaneous frequency (derivative of phase). Same size as input.
"""
phase_unwrapped = unwrap(phase_angle, axis=time_axis)
dphase = diff(phase_unwrapped, axis=time_axis)
# Add an initial phase to dphase
size = phase_unwrapped.get_shape().as_list()
size[time_axis] = 1
begin = [0 for unused_s in size]
phase_slice = tf.slice(phase_unwrapped, begin, size)
dphase = tf.concat([phase_slice, dphase], axis=time_axis) / np.pi
return dphase
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment