Last active
July 26, 2021 12:56
-
-
Save iver56/02daea5f35db744d21bdad2d25c30e22 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import subprocess | |
import tempfile | |
import uuid | |
from pathlib import Path | |
import numpy as np | |
from scipy.io.wavfile import write | |
def convert_float_samples_to_int16(y, clamp_values=True, dither=True): | |
""" | |
Convert floating-point numpy array of audio samples to int16. | |
:param y: | |
:param clamp_values: Clip extreme values to the range [-1.0, 1.0]. This can be done to avoid | |
integer overflow or underflow, which results in wrap distortion, which sounds worse than | |
clipping distortion. | |
:param dither: Whether or not to apply dithering. Dithering alleviates quantization noise. | |
See https://www.youtube.com/watch?v=zWpWIQw7HWU for an explanation on dither. | |
The dithering noise is triangular. | |
:return: | |
""" | |
if not issubclass(y.dtype.type, np.floating): | |
raise ValueError("input samples not floating-point") | |
y_16 = y * np.iinfo(np.int16).max | |
if dither: | |
y_16 += np.random.triangular(-1, 0, 1, size=y_16.shape) | |
if clamp_values: | |
y_16[y_16 < np.iinfo(np.int16).min] = np.iinfo(np.int16).min | |
y_16[y_16 > np.iinfo(np.int16).max] = np.iinfo(np.int16).max | |
return y_16.astype(np.int16) | |
def calculate_visqol_in_audio_mode( | |
degraded_audio: np.ndarray, reference_audio: np.ndarray, sample_rate: int | |
): | |
""" | |
Given an audio pair (a degraded audio and a reference/target audio), | |
return a MOS-LQO (Mean Opinion Score - Listening Quality Objective) score. | |
MOS-LQO scores range from 1 (the worst) to 5 (the best). | |
This uses VISQOL's "audio mode" (48 kHz), not "speech mode" (16 kHz). | |
""" | |
assert sample_rate == 48000 | |
assert degraded_audio.ndim == 2 | |
assert reference_audio.ndim == 2 | |
assert degraded_audio.shape[0] == 1 | |
assert reference_audio.shape[0] == 1 | |
tmp_dir = Path(tempfile.gettempdir()) | |
degraded_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav") | |
reference_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav") | |
write( | |
degraded_audio_file_path, | |
sample_rate, | |
convert_float_samples_to_int16(degraded_audio).T, | |
) | |
write( | |
reference_audio_file_path, | |
sample_rate, | |
convert_float_samples_to_int16(reference_audio).T, | |
) | |
command_args = [ | |
"docker", | |
"run", | |
"--rm", | |
"-t", | |
"-v", | |
"{}:/data".format(tmp_dir.as_posix()), | |
"jonashaag/visqol:v3", | |
"--degraded_file", | |
"/data/{}".format(degraded_audio_file_path.name), | |
"--reference_file", | |
"/data/{}".format(reference_audio_file_path.name), | |
] | |
visqol_output = subprocess.check_output(command_args, timeout=60.0).decode("utf-8") | |
os.remove(degraded_audio_file_path) | |
os.remove(reference_audio_file_path) | |
regex = re.compile(r"MOS-LQO:\s*(?P<mos_lqo>[0-9.]+)") | |
match = regex.search(visqol_output) | |
if match: | |
return float(match.group("mos_lqo")) | |
else: | |
raise Exception( | |
"Failed to calculate VISQOL - response does not contain MOS-LQO." | |
" Actual response: {}".format(visqol_output) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment