Skip to content

Instantly share code, notes, and snippets.

@iver56
Last active July 26, 2021 12:56
Show Gist options
  • Save iver56/02daea5f35db744d21bdad2d25c30e22 to your computer and use it in GitHub Desktop.
Save iver56/02daea5f35db744d21bdad2d25c30e22 to your computer and use it in GitHub Desktop.
import os
import re
import subprocess
import tempfile
import uuid
from pathlib import Path
import numpy as np
from scipy.io.wavfile import write
def convert_float_samples_to_int16(y, clamp_values=True, dither=True):
"""
Convert floating-point numpy array of audio samples to int16.
:param y:
:param clamp_values: Clip extreme values to the range [-1.0, 1.0]. This can be done to avoid
integer overflow or underflow, which results in wrap distortion, which sounds worse than
clipping distortion.
:param dither: Whether or not to apply dithering. Dithering alleviates quantization noise.
See https://www.youtube.com/watch?v=zWpWIQw7HWU for an explanation on dither.
The dithering noise is triangular.
:return:
"""
if not issubclass(y.dtype.type, np.floating):
raise ValueError("input samples not floating-point")
y_16 = y * np.iinfo(np.int16).max
if dither:
y_16 += np.random.triangular(-1, 0, 1, size=y_16.shape)
if clamp_values:
y_16[y_16 < np.iinfo(np.int16).min] = np.iinfo(np.int16).min
y_16[y_16 > np.iinfo(np.int16).max] = np.iinfo(np.int16).max
return y_16.astype(np.int16)
def calculate_visqol_in_audio_mode(
degraded_audio: np.ndarray, reference_audio: np.ndarray, sample_rate: int
):
"""
Given an audio pair (a degraded audio and a reference/target audio),
return a MOS-LQO (Mean Opinion Score - Listening Quality Objective) score.
MOS-LQO scores range from 1 (the worst) to 5 (the best).
This uses VISQOL's "audio mode" (48 kHz), not "speech mode" (16 kHz).
"""
assert sample_rate == 48000
assert degraded_audio.ndim == 2
assert reference_audio.ndim == 2
assert degraded_audio.shape[0] == 1
assert reference_audio.shape[0] == 1
tmp_dir = Path(tempfile.gettempdir())
degraded_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav")
reference_audio_file_path = tmp_dir / (str(uuid.uuid4()) + ".wav")
write(
degraded_audio_file_path,
sample_rate,
convert_float_samples_to_int16(degraded_audio).T,
)
write(
reference_audio_file_path,
sample_rate,
convert_float_samples_to_int16(reference_audio).T,
)
command_args = [
"docker",
"run",
"--rm",
"-t",
"-v",
"{}:/data".format(tmp_dir.as_posix()),
"jonashaag/visqol:v3",
"--degraded_file",
"/data/{}".format(degraded_audio_file_path.name),
"--reference_file",
"/data/{}".format(reference_audio_file_path.name),
]
visqol_output = subprocess.check_output(command_args, timeout=60.0).decode("utf-8")
os.remove(degraded_audio_file_path)
os.remove(reference_audio_file_path)
regex = re.compile(r"MOS-LQO:\s*(?P<mos_lqo>[0-9.]+)")
match = regex.search(visqol_output)
if match:
return float(match.group("mos_lqo"))
else:
raise Exception(
"Failed to calculate VISQOL - response does not contain MOS-LQO."
" Actual response: {}".format(visqol_output)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment