Created
October 12, 2021 14:06
-
-
Save FarisHijazi/a5ac05baae213f2f830dfa981c521270 to your computer and use it in GitHub Desktop.
cli wrapper for https://github.com/haoheliu/voicefixer_main
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
VoiceFixer | |
$ python voicefixer_cli.py -i ~/Downloads/download.mp3 | |
# for running on multiple files | |
$ find path/to/folder -name "*.wav" -not -name "*denoised*" |xargs -P 20 -I{} sh -c 'python voicefixer_cli.py -i {}' | |
""" | |
""" | |
# installation: | |
sudo apt-get install libsox-fmt-all libsox-dev sox libsndfile-dev | |
pip install torchaudio | |
pip install ffmpeg-python | |
pip install --upgrade ddsp | |
pip install voicefixer==0.0.9 | |
pip install git+https://github.com/facebookresearch/WavAugment.git | |
pip install note_seq | |
pip install tensorflow | |
""" | |
import argparse | |
from pathlib import Path | |
rate = 16000 | |
parser = argparse.ArgumentParser(__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-i', '--input', type=Path, help='input file path') | |
parser.add_argument('-o', '--out', default='{input}{mode}{noisified}_denoised.wav', help='denoised output file path') | |
parser.add_argument('-n', '--noisify', action='store_true', help='add noise before denoising') | |
parser.add_argument('-m', '--modes', nargs='+', type=int, choices={0, 1, 2}, default=[0, 1, 2]) | |
args = parser.parse_args() | |
args.input = args.input.as_posix() | |
# Alias these for backwards compatibility and ease. | |
print('importing') | |
# import IPython.display as ipd | |
import augment | |
import librosa | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import scipy.io | |
import soundfile as sf | |
import torch | |
from matplotlib import cm | |
from scipy.io import wavfile | |
from voicefixer import VoiceFixer | |
#@title Run this block to define some helper functions | |
def uniform_sample(lower, upper): | |
if (abs(lower - upper) < 1e-5): | |
return upper | |
return float((upper - lower) * torch.rand(1) + lower) | |
def show_spectrogram(file_path): | |
samples, _ = librosa.load(file_path, sr=44100) | |
plt.figure(figsize=(12, 4)) | |
plt.subplot(121) | |
plt.title("Linear Spectrogram") | |
librosa.display.specshow( | |
np.log10(np.abs(librosa.stft(samples)) + 1e-8), | |
sr=44100, | |
x_axis='frames', | |
y_axis='linear', | |
cmap=cm.jet, | |
vmax=2.8, | |
vmin=-1.7) | |
plt.subplot(122) | |
plt.title("Mel Spectrogram") | |
librosa.display.specshow( | |
np.log10(np.abs(librosa.feature.melspectrogram(samples, sr=44100)) + 1e-8), | |
sr=44100, | |
x_axis='frames', | |
y_axis='mel', | |
cmap=cm.jet, | |
vmax=2.5, | |
vmin=-4) | |
plt.show() | |
print('loading voice filter', end=' ... ') | |
vf = VoiceFixer() # Initilize a voicefixer. | |
print('loaded voicefixer') | |
sr, a = scipy.io.wavfile.read(args.input) | |
if len(a.shape) == 2: | |
a = a[:, 0] | |
a = librosa.util.normalize(a.astype('float32')) | |
a = librosa.resample(a, sr, 16000) | |
a.min(), a.max() | |
if args.noisify: | |
# Generate Random Distortion Parameters | |
clipping_ratio = uniform_sample(lower=0.25, upper=1.0) | |
print("clipping ratio: ", clipping_ratio) # lower>=0.1, upper<=1.0 | |
lowpass_frequency = uniform_sample(lower=4000, upper=8000) | |
print("lowpass cutoff frequency: ", lowpass_frequency) # lower>=1000, and upper have no limit | |
reverb_level = uniform_sample(lower=10, upper=80) # lower>=0, upper<=100 | |
dumping_factor = uniform_sample(lower=10, upper=50) # lower>=0, upper<=100 | |
room_size = uniform_sample(lower=10, upper=50) | |
print("reverberate level, dumping_factor, room_size = ", reverb_level, dumping_factor, room_size) # lower>=0, upper<=100 | |
highpass = uniform_sample(lower=10, upper=1500) | |
print("highpass cutoff frequency: ", highpass) # lower>=0, upper<=2000 | |
# Apply Distortion Effects | |
effect_chain = augment.EffectChain().clip(clipping_ratio) # clipping ratio | |
effect_chain = effect_chain.lowpass(lowpass_frequency) # remove high frequency information | |
effect_chain = effect_chain.reverb(reverb_level, dumping_factor, room_size).channels(1) # reverberate level, dumping factor, room size | |
effect_chain = effect_chain.highpass(highpass) # remove low frequency information | |
noise_generator = lambda: torch.zeros_like(torch.tensor(a)).uniform_() | |
effect_chain = effect_chain.additive_noise(noise_generator, snr=15) | |
# print(effect_chain) | |
y = effect_chain.apply(torch.tensor(a), src_info={'rate': rate}, target_info={'rate': rate}) | |
out = args.out.format(input=args.input, noisified="_noisified", mode=f'_distorted') | |
print("Distorted speech file name:", out) | |
sf.write(out, y[0, ...].numpy(), rate) | |
# sf.write(str(second)+"_"+str(rate)+".wav",a,rate) | |
# sf.write(str(second)+"_"+str(rate)+".wav",a,rate) | |
# show_spectrogram(args.out.format(input=args.input, mode='distorted')) | |
# ipd.Audio(args.out.format(input=args.input, mode='distorted'), rate=44100) | |
else: | |
y = torch.Tensor(a).unsqueeze(0) | |
for mode in args.modes: | |
out = args.out.format(input=args.input, noisified="_noisified" if args.noisify else "", mode=f'_mode{mode}') | |
print("Restore: ", out) | |
vf.restore(input=args.input, output=out, cuda=True, mode=mode) | |
# show_spectrogram(out) | |
# ipd.Audio(out, rate=44100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment