Created
February 19, 2019 06:24
-
-
Save tracek/f2e378181c04653e30c3f8a59e511a62 to your computer and use it in GitHub Desktop.
Running librosa parallel for loops with multiprocessing and joblib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The script illustartes stunning difference on my machine with processing of signal with multiprocessing and joblib. | |
# The slowness of multiprocessing is likely caused by oversubscription | |
import time | |
import numpy as np | |
import librosa | |
from joblib import Parallel, delayed | |
from functools import partial | |
from multiprocessing import Pool | |
n_proc = 4 | |
y, sr = librosa.load(librosa.util.example_audio_file(), duration=60) # load audio sample | |
y = np.repeat(y, 10) # repeat signal so that we can get more reliable measurements | |
sample_len = int(sr * 0.2) # We will compute MFCC for short pieces of audio | |
def get_mfcc_in_loop(audio, sr, sample_len): | |
# We split long array into small ones of lenth sample_len | |
y_windowed = np.array_split(audio, np.arange(sample_len, len(audio), sample_len)) | |
for sample in y_windowed: | |
mfcc = librosa.feature.mfcc(y=sample, sr=sr) | |
start = time.time() | |
get_mfcc_in_loop(y, sr, sample_len) | |
print('Time single process:', time.time() - start) | |
# Let's test now feeding these small arrays to pool of 4 workers. Since computing | |
# MFCCs for these small arrays is fast, I'd expect this to be not that fast | |
start = time.time() | |
y_windowed = np.array_split(y, np.arange(sample_len, len(y), sample_len)) | |
with Pool(n_proc) as pool: | |
func = partial(librosa.feature.mfcc, sr=sr) | |
result = pool.map(func, y_windowed) | |
print('Time multiprocessing (many small tasks):', time.time() - start) | |
# Here we split the audio into 4 chunks and process them separately. This I'd expect | |
# to be fast and somehow it isn't. What could be the cause? Anything to do about it? | |
start = time.time() | |
y_split = np.array_split(y, n_proc) | |
with Pool(n_proc) as pool: | |
func = partial(get_mfcc_in_loop, sr=sr, sample_len=sample_len) | |
result = pool.map(func, y_split) | |
print('Time multiprocessing (a few large tasks):', time.time() - start) | |
start = time.time() | |
y_windowed = np.array_split(y, np.arange(sample_len, len(y), sample_len)) | |
Parallel(n_jobs=n_proc, backend='multiprocessing')(delayed(get_mfcc_in_loop)(audio=data, sr=sr, sample_len=sample_len) for data in y_windowed) | |
print('Time multiprocessing with joblib (many small tasks):', time.time() - start) | |
y_split = np.array_split(y, n_proc) | |
start = time.time() | |
Parallel(n_jobs=n_proc, backend='multiprocessing')(delayed(get_mfcc_in_loop)(audio=data, sr=sr, sample_len=sample_len) for data in y_split) | |
print('Time multiprocessing with joblib (a few large tasks):', time.time() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment