Skip to content

Instantly share code, notes, and snippets.

Created November 14, 2018 21:08
Show Gist options
  • Save danstowell/3470accc6b0775c4b826c858549a84bd to your computer and use it in GitHub Desktop.
Save danstowell/3470accc6b0775c4b826c858549a84bd to your computer and use it in GitHub Desktop.
# simple audio sync example by Dan Stowell Nov 2018
import librosa # lib... Rosa!
import os
import numpy as np
maxlagsecs = 10 # the maximum offset between two audio files that will be considered
verbose = True
sr = 22050 # enforce a common sample rate
n_fft = 1024
hop_length = 512
def ingest_a_wav_file(wavpath):
"load a file, convert to mel spectrogram, limit to main speech range 100 Hz - 5000 Hz, normalise power"
global sr, hop_length, n_fft
y, sr = librosa.load(wavpath, sr=sr)
spec, n_fft = librosa.core.spectrum._spectrogram(y=y, n_fft=n_fft, hop_length=hop_length)
mel_basis = librosa.filters.mel(sr, n_fft=n_fft, fmin=100, fmax=5000)
melspec =, spec)
melspec /= np.sum(melspec * melspec)
if verbose:
print("Loaded %s. Shape %s" % (wavpath, melspec.shape))
return melspec # shape (n_mels, t)
def compare_two_audios(spec1, spec2):
# simple spectrogram cross-correlation -- with a maximum relative lag (in BOTH directions), normalised to the cross-correlation by dividing by the two stdevs
global sr, hop_length, n_fft
maxlagframes = librosa.core.time_to_frames(maxlagsecs, sr, hop_length, n_fft)
shortestlen = min(spec1.shape[1], spec2.shape[1])
offsetframes = range(-maxlagframes, maxlagframes+1)
result = [None for _ in offsetframes]
# it's slightly more precise to be doing the mean-and-std normalisation within the subspecs (on each loop iteration) but here we just do it once
spec1 = (spec1 - np.mean(spec1)) / np.std(spec1)
spec2 = (spec2 - np.mean(spec2)) / np.std(spec2)
for resultpos, lag in enumerate(offsetframes):
subspec1 = spec1[:, 0+max(0, lag):shortestlen-max(0, -lag)]
subspec2 = spec2[:, 0+max(0, -lag):shortestlen-max(0, lag)]
result[resultpos] = np.mean(subspec1 * subspec2)
offsetsecs = librosa.core.frames_to_time(offsetframes, sr, hop_length) # do not pass n_fft to librosa here, else it adds a time-offset that's irrelevant here
return offsetsecs, result
def compare_multi_wav_files(filelist):
"given a list of wav files, load em and do all pairwise comparisons. returns a matrix in which each entry has the peak xcorr strength, and its offset"
ret = [[None for _ in range(len(filelist))] for __ in range(len(filelist))]
specs = [ingest_a_wav_file(fpath) for fpath in filelist] # NB if you have loooooads of files this will exhaust RAM; instead, you'd only keep 2 in mem each time
for i in range(len(filelist)):
for j in range(0, len(filelist)):
offsets, corrs = compare_two_audios(specs[i], specs[j])
# find the peak, and plop it into the results matrix
peakpos = np.argmax(corrs)
ret[i][j] = (offsets[peakpos], corrs[peakpos])
return ret
if __name__=='__main__':
# Two files which should match each other somehow, plus another that... shouldn't
filelist = [
'/home/dan/birdsong/BL_SoundsOfGardenBirds/17 Feral Pigeon - Song.aiff',
results = compare_multi_wav_files(filelist)
for i in range(len(filelist)):
for j in range(0, len(filelist)):
print("%s vs %s:\n match strength %.1f %%, time offset %f" % (os.path.basename(filelist[i]), os.path.basename(filelist[j]),
results[i][j][1] * 100, results[i][j][0]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment