danstowell · November 14, 2018 21:08
diff --git a/speechwavs_matchingup.py b/speechwavs_matchingup.py
 # simple audio sync example by Dan Stowell Nov 2018

 import librosa  # lib... Rosa!

 import os
 import numpy as np

 ###############################################

 maxlagsecs = 10   # the maximum offset between two audio files that will be considered

 verbose = True

 sr = 22050 # enforce a common sample rate
 n_fft = 1024
 hop_length = 512

 ###############################################

 def ingest_a_wav_file(wavpath):
 	"load a file, convert to mel spectrogram, limit to main speech range 100 Hz - 5000 Hz, normalise power"
 	global sr, hop_length, n_fft

 	y, sr = librosa.load(wavpath, sr=sr)
 	spec, n_fft = librosa.core.spectrum._spectrogram(y=y, n_fft=n_fft, hop_length=hop_length)
 	mel_basis = librosa.filters.mel(sr, n_fft=n_fft, fmin=100, fmax=5000)
 	melspec = np.dot(mel_basis, spec)
 	melspec /= np.sum(melspec * melspec)
 	if verbose:
 		print("Loaded %s. Shape %s" % (wavpath, melspec.shape))

 	return melspec  # shape (n_mels, t)

 def compare_two_audios(spec1, spec2):
 	# simple spectrogram cross-correlation -- with a maximum relative lag (in BOTH directions), normalised to the cross-correlation by dividing by the two stdevs
 	global sr, hop_length, n_fft

 	maxlagframes = librosa.core.time_to_frames(maxlagsecs, sr, hop_length, n_fft)

 	shortestlen = min(spec1.shape[1], spec2.shape[1])

 	offsetframes = range(-maxlagframes, maxlagframes+1)
 	result = [None for _ in offsetframes]

 	# it's slightly more precise to be doing the mean-and-std normalisation within the subspecs (on each loop iteration) but here we just do it once
 	spec1 = (spec1 - np.mean(spec1)) / np.std(spec1)
 	spec2 = (spec2 - np.mean(spec2)) / np.std(spec2)

 	for resultpos, lag in enumerate(offsetframes):
 		subspec1 = spec1[:, 0+max(0,  lag):shortestlen-max(0, -lag)]
 		subspec2 = spec2[:, 0+max(0, -lag):shortestlen-max(0,  lag)]
 		result[resultpos] = np.mean(subspec1 * subspec2)

 	offsetsecs = librosa.core.frames_to_time(offsetframes, sr, hop_length) # do not pass n_fft to librosa here, else it adds a time-offset that's irrelevant here

 	return offsetsecs, result

 def compare_multi_wav_files(filelist):
 	"given a list of wav files, load em and do all pairwise comparisons. returns a matrix in which each entry has the peak xcorr strength, and its offset"

 	ret = [[None for _ in range(len(filelist))] for __ in range(len(filelist))]

 	specs = [ingest_a_wav_file(fpath) for fpath in filelist]  # NB if you have loooooads of files this will exhaust RAM; instead, you'd only keep 2 in mem each time

 	for i in range(len(filelist)):
 		for j in range(0, len(filelist)):
 			offsets, corrs = compare_two_audios(specs[i], specs[j])
 			# find the peak, and plop it into the results matrix
 			peakpos = np.argmax(corrs)
 			ret[i][j] = (offsets[peakpos], corrs[peakpos])

 	return ret

 ####################################################
 if __name__=='__main__':

 	# Two files which should match each other somehow, plus another that... shouldn't
 	filelist = [
 		'/home/dan/audio_misc/sample-audio-files-from-midsummer-nights-dream/original-full-files/d5F2.ogg.wav',
 		'/home/dan/audio_misc/sample-audio-files-from-midsummer-nights-dream/original-full-files/Titania-dTHT.ogg.wav',
 		'/home/dan/birdsong/BL_SoundsOfGardenBirds/17 Feral Pigeon - Song.aiff',
 	]
 	results = compare_multi_wav_files(filelist)

 	for i in range(len(filelist)):
 		for j in range(0, len(filelist)):
 			print("%s vs %s:\n  match strength %.1f %%, time offset %f" % (os.path.basename(filelist[i]), os.path.basename(filelist[j]),
 						results[i][j][1] * 100, results[i][j][0]))
	# simple audio sync example by Dan Stowell Nov 2018

	import librosa # lib... Rosa!

	import os
	import numpy as np

	###############################################

	maxlagsecs = 10 # the maximum offset between two audio files that will be considered

	verbose = True

	sr = 22050 # enforce a common sample rate
	n_fft = 1024
	hop_length = 512

	###############################################

	def ingest_a_wav_file(wavpath):
	"load a file, convert to mel spectrogram, limit to main speech range 100 Hz - 5000 Hz, normalise power"
	global sr, hop_length, n_fft

	y, sr = librosa.load(wavpath, sr=sr)
	spec, n_fft = librosa.core.spectrum._spectrogram(y=y, n_fft=n_fft, hop_length=hop_length)
	mel_basis = librosa.filters.mel(sr, n_fft=n_fft, fmin=100, fmax=5000)
	melspec = np.dot(mel_basis, spec)
	melspec /= np.sum(melspec * melspec)
	if verbose:
	print("Loaded %s. Shape %s" % (wavpath, melspec.shape))

	return melspec # shape (n_mels, t)

	def compare_two_audios(spec1, spec2):
	# simple spectrogram cross-correlation -- with a maximum relative lag (in BOTH directions), normalised to the cross-correlation by dividing by the two stdevs
	global sr, hop_length, n_fft

	maxlagframes = librosa.core.time_to_frames(maxlagsecs, sr, hop_length, n_fft)

	shortestlen = min(spec1.shape[1], spec2.shape[1])

	offsetframes = range(-maxlagframes, maxlagframes+1)
	result = [None for _ in offsetframes]

	# it's slightly more precise to be doing the mean-and-std normalisation within the subspecs (on each loop iteration) but here we just do it once
	spec1 = (spec1 - np.mean(spec1)) / np.std(spec1)
	spec2 = (spec2 - np.mean(spec2)) / np.std(spec2)

	for resultpos, lag in enumerate(offsetframes):
	subspec1 = spec1[:, 0+max(0, lag):shortestlen-max(0, -lag)]
	subspec2 = spec2[:, 0+max(0, -lag):shortestlen-max(0, lag)]
	result[resultpos] = np.mean(subspec1 * subspec2)

	offsetsecs = librosa.core.frames_to_time(offsetframes, sr, hop_length) # do not pass n_fft to librosa here, else it adds a time-offset that's irrelevant here

	return offsetsecs, result

	def compare_multi_wav_files(filelist):
	"given a list of wav files, load em and do all pairwise comparisons. returns a matrix in which each entry has the peak xcorr strength, and its offset"

	ret = [[None for _ in range(len(filelist))] for __ in range(len(filelist))]

	specs = [ingest_a_wav_file(fpath) for fpath in filelist] # NB if you have loooooads of files this will exhaust RAM; instead, you'd only keep 2 in mem each time

	for i in range(len(filelist)):
	for j in range(0, len(filelist)):
	offsets, corrs = compare_two_audios(specs[i], specs[j])
	# find the peak, and plop it into the results matrix
	peakpos = np.argmax(corrs)
	ret[i][j] = (offsets[peakpos], corrs[peakpos])

	return ret

	####################################################
	if __name__=='__main__':

	# Two files which should match each other somehow, plus another that... shouldn't
	filelist = [
	'/home/dan/audio_misc/sample-audio-files-from-midsummer-nights-dream/original-full-files/d5F2.ogg.wav',
	'/home/dan/audio_misc/sample-audio-files-from-midsummer-nights-dream/original-full-files/Titania-dTHT.ogg.wav',
	'/home/dan/birdsong/BL_SoundsOfGardenBirds/17 Feral Pigeon - Song.aiff',
	]
	results = compare_multi_wav_files(filelist)

	for i in range(len(filelist)):
	for j in range(0, len(filelist)):
	print("%s vs %s:\n match strength %.1f %%, time offset %f" % (os.path.basename(filelist[i]), os.path.basename(filelist[j]),
	results[i][j][1] * 100, results[i][j][0]))