from pydub import AudioSegment
from pyAudioAnalysis import audioBasicIO as aIO
from pyAudioAnalysis import audioSegmentation as aS
import sys
import numpy
from import wavfile
from scipy.signal import fftconvolve
def usage():
sys.exit("Usage: double_ender_sync master.wav sync.wav sync2.wav ...")
if len(sys.argv) < 3:
master = AudioSegment.from_wav(sys.argv[1])
master = master.set_channels(1)
files_to_sync = sys.argv[2:]
filenumber = 0
for sync_filename in files_to_sync:
print("Syncing %s to %s" %(sync_filename,sys.argv[1]))
filenumber = filenumber + 1
sync = AudioSegment.from_file(sync_filename)
sync = sync.set_channels(1)
# First reduce file sizes by only looking at relevant areas
needle_abs_index = 0
offset = abs(len(sync)-len(master)) * 1.05
# If there's less then 5 minutes difference, give us a bit more headroom.
if (offset<5*60*1000):
offset = 5*60*1000
search_area = master[:offset*2]
sample_area = sync[offset:10*60*1000+offset]
needle_abs_index = offset
search_area.export("search_area.wav", format="wav")
sample_area.export("sample_area.wav", format="wav")
# Segment sample area into speech bits and use first one to locate within
[Fs, x] = aIO.readAudioFile("sample_area.wav")
segments = aS.silenceRemoval(x, Fs, 0.05, 0.05, 1.0, 0.8, False)
for timeidx in segments:
start = timeidx[0] * 1000
end = timeidx[1] * 1000
needle_abs_index = needle_abs_index + start
if (end-start>2*1000):
needle = sample_area[start:end]
print("Found a needle")
needle.export("needle.wav", format="wav")
# Search code adapted from (
needle_rate, needle ="needle.wav")
haystack_rate, haystack ="search_area.wav")
if needle_rate != haystack_rate:
sys.exit("Sample rates are not the same")
needle = numpy.array(needle, dtype=numpy.float64)
needle_len = len(needle)
haystack = numpy.array(haystack, dtype=numpy.float64)
haystack_len = len(haystack)
needle_norm =
if needle_norm < 1000.0:
sys.exit("The needle is almost silent")
haystack_squared = numpy.hstack(([0.0], haystack * haystack))
haystack_cum_norm = numpy.cumsum(haystack_squared)
haystack_norm_at = haystack_cum_norm[needle_len:haystack_len + 1] - haystack_cum_norm[0:haystack_len + 1 - needle_len]
correlation_at = fftconvolve(haystack, needle[::-1], mode='valid')
difference_norm_at = haystack_norm_at + needle_norm - 2 * correlation_at
cos2phi_at = correlation_at * correlation_at / (haystack_norm_at + 0.000001) / needle_norm
at = numpy.argmin(difference_norm_at)
# Calculate diffs and write synced file.
time_offset = abs(needle_abs_index-(at/haystack_rate*1000))
print("Absolute needle pos: %d" % needle_abs_index)
print("The needle starts at ms: %d" % round(at/haystack_rate*1000))
print("Time Offset: %d seconds" % round(time_offset/1000))
synced = sync[time_offset:]
synced.export("synced-track%d.wav" % filenumber, format="wav")
