Created
December 15, 2015 05:50
-
-
Save kylemcdonald/c8e62ef8cb9515d64df4 to your computer and use it in GitHub Desktop.
Split an audio file into multiple files based on detected onsets from librosa.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import matplotlib.pyplot as plt | |
import librosa | |
import numpy as np | |
import os | |
from progressbar import ProgressBar | |
parser = argparse.ArgumentParser( | |
description='Split audio into multiple files and save analysis.') | |
parser.add_argument('-i', '--input', type=str) | |
parser.add_argument('-o', '--output', type=str, default='transients') | |
parser.add_argument('-s', '--sr', type=int, default=44100) | |
args = parser.parse_args() | |
y, sr = librosa.load(args.input, sr=args.sr) | |
o_env = librosa.onset.onset_strength(y, sr=sr, feature=librosa.cqt) | |
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr) | |
def prepare(y, sr=22050): | |
y = librosa.to_mono(y) | |
y = librosa.util.fix_length(y, sr) # 1 second of audio | |
y = librosa.util.normalize(y) | |
return y | |
def get_fingerprint(y, sr=22050): | |
y = prepare(y, sr) | |
cqt = librosa.cqt(y, sr=sr, hop_length=2048) | |
return cqt.flatten('F') | |
def normalize(x): | |
x -= x.min(axis=0) | |
x /= x.max(axis=0) | |
return x | |
def basename(file): | |
file = os.path.basename(file) | |
return os.path.splitext(file)[0] | |
vectors = [] | |
words = [] | |
filenames = [] | |
onset_samples = list(librosa.frames_to_samples(onset_frames)) | |
onset_samples = np.concatenate(onset_samples, len(y)) | |
starts = onset_samples[0:-1] | |
stops = onset_samples[1:] | |
analysis_folder = args.output | |
samples_folder = os.path.join(args.output, 'samples') | |
try: | |
os.makedirs(samples_folder) | |
except: | |
pass | |
pbar = ProgressBar() | |
for i, (start, stop) in enumerate(pbar(zip(starts, stops))): | |
audio = y[start:stop] | |
filename = os.path.join(samples_folder, str(i) + '.wav') | |
librosa.output.write_wav(filename, audio, sr) | |
vector = get_fingerprint(audio, sr=sr) | |
word = basename(filename) | |
vectors.append(vector) | |
words.append(word) | |
filenames.append(filename) | |
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t') | |
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s') | |
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s') |
It works now! thank you very much ^^
which version of librosa are you using? I'm getting the following error
librosa/onset.py", line 538, in onset_strength_multi
S = np.abs(feature(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, **kwargs))
TypeError: cqt() got an unexpected keyword argument 'n_fft'
(ppdataenv)
@cri5Castro, I am getting the same error. Guessing it is caused by updates of Librosa. Did you have any progress on this matter?
@cri5Castro - Update: I did some investigation, and the syntax is indeed a bit updated. If you want to use the Constant-Q spectrogram, just replace the code for the variable o_env
with:
C = np.abs(librosa.cqt(y=y, sr=sr))
o_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
Reference to documentation: https://librosa.github.io/librosa/generated/librosa.onset.onset_strength.html
I ran this and got
Traceback (most recent call last):
File "/Users/sam/Documents/Dev/mp3-sound-detection/src/./split_transients.py", line 57, in <module>
for i, (start, stop) in enumerate(pbar(zip(starts, stops))):
File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 152, in __next__
self.start()
File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 291, in start
self.update(0)
File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 262, in update
self.fd.write(self._format_line() + '\r')
File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 222, in _format_line
widgets = ''.join(self._format_widgets())
File "/usr/local/lib/python3.9/site-packages/progressbar/progressbar.py", line 202, in _format_widgets
widget = widgets.format_updatable(widget, self)
File "/usr/local/lib/python3.9/site-packages/progressbar/widgets.py", line 39, in format_updatable
if hasattr(updatable, 'update'): return updatable.update(pbar)
File "/usr/local/lib/python3.9/site-packages/progressbar/widgets.py", line 230, in update
return '%3d%%' % pbar.percentage()
ValueError: cannot convert float NaN to integer
The following code works, I had to ditch the progress bar though
#!/usr/bin/env python
import argparse
import matplotlib.pyplot as plt
import librosa
import numpy as np
import os
import soundfile as sf
parser = argparse.ArgumentParser(
description='Split audio into multiple files and save analysis.')
parser.add_argument('-i', '--input', type=str)
parser.add_argument('-o', '--output', type=str, default='transients')
parser.add_argument('-s', '--sr', type=int, default=44100)
args = parser.parse_args()
def prepare(y, sr=22050):
y = librosa.to_mono(y)
y = librosa.util.fix_length(y, sr) # 1 second of audio
y = librosa.util.normalize(y)
return y
def get_fingerprint(y, sr=22050):
y = prepare(y, sr)
cqt = librosa.cqt(y, sr=sr, hop_length=2048)
return cqt.flatten('F')
def normalize(x):
x -= x.min(axis=0)
x /= x.max(axis=0)
return x
def basename(file):
file = os.path.basename(file)
return os.path.splitext(file)[0]
vectors = []
words = []
filenames = []
y, sr = librosa.load(args.input, sr=args.sr)
C = np.abs(librosa.cqt(y=y, sr=sr))
o_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
analysis_folder = args.output
samples_folder = os.path.join(args.output, 'samples')
try:
os.makedirs(samples_folder)
except:
pass
enumeration = enumerate(zip(starts, stops))
#print(list(enumeration))
for i, (start, stop) in enumeration:
audio = y[start:stop]
filename = os.path.join(samples_folder, str(i) + '.wav')
sf.write(filename, audio, sr)
vector = get_fingerprint(audio, sr=sr)
word = basename(filename)
vectors.append(vector)
words.append(word)
filenames.append(filename)
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s')
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s')
Here is a working version with ProgressBar and librosa 0.8,1
#!/usr/bin/env python
import argparse
import soundfile
import librosa
import numpy as np
import os
from progressbar import ProgressBar, Percentage, Bar
parser = argparse.ArgumentParser(
description='Split audio into multiple files and save analysis.')
parser.add_argument('-i', '--input', type=str)
parser.add_argument('-o', '--output', type=str, default='transients')
parser.add_argument('-s', '--sr', type=int, default=44100)
args = parser.parse_args()
print(f'Loading {args.input}')
y, sr = librosa.load(args.input, sr=args.sr)
print('Calculating CQT')
C = np.abs(librosa.cqt(y=y, sr=sr))
print('Extracting onsets')
o_env = librosa.onset.onset_strength(y, sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
def prepare(y, sr=22050):
y = librosa.to_mono(y)
y = librosa.util.fix_length(y, sr) # 1 second of audio
y = librosa.util.normalize(y)
return y
def get_fingerprint(y, sr=22050):
y = prepare(y, sr)
cqt = librosa.cqt(y, sr=sr, hop_length=2048)
return cqt.flatten('F')
def normalize(x):
x -= x.min(axis=0)
x /= x.max(axis=0)
return x
def basename(file):
file = os.path.basename(file)
return os.path.splitext(file)[0]
vectors = []
words = []
filenames = []
onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
analysis_folder = args.output
samples_folder = os.path.join(args.output, 'samples')
num_segments = len(onset_samples)
print(f'Writing {num_segments} segments to {samples_folder}')
try:
os.makedirs(samples_folder)
except:
pass
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=num_segments).start()
for i, (start, stop) in enumerate(zip(starts, stops)):
audio = y[start:stop]
filename = os.path.join(samples_folder, str(i) + '.wav')
soundfile.write(filename, audio, sr)
vector = get_fingerprint(audio, sr=sr)
word = basename(filename)
vectors.append(vector)
words.append(word)
filenames.append(filename)
pbar.update(i+1)
pbar.finish()
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s')
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s')
thank you @elgiano !
This works with librosa==0.10.1
and numpy==1.26.4
:
#!/usr/bin/env python
import argparse
import soundfile
import librosa
import numpy as np
import os
from progressbar import ProgressBar, Percentage, Bar
parser = argparse.ArgumentParser(
description='Split audio into multiple files and save analysis.')
parser.add_argument('-i', '--input', type=str)
parser.add_argument('-o', '--output', type=str, default='transients')
parser.add_argument('-s', '--sr', type=int, default=44100)
args = parser.parse_args()
print(f'Loading {args.input}')
y, sr = librosa.load(args.input, sr=args.sr)
print('Calculating CQT')
C = np.abs(librosa.cqt(y=y, sr=sr))
print('Extracting onsets')
o_env = librosa.onset.onset_strength(y=y, sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
onset_frames = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
def prepare(y, sr=22050):
y = librosa.to_mono(y)
y = librosa.util.fix_length(y, size=sr) # 1 second of audio
y = librosa.util.normalize(y)
return y
def get_fingerprint(y, sr=22050):
y = prepare(y, sr)
cqt = librosa.cqt(y, sr=sr, hop_length=2048)
return cqt.flatten('F')
def normalize(x):
x -= x.min(axis=0)
x /= x.max(axis=0)
return x
def basename(file):
file = os.path.basename(file)
return os.path.splitext(file)[0]
vectors = []
words = []
filenames = []
onset_samples = list(librosa.frames_to_samples(onset_frames))
onset_samples = np.concatenate(onset_samples, len(y))
starts = onset_samples[0:-1]
stops = onset_samples[1:]
analysis_folder = args.output
samples_folder = os.path.join(args.output, 'samples')
num_segments = len(onset_samples)
print(f'Writing {num_segments} segments to {samples_folder}')
try:
os.makedirs(samples_folder)
except:
pass
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=num_segments).start()
for i, (start, stop) in enumerate(zip(starts, stops)):
audio = y[start:stop]
filename = os.path.join(samples_folder, str(i) + '.wav')
soundfile.write(filename, audio, sr)
vector = get_fingerprint(audio, sr=sr)
word = basename(filename)
vectors.append(vector)
words.append(word)
filenames.append(filename)
pbar.update(i+1)
pbar.finish()
np.savetxt(os.path.join(analysis_folder, 'vectors'), vectors, fmt='%.5f', delimiter='\t')
np.savetxt(os.path.join(analysis_folder, 'words'), words, fmt='%s')
np.savetxt(os.path.join(analysis_folder, 'filenames.txt'), filenames, fmt='%s')
Usage:
python Scripts/split2.py -i "Voices/Alexine Dreams 24-02-17.mp3"
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Get rid of the progress bar and the float issue will go away.
Here's how I wrote the bottom part: