Last active
September 5, 2023 19:16
-
-
Save vadimkantorov/d4037a70e98aad55da4dc03f8867d966 to your computer and use it in GitHub Desktop.
Python script using webrtcvad for splitting an audio file into voice segments
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: python3 split_audio_by_silence.py -i input_audio.m4a -o segments | |
# will save segments in mp3 format into the segments directory | |
# based on https://github.com/mozilla/DeepSpeech/tree/master/examples/vad_transcriber | |
# Dependencies: webrtcvad | |
import os | |
import argparse | |
import collections | |
import subprocess | |
import webrtcvad | |
def detect_voice_segments(vad, audio, sample_rate, frame_bytes, frame_duration_ms, triggered_sliding_window_threshold = 0.9): | |
padding_duration_ms = frame_duration_ms * 10 | |
num_padding_frames = int(padding_duration_ms / frame_duration_ms) | |
ring_buffer = collections.deque(maxlen=num_padding_frames) | |
triggered = False | |
makeseg = lambda voiced_frames: b''.join(voiced_frames) | |
voiced_frames = [] | |
for frame in (audio[offset:offset + frame_bytes] for offset in range(0, len(audio), frame_bytes) if offset + frame_bytes < len(audio)): | |
is_speech = vad.is_speech(frame, sample_rate) | |
if not triggered: | |
ring_buffer.append((frame, is_speech)) | |
num_voiced = len([f for f, speech in ring_buffer if speech]) | |
if num_voiced > triggered_sliding_window_threshold * ring_buffer.maxlen: | |
triggered = True | |
for f, s in ring_buffer: | |
voiced_frames.append(f) | |
ring_buffer.clear() | |
else: | |
voiced_frames.append(frame) | |
ring_buffer.append((frame, is_speech)) | |
num_unvoiced = len([f for f, speech in ring_buffer if not speech]) | |
if num_unvoiced > triggered_sliding_window_threshold * ring_buffer.maxlen: | |
triggered = False | |
yield makeseg(voiced_frames) | |
ring_buffer.clear() | |
voiced_frames = [] | |
if triggered: | |
pass | |
if voiced_frames: | |
yield makeseg(voiced_frames) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-i', '--input-path', required = True) | |
parser.add_argument('-o', '--output-dir', default = 'segments') | |
parser.add_argument('--aggressive', default = 3, type = int, choices = [0, 1, 2, 3], required = False) | |
parser.add_argument('--frame_duration_ms', default = 30, choices = [10, 20, 30]) | |
parser.add_argument('--min_segment_duration', type = int, default = 10) | |
args = parser.parse_args() | |
os.makedirs(args.output_dir, exist_ok = True) | |
sample_rate, audio = 16000, subprocess.check_output(['ffmpeg', '-loglevel', 'fatal', '-hide_banner', '-nostats', '-nostdin', '-i', args.input_path, '-ar', '16000', '-f', 's16le', '-acodec', 'pcm_s16le', '-ac', '1', '-vn', '-'], stderr = subprocess.DEVNULL) | |
frame_bytes = int(2 * sample_rate * (args.frame_duration_ms / 1000.0)) | |
segments = detect_voice_segments(webrtcvad.Vad(args.aggressive), audio, sample_rate, frame_bytes, args.frame_duration_ms) | |
for i, segment in enumerate(segments): | |
if len(segment) / (2 * sample_rate) > args.min_segment_duration: | |
subprocess.Popen(['ffmpeg', '-loglevel', 'fatal', '-hide_banner', '-nostats', '-nostdin', '-y', '-f', 's16le', '-ar', '16000', '-ac', '1', '-i', '-', '-acodec', 'mp3', '-vn', '-ar', '16000', '-ac', '1', os.path.join(args.output_dir, f'{args.input_path}.{i:04d}.mp3')], stdin = subprocess.PIPE, stdout = subprocess.DEVNULL, stderr = subprocess.DEVNULL).communicate(segment) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment