Created
June 25, 2025 15:15
-
-
Save alessaba/872ca49779fd14d4b357c313400004bb to your computer and use it in GitHub Desktop.
Concatena tutti gli audio di un dataset video in un unico file, aggiungendo anche il nome del file nel file audio (usando il comando "say" di macOS)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import subprocess | |
from tqdm import tqdm | |
from concurrent.futures import ThreadPoolExecutor | |
# === Config === | |
VIDEO_EXT = ".mp4" | |
TEMP_SPEECH = "temp_speech" | |
TEMP_AUDIO = "temp_audio" | |
TEMP_LABELED = "temp_labeled" | |
OUTPUT_FILE = "output_labeled.m4a" | |
os.makedirs(TEMP_SPEECH, exist_ok=True) | |
os.makedirs(TEMP_AUDIO, exist_ok=True) | |
os.makedirs(TEMP_LABELED, exist_ok=True) | |
videos = sorted([f for f in os.listdir('.') if f.endswith(VIDEO_EXT)]) | |
def process_video(f): | |
base = os.path.splitext(f)[0] | |
speech_path = os.path.join(TEMP_SPEECH, f"{base}.m4a") | |
audio_path = os.path.join(TEMP_AUDIO, f"{base}.m4a") | |
# Generate spoken filename | |
aiff_tmp = os.path.join(TEMP_SPEECH, f"{base}.aiff") | |
subprocess.run(['say', '-v', 'Samantha', base, '-o', aiff_tmp], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
subprocess.run(['ffmpeg', '-y', '-i', aiff_tmp, '-ar', '44100', '-c:a', 'aac', '-b:a', '128k', '-ac', '2', speech_path], | |
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
os.remove(aiff_tmp) | |
# Extract audio | |
subprocess.run([ | |
'ffmpeg', '-y', '-i', f, | |
'-vn', '-ac', '2', '-ar', '44100', | |
'-c:a', 'aac', '-b:a', '128k', | |
audio_path | |
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
# === Parallel Processing === | |
with ThreadPoolExecutor() as executor: | |
list(tqdm(executor.map(process_video, videos), total=len(videos), desc="Processing videos")) | |
# === Merge speech + audio === | |
labeled_files = [] | |
for f in tqdm(videos, desc="Merging labeled clips"): | |
base = os.path.splitext(f)[0] | |
speech = os.path.join(TEMP_SPEECH, f"{base}.m4a") | |
audio = os.path.join(TEMP_AUDIO, f"{base}.m4a") | |
labeled = os.path.join(TEMP_LABELED, f"{base}.m4a") | |
subprocess.run([ | |
'ffmpeg', '-y', | |
'-i', speech, | |
'-i', audio, | |
'-filter_complex', '[0:a][1:a]concat=n=2:v=0:a=1[outa]', | |
'-map', '[outa]', | |
'-c:a', 'aac', '-b:a', '128k', '-ac', '2', '-ar', '44100', | |
labeled | |
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
labeled_files.append(labeled) | |
# === Final Concatenation === | |
with open("master_list.txt", "w") as master: | |
for f in labeled_files: | |
master.write(f"file '{os.path.abspath(f)}'\n") | |
# Try -c copy first, fallback to re-encoding if it fails | |
result = subprocess.run(['ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', 'master_list.txt', '-c', 'copy', OUTPUT_FILE], | |
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
if result.returncode != 0: | |
print("🔁 Re-encoding output due to stream incompatibility...") | |
subprocess.run(['ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', 'master_list.txt', | |
'-c:a', 'aac', '-b:a', '128k', '-ac', '2', OUTPUT_FILE], | |
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
# Cleanup | |
os.remove("master_list.txt") | |
os.system(f"rm -rf {TEMP_AUDIO}") | |
os.system(f"rm -rf {TEMP_LABELED}") | |
os.system(f"rm -rf {TEMP_SPEECH}") | |
print(f"✅ Done! Final file: {OUTPUT_FILE}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment