Skip to content

Instantly share code, notes, and snippets.

@sotelo
Created January 18, 2017 01:54
Show Gist options
  • Save sotelo/1f2d3fc1dca7326ec5680681eb539809 to your computer and use it in GitHub Desktop.
Save sotelo/1f2d3fc1dca7326ec5680681eb539809 to your computer and use it in GitHub Desktop.
librispeech initial processing
import os
import glob
from shutil import copyfile
from subprocess import Popen
base_dir = '/Tmp/sotelo/data/librispeech/'
raw_dir = os.path.join(
base_dir, 'downloads/LibriSpeech/train-clean-100')
wav_dir = os.path.join(base_dir, 'wav')
audio_files = [
os.path.join(root, name)
for root, dirs, files in os.walk(raw_dir)
for name in files
if name.endswith((".flac"))]
text_files = [
os.path.join(root, name)
for root, dirs, files in os.walk(raw_dir)
for name in files
if name.endswith((".txt"))]
audio_files = sorted(audio_files)
text_files = sorted(text_files)
all_text = []
for text_file in text_files:
with open(text_file, 'r') as f:
text = f.readlines()
all_text += text
files_codes = [x.strip().split(" ")[0] for x in all_text]
all_text = [" ".join(x.strip().split(" ")[1:]) for x in all_text]
audio_codes = [x.strip().split("/")[-1].split(".")[0] for x in audio_files]
assert files_codes == audio_codes
proc_txt = []
for file_name, txt in zip(files_codes, all_text):
proc_txt.append('( ' + file_name + ' "' + txt + '" )\n')
with open(os.path.join(base_dir, 'utts.data'), 'w') as f:
f.writelines(proc_txt)
convert_cmd = 'ffmpeg -i {} {}'
for audio_code, audio_file in zip(audio_codes, audio_files):
wav_file = os.path.join(wav_dir, audio_code + '.wav')
! {convert_cmd.format(audio_file, wav_file)}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment