Skip to content

Instantly share code, notes, and snippets.

@seungwonpark
Last active November 29, 2021 15:33
Show Gist options
  • Select an option

  • Save seungwonpark/78dd69730ecee631e16018228c83af89 to your computer and use it in GitHub Desktop.

Select an option

Save seungwonpark/78dd69730ecee631e16018228c83af89 to your computer and use it in GitHub Desktop.
Split CSD (Children's Song Dataset)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "strategic-marble",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import glob\n",
"import tqdm\n",
"import numpy as np\n",
"import pandas as pd\n",
"import soundfile as sf"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "announced-bubble",
"metadata": {},
"outputs": [],
"source": [
"ROOT_PATH = r'D:\\datasets\\CSD\\CSD\\english'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "generic-workshop",
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(os.path.join(ROOT_PATH, 'split'))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "running-hamilton",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"en003a\n"
]
}
],
"source": [
"names = ['en%03da' % i for i in range(1, 51)] + ['en%03db' % i for i in range(1, 51)]\n",
"names.sort()\n",
"print(names[4])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "other-voltage",
"metadata": {},
"outputs": [],
"source": [
"def get_path(name, folder, extension):\n",
" return os.path.join(ROOT_PATH, folder, name + '.' + extension)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "prospective-niger",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:11<00:00, 1.39it/s]\n"
]
}
],
"source": [
"meta = list()\n",
"\n",
"for name in tqdm.tqdm(names):\n",
" with open(get_path(name, 'txt', 'txt'), 'r', encoding='utf-8') as f:\n",
" lines = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n",
" lines = [line for line in lines if line != '']\n",
" \n",
" with open(get_path(name, 'lyric', 'txt'), 'r', encoding='utf-8') as f:\n",
" graphemes = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n",
" graphemes = [line for line in graphemes if line != '']\n",
" \n",
" assert len(lines) == len(graphemes), name\n",
" \n",
" df = pd.read_csv(get_path(name, 'csv', 'csv'))\n",
" wav, sr = sf.read(get_path(name, 'wav', 'wav'))\n",
" assert sr == 44100, name\n",
" \n",
" cnt = 0\n",
" for idx, (line, grap) in enumerate(zip(lines, graphemes)):\n",
" words = len(line.split())\n",
" df_line = df.iloc[cnt:cnt+words]\n",
" start = df_line.iloc[0]['start']\n",
" end = df_line.iloc[-1]['end']\n",
" assert 0 <= int(start*sr) < int(end*sr) <= len(wav), name\n",
" split_wav = wav[int(start*sr):int(end*sr)]\n",
"\n",
" wavname = '%s_%02d.wav' % (name, idx)\n",
" txtname = '%s_%02d.txt' % (name, idx)\n",
" wavpath = os.path.join(ROOT_PATH, 'split', wavname)\n",
" txtpath = os.path.join(ROOT_PATH, 'split', txtname)\n",
" sf.write(wavpath, split_wav, sr)\n",
" \n",
" with open(txtpath, 'w', encoding='utf-8') as f:\n",
" f.write('%s\\n' % grap)\n",
"\n",
" meta.append('%s|%s|%s' % \\\n",
" (os.path.join('split', wavname), grap, \"CSD\"))\n",
" \n",
" cnt += words\n",
" \n",
"\n",
"meta_path = os.path.join(ROOT_PATH, 'csd_english_meta.txt')\n",
"with open(meta_path, 'w', encoding='utf-8') as f:\n",
" for line in meta:\n",
" f.write('%s\\n' % line)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment