Last active
November 29, 2021 15:33
-
-
Save seungwonpark/78dd69730ecee631e16018228c83af89 to your computer and use it in GitHub Desktop.
Split CSD (Children's Song Dataset)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "strategic-marble", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "import glob\n", | |
| "import tqdm\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "import soundfile as sf" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "announced-bubble", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "ROOT_PATH = r'D:\\datasets\\CSD\\CSD\\english'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "generic-workshop", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "os.makedirs(os.path.join(ROOT_PATH, 'split'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "running-hamilton", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "en003a\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "names = ['en%03da' % i for i in range(1, 51)] + ['en%03db' % i for i in range(1, 51)]\n", | |
| "names.sort()\n", | |
| "print(names[4])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "other-voltage", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def get_path(name, folder, extension):\n", | |
| " return os.path.join(ROOT_PATH, folder, name + '.' + extension)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "prospective-niger", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:11<00:00, 1.39it/s]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "meta = list()\n", | |
| "\n", | |
| "for name in tqdm.tqdm(names):\n", | |
| " with open(get_path(name, 'txt', 'txt'), 'r', encoding='utf-8') as f:\n", | |
| " lines = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n", | |
| " lines = [line for line in lines if line != '']\n", | |
| " \n", | |
| " with open(get_path(name, 'lyric', 'txt'), 'r', encoding='utf-8') as f:\n", | |
| " graphemes = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n", | |
| " graphemes = [line for line in graphemes if line != '']\n", | |
| " \n", | |
| " assert len(lines) == len(graphemes), name\n", | |
| " \n", | |
| " df = pd.read_csv(get_path(name, 'csv', 'csv'))\n", | |
| " wav, sr = sf.read(get_path(name, 'wav', 'wav'))\n", | |
| " assert sr == 44100, name\n", | |
| " \n", | |
| " cnt = 0\n", | |
| " for idx, (line, grap) in enumerate(zip(lines, graphemes)):\n", | |
| " words = len(line.split())\n", | |
| " df_line = df.iloc[cnt:cnt+words]\n", | |
| " start = df_line.iloc[0]['start']\n", | |
| " end = df_line.iloc[-1]['end']\n", | |
| " assert 0 <= int(start*sr) < int(end*sr) <= len(wav), name\n", | |
| " split_wav = wav[int(start*sr):int(end*sr)]\n", | |
| "\n", | |
| " wavname = '%s_%02d.wav' % (name, idx)\n", | |
| " txtname = '%s_%02d.txt' % (name, idx)\n", | |
| " wavpath = os.path.join(ROOT_PATH, 'split', wavname)\n", | |
| " txtpath = os.path.join(ROOT_PATH, 'split', txtname)\n", | |
| " sf.write(wavpath, split_wav, sr)\n", | |
| " \n", | |
| " with open(txtpath, 'w', encoding='utf-8') as f:\n", | |
| " f.write('%s\\n' % grap)\n", | |
| "\n", | |
| " meta.append('%s|%s|%s' % \\\n", | |
| " (os.path.join('split', wavname), grap, \"CSD\"))\n", | |
| " \n", | |
| " cnt += words\n", | |
| " \n", | |
| "\n", | |
| "meta_path = os.path.join(ROOT_PATH, 'csd_english_meta.txt')\n", | |
| "with open(meta_path, 'w', encoding='utf-8') as f:\n", | |
| " for line in meta:\n", | |
| " f.write('%s\\n' % line)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.7" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment