seungwonpark · November 29, 2021 15:33
diff --git a/split_csd.ipynb b/split_csd.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "strategic-marble",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import glob\n",
    "import tqdm\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import soundfile as sf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "announced-bubble",
   "metadata": {},
   "outputs": [],
   "source": [
    "ROOT_PATH = r'D:\\datasets\\CSD\\CSD\\english'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "generic-workshop",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(os.path.join(ROOT_PATH, 'split'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "running-hamilton",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "en003a\n"
     ]
    }
   ],
   "source": [
    "names = ['en%03da' % i for i in range(1, 51)] + ['en%03db' % i for i in range(1, 51)]\n",
    "names.sort()\n",
    "print(names[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "other-voltage",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_path(name, folder, extension):\n",
    "    return os.path.join(ROOT_PATH, folder, name + '.' + extension)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "prospective-niger",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:11<00:00,  1.39it/s]\n"
     ]
    }
   ],
   "source": [
    "meta = list()\n",
    "\n",
    "for name in tqdm.tqdm(names):\n",
    "    with open(get_path(name, 'txt', 'txt'), 'r', encoding='utf-8') as f:\n",
    "        lines = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n",
    "        lines = [line for line in lines if line != '']\n",
    "    \n",
    "    with open(get_path(name, 'lyric', 'txt'), 'r', encoding='utf-8') as f:\n",
    "        graphemes = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n",
    "        graphemes = [line for line in graphemes if line != '']\n",
    "    \n",
    "    assert len(lines) == len(graphemes), name\n",
    "    \n",
    "    df = pd.read_csv(get_path(name, 'csv', 'csv'))\n",
    "    wav, sr = sf.read(get_path(name, 'wav', 'wav'))\n",
    "    assert sr == 44100, name\n",
    "    \n",
    "    cnt = 0\n",
    "    for idx, (line, grap) in enumerate(zip(lines, graphemes)):\n",
    "        words = len(line.split())\n",
    "        df_line = df.iloc[cnt:cnt+words]\n",
    "        start = df_line.iloc[0]['start']\n",
    "        end = df_line.iloc[-1]['end']\n",
    "        assert 0 <= int(start*sr) < int(end*sr) <= len(wav), name\n",
    "        split_wav = wav[int(start*sr):int(end*sr)]\n",
    "\n",
    "        wavname = '%s_%02d.wav' % (name, idx)\n",
    "        txtname = '%s_%02d.txt' % (name, idx)\n",
    "        wavpath = os.path.join(ROOT_PATH, 'split', wavname)\n",
    "        txtpath = os.path.join(ROOT_PATH, 'split', txtname)\n",
    "        sf.write(wavpath, split_wav, sr)\n",
    "        \n",
    "        with open(txtpath, 'w', encoding='utf-8') as f:\n",
    "            f.write('%s\\n' % grap)\n",
    "\n",
    "        meta.append('%s|%s|%s' % \\\n",
    "            (os.path.join('split', wavname), grap, \"CSD\"))\n",
    "        \n",
    "        cnt += words\n",
    "    \n",
    "\n",
    "meta_path = os.path.join(ROOT_PATH, 'csd_english_meta.txt')\n",
    "with open(meta_path, 'w', encoding='utf-8') as f:\n",
    "    for line in meta:\n",
    "        f.write('%s\\n' % line)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/split_trainvaltest_22k.ipynb b/split_trainvaltest_22k.ipynb
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "strategic-marble",
	"metadata": {},
	"outputs": [],
	"source": [
	"import os\n",
	"import glob\n",
	"import tqdm\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import soundfile as sf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "announced-bubble",
	"metadata": {},
	"outputs": [],
	"source": [
	"ROOT_PATH = r'D:\\datasets\\CSD\\CSD\\english'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "generic-workshop",
	"metadata": {},
	"outputs": [],
	"source": [
	"os.makedirs(os.path.join(ROOT_PATH, 'split'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "running-hamilton",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"en003a\n"
	]
	}
	],
	"source": [
	"names = ['en%03da' % i for i in range(1, 51)] + ['en%03db' % i for i in range(1, 51)]\n",
	"names.sort()\n",
	"print(names[4])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "other-voltage",
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_path(name, folder, extension):\n",
	" return os.path.join(ROOT_PATH, folder, name + '.' + extension)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "prospective-niger",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|████████████████████████████████████████████████████████████████████████████████\| 100/100 [01:11<00:00, 1.39it/s]\n"
	]
	}
	],
	"source": [
	"meta = list()\n",
	"\n",
	"for name in tqdm.tqdm(names):\n",
	" with open(get_path(name, 'txt', 'txt'), 'r', encoding='utf-8') as f:\n",
	" lines = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n",
	" lines = [line for line in lines if line != '']\n",
	" \n",
	" with open(get_path(name, 'lyric', 'txt'), 'r', encoding='utf-8') as f:\n",
	" graphemes = [x.strip().replace('\\u200b', '') for x in f.readlines()]\n",
	" graphemes = [line for line in graphemes if line != '']\n",
	" \n",
	" assert len(lines) == len(graphemes), name\n",
	" \n",
	" df = pd.read_csv(get_path(name, 'csv', 'csv'))\n",
	" wav, sr = sf.read(get_path(name, 'wav', 'wav'))\n",
	" assert sr == 44100, name\n",
	" \n",
	" cnt = 0\n",
	" for idx, (line, grap) in enumerate(zip(lines, graphemes)):\n",
	" words = len(line.split())\n",
	" df_line = df.iloc[cnt:cnt+words]\n",
	" start = df_line.iloc[0]['start']\n",
	" end = df_line.iloc[-1]['end']\n",
	" assert 0 <= int(startsr) < int(endsr) <= len(wav), name\n",
	" split_wav = wav[int(startsr):int(endsr)]\n",
	"\n",
	" wavname = '%s_%02d.wav' % (name, idx)\n",
	" txtname = '%s_%02d.txt' % (name, idx)\n",
	" wavpath = os.path.join(ROOT_PATH, 'split', wavname)\n",
	" txtpath = os.path.join(ROOT_PATH, 'split', txtname)\n",
	" sf.write(wavpath, split_wav, sr)\n",
	" \n",
	" with open(txtpath, 'w', encoding='utf-8') as f:\n",
	" f.write('%s\\n' % grap)\n",
	"\n",
	" meta.append('%s\|%s\|%s' % \\\n",
	" (os.path.join('split', wavname), grap, \"CSD\"))\n",
	" \n",
	" cnt += words\n",
	" \n",
	"\n",
	"meta_path = os.path.join(ROOT_PATH, 'csd_english_meta.txt')\n",
	"with open(meta_path, 'w', encoding='utf-8') as f:\n",
	" for line in meta:\n",
	" f.write('%s\\n' % line)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found