-
-
Save gevangelopoulos/e4e18ffbb468072f43f9 to your computer and use it in GitHub Desktop.
This notebook preprocesses the TIMIT dataset using MFCCs in the same way that the paper "LSTM: A Search Space Odyssey" used it.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Prepare the TIMIT dataset\n", | |
"This notebook preprocesses the TIMIT dataset using MFCCs. It also provides the reduced version of TIMIT with only a core test set and the well known train/validation split from [Halberstadt1998]. \n", | |
"\n", | |
"### Dependencies\n", | |
"* numpy\n", | |
"* h5py\n", | |
"* scikits.audiolab (works only on python2) \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/greff/venv/py2/local/lib/python2.7/site-packages/scikits/audiolab/soundio/play.py:48: UserWarning: Could not import alsa backend; most probably, you did not have alsa headers when building audiolab\n", | |
" warnings.warn(\"Could not import alsa backend; most probably, \"\n" | |
] | |
} | |
], | |
"source": [ | |
"from __future__ import division, absolute_import, print_function, unicode_literals\n", | |
"from random import shuffle\n", | |
"import os\n", | |
"\n", | |
"import h5py\n", | |
"import numpy as np\n", | |
"import scikits.audiolab as al" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## MFCC Extraction" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"###############################################################################\n", | |
"# MFCC extraction\n", | |
"# By Maigo Yun Wang, 02/08/2012 adapted by Klaus Greff 2015\n", | |
"###############################################################################\n", | |
"\n", | |
"def melfb(p, n, fs):\n", | |
" \"\"\"\n", | |
" Return a Mel filterbank matrix as a numpy array.\n", | |
" Inputs:\n", | |
" p: number of filters in the filterbank\n", | |
" n: length of fft\n", | |
" fs: sample rate in Hz\n", | |
" Ref. www.ifp.illinois.edu/~minhdo/teaching/speaker_recognition/code/melfb.m\n", | |
" \"\"\"\n", | |
" f0 = 700.0 / fs\n", | |
" fn2 = int(np.floor(n/2))\n", | |
" lr = np.log(1 + 0.5/f0) / (p+1)\n", | |
" CF = fs * f0 * (np.exp(np.arange(1, p+1) * lr) - 1)\n", | |
" bl = n * f0 * (np.exp(np.array([0, 1, p, p+1]) * lr) - 1)\n", | |
" b1 = int(np.floor(bl[0])) + 1\n", | |
" b2 = int(np.ceil(bl[1]))\n", | |
" b3 = int(np.floor(bl[2]))\n", | |
" b4 = min(fn2, int(np.ceil(bl[3]))) - 1\n", | |
" pf = np.log(1 + np.arange(b1, b4+1) / f0 / n) / lr\n", | |
" fp = np.floor(pf)\n", | |
" pm = pf - fp\n", | |
" M = np.zeros((p, 1+fn2))\n", | |
" for c in range(b2-1, b4):\n", | |
" r = fp[c] - 1\n", | |
" M[int(r), c+1] += 2 * (1 - pm[c])\n", | |
" for c in range(b3):\n", | |
" r = fp[c]\n", | |
" M[int(r), c+1] += 2 * pm[c]\n", | |
" return M, CF\n", | |
"\n", | |
"def dctmtx(n):\n", | |
" \"\"\"\n", | |
" Return the DCT-II matrix of order n as a numpy array.\n", | |
" \"\"\"\n", | |
" x,y = np.meshgrid(range(n), range(n))\n", | |
" D = np.sqrt(2.0/n) * np.cos(np.pi * (2*x+1) * y / (2*n))\n", | |
" D[0] /= np.sqrt(2)\n", | |
" return D\n", | |
"\n", | |
"def extract(x):\n", | |
" \"\"\"\n", | |
" Extract MFCC coefficients of the sound x in numpy array format.\n", | |
" \"\"\"\n", | |
" FS = 16000 # Sampling rate\n", | |
" FRAME_LEN = int(0.025 * FS) # Frame length\n", | |
" FRAME_SHIFT = int(0.01 * FS) # Frame shift\n", | |
" FFT_SIZE = 2048 # How many points for FFT\n", | |
" WINDOW = np.hamming(FRAME_LEN) # Window function\n", | |
" PRE_EMPH = 0.97 # Pre-emphasis factor\n", | |
"\n", | |
" BANDS = 40 # Number of Mel filters\n", | |
" COEFS = 13 # Number of Mel cepstra coefficients to keep\n", | |
" POWER_SPECTRUM_FLOOR = 1e-100 # Flooring for the power to avoid log(0)\n", | |
" M, CF = melfb(BANDS, FFT_SIZE, FS) # The Mel filterbank matrix and the center frequencies of each band\n", | |
" D = dctmtx(BANDS)[0:COEFS] # The DCT matrix. Change the index to [0:COEFS] if you want to keep the 0-th coefficient\n", | |
" invD = np.linalg.inv(dctmtx(BANDS))[:, 0:COEFS] # The inverse DCT matrix. Change the index to [0:COEFS] if you want to keep the 0-th \n", | |
" \n", | |
" if x.ndim > 1:\n", | |
" print(\"INFO: Input signal has more than 1 channel; the channels will be averaged.\")\n", | |
" x = mean(x, axis=1)\n", | |
" frames = int((len(x) - FRAME_LEN) / FRAME_SHIFT + 1)\n", | |
" feature = []\n", | |
" for f in range(frames):\n", | |
" # Windowing\n", | |
" frame = x[f * FRAME_SHIFT : f * FRAME_SHIFT + FRAME_LEN] * WINDOW\n", | |
" # Pre-emphasis\n", | |
" frame[1:] -= frame[:-1] * PRE_EMPH\n", | |
" # Power spectrum\n", | |
" X = np.abs(np.fft.fft(frame, FFT_SIZE)[:FFT_SIZE/2+1]) ** 2\n", | |
" X[X < POWER_SPECTRUM_FLOOR] = POWER_SPECTRUM_FLOOR # Avoid zero\n", | |
" # Mel filtering, logarithm, DCT\n", | |
" X = np.dot(D, np.log(np.dot(M,X)))\n", | |
" feature.append(X)\n", | |
" feature = np.row_stack(feature)\n", | |
" return feature" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Configuration" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"TIMIT_DIR = '../timit'\n", | |
"#filename = 'timit.h5'\n", | |
"# no transformation\n", | |
"#DTYPE = np.float32\n", | |
"#extractor = lambda x: x.astype(DTYPE).reshape(-1, 1)\n", | |
"#frame_size=1\n", | |
"#frame_shift=1\n", | |
"#derivatives=0\n", | |
"#preprocessing_description = \"Only minimal preprocessing (normalizing to zero mean and unit standard deviation).\"\n", | |
"\n", | |
"# mfcc + 1st and 2nd deriv\n", | |
"filename = 'timit_mfcc.h5'\n", | |
"extractor = extract\n", | |
"frame_size=400\n", | |
"frame_shift=160\n", | |
"derivatives=2\n", | |
"DTYPE = np.float64\n", | |
"preprocessing_description = \"\"\"Extracted 12 MFCCs coefficients + energy with window size of 25ms and 10ms step. \n", | |
"Used hamming window and a pre-emphasis coefficient of 0.97.\n", | |
"Also included 1st and 2nd time-derivative of the signal for a total of 39 feature dimensions.\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Phones\n", | |
"Timit uses 61 phones (and erroneously calles them phonemes). But some tasks work with a reduced set of only 39 phones." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"phones = ['aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'axr', 'ay', 'b', 'bcl',\n", | |
" 'ch', 'd', 'dcl', 'dh', 'dx', 'eh', 'el', 'em', 'en', 'eng', 'epi',\n", | |
" 'er', 'ey', 'f', 'g', 'gcl', 'h#', 'hh', 'hv', 'ih', 'ix', 'iy',\n", | |
" 'jh', 'k', 'kcl', 'l', 'm', 'n', 'ng', 'nx', 'ow', 'oy', 'p', 'pau',\n", | |
" 'pcl', 'q', 'r', 's', 'sh', 't', 'tcl', 'th', 'uh', 'uw', 'ux', 'v',\n", | |
" 'w', 'y', 'z', 'zh']\n", | |
"silence_label = phones.index('h#')\n", | |
"\n", | |
"reduce_phones = {p: p for p in phones if p != 'q'} # discard q\n", | |
"reduce_phones.update({\n", | |
" 'ae': 'aa',\n", | |
" 'ax': 'ah', 'ax-h': 'ah',\n", | |
" 'axr': 'er',\n", | |
" 'hv': 'hh',\n", | |
" 'ix': 'ih',\n", | |
" 'el': 'l',\n", | |
" 'em': 'm',\n", | |
" 'en': 'n', 'nx': 'n',\n", | |
" 'eng': 'ng',\n", | |
" 'zh': 'sh',\n", | |
" 'pcl': 'h#', 'tcl': 'h#', 'kcl': 'h#', 'bcl': 'h#', 'dcl': 'h#', 'gcl': 'h#', 'pau': 'h#', 'epi': 'h#',\n", | |
" 'ux': 'uw'\n", | |
"})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## TIMIT Sample Class\n", | |
"We first write a small class that captures and extracts all important information about a single TIMIT sequence." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"class TimitSample(object):\n", | |
" @classmethod\n", | |
" def create(cls, directory, name):\n", | |
" f = os.path.join(directory, name.split('.')[0])\n", | |
" f = f.split('/')[-4:]\n", | |
" sample = cls(f[0], f[1], f[2][0], f[2][1:], f[3])\n", | |
" return sample\n", | |
"\n", | |
" def __init__(self, usage, dialect, sex, speaker_id, sentence_id,\n", | |
" start=None, stop=None):\n", | |
" self.usage = usage\n", | |
" self.dialect = dialect\n", | |
" self.sex = sex\n", | |
" self.speaker_id = speaker_id\n", | |
" self.sentence_id = sentence_id\n", | |
" self.start = start\n", | |
" self.stop = stop\n", | |
"\n", | |
" def _get_path(self, fileending):\n", | |
" if not fileending.startswith('.'):\n", | |
" fileending = '.' + fileending\n", | |
" return os.path.join(TIMIT_DIR, self.usage, self.dialect, self.sex +\n", | |
" self.speaker_id, self.sentence_id + fileending)\n", | |
"\n", | |
" def get_sentence(self):\n", | |
" filename = self._get_path('txt')\n", | |
" with file(filename, 'r') as f:\n", | |
" content = f.read()\n", | |
" start, stop, sentence = content.split(' ', 2)\n", | |
" return int(start), int(stop), sentence.strip()\n", | |
"\n", | |
" def get_words(self):\n", | |
" filename = self._get_path('wrd')\n", | |
" with file(filename, 'r') as f:\n", | |
" content = f.readlines()\n", | |
" wordlist = [c.strip().split(' ', 2) for c in content]\n", | |
" return [(int(start), int(stop), word)\n", | |
" for start, stop, word in wordlist\n", | |
" if (self.start is None or int(start) >= self.start) and\n", | |
" (self.stop is None or int(stop) <= self.stop)]\n", | |
"\n", | |
" def get_phones(self):\n", | |
" filename = self._get_path('phn')\n", | |
" with file(filename, 'r') as f:\n", | |
" content = f.readlines()\n", | |
" phone_list = [c.strip().split(' ', 2) for c in content]\n", | |
" return [(int(start), int(stop), phone, phones.index(phone))\n", | |
" for start, stop, phone in phone_list\n", | |
" if (self.start is None or int(start) >= self.start) and\n", | |
" (self.stop is None or int(stop) <= self.stop)]\n", | |
"\n", | |
" def get_audio_data(self):\n", | |
" filename = os.path.join(TIMIT_DIR, self.usage, self.dialect,\n", | |
" self.sex + self.speaker_id,\n", | |
" self.sentence_id + '.wav')\n", | |
" f = al.Sndfile(filename, 'r')\n", | |
" data = f.read_frames(f.nframes, dtype=np.float64)\n", | |
" return data[self.start:self.stop]\n", | |
"\n", | |
" def get_labels(self, frame_size=1, frame_shift=1):\n", | |
" phones = self.get_phones()\n", | |
" begin = self.start if self.start else 0\n", | |
" p_extended = [silence_label] * (phones[0][0] - begin)\n", | |
" for p in phones:\n", | |
" p_extended += [p[3]] * (int(p[1]) - int(p[0]))\n", | |
" end = phones[-1][1]\n", | |
" windows = zip(range(0, end - begin - frame_size + 1, frame_shift),\n", | |
" range(frame_size, end - begin + 1, frame_shift))\n", | |
" labels = [np.bincount(p_extended[w[0]:w[1]]).argmax() for w in windows]\n", | |
" return np.array(labels, dtype=np.byte)\n", | |
"\n", | |
" def get_features(self, extractor, frame_size=1, frame_shift=1, derivatives=0):\n", | |
" d = self.get_audio_data()\n", | |
" features = extractor(d)\n", | |
"\n", | |
" feature_derivs = [features]\n", | |
" for i in range(derivatives):\n", | |
" feature_derivs.append(np.gradient(feature_derivs[-1])[0])\n", | |
"\n", | |
" all_features = np.hstack(feature_derivs)\n", | |
" labels = self.get_labels(frame_size, frame_shift)\n", | |
" return all_features, labels\n", | |
"\n", | |
" def __unicode__(self):\n", | |
" return '<TimitSample ' + '/'.join([self.usage, self.dialect,\n", | |
" self.sex + self.speaker_id,\n", | |
" self.sentence_id]) + '>'\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def read_all_samples():\n", | |
" samples = []\n", | |
" for dirname, dirnames, filenames in os.walk(TIMIT_DIR):\n", | |
" samples += [TimitSample.create(dirname, n)\n", | |
" for n in filenames if n.endswith('.wav')]\n", | |
" return samples\n", | |
"\n", | |
"def filter_samples(samples, usage=None, dialect=None, sex=None, speaker_id=None,\n", | |
" sentence_id=None):\n", | |
" def match(s):\n", | |
" return (usage is None or s.usage == usage) and \\\n", | |
" (dialect is None or s.dialect == dialect) and \\\n", | |
" (sex is None or s.sex == sex) and \\\n", | |
" (speaker_id is None or s.speaker_id == speaker_id) and \\\n", | |
" (sentence_id is None or s.sentence_id == sentence_id)\n", | |
" return [s for s in samples if match(s)]\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Extract features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_features_and_labels_for(samples):\n", | |
" ds_list = [s.get_features(extractor, derivatives=derivatives, frame_size=frame_size, frame_shift=frame_shift)\n", | |
" for s in samples]\n", | |
" \n", | |
" maxlen = max(f.shape[0] for f, l in ds_list)\n", | |
" padded_features = []\n", | |
" padded_labels = []\n", | |
" masks = []\n", | |
" for f, l in ds_list:\n", | |
" pad_length_f = maxlen - f.shape[0]\n", | |
" pad_length_l = maxlen - l.shape[0]\n", | |
"\n", | |
" mask = np.ones_like(l)\n", | |
" padded_features.append(np.vstack((f, np.zeros((pad_length_f, f.shape[1]), dtype=DTYPE))))\n", | |
" padded_labels.append(np.hstack((l, np.ones(pad_length_l, dtype=DTYPE) * silence_label)))\n", | |
" masks.append(np.hstack((mask, np.zeros(pad_length_l, dtype=DTYPE))))\n", | |
"\n", | |
" features = np.dstack(padded_features).swapaxes(1, 2)\n", | |
" labels = np.vstack(padded_labels).T.reshape(maxlen, -1, 1)\n", | |
" masks = np.vstack(masks).T.reshape(maxlen, -1, 1)\n", | |
" return features, labels, masks" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_padded_labels(samples, reduced=False):\n", | |
" if not reduced:\n", | |
" L = [zip(*l.get_phones())[3] for l in samples]\n", | |
" else:\n", | |
" L_tmp = [zip(*l.get_phones())[2] for l in samples]\n", | |
" L = [[reduced_phones.index(reduce_phones[p]) for p in l if p != 'q'] for l in L_tmp]\n", | |
" \n", | |
" L_len = max([len(l) for l in L])\n", | |
" L_padded = -np.ones([L_len, len(L), 1], dtype=np.byte)\n", | |
" for i, l in enumerate(L):\n", | |
" L_padded[:len(l), i, 0] = l\n", | |
" return L_padded" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Normalization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_means(input_data, mask=None):\n", | |
" \"\"\"\n", | |
" Get the mean values for every feature in the batch of sequences X by\n", | |
" considering only masked-in entries.\n", | |
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n", | |
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n", | |
" @return: mean value for each feature. shape = (features, )\n", | |
" \"\"\"\n", | |
" if mask is not None:\n", | |
" return input_data[:, :, :].reshape(-1, input_data.shape[2])[\n", | |
" mask.flatten() == 1].mean(0)\n", | |
" else:\n", | |
" return input_data[:, :, :].mean((0, 1))\n", | |
"\n", | |
"\n", | |
"def get_stds(input_data, mask=None, channel_mask=None):\n", | |
" \"\"\"\n", | |
" Get the standard deviation for every feature in the batch of sequences X by\n", | |
" considering only masked-in entries.\n", | |
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n", | |
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n", | |
" @return: standard deviation of each feature. shape = (features, )\n", | |
" \"\"\"\n", | |
" if mask is not None:\n", | |
" return input_data[:, :, :].reshape(-1, input_data.shape[2])[\n", | |
" mask.flatten() == 1].std(0)\n", | |
" else:\n", | |
" return input_data[:, :, :].std((0, 1))\n", | |
"\n", | |
"\n", | |
"def subtract_means(input_data, means, mask=None):\n", | |
" \"\"\"\n", | |
" Subtract the means from the masked-in entries of a batch of sequences X.\n", | |
" This operation is performed in-place, i.e. the input_data will be modified.\n", | |
"\n", | |
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n", | |
" @param means: The means to subtract. shape = (features, )\n", | |
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n", | |
" @param channel_mask: Optional mask for the channels. shape = (feature,)\n", | |
" \"\"\"\n", | |
" if mask is not None:\n", | |
" j = 0\n", | |
" for i in range(input_data.shape[2]):\n", | |
" input_data[:, :, i][mask[:, :, 0] == 1] -= means[j]\n", | |
" j += 1\n", | |
" else:\n", | |
" input_data[:, :, :] -= means\n", | |
"\n", | |
"\n", | |
"def divide_by_stds(input_data, stds, mask=None):\n", | |
" \"\"\"\n", | |
" Divide masked-in entries of input_data by the stds.\n", | |
"\n", | |
" @param input_data: Batch of sequences. shape = (time, sample, feature)\n", | |
" @param stds: The standard deviations for every feature. shape = (features, )\n", | |
" @param mask: Optional mask for the sequences. shape = (time, sample, 1)\n", | |
" \"\"\"\n", | |
" if mask is not None:\n", | |
" j = 0\n", | |
" for i in range(input_data.shape[2]):\n", | |
" input_data[:, :, i][mask[:, :, 0] == 1] /= stds[j]\n", | |
" j += 1\n", | |
" else:\n", | |
" input_data[:, :, :] /= stds\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Playin around" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"all_samples = read_all_samples()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"samples = all_samples[:10]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/greff/venv/py2/lib/python2.7/site-packages/ipykernel/__main__.py:74: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n" | |
] | |
} | |
], | |
"source": [ | |
"X, T, M = get_features_and_labels_for(samples)\n", | |
"L = get_padded_labels(samples)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"means = get_means(X, M)\n", | |
"subtract_means(X, means, M)\n", | |
"stds = get_stds(X, M)\n", | |
"divide_by_stds(X, stds, M)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ -1.36270510e-01, -2.64865545e-02, -1.88458491e-03,\n", | |
" -5.35061361e-03, -4.14322723e-04, -1.62373580e-03,\n", | |
" -4.65313835e-04, 1.66968832e-04, 1.34837504e-03,\n", | |
" 2.84097840e-04, 3.70963168e-04, 9.10096640e-05,\n", | |
" -4.71527965e-05, -1.85187170e-03, -1.06796464e-03,\n", | |
" 1.74962193e-03, 7.97651630e-04, 6.28126471e-04,\n", | |
" -1.48645415e-03, -6.65244577e-04, -3.86702196e-04,\n", | |
" -1.08519194e-04, 1.22560158e-03, 6.08361830e-04,\n", | |
" 8.05459867e-04, -6.19838093e-04, -3.79369618e-04,\n", | |
" -7.56204580e-04, 6.21650029e-04, 9.32087564e-05,\n", | |
" 7.47136741e-04, -3.67511093e-04, -2.92121960e-04,\n", | |
" -1.71338698e-04, -1.18740905e-04, 3.18499112e-04,\n", | |
" 2.22929043e-04, 5.36611170e-04, -8.22041047e-04])" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.mean((0, 1)) # checking the mean" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.9473095372851984" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X.reshape(-1, 1)[M.flatten() == 1].std() # checking the variance" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Preparing the full(original) TIMIT dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File(filename, 'w') as f:\n", | |
" orig = f.create_group('original')\n", | |
" orig.attrs['description'] = \"\"\"\n", | |
" TIMIT\n", | |
" =====\n", | |
" \n", | |
" This is the original TIMIT dataset.\n", | |
" \n", | |
" Preprocessing\n", | |
" -------------\n", | |
" {}\n", | |
" \n", | |
" Content\n", | |
" -------\n", | |
" default: All audio data padded to be of equal length\n", | |
" targets: Phone index for each frame (same shape as default)\n", | |
" masks: Binary array indicating for each frame whether it is part of a sequence (1) or just padding (0)\n", | |
" labels: Integer array with all the phone indices for a labelling task (not framewise). Padded with -1\n", | |
" names: list of filenames in the original dataset for each sample\n", | |
" \"\"\".format(preprocessing_description)\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### training data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"4620" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_samples = filter_samples(all_samples, usage='train')\n", | |
"len(train_samples)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X, T, M = get_features_and_labels_for(train_samples)\n", | |
"L = get_padded_labels(train_samples)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"means = get_means(X, M)\n", | |
"subtract_means(X, means, M)\n", | |
"\n", | |
"stds = get_stds(X, M)\n", | |
"divide_by_stds(X, stds, M)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"shuffling = range(len(train_samples))\n", | |
"shuffle(shuffling)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X = X[:, shuffling, :]\n", | |
"T = T[:, shuffling, :]\n", | |
"M = M[:, shuffling, :]\n", | |
"L = L[:, shuffling, :]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"timit_train_names = [train_samples[i]._get_path(\".txt\").encode() for i in shuffling]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File(filename, 'a') as f:\n", | |
" orig = f['original']\n", | |
" train = orig.create_group('training')\n", | |
" train.create_dataset('default', data=X, compression='gzip', chunks=(X.shape[0], 1, X.shape[2]))\n", | |
" train.create_dataset('targets', data=T, compression='gzip', chunks=(T.shape[0], 1, T.shape[2]))\n", | |
" train.create_dataset('masks', data=M, compression='gzip', chunks=(M.shape[0], 1, M.shape[2]))\n", | |
" train.create_dataset('labels', data=L)\n", | |
" train.create_dataset('names', data=np.array(timit_train_names))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"del X, T, M, L" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### test data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1680" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"test_samples = filter_samples(all_samples, usage='test')\n", | |
"len(test_samples)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X_test, T_test, M_test = get_features_and_labels_for(test_samples)\n", | |
"L_test = get_padded_labels(test_samples)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"subtract_means(X_test, means, M_test)\n", | |
"divide_by_stds(X_test, stds, M_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"timit_test_names = [x._get_path(\".txt\").encode() for x in test_samples]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File(filename, 'a') as f:\n", | |
" orig = f['original']\n", | |
" test = orig.create_group('test')\n", | |
" test.create_dataset('default', data=X_test, compression='gzip', chunks=(X_test.shape[0], 1, X_test.shape[2]))\n", | |
" test.create_dataset('targets', data=T_test, compression='gzip', chunks=(T_test.shape[0], 1, T_test.shape[2]))\n", | |
" test.create_dataset('masks', data=M_test, compression='gzip', chunks=(M_test.shape[0], 1, M_test.shape[2]))\n", | |
" test.create_dataset('labels', data=L_test)\n", | |
" test.create_dataset('names', data=np.array(timit_test_names))\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# The reduced Timit Dataset\n", | |
"\n", | |
"see Phd Thesis of Andrew K. Halberstadt 1998" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File(filename, 'a') as f: \n", | |
"#set 'a' to 'w' if you didn't prepare the full (original) TIMIT \n", | |
" orig = f.create_group('reduced')\n", | |
" orig.attrs['description'] = \"\"\"\n", | |
" TIMIT Reduced\n", | |
" =============\n", | |
" \n", | |
" This is the reduced TIMIT dataset. \n", | |
" It only uses a core test set of 24 speakers, discards all the SA samples from training and has a fixed validation set.\n", | |
" (For details see the PhD Thesis of Andrew K. Halberstadt 1998.)\n", | |
" \n", | |
" Preprocessing\n", | |
" -------------\n", | |
" {}\n", | |
" \n", | |
" Content\n", | |
" -------\n", | |
" default: All audio data padded to be of equal length\n", | |
" targets: Phone index for each frame (same shape as default)\n", | |
" masks: Binary array indicating for each frame whether it is part of a sequence (1) or just padding (0)\n", | |
" labels: Integer array with all the phone indices for a labelling task (not framewise). Padded with -1\n", | |
" labels_reduced: Integer array with all the phones mapped to the reduced phone set. (like labels)\n", | |
" names: list of filenames in the original dataset for each sample\n", | |
" \"\"\".format(preprocessing_description)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"24" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"core_test_speakers={\"mdab0\", \"mwbt0\", \"felc0\", \"mtas1\", \"mwew0\", \"fpas0\",\n", | |
" \"mjmp0\", \"mlnt0\", \"fpkt0\", \"mlll0\", \"mtls0\", \"fjlm0\",\n", | |
" \"mbpm0\", \"mklt0\", \"fnlp0\", \"mcmj0\", \"mjdh0\", \"fmgd0\",\n", | |
" \"mgrt0\", \"mnjm0\", \"fdhc0\", \"mjln0\", \"mpam0\", \"fmld0\"}\n", | |
"len(core_test_speakers) # should be 24" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"192" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"core_test_set= [x for x in all_samples if x.sex + x.speaker_id in core_test_speakers and not x.sentence_id.startswith('sa')]\n", | |
"len(core_test_set) # should be 192" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"462" | |
] | |
}, | |
"execution_count": 32, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len({x.speaker_id for x in all_samples if x.usage=='train'}) # should be 462" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"3696" | |
] | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_set = [x for x in all_samples if x.usage=='train' and not x.sentence_id.startswith('sa')]\n", | |
"len(train_set) # should be 3696" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"168" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len({x.speaker_id for x in all_samples if x.usage=='test'}) # should be 168" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"50" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"val_set_speakers={'faks0', 'mmdb1', 'mbdg0', 'fedw0', 'mtdt0', 'fsem0', 'mdvc0', 'mrjm4', 'mjsw0', 'mteb0',\n", | |
" 'fdac1', 'fjem0', 'mgwt0', 'mmdm2', 'mpdf0', 'fcmh0', 'mbwm0', 'mcsh0', 'fadg0', 'mgjf0',\n", | |
" 'mglb0', 'mrtk0', 'mthc0', 'mwjg0', 'fnmr0', 'mbns0', 'mmjr0', 'mdls0', 'mers0', 'fmah0',\n", | |
" 'fdrw0', 'fcal1', 'mmwh0', 'fjsj0', 'mreb0', 'fgjd0', 'fjmg0', 'mjfc0', 'mrjr0', 'fmml0',\n", | |
" 'mjar0', 'fkms0', 'fdms0', 'mtaa0', 'frew0', 'mdlf0', 'mrcs0', 'majc0', 'mroa0', 'mrws1'}\n", | |
"len(val_set_speakers) # should be 50" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"400" | |
] | |
}, | |
"execution_count": 36, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"val_set= [x for x in all_samples if x.sex + x.speaker_id in val_set_speakers and not x.sentence_id.startswith('sa')]\n", | |
"len(val_set) # should be 400" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"39" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"reduced_phones = sorted({reduce_phones[p] for p in phones if p != 'q'})\n", | |
"len(reduced_phones) # should be 39" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Preparing the training set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train, T_train, M_train = get_features_and_labels_for(train_set)\n", | |
"L_train = get_padded_labels(train_set)\n", | |
"L_train_reduced = get_padded_labels(train_set, reduced=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"means = get_means(X_train, M_train)\n", | |
"subtract_means(X_train, means, M_train)\n", | |
"\n", | |
"stds = get_stds(X_train, M_train)\n", | |
"divide_by_stds(X_train, stds, M_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"shuffling = range(len(train_set))\n", | |
"shuffle(shuffling)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train = X_train[:, shuffling, :]\n", | |
"T_train = T_train[:, shuffling, :]\n", | |
"M_train = M_train[:, shuffling, :]\n", | |
"L_train = L_train[:, shuffling, :]\n", | |
"L_train_reduced = L_train_reduced[:, shuffling, :]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"rtimit_train_names = [train_set[i]._get_path(\".txt\").encode() for i in shuffling]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File(filename, 'a') as f:\n", | |
" orig = f['reduced']\n", | |
" train = orig.create_group('training')\n", | |
" train.create_dataset('default', data=X_train, compression='gzip', chunks=(X_train.shape[0], 1, X_train.shape[2]))\n", | |
" train.create_dataset('targets', data=T_train, compression='gzip', chunks=(T_train.shape[0], 1, T_train.shape[2]))\n", | |
" train.create_dataset('masks', data=M_train, compression='gzip', chunks=(M_train.shape[0], 1, M_train.shape[2]))\n", | |
" train.create_dataset('labels', data=L_train)\n", | |
" train.create_dataset('labels_reduced', data=L_train_reduced)\n", | |
" train.create_dataset('names', data=np.array(rtimit_train_names))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"del X_train, T_train, M_train, L_train, L_train_reduced" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Preparing the validation set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X_val, T_val, M_val = get_features_and_labels_for(val_set)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"subtract_means(X_val, means, M_val)\n", | |
"divide_by_stds(X_val, stds, M_val)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"L_val = get_padded_labels(val_set)\n", | |
"L_val_reduced = get_padded_labels(val_set, reduced=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"rtimit_val_names = [val_set[i]._get_path(\".txt\").encode() for i in range(len(val_set))]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File(filename, 'a') as f:\n", | |
" orig = f['reduced']\n", | |
" validation = orig.create_group('validation')\n", | |
" validation.create_dataset('default', data=X_val, compression='gzip', chunks=(X_val.shape[0], 1, X_val.shape[2]))\n", | |
" validation.create_dataset('targets', data=T_val, compression='gzip', chunks=(T_val.shape[0], 1, T_val.shape[2]))\n", | |
" validation.create_dataset('masks', data=M_val, compression='gzip', chunks=(M_val.shape[0], 1, M_val.shape[2]))\n", | |
" validation.create_dataset('labels', data=L_val)\n", | |
" validation.create_dataset('labels_reduced', data=L_val_reduced)\n", | |
" validation.create_dataset('names', data=np.array(rtimit_val_names))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"del X_val, T_val, M_val, L_val, L_val_reduced" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## preparing the core test set" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 51, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X_test, T_test, M_test = get_features_and_labels_for(core_test_set)\n", | |
"L_test = get_padded_labels(core_test_set)\n", | |
"L_test_reduced = get_padded_labels(core_test_set, reduced=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 52, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"subtract_means(X_test, means, M_test)\n", | |
"divide_by_stds(X_test, stds, M_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"rtimit_test_names = [core_test_set[i]._get_path(\".txt\").encode() for i in range(len(core_test_set))]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"with h5py.File(filename, 'a') as f:\n", | |
" orig = f['reduced']\n", | |
" validation = orig.create_group('test')\n", | |
" validation.create_dataset('default', data=X_test, compression='gzip', chunks=(X_test.shape[0], 1, X_test.shape[2]))\n", | |
" validation.create_dataset('targets', data=T_test, compression='gzip', chunks=(T_test.shape[0], 1, T_test.shape[2]))\n", | |
" validation.create_dataset('masks', data=M_test, compression='gzip', chunks=(M_test.shape[0], 1, M_test.shape[2]))\n", | |
" validation.create_dataset('labels', data=L_test)\n", | |
" validation.create_dataset('labels_reduced', data=L_test_reduced)\n", | |
" validation.create_dataset('names', data=np.array(rtimit_test_names))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"del X_test, T_test, M_test, L_test, L_test_reduced" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment