Created
January 12, 2017 20:23
-
-
Save sotelo/b6a9209b8cd5931a49689d3f1e1af514 to your computer and use it in GitHub Desktop.
pavoque to hdf5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from run_merlin import prepare_file_path_list, read_file_list | |
from io_funcs.binary_io import BinaryIOCollection | |
import numpy | |
import h5py | |
import pickle | |
from fuel.datasets.hdf5 import H5PYDataset | |
io_fun = BinaryIOCollection() | |
n_outs = 63 # 187 | |
save_dir = '/Tmp/sotelo/data/pavoque/' | |
base_dir = '/Tmp/sotelo/results/merlin/egs/build_your_own_voice/s1/' | |
merlin_data_dir = base_dir + \ | |
'experiments/pavoque/acoustic_model/data/' | |
file_list = read_file_list(merlin_data_dir + 'file_id_list.scp') | |
emotion_set = [x.split('-')[1] for x in file_list] | |
emotion_set = sorted(list(set(emotion_set))) | |
emotion_dict = {x: i + 1 for i, x in enumerate(emotion_set)} | |
# File before merlin | |
raw_dir = base_dir + 'raw_data/pavoque/' | |
text_file = raw_dir + 'utts.data' | |
with open(text_file) as f: | |
text_data = f.readlines() | |
id_from_text = [x.split()[1] for x in text_data] | |
error_files = [ | |
(i, x) for i, x in enumerate(id_from_text) if x not in file_list] | |
assert id_from_text == file_list | |
text_data = ['"'.join(x.strip().split('"')[1:-1]) for x in text_data] | |
char_set = sorted(list(set(''.join(text_data).lower()))) | |
char2code = {x: i + 1 for i, x in enumerate(char_set)} | |
with open(save_dir + 'char2code.pkl', 'w') as f: | |
pickle.dump(char2code, f) | |
audio_files = prepare_file_path_list( | |
file_list, merlin_data_dir + 'nn_norm_mgc_lf0_vuv_bap_63', '.cmp') | |
resulth5 = h5py.File( | |
'/Tmp/sotelo/data/pavoque/pavoque.hdf5', mode='w') | |
num_files = len(file_list) | |
features_h5 = resulth5.create_dataset( | |
'features', (num_files,), | |
dtype=h5py.special_dtype(vlen=numpy.dtype('float32'))) | |
features_shape_h5 = resulth5.create_dataset( | |
'features_shapes', (num_files, 2), dtype='int32') | |
features_h5.dims.create_scale(features_shape_h5, 'shapes') | |
features_h5.dims[0].attach_scale(features_shape_h5) | |
features_shape_labels = resulth5.create_dataset( | |
'features_shape_labels', (2,), dtype='S7') | |
features_shape_labels[...] = [ | |
'time_step'.encode('utf8'), | |
'num_feature'.encode('utf8')] | |
features_h5.dims.create_scale( | |
features_shape_labels, 'shape_labels') | |
features_h5.dims[0].attach_scale(features_shape_labels) | |
text_h5 = resulth5.create_dataset( | |
'text', (num_files,), | |
dtype=h5py.special_dtype(vlen=numpy.dtype('int32'))) | |
speaker_index_h5 = resulth5.create_dataset( | |
'speaker_index', (num_files, 1), dtype='uint8') | |
order = range(num_files) | |
numpy.random.seed(1) | |
numpy.random.shuffle(order) | |
for i, idx in enumerate(order): | |
if i % 100 == 0: | |
print i | |
out_features, out_frame_number = io_fun.load_binary_file_frame( | |
audio_files[idx], n_outs) | |
features_h5[i] = out_features.flatten() | |
features_shape_h5[i] = numpy.array(out_features.shape) | |
speaker_label = id_from_text[idx].split('-')[1] | |
speaker_index_h5[i] = emotion_dict[speaker_label] | |
text_h5[i] = numpy.array( | |
[char2code[x.lower()] for x in text_data[idx]], dtype='int32') | |
end_train = int(.9 * num_files) | |
end_valid = int(.95 * num_files) | |
end_test = num_files | |
split_dict = { | |
'train': {'features': (0, end_train), | |
'text': (0, end_train), | |
'speaker_index': (0, end_train)}, | |
'valid': {'features': (end_train, end_valid), | |
'text': (end_train, end_valid), | |
'speaker_index': (end_train, end_valid)}, | |
'test': {'features': (end_valid, end_test), | |
'text': (end_valid, end_test), | |
'speaker_index': (end_valid, end_test)}} | |
resulth5.attrs['split'] = H5PYDataset.create_split_array(split_dict) | |
resulth5.flush() | |
resulth5.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment