Created
February 27, 2018 14:05
-
-
Save iamukasa/cc553a5a4082f77d9e0bee1c74449d12 to your computer and use it in GitHub Desktop.
PReprocessing from ubuntu dialog corpus to our dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import itertools | |
import functools | |
import tensorflow as tf | |
import numpy as np | |
import array | |
tf.flags.DEFINE_integer( | |
"min_word_frequency", 5, "Minimum frequency of words in the vocabulary") | |
tf.flags.DEFINE_integer("max_sentence_len", 160, "Maximum Sentence Length") | |
tf.flags.DEFINE_string( | |
"input_dir", os.path.abspath("./data"), | |
"Input directory containing original CSV data files (default = './data')") | |
tf.flags.DEFINE_string( | |
"output_dir", os.path.abspath("./data"), | |
"Output directory for TFrEcord files (default = './data')") | |
FLAGS = tf.flags.FLAGS | |
TRAIN_PATH = os.path.join(FLAGS.input_dir, "train.csv") | |
VALIDATION_PATH = os.path.join(FLAGS.input_dir, "valid.csv") | |
TEST_PATH = os.path.join(FLAGS.input_dir, "test.csv") | |
def tokenizer_fn(iterator): | |
return (x.split(" ") for x in iterator) | |
def create_csv_iter(filename): | |
""" | |
Returns an iterator over a CSV file. Skips the header. | |
""" | |
with open(filename) as csvfile: | |
reader = csv.reader(csvfile) | |
# Skip the header | |
next(reader) | |
for row in reader: | |
yield row | |
def create_vocab(input_iter, min_frequency): | |
""" | |
Creates and returns a VocabularyProcessor object with the vocabulary | |
for the input iterator. | |
""" | |
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( | |
FLAGS.max_sentence_len, | |
min_frequency=min_frequency, | |
tokenizer_fn=tokenizer_fn) | |
vocab_processor.fit(input_iter) | |
return vocab_processor | |
def transform_sentence(sequence, vocab_processor): | |
""" | |
Maps a single sentence into the integer vocabulary. Returns a python array. | |
""" | |
return next(vocab_processor.transform([sequence])).tolist() | |
def create_text_sequence_feature(fl, sentence, sentence_len, vocab): | |
""" | |
Writes a sentence to FeatureList protocol buffer | |
""" | |
sentence_transformed = transform_sentence(sentence, vocab) | |
for word_id in sentence_transformed: | |
fl.feature.add().int64_list.value.extend([word_id]) | |
return fl | |
def create_example_train(row, vocab): | |
""" | |
Creates a training example for the Ubuntu Dialog Corpus dataset. | |
Returnsthe a tensorflow.Example Protocol Buffer object. | |
""" | |
context, utterance, label = row | |
context_transformed = transform_sentence(context, vocab) | |
utterance_transformed = transform_sentence(utterance, vocab) | |
context_len = len(next(vocab._tokenizer([context]))) | |
utterance_len = len(next(vocab._tokenizer([utterance]))) | |
label = int(float(label)) | |
# New Example | |
example = tf.train.Example() | |
example.features.feature["context"].int64_list.value.extend(context_transformed) | |
example.features.feature["utterance"].int64_list.value.extend(utterance_transformed) | |
example.features.feature["context_len"].int64_list.value.extend([context_len]) | |
example.features.feature["utterance_len"].int64_list.value.extend([utterance_len]) | |
example.features.feature["label"].int64_list.value.extend([label]) | |
return example | |
def create_example_test(row, vocab): | |
""" | |
Creates a test/validation example for the Ubuntu Dialog Corpus dataset. | |
Returnsthe a tensorflow.Example Protocol Buffer object. | |
""" | |
context, utterance = row[:2] | |
distractors = row[2:] | |
context_len = len(next(vocab._tokenizer([context]))) | |
utterance_len = len(next(vocab._tokenizer([utterance]))) | |
context_transformed = transform_sentence(context, vocab) | |
utterance_transformed = transform_sentence(utterance, vocab) | |
# New Example | |
example = tf.train.Example() | |
example.features.feature["context"].int64_list.value.extend(context_transformed) | |
example.features.feature["utterance"].int64_list.value.extend(utterance_transformed) | |
example.features.feature["context_len"].int64_list.value.extend([context_len]) | |
example.features.feature["utterance_len"].int64_list.value.extend([utterance_len]) | |
# Distractor sequences | |
for i, distractor in enumerate(distractors): | |
dis_key = "distractor_{}".format(i) | |
dis_len_key = "distractor_{}_len".format(i) | |
# Distractor Length Feature | |
dis_len = len(next(vocab._tokenizer([distractor]))) | |
example.features.feature[dis_len_key].int64_list.value.extend([dis_len]) | |
# Distractor Text Feature | |
dis_transformed = transform_sentence(distractor, vocab) | |
example.features.feature[dis_key].int64_list.value.extend(dis_transformed) | |
return example | |
def create_tfrecords_file(input_filename, output_filename, example_fn): | |
""" | |
Creates a TFRecords file for the given input data and | |
example transofmration function | |
""" | |
writer = tf.python_io.TFRecordWriter(output_filename) | |
print("Creating TFRecords file at {}...".format(output_filename)) | |
for i, row in enumerate(create_csv_iter(input_filename)): | |
x = example_fn(row) | |
writer.write(x.SerializeToString()) | |
writer.close() | |
print("Wrote to {}".format(output_filename)) | |
def write_vocabulary(vocab_processor, outfile): | |
""" | |
Writes the vocabulary to a file, one word per line. | |
""" | |
vocab_size = len(vocab_processor.vocabulary_) | |
with open(outfile, "w") as vocabfile: | |
for id in range(vocab_size): | |
word = vocab_processor.vocabulary_._reverse_mapping[id] | |
vocabfile.write(word + "\n") | |
print("Saved vocabulary to {}".format(outfile)) | |
if __name__ == "__main__": | |
print("Creating vocabulary...") | |
input_iter = create_csv_iter(TRAIN_PATH) | |
input_iter = (x[0] + " " + x[1] for x in input_iter) | |
vocab = create_vocab(input_iter, min_frequency=FLAGS.min_word_frequency) | |
print("Total vocabulary size: {}".format(len(vocab.vocabulary_))) | |
# Create vocabulary.txt file | |
write_vocabulary( | |
vocab, os.path.join(FLAGS.output_dir, "vocabulary.txt")) | |
# Save vocab processor | |
vocab.save(os.path.join(FLAGS.output_dir, "vocab_processor.bin")) | |
# Create validation.tfrecords | |
create_tfrecords_file( | |
input_filename=VALIDATION_PATH, | |
output_filename=os.path.join(FLAGS.output_dir, "validation.tfrecords"), | |
example_fn=functools.partial(create_example_test, vocab=vocab)) | |
# Create test.tfrecords | |
create_tfrecords_file( | |
input_filename=TEST_PATH, | |
output_filename=os.path.join(FLAGS.output_dir, "test.tfrecords"), | |
example_fn=functools.partial(create_example_test, vocab=vocab)) | |
# Create train.tfrecords | |
create_tfrecords_file( | |
input_filename=TRAIN_PATH, | |
output_filename=os.path.join(FLAGS.output_dir, "train.tfrecords"), | |
example_fn=functools.partial(create_example_train, vocab=vocab)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WNDEF SOPHIE | |
I am a smart chat robot who knows about reproductive health. Ask me any questions about sexual and reproductive health.The Devs have fed me with a lot of knowledge about sexual and reproductive health.I also talk back but you can shut me up if you so wish | |
WNDEF YOUR NAME | |
My name is Sophie. I am a smart chat robot who knows about reproductive health. Ask me any questions about sexual and reproductive health.The Devs have fed me with a lot of knowledge about sexual and reproductive health.I also talk back but you can shut me up if you so wish. | |
WNDEF Unaitwa nani | |
My name is Sophie. I am a smart chat robot who knows about reproductive health. Ask me any questions about sexual and reproductive health.The Devs have fed me with a lot of knowledge about sexual and reproductive health.I also talk back but you can shut me up if you so wish. | |
WNDEF SEX | |
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse. | |
WNDEF WHAT IS SEX | |
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse. | |
WNDEF DESCRIBE SEX | |
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse. | |
WNDEF EXPLAIN SEX | |
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse. | |
WNDEF DEFINE SEX | |
The state of being male or female.Men or male animals as a group or women or female animals as a group.Physical activity in which people touch each other's bodies, kiss each other, etc.Physical activity that is related to and often includes sexual intercourse. | |
WNDEF HEALTH | |
The state of physical,mental and psychological well-being. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment