Created
April 21, 2018 16:08
-
-
Save fedden/8e816e7e80d4ebba82b613fb91964a6d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class SequenceDataset(): | |
def __init__(self, sequence_length): | |
"""Initialises the dataset class that contains the sequences that will be trained on. | |
Params: | |
sequence_length: int - the length of a sequence that will be fed to the RNN. | |
""" | |
self.sequence_length = sequence_length | |
self.pad_length = self.sequence_length - 1 | |
# Creating dictionaries for conversion between char and tokens and back again. | |
self.token_to_char = dict() | |
self.char_to_token = dict() | |
self.vocabulary_size = 0 | |
self.counter = 0 | |
# Special Characters to pad, start and end sentances. | |
self.pad_char = '<PAD_CHAR>' | |
self.start_char = '<START_CHAR>' | |
self.end_char = '<END_CHAR>' | |
# Add the special characters to the vocabulary. | |
self.add_character(self.pad_char) | |
self.add_character(self.start_char) | |
self.add_character(self.end_char) | |
# Get the special tokens needed to pad, start and end sentances of tokens. | |
self.pad_token = self.char_to_token[self.pad_char] | |
self.start_token = self.char_to_token[self.start_char] | |
self.end_token = self.char_to_token[self.end_char] | |
# We need to start our sentances with pad tokens and a single start token. | |
self.empty_start_tokens = [self.pad_token for _ in range(self.pad_length)] | |
self.empty_start_tokens += [self.start_token] | |
self.dataset_x = None | |
self.dataset_y = None | |
def add_character(self, char): | |
"""Adds a character to the vocabulary dictionary. | |
Only adds the character if it is not already present in the vocabulary dictionary | |
that comprises every character used in the dataset. | |
Params: | |
char: str - the char that will be added to the vocabulary. | |
""" | |
# If char not already in the dictionary... | |
if not char in self.char_to_token: | |
# Create a unique int token for the char. | |
self.char_to_token[char] = self.counter | |
# Create the token to char conversion dictionary entry. | |
self.token_to_char[self.counter] = char | |
self.counter += 1 | |
self.vocabulary_size = len(self.char_to_token) | |
def __len__(self): | |
"""Returns the length of the dataset - a massive list of sequences. | |
Example usage: | |
dataset = SequenceDataset(10) | |
dataset.add_sequences_from_bios(bios) | |
# Returns the length of the dataset defined in the function. | |
print(len(dataset)) | |
>> 3201931 | |
""" | |
# If only just contstructed the dataset object. | |
if self.dataset_x is None: | |
return 0 | |
# Sanity check. | |
assert len(self.dataset_x) == len(self.dataset_y) | |
return len(self.dataset_x) | |
def add_sequences_from_bios(self, bios): | |
"""Fills this dataset object with sequences from the argument. | |
This method takes a list of strings, where each string is is a bio from | |
a dating profile, and turns it into a list of fixed sequence_length tokens. | |
Each bio will be padded and wrapped with start and end tokens respectively. | |
Params: | |
bios: list(str) - the list of profile bios, that may include unicode emojis. | |
""" | |
# The token sequences that will be created to input into the RNN. | |
token_inputs = [] | |
# The token targets that the RNN will be expected to predict. | |
token_targets = [] | |
# Loop over the biographies passed in. | |
for i, bio in enumerate(bios): | |
# Create a list to store the tokens for this bio. | |
bio_as_tokens = list(self.empty_start_tokens) | |
# For each char in bio, add character to vocabulary and convert to tokens. | |
for char in bio: | |
self.add_character(char) | |
token = self.char_to_token[char] | |
bio_as_tokens.append(token) | |
# Add the end token to wrap the sentance of tokens. | |
bio_as_tokens.append(self.end_token) | |
# Loop over the sequence of tokens, create sequences of tokens for input | |
# and create token targets for the RNN. | |
for start in range(0, len(bio_as_tokens) - self.sequence_length - 1): | |
end = start + self.sequence_length | |
token_input = bio_as_tokens[start:end] | |
token_target = bio_as_tokens[end] | |
token_inputs.append(token_input) | |
token_targets.append(token_target) | |
# Print progress. | |
print("{:.2f}% done ".format((i + 1) / len(bios) * 100), end='\r') | |
# Create or append the inputs and targets to the main dataset. | |
if self.dataset_x is None: | |
self.dataset_x = np.array(token_inputs) | |
self.dataset_y = np.array(token_targets) | |
else: | |
self.dataset_x = np.concatenate((self.dataset_x, np.array(token_inputs)), | |
axis=0) | |
self.dataset_y = np.concatenate((self.dataset_y, np.array(token_targets)), | |
axis=0) | |
def convert_tokens_to_words(self, tokens): | |
"""Take a sequence of tokens, and convert back to characters. | |
Helper function that also filters out the special padding, start and end tokens. | |
Params: | |
tokens: str - the sequence of tokens that will be returned as a string of words. | |
""" | |
chars = [] | |
# For each token. | |
for token in tokens: | |
# If valid token. | |
if not token in [self.start_token, self.pad_token, self.end_token]: | |
char = self.token_to_char[token] | |
chars.append(char) | |
# Return string of words. | |
return ''.join(chars) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment