Skip to content

Instantly share code, notes, and snippets.

@fedden
Created April 21, 2018 16:08
Show Gist options
  • Save fedden/8e816e7e80d4ebba82b613fb91964a6d to your computer and use it in GitHub Desktop.
Save fedden/8e816e7e80d4ebba82b613fb91964a6d to your computer and use it in GitHub Desktop.
class SequenceDataset():
def __init__(self, sequence_length):
"""Initialises the dataset class that contains the sequences that will be trained on.
Params:
sequence_length: int - the length of a sequence that will be fed to the RNN.
"""
self.sequence_length = sequence_length
self.pad_length = self.sequence_length - 1
# Creating dictionaries for conversion between char and tokens and back again.
self.token_to_char = dict()
self.char_to_token = dict()
self.vocabulary_size = 0
self.counter = 0
# Special Characters to pad, start and end sentances.
self.pad_char = '<PAD_CHAR>'
self.start_char = '<START_CHAR>'
self.end_char = '<END_CHAR>'
# Add the special characters to the vocabulary.
self.add_character(self.pad_char)
self.add_character(self.start_char)
self.add_character(self.end_char)
# Get the special tokens needed to pad, start and end sentances of tokens.
self.pad_token = self.char_to_token[self.pad_char]
self.start_token = self.char_to_token[self.start_char]
self.end_token = self.char_to_token[self.end_char]
# We need to start our sentances with pad tokens and a single start token.
self.empty_start_tokens = [self.pad_token for _ in range(self.pad_length)]
self.empty_start_tokens += [self.start_token]
self.dataset_x = None
self.dataset_y = None
def add_character(self, char):
"""Adds a character to the vocabulary dictionary.
Only adds the character if it is not already present in the vocabulary dictionary
that comprises every character used in the dataset.
Params:
char: str - the char that will be added to the vocabulary.
"""
# If char not already in the dictionary...
if not char in self.char_to_token:
# Create a unique int token for the char.
self.char_to_token[char] = self.counter
# Create the token to char conversion dictionary entry.
self.token_to_char[self.counter] = char
self.counter += 1
self.vocabulary_size = len(self.char_to_token)
def __len__(self):
"""Returns the length of the dataset - a massive list of sequences.
Example usage:
dataset = SequenceDataset(10)
dataset.add_sequences_from_bios(bios)
# Returns the length of the dataset defined in the function.
print(len(dataset))
>> 3201931
"""
# If only just contstructed the dataset object.
if self.dataset_x is None:
return 0
# Sanity check.
assert len(self.dataset_x) == len(self.dataset_y)
return len(self.dataset_x)
def add_sequences_from_bios(self, bios):
"""Fills this dataset object with sequences from the argument.
This method takes a list of strings, where each string is is a bio from
a dating profile, and turns it into a list of fixed sequence_length tokens.
Each bio will be padded and wrapped with start and end tokens respectively.
Params:
bios: list(str) - the list of profile bios, that may include unicode emojis.
"""
# The token sequences that will be created to input into the RNN.
token_inputs = []
# The token targets that the RNN will be expected to predict.
token_targets = []
# Loop over the biographies passed in.
for i, bio in enumerate(bios):
# Create a list to store the tokens for this bio.
bio_as_tokens = list(self.empty_start_tokens)
# For each char in bio, add character to vocabulary and convert to tokens.
for char in bio:
self.add_character(char)
token = self.char_to_token[char]
bio_as_tokens.append(token)
# Add the end token to wrap the sentance of tokens.
bio_as_tokens.append(self.end_token)
# Loop over the sequence of tokens, create sequences of tokens for input
# and create token targets for the RNN.
for start in range(0, len(bio_as_tokens) - self.sequence_length - 1):
end = start + self.sequence_length
token_input = bio_as_tokens[start:end]
token_target = bio_as_tokens[end]
token_inputs.append(token_input)
token_targets.append(token_target)
# Print progress.
print("{:.2f}% done ".format((i + 1) / len(bios) * 100), end='\r')
# Create or append the inputs and targets to the main dataset.
if self.dataset_x is None:
self.dataset_x = np.array(token_inputs)
self.dataset_y = np.array(token_targets)
else:
self.dataset_x = np.concatenate((self.dataset_x, np.array(token_inputs)),
axis=0)
self.dataset_y = np.concatenate((self.dataset_y, np.array(token_targets)),
axis=0)
def convert_tokens_to_words(self, tokens):
"""Take a sequence of tokens, and convert back to characters.
Helper function that also filters out the special padding, start and end tokens.
Params:
tokens: str - the sequence of tokens that will be returned as a string of words.
"""
chars = []
# For each token.
for token in tokens:
# If valid token.
if not token in [self.start_token, self.pad_token, self.end_token]:
char = self.token_to_char[token]
chars.append(char)
# Return string of words.
return ''.join(chars)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment