fedden · April 21, 2018 16:08
diff --git a/dataset.py b/dataset.py
 class SequenceDataset():
    
    def __init__(self, sequence_length):
        """Initialises the dataset class that contains the sequences that will be trained on.
        
        Params:
            sequence_length: int - the length of a sequence that will be fed to the RNN.
        """
        
        self.sequence_length = sequence_length
        self.pad_length = self.sequence_length - 1
        
        # Creating dictionaries for conversion between char and tokens and back again.
        self.token_to_char = dict()
        self.char_to_token = dict()
        self.vocabulary_size = 0
        self.counter = 0
        
        # Special Characters to pad, start and end sentances.
        self.pad_char = '<PAD_CHAR>'
        self.start_char = '<START_CHAR>'
        self.end_char = '<END_CHAR>'
        
        # Add the special characters to the vocabulary.
        self.add_character(self.pad_char) 
        self.add_character(self.start_char)
        self.add_character(self.end_char)
        
        # Get the special tokens needed to pad, start and end sentances of tokens.
        self.pad_token = self.char_to_token[self.pad_char]
        self.start_token = self.char_to_token[self.start_char]
        self.end_token = self.char_to_token[self.end_char]
        
        # We need to start our sentances with pad tokens and a single start token.
        self.empty_start_tokens = [self.pad_token for _ in range(self.pad_length)]
        self.empty_start_tokens += [self.start_token]
        self.dataset_x = None
        self.dataset_y = None
        
    def add_character(self, char):
        """Adds a character to the vocabulary dictionary.
        
        Only adds the character if it is not already present in the vocabulary dictionary
        that comprises every character used in the dataset. 
        
        Params:
            char: str - the char that will be added to the vocabulary.
        """
        
        # If char not already in the dictionary...
        if not char in self.char_to_token:
            
            # Create a unique int token for the char.
            self.char_to_token[char] = self.counter
            
            # Create the token to char conversion dictionary entry.
            self.token_to_char[self.counter] = char
            self.counter += 1
            self.vocabulary_size = len(self.char_to_token)
            
    def __len__(self):
        """Returns the length of the dataset - a massive list of sequences.
        
        Example usage:
            dataset = SequenceDataset(10)
            dataset.add_sequences_from_bios(bios)
            
            # Returns the length of the dataset defined in the function.
            print(len(dataset)) 
            >> 3201931
        """
        # If only just contstructed the dataset object.
        if self.dataset_x is None:
            return 0
        
        # Sanity check.
        assert len(self.dataset_x) == len(self.dataset_y)
        return len(self.dataset_x)
        
    def add_sequences_from_bios(self, bios):
        """Fills this dataset object with sequences from the argument.
        
        This method takes a list of strings, where each string is is a bio from
        a dating profile, and turns it into a list of fixed sequence_length tokens.
        
        Each bio will be padded and wrapped with start and end tokens respectively.
        
        Params: 
            bios: list(str) - the list of profile bios, that may include unicode emojis.
        """
        
        # The token sequences that will be created to input into the RNN.
        token_inputs = []
        
        # The token targets that the RNN will be expected to predict.
        token_targets = []
        
        # Loop over the biographies passed in.
        for i, bio in enumerate(bios):
            
            # Create a list to store the tokens for this bio.
            bio_as_tokens = list(self.empty_start_tokens)
            
            # For each char in bio, add character to vocabulary and convert to tokens.
            for char in bio:
                self.add_character(char)
                token = self.char_to_token[char]
                bio_as_tokens.append(token)
                
            # Add the end token to wrap the sentance of tokens.
            bio_as_tokens.append(self.end_token)
            
            # Loop over the sequence of tokens, create sequences of tokens for input 
            # and create token targets for the RNN.
            for start in range(0, len(bio_as_tokens) - self.sequence_length - 1):
                end = start + self.sequence_length
                token_input = bio_as_tokens[start:end]
                token_target = bio_as_tokens[end]
                
                token_inputs.append(token_input)
                token_targets.append(token_target)
                
            # Print progress.
            print("{:.2f}% done    ".format((i + 1) / len(bios) * 100), end='\r')
                
        # Create or append the inputs and targets to the main dataset.
        if self.dataset_x is None:
            self.dataset_x = np.array(token_inputs)
            self.dataset_y = np.array(token_targets)
        else:
            self.dataset_x = np.concatenate((self.dataset_x, np.array(token_inputs)), 
                                            axis=0)
            self.dataset_y = np.concatenate((self.dataset_y, np.array(token_targets)), 
                                            axis=0)        
                
    def convert_tokens_to_words(self, tokens):
        """Take a sequence of tokens, and convert back to characters.
        
        Helper function that also filters out the special padding, start and end tokens.
        
        Params:
            tokens: str - the sequence of tokens that will be returned as a string of words.
        """
        
        chars = []
        
        # For each token. 
        for token in tokens:
            
            # If valid token.
            if not token in [self.start_token, self.pad_token, self.end_token]:
                char = self.token_to_char[token]
                chars.append(char)
        
        # Return string of words.
        return ''.join(chars)
	class SequenceDataset():

	def __init__(self, sequence_length):
	"""Initialises the dataset class that contains the sequences that will be trained on.

	Params:
	sequence_length: int - the length of a sequence that will be fed to the RNN.
	"""

	self.sequence_length = sequence_length
	self.pad_length = self.sequence_length - 1

	# Creating dictionaries for conversion between char and tokens and back again.
	self.token_to_char = dict()
	self.char_to_token = dict()
	self.vocabulary_size = 0
	self.counter = 0

	# Special Characters to pad, start and end sentances.
	self.pad_char = '<PAD_CHAR>'
	self.start_char = '<START_CHAR>'
	self.end_char = '<END_CHAR>'

	# Add the special characters to the vocabulary.
	self.add_character(self.pad_char)
	self.add_character(self.start_char)
	self.add_character(self.end_char)

	# Get the special tokens needed to pad, start and end sentances of tokens.
	self.pad_token = self.char_to_token[self.pad_char]
	self.start_token = self.char_to_token[self.start_char]
	self.end_token = self.char_to_token[self.end_char]

	# We need to start our sentances with pad tokens and a single start token.
	self.empty_start_tokens = [self.pad_token for _ in range(self.pad_length)]
	self.empty_start_tokens += [self.start_token]
	self.dataset_x = None
	self.dataset_y = None

	def add_character(self, char):
	"""Adds a character to the vocabulary dictionary.

	Only adds the character if it is not already present in the vocabulary dictionary
	that comprises every character used in the dataset.

	Params:
	char: str - the char that will be added to the vocabulary.
	"""

	# If char not already in the dictionary...
	if not char in self.char_to_token:

	# Create a unique int token for the char.
	self.char_to_token[char] = self.counter

	# Create the token to char conversion dictionary entry.
	self.token_to_char[self.counter] = char
	self.counter += 1
	self.vocabulary_size = len(self.char_to_token)

	def __len__(self):
	"""Returns the length of the dataset - a massive list of sequences.

	Example usage:
	dataset = SequenceDataset(10)
	dataset.add_sequences_from_bios(bios)

	# Returns the length of the dataset defined in the function.
	print(len(dataset))
	>> 3201931
	"""
	# If only just contstructed the dataset object.
	if self.dataset_x is None:
	return 0

	# Sanity check.
	assert len(self.dataset_x) == len(self.dataset_y)
	return len(self.dataset_x)

	def add_sequences_from_bios(self, bios):
	"""Fills this dataset object with sequences from the argument.

	This method takes a list of strings, where each string is is a bio from
	a dating profile, and turns it into a list of fixed sequence_length tokens.

	Each bio will be padded and wrapped with start and end tokens respectively.

	Params:
	bios: list(str) - the list of profile bios, that may include unicode emojis.
	"""

	# The token sequences that will be created to input into the RNN.
	token_inputs = []

	# The token targets that the RNN will be expected to predict.
	token_targets = []

	# Loop over the biographies passed in.
	for i, bio in enumerate(bios):

	# Create a list to store the tokens for this bio.
	bio_as_tokens = list(self.empty_start_tokens)

	# For each char in bio, add character to vocabulary and convert to tokens.
	for char in bio:
	self.add_character(char)
	token = self.char_to_token[char]
	bio_as_tokens.append(token)

	# Add the end token to wrap the sentance of tokens.
	bio_as_tokens.append(self.end_token)

	# Loop over the sequence of tokens, create sequences of tokens for input
	# and create token targets for the RNN.
	for start in range(0, len(bio_as_tokens) - self.sequence_length - 1):
	end = start + self.sequence_length
	token_input = bio_as_tokens[start:end]
	token_target = bio_as_tokens[end]

	token_inputs.append(token_input)
	token_targets.append(token_target)

	# Print progress.
	print("{:.2f}% done ".format((i + 1) / len(bios) * 100), end='\r')

	# Create or append the inputs and targets to the main dataset.
	if self.dataset_x is None:
	self.dataset_x = np.array(token_inputs)
	self.dataset_y = np.array(token_targets)
	else:
	self.dataset_x = np.concatenate((self.dataset_x, np.array(token_inputs)),
	axis=0)
	self.dataset_y = np.concatenate((self.dataset_y, np.array(token_targets)),
	axis=0)

	def convert_tokens_to_words(self, tokens):
	"""Take a sequence of tokens, and convert back to characters.

	Helper function that also filters out the special padding, start and end tokens.

	Params:
	tokens: str - the sequence of tokens that will be returned as a string of words.
	"""

	chars = []

	# For each token.
	for token in tokens:

	# If valid token.
	if not token in [self.start_token, self.pad_token, self.end_token]:
	char = self.token_to_char[token]
	chars.append(char)

	# Return string of words.
	return ''.join(chars)