mineshpatel1 · November 1, 2017 17:31
diff --git a/__init__.py b/__init__.py
 """
 neural_net module. Please create a directory neural_net with a blank __init__.py and Dataset.py in it in order to read
 the pickle archive provided as it requires this module.

 E.g.
 neural_net/__init__.py
 neural_net/Dataset.py
 """
diff --git a/Dataset.py b/Dataset.py
 import numpy as np
 import cv2


 def read_image(path):
 	"""Reads image as grayscale and normalises and flattens the array."""
 	img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)  # Load as grayscale
 	img = img / 255  # Normalise values
 	return img.flatten()  # Flatten to shape [784]


 def convert_to_one_hot(digit):
 	"""Converts an integer between 0-9 into an"""
 	if 0 > digit or digit > 9:
 		raise ValueError('Only accepts integers between 0 and 9')
 	x = [0.] * 10
 	x[int(digit)] = 1.
 	return x


 class Subset:
 	def __init__(self, images, labels):
 		"""Class for a subset of a dataset, e.g. training or test data."""
 		self.num_items = len(images)
 		self.images = images
 		self.labels = labels
 		self.used = []  # Holds all of the indices that have already been used in a previous batch

 	def next_batch(self, n):
 		"""
 		Gets a random batch of images and associated labels to use. Tries not to reuse elements until all elements in
 		the set have been used once. This isn't perfect but due to randomisation should still be useful for training.

 		Args:
 			n (int): Number of items to receive.

 		Returns:
 			tuple: The first is an `np.array` of randomly selected images and the second is and `np.array` of their
 			associated labels.
 		"""
 		# Reset the used array when it is full
 		if len(self.used) + n > self.num_items:
 			self.used = np.array([])

 		# Restricts to only the unused indices
 		possible = range(self.num_items)
 		possible = np.array(list(filter(lambda x: x not in self.used, possible)))

 		rand = np.random.permutation(len(possible))  # Randomises the order
 		batch_idx = rand[:n]  # Takes n elements
 		batch_idx = possible[batch_idx]  # Takes the random elements from the list of possible indices to use

 		self.used = np.concatenate((self.used, batch_idx), axis=0)
 		return self.images[batch_idx], self.labels[batch_idx]


 class Dataset:
 	def __init__(self, images, labels, from_path=False, normalise=True, flatten=True, one_hot=True, training_ratio=0.8,
 	             subsets=False, image_size=28, split=True):
 		"""
 		Class for holding datasets of images and associated labels. Specifically written for square images of digits.

 		Args:
 			images (list|tuple): List of images read directly from `cv2`. Expects images to be grayscale.
 			labels (list|tuple): List of integers, with each integer indicating the label from 0-9.
 			from_path (bool): If True, will load the images from paths given.
 			normalise (bool): If True, will normalise the image so each pixel is measured between 0 and 1.
 			flatten (bool): If True, will flatten each image array so it is one-dimensional.
 			one_hot (bool): If True will convert integer digit labels to lists of length 10 with a 1 indicating the
 				position of the value and 0 in each other position.
 			training_ratio (float): Indicates the percentage that should be used to randomly split the data into training
 				and test segments. E.g. `training=0.8` would mean 80% training data, 20% test.
 			subsets (bool): If True, will expect images and labels to be lists or tuples where the first element is
 				the array for training data and the second is for test data.
 			image_size (int): Length of side of the square image in pixels.
 			split (bool): If True, will automatically split the whole dataset according to the `training_ratio`. Otherwise
 				will accept tuples for `images` and `labels` where the first element is the training data and the second
 				element is the test data.
 		"""

 		def process(imgs, labs):
 			if from_path or normalise or flatten:  # Handles various loading options
 				images_ = []
 				for img in imgs:
 					if from_path:  # Read the image as grayscale
 						img = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
 						if len(img) != image_size ** 2:  # Resize the image if it is not already a square
 							img = cv2.resize(img, (28, 28))

 					if normalise:  # Normalise the data
 						img = img / 255
 					if flatten:  # Flatten the array into 1-D
 						img = img.flatten()
 					images_.append(np.float32(img))
 				imgs = images_
 				del images_

 			if one_hot:
 				labs = [convert_to_one_hot(digit) for digit in labs]

 			if len(imgs) != len(labs):
 				raise AssertionError('Different number of images compared to labels.')

 			if type(imgs) is not np.array:
 				imgs = np.array(imgs)

 			if type(labs) is not np.array:
 				labs = np.array(labs)

 			return imgs, labs

 		if not subsets:
 			if split:
 				images, labels = process(images, labels)
 				self.num_items = len(images)

 				# Randomly shuffle the indices and split into a training and test set based on the input ratio
 				indices = np.random.permutation(images.shape[0])
 				ratio_idx = int(training_ratio * self.num_items)
 				training_idx, test_idx = indices[:ratio_idx], indices[ratio_idx:]

 				# Creates subsets based on the training and test split
 				self.train = Subset(images[training_idx], labels[training_idx])
 				self.test = Subset(images[test_idx], labels[test_idx])
 			else:
 				self.train = Subset(*process(images[0], labels[0]))
 				self.test = Subset(*process(images[1], labels[1]))
 		else:
 			self.train = Subset(images[0], labels[0])
 			self.test = Subset(images[1], labels[1])

 	@property
 	def train_size(self):
 		return len(self.train.images)

 	@property
 	def test_size(self):
 		return len(self.test.images)
	"""
	neural_net module. Please create a directory neural_net with a blank __init__.py and Dataset.py in it in order to read
	the pickle archive provided as it requires this module.

	E.g.
	neural_net/__init__.py
	neural_net/Dataset.py
	"""
	import numpy as np
	import cv2


	def read_image(path):
	"""Reads image as grayscale and normalises and flattens the array."""
	img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) # Load as grayscale
	img = img / 255 # Normalise values
	return img.flatten() # Flatten to shape [784]


	def convert_to_one_hot(digit):
	"""Converts an integer between 0-9 into an"""
	if 0 > digit or digit > 9:
	raise ValueError('Only accepts integers between 0 and 9')
	x = [0.] * 10
	x[int(digit)] = 1.
	return x


	class Subset:
	def __init__(self, images, labels):
	"""Class for a subset of a dataset, e.g. training or test data."""
	self.num_items = len(images)
	self.images = images
	self.labels = labels
	self.used = [] # Holds all of the indices that have already been used in a previous batch

	def next_batch(self, n):
	"""
	Gets a random batch of images and associated labels to use. Tries not to reuse elements until all elements in
	the set have been used once. This isn't perfect but due to randomisation should still be useful for training.

	Args:
	n (int): Number of items to receive.

	Returns:
	tuple: The first is an `np.array` of randomly selected images and the second is and `np.array` of their
	associated labels.
	"""
	# Reset the used array when it is full
	if len(self.used) + n > self.num_items:
	self.used = np.array([])

	# Restricts to only the unused indices
	possible = range(self.num_items)
	possible = np.array(list(filter(lambda x: x not in self.used, possible)))

	rand = np.random.permutation(len(possible)) # Randomises the order
	batch_idx = rand[:n] # Takes n elements
	batch_idx = possible[batch_idx] # Takes the random elements from the list of possible indices to use

	self.used = np.concatenate((self.used, batch_idx), axis=0)
	return self.images[batch_idx], self.labels[batch_idx]


	class Dataset:
	def __init__(self, images, labels, from_path=False, normalise=True, flatten=True, one_hot=True, training_ratio=0.8,
	subsets=False, image_size=28, split=True):
	"""
	Class for holding datasets of images and associated labels. Specifically written for square images of digits.

	Args:
	images (list\|tuple): List of images read directly from `cv2`. Expects images to be grayscale.
	labels (list\|tuple): List of integers, with each integer indicating the label from 0-9.
	from_path (bool): If True, will load the images from paths given.
	normalise (bool): If True, will normalise the image so each pixel is measured between 0 and 1.
	flatten (bool): If True, will flatten each image array so it is one-dimensional.
	one_hot (bool): If True will convert integer digit labels to lists of length 10 with a 1 indicating the
	position of the value and 0 in each other position.
	training_ratio (float): Indicates the percentage that should be used to randomly split the data into training
	and test segments. E.g. `training=0.8` would mean 80% training data, 20% test.
	subsets (bool): If True, will expect images and labels to be lists or tuples where the first element is
	the array for training data and the second is for test data.
	image_size (int): Length of side of the square image in pixels.
	split (bool): If True, will automatically split the whole dataset according to the `training_ratio`. Otherwise
	will accept tuples for `images` and `labels` where the first element is the training data and the second
	element is the test data.
	"""

	def process(imgs, labs):
	if from_path or normalise or flatten: # Handles various loading options
	images_ = []
	for img in imgs:
	if from_path: # Read the image as grayscale
	img = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
	if len(img) != image_size ** 2: # Resize the image if it is not already a square
	img = cv2.resize(img, (28, 28))

	if normalise: # Normalise the data
	img = img / 255
	if flatten: # Flatten the array into 1-D
	img = img.flatten()
	images_.append(np.float32(img))
	imgs = images_
	del images_

	if one_hot:
	labs = [convert_to_one_hot(digit) for digit in labs]

	if len(imgs) != len(labs):
	raise AssertionError('Different number of images compared to labels.')

	if type(imgs) is not np.array:
	imgs = np.array(imgs)

	if type(labs) is not np.array:
	labs = np.array(labs)

	return imgs, labs

	if not subsets:
	if split:
	images, labels = process(images, labels)
	self.num_items = len(images)

	# Randomly shuffle the indices and split into a training and test set based on the input ratio
	indices = np.random.permutation(images.shape[0])
	ratio_idx = int(training_ratio * self.num_items)
	training_idx, test_idx = indices[:ratio_idx], indices[ratio_idx:]

	# Creates subsets based on the training and test split
	self.train = Subset(images[training_idx], labels[training_idx])
	self.test = Subset(images[test_idx], labels[test_idx])
	else:
	self.train = Subset(*process(images[0], labels[0]))
	self.test = Subset(*process(images[1], labels[1]))
	else:
	self.train = Subset(images[0], labels[0])
	self.test = Subset(images[1], labels[1])

	@property
	def train_size(self):
	return len(self.train.images)

	@property
	def test_size(self):
	return len(self.test.images)