ASHUTOSH KUMAR ashunigion

Project Title

One Paragraph of project description goes here

Getting Started

These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.

	label_0 = train_data[train_labels==0]
	label_1 = train_data[train_labels==1]
	#it can be repeated for all the intermediate labels
	#...
	label_9 = train_data[train_labels==9]

	from sklearn.cluster import KMeans

	kmeans_0 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_0)
	sample_0 = kmeans_0.cluster_centers_

	kmeans_1 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_1)
	sample_1 = kmeans_1.cluster_centers_
	#the code can be repeated for all the intermediate labels
	#...
	kmeans_9 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_9)

	proto_sample = np.concatenate((sample_0,sample_1,
	sample_2,sample_3,
	sample_4,sample_5,
	sample_6,sample_7,
	sample_8,sample_9
	), axis = 0
	)

	# generating the labels for prototyped training data
	sample_0_labels = np.full((10,), 0)
	sample_1_labels = np.full((10,), 1)
	# similarly intermediate labels can be generated
	# ...
	sample_9_labels = np.full((10,), 9)

	# stacking the labels into a single array
	proto_labels = np.concatenate((sample_0_labels,sample_1_labels,
	sample_2_labels,sample_3_labels,

	# the following snippet was provided as starter code by UCSD for the course
	# Machine Learning Fundamentals on edX
	def rand_prototypes(M):
	"""
	Returns M randomly samples data points and corresponding labels.

	Parameters:
	M (int): number of data points to be sampled

	Returns:

	# the following code snippet was provided by UCSD for Machine Learning Fundamentals on edX
	@interact_manual( M=(100,2000,100), rounds=(1,10))
	def comparison(M,rounds):
	"""
	Shows the mean error of both the prototyping methods. As we randomly prototyping the data,
	it makes sense to do it multiple times and take the mean error.

	Parameters:
	M(int): number of data points to be sampled
	r(int): number of times random dataset chosen to calculate the mean error

	import nltk
	nltk.download('reuters')
	from nltk.corpus import reuters

	def read_corpus(category="crude"):
	""" Read files from the specified Reuter's category. And adds
	START and END to beginning and end of each document.

	Params:
	category (string): category name

	# dont forget to import "pprint"
	reuters_corpus = read_corpus()
	pprint.pprint(reuters_corpus[:3], compact=True, width=100)

	# first three document from the corpus
	# [['<START>', 'japan', 'to', 'revise', 'long', '-', 'term', 'energy', 'demand', 'downwards', 'the',
	# 'ministry', 'of', 'international', 'trade', 'and', 'industry', '(', 'miti', ')', 'will', 'revise',
	# 'its', 'long', '-', 'term', 'energy', 'supply', '/', 'demand', 'outlook', 'by', 'august', 'to',
	# 'meet', 'a', 'forecast', 'downtrend', 'in', 'japanese', 'energy', 'demand', ',', 'ministry',
	# 'officials', 'said', '.', 'miti', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for',

	def distinct_words(corpus):
	""" Determine a list of distinct words for the corpus.
	Params:
	corpus (list of list of strings): corpus of documents
	Return:
	corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
	num_corpus_words (integer): number of distinct words across the corpus
	"""
	corpus_words = []
	num_corpus_words = -1

ASHUTOSH KUMAR ashunigion

Project Title

Getting Started

Prerequisites