One Paragraph of project description goes here
These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
label_0 = train_data[train_labels==0] | |
label_1 = train_data[train_labels==1] | |
#it can be repeated for all the intermediate labels | |
#... | |
label_9 = train_data[train_labels==9] |
from sklearn.cluster import KMeans | |
kmeans_0 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_0) | |
sample_0 = kmeans_0.cluster_centers_ | |
kmeans_1 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_1) | |
sample_1 = kmeans_1.cluster_centers_ | |
#the code can be repeated for all the intermediate labels | |
#... | |
kmeans_9 = KMeans(n_clusters=10, init='k-means++',random_state=0).fit(label_9) |
proto_sample = np.concatenate((sample_0,sample_1, | |
sample_2,sample_3, | |
sample_4,sample_5, | |
sample_6,sample_7, | |
sample_8,sample_9 | |
), axis = 0 | |
) |
# generating the labels for prototyped training data | |
sample_0_labels = np.full((10,), 0) | |
sample_1_labels = np.full((10,), 1) | |
# similarly intermediate labels can be generated | |
# ... | |
sample_9_labels = np.full((10,), 9) | |
# stacking the labels into a single array | |
proto_labels = np.concatenate((sample_0_labels,sample_1_labels, | |
sample_2_labels,sample_3_labels, |
# the following snippet was provided as starter code by UCSD for the course | |
# Machine Learning Fundamentals on edX | |
def rand_prototypes(M): | |
""" | |
Returns M randomly samples data points and corresponding labels. | |
Parameters: | |
M (int): number of data points to be sampled | |
Returns: |
# the following code snippet was provided by UCSD for Machine Learning Fundamentals on edX | |
@interact_manual( M=(100,2000,100), rounds=(1,10)) | |
def comparison(M,rounds): | |
""" | |
Shows the mean error of both the prototyping methods. As we randomly prototyping the data, | |
it makes sense to do it multiple times and take the mean error. | |
Parameters: | |
M(int): number of data points to be sampled | |
r(int): number of times random dataset chosen to calculate the mean error |
import nltk | |
nltk.download('reuters') | |
from nltk.corpus import reuters | |
def read_corpus(category="crude"): | |
""" Read files from the specified Reuter's category. And adds | |
START and END to beginning and end of each document. | |
Params: | |
category (string): category name |
# dont forget to import "pprint" | |
reuters_corpus = read_corpus() | |
pprint.pprint(reuters_corpus[:3], compact=True, width=100) | |
# first three document from the corpus | |
# [['<START>', 'japan', 'to', 'revise', 'long', '-', 'term', 'energy', 'demand', 'downwards', 'the', | |
# 'ministry', 'of', 'international', 'trade', 'and', 'industry', '(', 'miti', ')', 'will', 'revise', | |
# 'its', 'long', '-', 'term', 'energy', 'supply', '/', 'demand', 'outlook', 'by', 'august', 'to', | |
# 'meet', 'a', 'forecast', 'downtrend', 'in', 'japanese', 'energy', 'demand', ',', 'ministry', | |
# 'officials', 'said', '.', 'miti', 'is', 'expected', 'to', 'lower', 'the', 'projection', 'for', |
def distinct_words(corpus): | |
""" Determine a list of distinct words for the corpus. | |
Params: | |
corpus (list of list of strings): corpus of documents | |
Return: | |
corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function) | |
num_corpus_words (integer): number of distinct words across the corpus | |
""" | |
corpus_words = [] | |
num_corpus_words = -1 |