qharlie · February 21, 2017 16:54
diff --git a/find_signature.py b/find_signature.py
 #!/usr/bin/python

 import pickle
 import numpy
 from sklearn.feature_selection import SelectPercentile, f_classif

 numpy.random.seed(42)

 ### The words (features) and authors (labels), already largely processed.
 ### These files should have been created from the previous (Lesson 10)
 ### mini-project.
 words_file = "../text_learning/your_word_data.pkl"
 authors_file = "../text_learning/your_email_authors.pkl"
 word_data = pickle.load(open(words_file, "rb"))
 authors = pickle.load(open(authors_file, "rb"))

 ### test_size is the percentage of events assigned to the test set (the
 ### remainder go into training)
 ### feature matrices changed to dense representations for compatibility with
 ### classifier functions in versions 0.15.2 and earlier
 from sklearn import cross_validation

 features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors,
                                                                                             test_size=0.1,
                                                                                             random_state=42)

 from sklearn.feature_extraction.text import TfidfVectorizer

 vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
 features_train_transformed = vectorizer.fit_transform(features_train).toarray()
 features_test_transformed = vectorizer.transform(features_test).toarray()

 selector = SelectPercentile(f_classif, percentile=10)
 selector.fit(features_train, labels_train)

 features_train_transformed = selector.transform(features_train)
 features_test_transformed = selector.transform(features_test)

 print("CHRIS = {}:,SARA= {}".format(print(sum(labels_train)), len(labels_train) - sum(labels_train)))

 ### a classic way to overfit is to use a small number
 ### of data points and a large number of features;
 ### train on only 150 events to put ourselves in this regime
 features_train = features_train[:150]
 labels_train = labels_train[:150]

 ### your code goes here
	#!/usr/bin/python

	import pickle
	import numpy
	from sklearn.feature_selection import SelectPercentile, f_classif

	numpy.random.seed(42)

	### The words (features) and authors (labels), already largely processed.
	### These files should have been created from the previous (Lesson 10)
	### mini-project.
	words_file = "../text_learning/your_word_data.pkl"
	authors_file = "../text_learning/your_email_authors.pkl"
	word_data = pickle.load(open(words_file, "rb"))
	authors = pickle.load(open(authors_file, "rb"))

	### test_size is the percentage of events assigned to the test set (the
	### remainder go into training)
	### feature matrices changed to dense representations for compatibility with
	### classifier functions in versions 0.15.2 and earlier
	from sklearn import cross_validation

	features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors,
	test_size=0.1,
	random_state=42)

	from sklearn.feature_extraction.text import TfidfVectorizer

	vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
	stop_words='english')
	features_train_transformed = vectorizer.fit_transform(features_train).toarray()
	features_test_transformed = vectorizer.transform(features_test).toarray()

	selector = SelectPercentile(f_classif, percentile=10)
	selector.fit(features_train, labels_train)

	features_train_transformed = selector.transform(features_train)
	features_test_transformed = selector.transform(features_test)

	print("CHRIS = {}:,SARA= {}".format(print(sum(labels_train)), len(labels_train) - sum(labels_train)))

	### a classic way to overfit is to use a small number
	### of data points and a large number of features;
	### train on only 150 events to put ourselves in this regime
	features_train = features_train[:150]
	labels_train = labels_train[:150]

	### your code goes here