Created
February 21, 2017 16:54
-
-
Save qharlie/466fe80d43e786715172a14af41aabde to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import pickle | |
import numpy | |
from sklearn.feature_selection import SelectPercentile, f_classif | |
numpy.random.seed(42) | |
### The words (features) and authors (labels), already largely processed. | |
### These files should have been created from the previous (Lesson 10) | |
### mini-project. | |
words_file = "../text_learning/your_word_data.pkl" | |
authors_file = "../text_learning/your_email_authors.pkl" | |
word_data = pickle.load(open(words_file, "rb")) | |
authors = pickle.load(open(authors_file, "rb")) | |
### test_size is the percentage of events assigned to the test set (the | |
### remainder go into training) | |
### feature matrices changed to dense representations for compatibility with | |
### classifier functions in versions 0.15.2 and earlier | |
from sklearn import cross_validation | |
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, | |
test_size=0.1, | |
random_state=42) | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, | |
stop_words='english') | |
features_train_transformed = vectorizer.fit_transform(features_train).toarray() | |
features_test_transformed = vectorizer.transform(features_test).toarray() | |
selector = SelectPercentile(f_classif, percentile=10) | |
selector.fit(features_train, labels_train) | |
features_train_transformed = selector.transform(features_train) | |
features_test_transformed = selector.transform(features_test) | |
print("CHRIS = {}:,SARA= {}".format(print(sum(labels_train)), len(labels_train) - sum(labels_train))) | |
### a classic way to overfit is to use a small number | |
### of data points and a large number of features; | |
### train on only 150 events to put ourselves in this regime | |
features_train = features_train[:150] | |
labels_train = labels_train[:150] | |
### your code goes here |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment