Created
December 10, 2016 15:25
-
-
Save manashmandal/e6fb8b63f53f6bce6f996fe07a590edf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import pickle | |
import cPickle | |
import numpy | |
from sklearn import cross_validation | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.feature_selection import SelectPercentile, f_classif | |
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): | |
""" | |
this function takes a pre-made list of email texts (by default word_data.pkl) | |
and the corresponding authors (by default email_authors.pkl) and performs | |
a number of preprocessing steps: | |
-- splits into training/testing sets (10% testing) | |
-- vectorizes into tfidf matrix | |
-- selects/keeps most helpful features | |
after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions | |
4 objects are returned: | |
-- training/testing features | |
-- training/testing labels | |
""" | |
### the words (features) and authors (labels), already largely preprocessed | |
### this preprocessing will be repeated in the text learning mini-project | |
authors_file_handler = open(authors_file, "r") | |
authors = pickle.load(authors_file_handler) | |
authors_file_handler.close() | |
words_file_handler = open(words_file, "r") | |
word_data = cPickle.load(words_file_handler) | |
words_file_handler.close() | |
### test_size is the percentage of events assigned to the test set | |
### (remainder go into training) | |
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) | |
#print "Features test, ", features_test | |
### text vectorization--go from strings to lists of numbers | |
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, | |
stop_words='english') | |
#features_train_transformed = vectorizer.fit_transform(features_train) | |
#features_test_transformed = vectorizer.transform(features_test) | |
features_train_vectorized = vectorizer.fit_transform(features_train) | |
features_test_vectorized = vectorizer.transform(features_test) | |
### feature selection, because text is super high dimensional and | |
### can be really computationally chewy as a result | |
#selector = SelectPercentile(f_classif, percentile=10) | |
#selector.fit(features_train_transformed, labels_train) | |
#features_train_transformed = selector.transform(features_train_transformed).toarray() | |
#features_test_transformed = selector.transform(features_test_transformed).toarray() | |
selector = SelectPercentile(f_classif, percentile=5) | |
selector.fit(features_train_vectorized, labels_train) | |
features_train_transformed = selector.transform(features_train_vectorized).toarray() | |
features_test_transformed = selector.transform(features_test_vectorized).toarray() | |
#print "Label trains, ", labels_train | |
flattened_feature = features_train_transformed.ravel() | |
z_f = [f for f in flattened_feature if f == 0] | |
print "Zero feat, ", len(z_f) | |
print "features, ", len(flattened_feature) | |
print "Number of features: ", len(flattened_feature) - len(z_f) | |
### info on the data | |
print "no. of Chris training emails:", sum(labels_train) | |
print "no. of Sara training emails:", len(labels_train)-sum(labels_train) | |
return features_train_transformed, features_test_transformed, labels_train, labels_test | |
preprocess() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Interesting Resources to look at
http://stackoverflow.com/questions/21338090/how-can-i-store-and-print-the-top-20-feature-names-and-scores