Skip to content

Instantly share code, notes, and snippets.

@hughdbrown
Last active October 2, 2015 19:23
Show Gist options
  • Save hughdbrown/c92a989f58fa5f1f8eef to your computer and use it in GitHub Desktop.
Save hughdbrown/c92a989f58fa5f1f8eef to your computer and use it in GitHub Desktop.
Simple data science code for a model
from __future__ import print_function
import numpy as np
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn import metrics
from sklearn.cross_validation import train_test_split # , cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.grid_search import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import BernoulliNB
def main():
with open('data/train.txt') as f:
data = [line.strip() for line in f]
labels = np.loadtxt('data/labels.txt')
# Stem the data
pstem_data = [' '.join([PorterStemmer().stem(x) for x in doc.split()]) for doc in data]
rf_grid = {
'n_estimators' : [5000],
'max_features' : ['sqrt', 'log2'],
'max_depth' : [3, 7, None],
'bootstrap' : [True],
}
# Split documents into training and test
xtrain, xtest, ytrain, ytest = train_test_split(pstem_data, labels)
# Vectorize training and test sets
vectorizer = TfidfVectorizer(stop_words=stopwords.words('English'), max_features=1000)
xtrain = vectorizer.fit_transform(xtrain)
xtest = vectorizer.transform(xtest)
# Grid search the model to find best parameters
search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_grid, n_jobs=-1)
search.fit(xtrain, ytrain)
preds = search.best_estimator_.predict(xtest)
# Print results
print(metrics.classification_report(ytest, preds))
print(metrics.accuracy_score(ytest, preds))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment