hughdbrown · October 2, 2015 19:23
diff --git a/ds_simple_model.py b/ds_simple_model.py
 from __future__ import print_function

 import numpy as np

 from nltk.corpus import stopwords
 # from nltk.stem import WordNetLemmatizer
 from nltk.stem.porter import PorterStemmer

 from sklearn import metrics 
 from sklearn.cross_validation import train_test_split  # , cross_val_score
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.grid_search import GridSearchCV
 # from sklearn.linear_model import LogisticRegression
 # from sklearn.naive_bayes import BernoulliNB


 def main():
    with open('data/train.txt') as f:
        data = [line.strip() for line in f]
    labels = np.loadtxt('data/labels.txt')

    # Stem the data
    pstem_data = [' '.join([PorterStemmer().stem(x) for x in doc.split()]) for doc in data]

    rf_grid = {
        'n_estimators' : [5000],
        'max_features' : ['sqrt', 'log2'],
        'max_depth' : [3, 7, None],
        'bootstrap' : [True],
    }

    # Split documents into training and test
    xtrain, xtest, ytrain, ytest = train_test_split(pstem_data, labels)

    # Vectorize training and test sets
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('English'), max_features=1000)
    xtrain = vectorizer.fit_transform(xtrain)
    xtest = vectorizer.transform(xtest)

    # Grid search the model to find best parameters
    search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_grid, n_jobs=-1)
    search.fit(xtrain, ytrain)
    preds = search.best_estimator_.predict(xtest)
    
    # Print results
    print(metrics.classification_report(ytest, preds))
    print(metrics.accuracy_score(ytest, preds))


 if __name__ == '__main__':
    main()
	from __future__ import print_function

	import numpy as np

	from nltk.corpus import stopwords
	# from nltk.stem import WordNetLemmatizer
	from nltk.stem.porter import PorterStemmer

	from sklearn import metrics
	from sklearn.cross_validation import train_test_split # , cross_val_score
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.grid_search import GridSearchCV
	# from sklearn.linear_model import LogisticRegression
	# from sklearn.naive_bayes import BernoulliNB


	def main():
	with open('data/train.txt') as f:
	data = [line.strip() for line in f]
	labels = np.loadtxt('data/labels.txt')

	# Stem the data
	pstem_data = [' '.join([PorterStemmer().stem(x) for x in doc.split()]) for doc in data]

	rf_grid = {
	'n_estimators' : [5000],
	'max_features' : ['sqrt', 'log2'],
	'max_depth' : [3, 7, None],
	'bootstrap' : [True],
	}

	# Split documents into training and test
	xtrain, xtest, ytrain, ytest = train_test_split(pstem_data, labels)

	# Vectorize training and test sets
	vectorizer = TfidfVectorizer(stop_words=stopwords.words('English'), max_features=1000)
	xtrain = vectorizer.fit_transform(xtrain)
	xtest = vectorizer.transform(xtest)

	# Grid search the model to find best parameters
	search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_grid, n_jobs=-1)
	search.fit(xtrain, ytrain)
	preds = search.best_estimator_.predict(xtest)

	# Print results
	print(metrics.classification_report(ytest, preds))
	print(metrics.accuracy_score(ytest, preds))


	if __name__ == '__main__':
	main()