dansbecker · July 19, 2017 16:29
diff --git a/predict_from_text.py b/predict_from_text.py
 from sklearn.datasets import fetch_20newsgroups
 from sklearn.feature_extraction.text import TfidfVectorizer

 from sklearn.linear_model import LogisticRegression
 # A pipeline "stitches together" the various steps of a modeling process into a single piece.  You should either try to get a separate explanation of this, or try to do without it.
 # In general pipelines are pretty cool.  But, one more thing to learn.
 from sklearn.pipeline import Pipeline
 import pandas as pd
 import numpy as np

 # Grab just two categories from the 20 newsgroups dataset
 # sklearn has facilities for expanding this to more than two categories, especially using something called a one-vs-all methodology
 categories=['sci.space', 'rec.autos']

 # Get training data
 ngd = fetch_20newsgroups(subset='train', categories=categories)
 # it's probably worth getting this locally and preting out what X_train looks like
 X_train = ngd.data
 # it's probably worth getting this locally and preting out what y_train looks like
 y_train = ngd.target

 # Use pipeline for easy extensibility
 steps = [
 	('vectorizer', TfidfVectorizer()),
 	('classifier', LogisticRegression(penalty='l1', C=10))
 ]
 pipeline = Pipeline(steps)

 ### Fit and assess training performance

 ## This makes the model consider your data
 pipeline.fit(X_train, y_train)

 # this step is just to see how accurate the model is on the data that was used to build it.
 pred = pipeline.predict(X_train)
 print("Classification accuracy on training data: %.2f" % pipeline.score(X_train, y_train))

 # Performance on test data. That tests how accurate model is on data it has never seen before.
 ngd = fetch_20newsgroups(subset='test', categories=categories)
 test_score = pipeline.score(ngd.data, ngd.target)
 print("Classification accuracy on test data: %.2f" % test_score)

 ### Get a list of words that indicate what category a message is from. 
 # Print largest coefficients

 ### we want to use the name vec to refer to the vectorizer, and clf to describe the model.
 vec, clf = pipeline.named_steps['vectorizer'], pipeline.named_steps['classifier']

 # coefficients of the model, which indicate how each word affects the probability of being from the True class
 # we store this information in a Series datastructure. Could store as a dictionary if desired, but Series makes sense.
 coefs = pd.Series(clf.coef_[0], index=vec.get_feature_names())


 print("\n20 most discriminating words:")
 print(coefs[coefs.abs().sort_values(ascending=False).index][:20])

 # they are doing some clever Pandas stuff in the above line. 
 # coefs.abs().sort_values(ascending=False).index is a sorted list of the indices of highest absolute value coefficients.
 # if you want the 20 words with the most positive coefficients, it would be
 # coefs[coefs.sort_values(ascending=False).index][:20])
 # if you want the 20 words with the most negative coefficients, it would be
 # coefs[coefs.sort_values(ascending=True).index][:20])
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import TfidfVectorizer

	from sklearn.linear_model import LogisticRegression
	# A pipeline "stitches together" the various steps of a modeling process into a single piece. You should either try to get a separate explanation of this, or try to do without it.
	# In general pipelines are pretty cool. But, one more thing to learn.
	from sklearn.pipeline import Pipeline
	import pandas as pd
	import numpy as np

	# Grab just two categories from the 20 newsgroups dataset
	# sklearn has facilities for expanding this to more than two categories, especially using something called a one-vs-all methodology
	categories=['sci.space', 'rec.autos']

	# Get training data
	ngd = fetch_20newsgroups(subset='train', categories=categories)
	# it's probably worth getting this locally and preting out what X_train looks like
	X_train = ngd.data
	# it's probably worth getting this locally and preting out what y_train looks like
	y_train = ngd.target

	# Use pipeline for easy extensibility
	steps = [
	('vectorizer', TfidfVectorizer()),
	('classifier', LogisticRegression(penalty='l1', C=10))
	]
	pipeline = Pipeline(steps)

	### Fit and assess training performance

	## This makes the model consider your data
	pipeline.fit(X_train, y_train)

	# this step is just to see how accurate the model is on the data that was used to build it.
	pred = pipeline.predict(X_train)
	print("Classification accuracy on training data: %.2f" % pipeline.score(X_train, y_train))

	# Performance on test data. That tests how accurate model is on data it has never seen before.
	ngd = fetch_20newsgroups(subset='test', categories=categories)
	test_score = pipeline.score(ngd.data, ngd.target)
	print("Classification accuracy on test data: %.2f" % test_score)

	### Get a list of words that indicate what category a message is from.
	# Print largest coefficients

	### we want to use the name vec to refer to the vectorizer, and clf to describe the model.
	vec, clf = pipeline.named_steps['vectorizer'], pipeline.named_steps['classifier']

	# coefficients of the model, which indicate how each word affects the probability of being from the True class
	# we store this information in a Series datastructure. Could store as a dictionary if desired, but Series makes sense.
	coefs = pd.Series(clf.coef_[0], index=vec.get_feature_names())


	print("\n20 most discriminating words:")
	print(coefs[coefs.abs().sort_values(ascending=False).index][:20])

	# they are doing some clever Pandas stuff in the above line.
	# coefs.abs().sort_values(ascending=False).index is a sorted list of the indices of highest absolute value coefficients.
	# if you want the 20 words with the most positive coefficients, it would be
	# coefs[coefs.sort_values(ascending=False).index][:20])
	# if you want the 20 words with the most negative coefficients, it would be
	# coefs[coefs.sort_values(ascending=True).index][:20])