-
-
Save dansbecker/f1a39119168573de4acc4f29176ea286 to your computer and use it in GitHub Desktop.
simple example predicting binary outcome from text features with sklearn (with extra comments for Alon)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.datasets import fetch_20newsgroups | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.linear_model import LogisticRegression | |
# A pipeline "stitches together" the various steps of a modeling process into a single piece. You should either try to get a separate explanation of this, or try to do without it. | |
# In general pipelines are pretty cool. But, one more thing to learn. | |
from sklearn.pipeline import Pipeline | |
import pandas as pd | |
import numpy as np | |
# Grab just two categories from the 20 newsgroups dataset | |
# sklearn has facilities for expanding this to more than two categories, especially using something called a one-vs-all methodology | |
categories=['sci.space', 'rec.autos'] | |
# Get training data | |
ngd = fetch_20newsgroups(subset='train', categories=categories) | |
# it's probably worth getting this locally and preting out what X_train looks like | |
X_train = ngd.data | |
# it's probably worth getting this locally and preting out what y_train looks like | |
y_train = ngd.target | |
# Use pipeline for easy extensibility | |
steps = [ | |
('vectorizer', TfidfVectorizer()), | |
('classifier', LogisticRegression(penalty='l1', C=10)) | |
] | |
pipeline = Pipeline(steps) | |
### Fit and assess training performance | |
## This makes the model consider your data | |
pipeline.fit(X_train, y_train) | |
# this step is just to see how accurate the model is on the data that was used to build it. | |
pred = pipeline.predict(X_train) | |
print("Classification accuracy on training data: %.2f" % pipeline.score(X_train, y_train)) | |
# Performance on test data. That tests how accurate model is on data it has never seen before. | |
ngd = fetch_20newsgroups(subset='test', categories=categories) | |
test_score = pipeline.score(ngd.data, ngd.target) | |
print("Classification accuracy on test data: %.2f" % test_score) | |
### Get a list of words that indicate what category a message is from. | |
# Print largest coefficients | |
### we want to use the name vec to refer to the vectorizer, and clf to describe the model. | |
vec, clf = pipeline.named_steps['vectorizer'], pipeline.named_steps['classifier'] | |
# coefficients of the model, which indicate how each word affects the probability of being from the True class | |
# we store this information in a Series datastructure. Could store as a dictionary if desired, but Series makes sense. | |
coefs = pd.Series(clf.coef_[0], index=vec.get_feature_names()) | |
print("\n20 most discriminating words:") | |
print(coefs[coefs.abs().sort_values(ascending=False).index][:20]) | |
# they are doing some clever Pandas stuff in the above line. | |
# coefs.abs().sort_values(ascending=False).index is a sorted list of the indices of highest absolute value coefficients. | |
# if you want the 20 words with the most positive coefficients, it would be | |
# coefs[coefs.sort_values(ascending=False).index][:20]) | |
# if you want the 20 words with the most negative coefficients, it would be | |
# coefs[coefs.sort_values(ascending=True).index][:20]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment