Created
December 1, 2013 01:25
-
-
Save ptwobrussell/7727485 to your computer and use it in GitHub Desktop.
An example of how to use yhat's cloud server to "predict" summaries of news articles.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################################################## | |
# | |
# An example of how to deploy a custom predictive model to yhat | |
# and "predict" the summary for a news article. | |
# | |
# Input: URL for a web page containing a news article | |
# | |
# Output: Summary of the "story" in the web page for the URL | |
# | |
# Example usage: $ python summarizer.py <username> <apikey> <url> | |
# | |
######################################################################## | |
# For getting command line args | |
import sys | |
# For pretty-printing API responses | |
import json | |
# These libs are provided by yhat on the cloud server and used | |
# in the summarize function below | |
import nltk | |
import numpy | |
# Had to ask for this to be specially installed on the cloud server | |
from boilerpipe.extract import Extractor | |
# Get this via "pip install yhat" | |
from yhat import Yhat, BaseModel | |
# This summarize function is taken from Mining the Social Web - http://bit.ly/1dIqdNd | |
def summarize(url=None, html=None, n=100, cluster_threshold=5, top_sentences=5): | |
# Adapted from "The Automatic Creation of Literature Abstracts" by H.P. Luhn | |
# | |
# Parameters: | |
# * n - Number of words to consider | |
# * cluster_threshold - Distance between words to consider | |
# * top_sentences - Number of sentences to return for a "top n" summary | |
# Begin - nested helper function | |
def score_sentences(sentences, important_words): | |
scores = [] | |
sentence_idx = -1 | |
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]: | |
sentence_idx += 1 | |
word_idx = [] | |
# For each word in the word list... | |
for w in important_words: | |
try: | |
# Compute an index for important words in each sentence | |
word_idx.append(s.index(w)) | |
except ValueError, e: # w not in this particular sentence | |
pass | |
word_idx.sort() | |
# It is possible that some sentences may not contain any important words | |
if len(word_idx)== 0: continue | |
# Using the word index, compute clusters with a max distance threshold | |
# for any two consecutive words | |
clusters = [] | |
cluster = [word_idx[0]] | |
i = 1 | |
while i < len(word_idx): | |
if word_idx[i] - word_idx[i - 1] < cluster_threshold: | |
cluster.append(word_idx[i]) | |
else: | |
clusters.append(cluster[:]) | |
cluster = [word_idx[i]] | |
i += 1 | |
clusters.append(cluster) | |
# Score each cluster. The max score for any given cluster is the score | |
# for the sentence. | |
max_cluster_score = 0 | |
for c in clusters: | |
significant_words_in_cluster = len(c) | |
total_words_in_cluster = c[-1] - c[0] + 1 | |
score = 1.0 * significant_words_in_cluster \ | |
* significant_words_in_cluster / total_words_in_cluster | |
if score > max_cluster_score: | |
max_cluster_score = score | |
scores.append((sentence_idx, score)) | |
return scores | |
# End - nested helper function | |
extractor = Extractor(extractor='ArticleExtractor', url=url, html=html) | |
# It's entirely possible that this "clean page" will be a big mess. YMMV. | |
# The good news is that the summarize algorithm inherently accounts for handling | |
# a lot of this noise. | |
txt = extractor.getText() | |
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)] | |
normalized_sentences = [s.lower() for s in sentences] | |
words = [w.lower() for sentence in normalized_sentences for w in | |
nltk.tokenize.word_tokenize(sentence)] | |
fdist = nltk.FreqDist(words) | |
top_n_words = [w[0] for w in fdist.items() | |
if w[0] not in nltk.corpus.stopwords.words('english')][:n] | |
scored_sentences = score_sentences(normalized_sentences, top_n_words) | |
# Summarization Approach 1: | |
# Filter out nonsignificant sentences by using the average score plus a | |
# fraction of the std dev as a filter | |
avg = numpy.mean([s[1] for s in scored_sentences]) | |
std = numpy.std([s[1] for s in scored_sentences]) | |
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences | |
if score > avg + 0.5 * std] | |
# Summarization Approach 2: | |
# Another approach would be to return only the top N ranked sentences | |
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-top_sentences:] | |
top_n_scored = sorted(top_n_scored, key=lambda s: s[0]) | |
# Decorate the post object with summaries | |
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored], | |
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored]) | |
# Create a model by overriding yhat's BaseModel | |
class MySummarizerModel(BaseModel): | |
def require(self): | |
from boilerpipe.extract import Extractor | |
import nltk | |
import numpy | |
def transform(self, d): | |
return (d, summarize(url=d),) | |
def predict(self, d): | |
return { 'url' : d[0], 'summary': d[1] } | |
if __name__ == '__main__': | |
# Get the username, api key, and url from the command line | |
USERNAME, APIKEY = sys.argv[1], sys.argv[2] | |
# Some sample URLs to try: | |
# http://radar.oreilly.com/2013/10/mining-the-social-web-again.html or | |
# http://radar.oreilly.com/2013/06/phishing-in-facebooks-pond.html | |
URL = sys.argv[3] | |
# Create a model to use | |
my_summarizer_model = MySummarizerModel(udfs=[summarize]) | |
# Create an API connection | |
yh = Yhat(USERNAME, APIKEY) | |
# Upload the model to the yhat server and print the results. | |
# Commented out, because it's already uploaded. | |
#print yh.upload("MySummarizerModel", my_summarizer_model) | |
# How to list available models and versions. (Would be convenient | |
# to not have to provide a specific version, which would call the | |
# latest available model.) | |
#print yh.show_models() | |
# Make a prediction with the uploaded model and display the result. | |
prediction = yh.raw_predict("MySummarizerModel", 1, URL) | |
print json.dumps(prediction, indent=2) | |
# You can also use yhat's API to predict with a RESTful endpoint. | |
# (BTW, It's unfortunate that you can GET /predict with REST. This | |
# particular model is a great example of why that would be super-useful.) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment