Last active
December 14, 2015 17:19
-
-
Save 89465127/5121468 to your computer and use it in GitHub Desktop.
Wikipedia glossary generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
import bs4 | |
import json | |
import urllib2 | |
import html2text | |
import nltk | |
import collections | |
# to do, lets turn this into a full class (that can be imported) | |
# and use argparse (which invokes serializing it to a file) | |
# might be nice to be able to kick this thing off in 3 ways: | |
# full url | |
# a properly formed title | |
# a search term -- could use google's site:en.wikipedia.org [term] | |
# (it should be able to parse and figure this out for you) | |
# given a url, get me a definition | |
# given a url, get me a list of links, for which to get all definitions | |
class Glossary(): | |
def __init__(self, term): | |
# to do -- check term and handle 3 cases to create a url. | |
# self.url = term | |
self.title = term | |
self.definitions = collections.OrderedDict() | |
for title, url in self.get_links().iteritems(): | |
anchor_tag = "<a href='{url}'>{title}</a>".format(url=url, title=title) | |
self.definitions[anchor_tag] = self.get_definition(url) | |
def get_definition(self, url): | |
request = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) # because Wikipedia blocks Python | |
soup = bs4.BeautifulSoup(urllib2.urlopen(request)) #.read() | |
article_html = str(soup.find('div', attrs={"id": "mw-content-text"}).p) | |
h = html2text.HTML2Text() | |
h.body_width = 0 # no line wrapping | |
h.ignore_links = True | |
h.ignore_emphasis = True | |
h.ignore_images = True | |
# h.unicode_snob = True | |
article_text = h.handle(article_html.decode('utf-8')) | |
# While nltk offers a more concise tag stripper, it inserts extra spaces. | |
# article_text = nltk.clean_html(article_html) | |
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') | |
article_sentences = tokenizer.tokenize(article_text) | |
return article_sentences[0] | |
def get_links(self, limit=3): | |
#do we want to use a page term or page id? Not sure it actaully makes sense to use id at this point... | |
url = "http://en.wikipedia.org/w/api.php?action=query&format=json&titles={title}&generator=links&prop=info&inprop=url&gpllimit={limit}".format(title=self.title, limit=limit) | |
#print urllib2.urlopen(url).read() | |
raw_links = json.load(urllib2.urlopen(url))['query']['pages'] | |
links = {} | |
for pageid, values in raw_links.iteritems(): | |
links[values['title']] = values['fullurl'] | |
return links | |
def __str__(self): | |
s = '' | |
for title, definition in self.definitions.iteritems(): | |
s += "\n{}\n\t{}".format(title, definition) | |
return s | |
def main(): | |
g = Glossary('Link_farm') | |
print g | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment