Created
June 14, 2011 17:48
-
-
Save gvx/1025436 to your computer and use it in GitHub Desktop.
Path to Philosophy code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Alias(object): | |
def __init__(self, initial): | |
self._set = {initial} | |
self.initial = initial | |
def add(self, alias): | |
self._set.add(alias) | |
def merge(self, other): | |
self._set.update(other._set) | |
def __iter__(self): | |
return iter(self._set) | |
class AliasDict(object): | |
def __init__(self): | |
self._dict = {} | |
def add(self, one, other): | |
if one in self._dict: | |
if other in self._dict: #merge! | |
self._dict[one].merge(self._dict[other]) | |
for k in self._dict[other]: | |
self._dict[k] = self._dict[one] | |
else: | |
self._dict[one].add(other) | |
elif other in self._dict: | |
self._dict[other].add(one) | |
else: | |
self._dict[one] = self._dict[other] = Alias(one) | |
self._dict[one].add(other) | |
def get(self, n): | |
return self._dict.get(n) | |
def __contains__(self, s): | |
return s in self._dict |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser | |
class LinkFound(Exception): | |
def __init__(self, link): | |
self.linkname = link | |
class WikipediaParser(HTMLParser): | |
bad_namespaces = {'File', 'File_talk', 'Wikipedia', 'Wikipedia_talk', 'Template', 'Template_talk', 'Talk', 'User', 'User_talk', 'Help'} | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.inside_italics = False | |
self.open_parens = 0 | |
self.open_divs = 0 | |
self.started = False | |
self.inside_table = False | |
def handle_starttag(self, tag, attrs): | |
if tag in ('i', 'em'): | |
self.inside_italics = True | |
elif tag == 'div': | |
for key, value in attrs: | |
if key == 'id' and value == 'bodyContent': | |
self.started = True | |
self.open_divs = 0 | |
return | |
self.open_divs += 1 | |
elif tag == 'table': | |
self.inside_table = True | |
elif tag == 'a': | |
if self.started and not self.inside_italics and not self.inside_table and self.open_parens <= 0 and self.open_divs <= 0: | |
for key, value in attrs: | |
if key == 'href' and value.startswith('/wiki/'): | |
value = value[6:] | |
if value.split(':',1)[0] not in self.bad_namespaces: | |
raise LinkFound(value) | |
def handle_endtag(self, tag): | |
if tag in ('i', 'em'): | |
self.inside_italics = False | |
elif tag == 'div': | |
self.open_divs -= 1 | |
elif tag == 'table': | |
self.inside_table = False | |
def handle_data(self, data): | |
self.open_parens += data.count('(') - data.count(')') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from walk import Walker | |
from urllib.parse import unquote | |
def check_endpoint(go_from, endpoint): | |
w = Walker(endpoint) | |
w.start(go_from) | |
return w | |
def quote(n): #make page title fit for printing | |
return '"'+unquote(n).replace('_', ' ')+'"' | |
if __name__ == '__main__': | |
import sys | |
g = len(sys.argv) > 1 and sys.argv[1] or 'Special:Random' | |
e = 'Philosophy' #change for another end point | |
w = check_endpoint(g, e) | |
l = [w.aliases.get(g).initial] | |
while len(l) <= len(w.cache): | |
l.append(w.cache[l[-1]]) | |
print('I', e in w and 'could' or 'could not', 'reach', quote(e), 'from', quote(w.aliases.get(g).initial), 'in', len(w.cache), 'steps') | |
print(' -> '.join(quote(x) for x in l)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from parser import * | |
from aliasdict import * | |
from urllib.request import urlopen, Request | |
class Walker(object): | |
def __init__(self, stop_at=None, lang='en'): | |
self.lang = lang | |
self.build_url() | |
self.cache = {} | |
self.stop_at = stop_at | |
self.aliases = AliasDict() | |
def start(self, url='Special:Random'): | |
while url: | |
url = self.walk_from(url) | |
def build_url(self): | |
self.built_url = 'http://' + self.lang + '.wikipedia.org/wiki/' | |
def walk_from(self, url): | |
if url in self.aliases: | |
return | |
wpp = WikipediaParser() | |
resp = urlopen(Request(self.built_url + url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})) #It doesn't like Python's default user agent, so I spoof it. | |
try: | |
text = resp.read().decode('utf-8') #Assuming UTF-8. Awful, I know. | |
except Exception as e: | |
print('Ignored', e, 'for', url) #for some reason, it receives garbage on some articles | |
#if you know why this is, please let me know | |
text = '' | |
try: | |
wpp.feed(text) | |
except LinkFound as e: | |
n_url = resp.geturl()[len(self.built_url):] | |
self.aliases.add(n_url, url) | |
if self.stop_at in self.aliases: | |
return | |
self.cache[n_url] = e.linkname.split('#',1)[0] | |
return self.cache[n_url] | |
def __contains__(self, s): | |
return s in self.aliases |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Damn, fixed. Some
z
got in the way, somehow.