gvx · June 14, 2011 17:48 · qubodup · Jun 14, 2011 · gvx · Jun 14, 2011
diff --git a/aliasdict.py b/aliasdict.py
 class Alias(object):
 	def __init__(self, initial):
 		self._set = {initial}
 		self.initial = initial
 	def add(self, alias):
 		self._set.add(alias)
 	def merge(self, other):
 		self._set.update(other._set)
 	def __iter__(self):
 		return iter(self._set)

 class AliasDict(object):
 	def __init__(self):
 		self._dict = {}
 	def add(self, one, other):
 		if one in self._dict:
 			if other in self._dict: #merge!
 				self._dict[one].merge(self._dict[other])
 				for k in self._dict[other]:
 					self._dict[k] = self._dict[one]
 			else:
 				self._dict[one].add(other)
 		elif other in self._dict:
 			self._dict[other].add(one)
 		else:
 			self._dict[one] = self._dict[other] = Alias(one)
 			self._dict[one].add(other)
 	def get(self, n):
 		return self._dict.get(n)
 	def __contains__(self, s):
 		return s in self._dict
diff --git a/parser.py b/parser.py
 from html.parser import HTMLParser

 class LinkFound(Exception):
 	def __init__(self, link):
 		self.linkname = link

 class WikipediaParser(HTMLParser):
 	bad_namespaces = {'File', 'File_talk', 'Wikipedia', 'Wikipedia_talk', 'Template', 'Template_talk', 'Talk', 'User', 'User_talk', 'Help'}
 	def __init__(self):
 		HTMLParser.__init__(self)
 		self.inside_italics = False
 		self.open_parens = 0
 		self.open_divs = 0
 		self.started = False
 		self.inside_table = False
 	def handle_starttag(self, tag, attrs):
 		if tag in ('i', 'em'):
 			self.inside_italics = True
 		elif tag == 'div':
 			for key, value in attrs:
 				if key == 'id' and value == 'bodyContent':
 					self.started = True
 					self.open_divs = 0
 					return
 			self.open_divs += 1
 		elif tag == 'table':
 			self.inside_table = True
 		elif tag == 'a':
 			if self.started and not self.inside_italics and not self.inside_table and self.open_parens <= 0 and self.open_divs <= 0:
 				for key, value in attrs:
 					if key == 'href' and value.startswith('/wiki/'):
 						value = value[6:]
 						if value.split(':',1)[0] not in self.bad_namespaces:
 							raise LinkFound(value)
 	def handle_endtag(self, tag):
 		if tag in ('i', 'em'):
 			self.inside_italics = False
 		elif tag == 'div':
 			self.open_divs -= 1
 		elif tag == 'table':
 			self.inside_table = False
 	def handle_data(self, data):
 		self.open_parens += data.count('(') - data.count(')')
diff --git a/philosophy.py b/philosophy.py
 from walk import Walker
 from urllib.parse import unquote

 def check_endpoint(go_from, endpoint):
 	w = Walker(endpoint)
 	w.start(go_from)
 	return w

 def quote(n): #make page title fit for printing
 	return '"'+unquote(n).replace('_', ' ')+'"'

 if __name__ == '__main__':
 	import sys
 	g = len(sys.argv) > 1 and sys.argv[1] or 'Special:Random'
 	e = 'Philosophy' #change for another end point
 	w = check_endpoint(g, e)

 	l = [w.aliases.get(g).initial]
 	while len(l) <= len(w.cache):
 		l.append(w.cache[l[-1]])

 	print('I', e in w and 'could' or 'could not', 'reach', quote(e), 'from', quote(w.aliases.get(g).initial), 'in', len(w.cache), 'steps')
 	print(' -> '.join(quote(x) for x in l))
diff --git a/walk.py b/walk.py
 from parser import *
 from aliasdict import *
 from urllib.request import urlopen, Request

 class Walker(object):
 	def __init__(self, stop_at=None, lang='en'):
 		self.lang = lang
 		self.build_url()
 		self.cache = {}
 		self.stop_at = stop_at
 		self.aliases = AliasDict()

 	def start(self, url='Special:Random'):
 		while url:
 			url = self.walk_from(url)

 	def build_url(self):
 		self.built_url = 'http://' + self.lang + '.wikipedia.org/wiki/'

 	def walk_from(self, url):
 		if url in self.aliases:
 			return
 		wpp = WikipediaParser()
 		resp = urlopen(Request(self.built_url + url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})) #It doesn't like Python's default user agent, so I spoof it. 
 		try:
 			text = resp.read().decode('utf-8') #Assuming UTF-8. Awful, I know.
 		except Exception as e:
 			print('Ignored', e, 'for', url) #for some reason, it receives garbage on some articles
 			                                #if you know why this is, please let me know
 			text = ''
 		try:
 			wpp.feed(text)
 		except LinkFound as e:
 			n_url = resp.geturl()[len(self.built_url):]
 			self.aliases.add(n_url, url)
 			if self.stop_at in self.aliases:
 				return 
 			self.cache[n_url] = e.linkname.split('#',1)[0]
 			return self.cache[n_url]
 	def __contains__(self, s):
 		return s in self.aliases
	class Alias(object):
	def __init__(self, initial):
	self._set = {initial}
	self.initial = initial
	def add(self, alias):
	self._set.add(alias)
	def merge(self, other):
	self._set.update(other._set)
	def __iter__(self):
	return iter(self._set)

	class AliasDict(object):
	def __init__(self):
	self._dict = {}
	def add(self, one, other):
	if one in self._dict:
	if other in self._dict: #merge!
	self._dict[one].merge(self._dict[other])
	for k in self._dict[other]:
	self._dict[k] = self._dict[one]
	else:
	self._dict[one].add(other)
	elif other in self._dict:
	self._dict[other].add(one)
	else:
	self._dict[one] = self._dict[other] = Alias(one)
	self._dict[one].add(other)
	def get(self, n):
	return self._dict.get(n)
	def __contains__(self, s):
	return s in self._dict
	from html.parser import HTMLParser

	class LinkFound(Exception):
	def __init__(self, link):
	self.linkname = link

	class WikipediaParser(HTMLParser):
	bad_namespaces = {'File', 'File_talk', 'Wikipedia', 'Wikipedia_talk', 'Template', 'Template_talk', 'Talk', 'User', 'User_talk', 'Help'}
	def __init__(self):
	HTMLParser.__init__(self)
	self.inside_italics = False
	self.open_parens = 0
	self.open_divs = 0
	self.started = False
	self.inside_table = False
	def handle_starttag(self, tag, attrs):
	if tag in ('i', 'em'):
	self.inside_italics = True
	elif tag == 'div':
	for key, value in attrs:
	if key == 'id' and value == 'bodyContent':
	self.started = True
	self.open_divs = 0
	return
	self.open_divs += 1
	elif tag == 'table':
	self.inside_table = True
	elif tag == 'a':
	if self.started and not self.inside_italics and not self.inside_table and self.open_parens <= 0 and self.open_divs <= 0:
	for key, value in attrs:
	if key == 'href' and value.startswith('/wiki/'):
	value = value[6:]
	if value.split(':',1)[0] not in self.bad_namespaces:
	raise LinkFound(value)
	def handle_endtag(self, tag):
	if tag in ('i', 'em'):
	self.inside_italics = False
	elif tag == 'div':
	self.open_divs -= 1
	elif tag == 'table':
	self.inside_table = False
	def handle_data(self, data):
	self.open_parens += data.count('(') - data.count(')')
	from walk import Walker
	from urllib.parse import unquote

	def check_endpoint(go_from, endpoint):
	w = Walker(endpoint)
	w.start(go_from)
	return w

	def quote(n): #make page title fit for printing
	return '"'+unquote(n).replace('_', ' ')+'"'

	if __name__ == '__main__':
	import sys
	g = len(sys.argv) > 1 and sys.argv[1] or 'Special:Random'
	e = 'Philosophy' #change for another end point
	w = check_endpoint(g, e)

	l = [w.aliases.get(g).initial]
	while len(l) <= len(w.cache):
	l.append(w.cache[l[-1]])

	print('I', e in w and 'could' or 'could not', 'reach', quote(e), 'from', quote(w.aliases.get(g).initial), 'in', len(w.cache), 'steps')
	print(' -> '.join(quote(x) for x in l))
	from parser import *
	from aliasdict import *
	from urllib.request import urlopen, Request

	class Walker(object):
	def __init__(self, stop_at=None, lang='en'):
	self.lang = lang
	self.build_url()
	self.cache = {}
	self.stop_at = stop_at
	self.aliases = AliasDict()

	def start(self, url='Special:Random'):
	while url:
	url = self.walk_from(url)

	def build_url(self):
	self.built_url = 'http://' + self.lang + '.wikipedia.org/wiki/'

	def walk_from(self, url):
	if url in self.aliases:
	return
	wpp = WikipediaParser()
	resp = urlopen(Request(self.built_url + url, headers={'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'})) #It doesn't like Python's default user agent, so I spoof it.
	try:
	text = resp.read().decode('utf-8') #Assuming UTF-8. Awful, I know.
	except Exception as e:
	print('Ignored', e, 'for', url) #for some reason, it receives garbage on some articles
	#if you know why this is, please let me know
	text = ''
	try:
	wpp.feed(text)
	except LinkFound as e:
	n_url = resp.geturl()[len(self.built_url):]
	self.aliases.add(n_url, url)
	if self.stop_at in self.aliases:
	return
	self.cache[n_url] = e.linkname.split('#',1)[0]
	return self.cache[n_url]
	def __contains__(self, s):
	return s in self.aliases