OzTamir · August 29, 2015 14:08 · OzTamir · Nov 4, 2014
diff --git a/wikigame.py b/wikigame.py
 import urllib2
 import sys
 from bs4 import BeautifulSoup
 import re

 # Used for formating wiki links
 wiki_endpoint = "http://en.wikipedia.org/wiki/"
 # Count how many links were visited on the way
 counter = 0
 # Regular Pattren for link
 link_patt = re.compile(r'<a href="/wiki/[^:"].*?".*?>')
 # Keep a list of visited articles to detect loops
 visited = []
 def remove_parentheses(data):
 	''' We want the first link that is not in parentheses, so we make sure it's not '''
 	first = data.split('href="')
 	# If the paragraph dosen't contain parentheses, return the first wiki link
 	if not ('(' in first[0] or ')' in first[0]):
 		return [x for x in first if x.startswith('/wiki/') and not 'File:' in x][0]
 	# Else, find the first link after the parentheses
 	total = first[0]
 	for i in range(len(first)):
 		if total.count('(') == total.count(')'):
 			# Make sure it's a wiki link
 			return [x for x in first[i:] if x.startswith('/wiki/')][0]
 		if i > 0:
 			total += first[i]

 def get_next(url):
 	''' Get the first link from the article '''
 	# Get the HTML
 	html = urllib2.urlopen(url).read()
 	# Get the article's paragraphs
 	soup = BeautifulSoup(html)
 	paragraphs = [x for x in soup.find(id='mw-content-text').contents if x.name == u'p']
 	paragraph = ''
 	for p in paragraphs:
 		if re.search(link_patt, str(p)):
 			paragraph = str(p)
 			break
 	# Sometimes we have to deal with a 'may refer to' page, which only has links in list items
 	if paragraph == '':
 		lists = [x for x in soup.find(id='mw-content-text').contents if x.name in [u'ul', u'ol']]
 		for lst in lists:
 			list_items = [x for x in lst.contents if x.name == 'li']
 			for i in list_items:
 				if re.search(link_patt, str(i)):
 					paragraph = str(i)
 					break
 	link = remove_parentheses(paragraph).split('"')[0].split('/')[-1]
 	return link

 def next_url(title):
 	''' URL formatting '''
 	return wiki_endpoint + urllib2.quote(title).replace('%20', '_')

 def nextPage(name):
 	global counter
 	''' Recursive function to get from a given article to the Philosophy article '''
 	if name == "Philosophy":
 		return
 	counter += 1
 	# Get the title of the next article
 	url = next_url(name)
 	# Format it to a URL
 	title = get_next(url)
 	# Log to the user
 	print "{0} reached.".format(title.replace('_', ' '))
 	# Check if we were here already and declare a loop if we were
 	if title in visited:
 		print 'Stuck in a loop, terminating.'
 		return
 	# Add it to the list of visited articles
 	visited.append(title)
 	# Make a recursive call
 	nextPage(title)


 def main():
 	''' Get the first article's title from the user and run the code for it '''
 	if len(sys.argv) < 2:
 		print 'Please enter a title.'
 		return
 	title = ' '.join(sys.argv[1:])
 	try:
 		nextPage(title)
 		print 'Number of articles visited: %s' % str(counter)
 	except KeyboardInterrupt:
 		print 'User has stopped.'
 	except Exception, e:
 		print 'ERROR: {0}'.format(str(e))

 if __name__ == '__main__':
 	main()
	import urllib2
	import sys
	from bs4 import BeautifulSoup
	import re

	# Used for formating wiki links
	wiki_endpoint = "http://en.wikipedia.org/wiki/"
	# Count how many links were visited on the way
	counter = 0
	# Regular Pattren for link
	link_patt = re.compile(r'<a href="/wiki/[^:"].?".?>')
	# Keep a list of visited articles to detect loops
	visited = []
	def remove_parentheses(data):
	''' We want the first link that is not in parentheses, so we make sure it's not '''
	first = data.split('href="')
	# If the paragraph dosen't contain parentheses, return the first wiki link
	if not ('(' in first[0] or ')' in first[0]):
	return [x for x in first if x.startswith('/wiki/') and not 'File:' in x][0]
	# Else, find the first link after the parentheses
	total = first[0]
	for i in range(len(first)):
	if total.count('(') == total.count(')'):
	# Make sure it's a wiki link
	return [x for x in first[i:] if x.startswith('/wiki/')][0]
	if i > 0:
	total += first[i]

	def get_next(url):
	''' Get the first link from the article '''
	# Get the HTML
	html = urllib2.urlopen(url).read()
	# Get the article's paragraphs
	soup = BeautifulSoup(html)
	paragraphs = [x for x in soup.find(id='mw-content-text').contents if x.name == u'p']
	paragraph = ''
	for p in paragraphs:
	if re.search(link_patt, str(p)):
	paragraph = str(p)
	break
	# Sometimes we have to deal with a 'may refer to' page, which only has links in list items
	if paragraph == '':
	lists = [x for x in soup.find(id='mw-content-text').contents if x.name in [u'ul', u'ol']]
	for lst in lists:
	list_items = [x for x in lst.contents if x.name == 'li']
	for i in list_items:
	if re.search(link_patt, str(i)):
	paragraph = str(i)
	break
	link = remove_parentheses(paragraph).split('"')[0].split('/')[-1]
	return link

	def next_url(title):
	''' URL formatting '''
	return wiki_endpoint + urllib2.quote(title).replace('%20', '_')

	def nextPage(name):
	global counter
	''' Recursive function to get from a given article to the Philosophy article '''
	if name == "Philosophy":
	return
	counter += 1
	# Get the title of the next article
	url = next_url(name)
	# Format it to a URL
	title = get_next(url)
	# Log to the user
	print "{0} reached.".format(title.replace('_', ' '))
	# Check if we were here already and declare a loop if we were
	if title in visited:
	print 'Stuck in a loop, terminating.'
	return
	# Add it to the list of visited articles
	visited.append(title)
	# Make a recursive call
	nextPage(title)


	def main():
	''' Get the first article's title from the user and run the code for it '''
	if len(sys.argv) < 2:
	print 'Please enter a title.'
	return
	title = ' '.join(sys.argv[1:])
	try:
	nextPage(title)
	print 'Number of articles visited: %s' % str(counter)
	except KeyboardInterrupt:
	print 'User has stopped.'
	except Exception, e:
	print 'ERROR: {0}'.format(str(e))

	if __name__ == '__main__':
	main()
No results found