Skip to content

Instantly share code, notes, and snippets.

@OzTamir
Last active August 29, 2015 14:08
Show Gist options
  • Select an option

  • Save OzTamir/4fd0ca8ff8ef8783412b to your computer and use it in GitHub Desktop.

Select an option

Save OzTamir/4fd0ca8ff8ef8783412b to your computer and use it in GitHub Desktop.
Wikigame - Get to Philosophy from any given article
import urllib2
import sys
from bs4 import BeautifulSoup
import re
# Used for formating wiki links
wiki_endpoint = "http://en.wikipedia.org/wiki/"
# Count how many links were visited on the way
counter = 0
# Regular Pattren for link
link_patt = re.compile(r'<a href="/wiki/[^:"].*?".*?>')
# Keep a list of visited articles to detect loops
visited = []
def remove_parentheses(data):
''' We want the first link that is not in parentheses, so we make sure it's not '''
first = data.split('href="')
# If the paragraph dosen't contain parentheses, return the first wiki link
if not ('(' in first[0] or ')' in first[0]):
return [x for x in first if x.startswith('/wiki/') and not 'File:' in x][0]
# Else, find the first link after the parentheses
total = first[0]
for i in range(len(first)):
if total.count('(') == total.count(')'):
# Make sure it's a wiki link
return [x for x in first[i:] if x.startswith('/wiki/')][0]
if i > 0:
total += first[i]
def get_next(url):
''' Get the first link from the article '''
# Get the HTML
html = urllib2.urlopen(url).read()
# Get the article's paragraphs
soup = BeautifulSoup(html)
paragraphs = [x for x in soup.find(id='mw-content-text').contents if x.name == u'p']
paragraph = ''
for p in paragraphs:
if re.search(link_patt, str(p)):
paragraph = str(p)
break
# Sometimes we have to deal with a 'may refer to' page, which only has links in list items
if paragraph == '':
lists = [x for x in soup.find(id='mw-content-text').contents if x.name in [u'ul', u'ol']]
for lst in lists:
list_items = [x for x in lst.contents if x.name == 'li']
for i in list_items:
if re.search(link_patt, str(i)):
paragraph = str(i)
break
link = remove_parentheses(paragraph).split('"')[0].split('/')[-1]
return link
def next_url(title):
''' URL formatting '''
return wiki_endpoint + urllib2.quote(title).replace('%20', '_')
def nextPage(name):
global counter
''' Recursive function to get from a given article to the Philosophy article '''
if name == "Philosophy":
return
counter += 1
# Get the title of the next article
url = next_url(name)
# Format it to a URL
title = get_next(url)
# Log to the user
print "{0} reached.".format(title.replace('_', ' '))
# Check if we were here already and declare a loop if we were
if title in visited:
print 'Stuck in a loop, terminating.'
return
# Add it to the list of visited articles
visited.append(title)
# Make a recursive call
nextPage(title)
def main():
''' Get the first article's title from the user and run the code for it '''
if len(sys.argv) < 2:
print 'Please enter a title.'
return
title = ' '.join(sys.argv[1:])
try:
nextPage(title)
print 'Number of articles visited: %s' % str(counter)
except KeyboardInterrupt:
print 'User has stopped.'
except Exception, e:
print 'ERROR: {0}'.format(str(e))
if __name__ == '__main__':
main()
@OzTamir
Copy link
Author

OzTamir commented Nov 4, 2014

For a less reliable version which only use pure Python, please go here:
https://gist.github.com/OzTamir/4fd0ca8ff8ef8783412b/3b01e3d317a569235e909e7fca96ade50027c34f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment