Skip to content

Instantly share code, notes, and snippets.

@OzTamir
Last active August 29, 2015 14:08
Show Gist options
  • Select an option

  • Save OzTamir/4fd0ca8ff8ef8783412b to your computer and use it in GitHub Desktop.

Select an option

Save OzTamir/4fd0ca8ff8ef8783412b to your computer and use it in GitHub Desktop.
Wikigame - Get to Philosophy from any given article
import urllib2
import sys
# Used for formating wiki links
wiki_endpoint = "http://en.wikipedia.org/wiki/"
# Count how many links were visited on the way
counter = 0
# Content HTML
c_html = '<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr">'
def remove_parentheses(data):
''' We want the first link that is not in parentheses, so we make sure it's not '''
first = data.split('<a href="')
# If the paragraph dosen't contain parentheses, return the first wiki link
if not ('(' in first[0] or ')' in first[0]):
return [x for x in first if x.startswith('/wiki/') and not 'File:' in x][0]
# Else, find the first link after the parentheses
total = first[0]
for i in range(len(first)):
if total.count('(') == total.count(')'):
# Make sure it's a wiki link
return [x for x in first[i:] if x.startswith('/wiki/')][0]
if i > 0:
total += first[i]
def get_next(url):
''' Get the first link from the article '''
# Get the HTML
data = urllib2.urlopen(url).read()
# Split to paragraphs
data = data.split(c_html)[1]
if '<table class="infobox biota"' in data:
data = data.split('</table>')[1]
while data.startswith('<table'):
data = '</table>'.join(data.split('</table>')[1:])
p = data.split('<p>')
paragraph = ''
pars = []
# Find the first paragraph with a link in it
for index, i in enumerate(p[1:]):
if "<a href" in i.split('</p>')[0]:
pars.append(i)
for i in pars:
pa = i.split('</p>')[0]
if (pa[:2] in ['<b', '<i', '<a']) or (pa[0] != '<'):
paragraph = pa
break
# Get the next article's title
link = remove_parentheses(paragraph).split('"')[0].split('/')[-1]
return link
def next_url(title):
''' URL formatting '''
return wiki_endpoint + urllib2.quote(title).replace('%20', '_')
def nextPage(name):
global counter
''' Recursive function to get from a given article to the Philosophy article '''
if name == "Philosophy":
return
counter += 1
# Get the title of the next article
url = next_url(name)
# Format it to a URL
title = get_next(url)
# Log to the user
print "{0} reached.".format(title.replace('_', ' '))
# Make a recursive call
nextPage(title)
def main():
''' Get the first article's title from the user and run the code for it '''
title = ' '.join(sys.argv[1:])
try:
nextPage(title)
print 'Number of articles visited: %s' % str(counter)
except KeyboardInterrupt:
print 'User has stopped.'
except Exception, e:
print 'ERROR: {0}'.format(str(e))
if __name__ == '__main__':
main()
@OzTamir
Copy link
Author

OzTamir commented Nov 4, 2014

For a less reliable version which only use pure Python, please go here:
https://gist.github.com/OzTamir/4fd0ca8ff8ef8783412b/3b01e3d317a569235e909e7fca96ade50027c34f

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment