Last active
August 29, 2015 14:08
-
-
Save OzTamir/4fd0ca8ff8ef8783412b to your computer and use it in GitHub Desktop.
Wikigame - Get to Philosophy from any given article
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib2 | |
| import sys | |
| # Used for formating wiki links | |
| wiki_endpoint = "http://en.wikipedia.org/wiki/" | |
| # Count how many links were visited on the way | |
| counter = 0 | |
| # Content HTML | |
| c_html = '<div id="mw-content-text" lang="en" dir="ltr" class="mw-content-ltr">' | |
| def remove_parentheses(data): | |
| ''' We want the first link that is not in parentheses, so we make sure it's not ''' | |
| first = data.split('<a href="') | |
| # If the paragraph dosen't contain parentheses, return the first wiki link | |
| if not ('(' in first[0] or ')' in first[0]): | |
| return [x for x in first if x.startswith('/wiki/') and not 'File:' in x][0] | |
| # Else, find the first link after the parentheses | |
| total = first[0] | |
| for i in range(len(first)): | |
| if total.count('(') == total.count(')'): | |
| # Make sure it's a wiki link | |
| return [x for x in first[i:] if x.startswith('/wiki/')][0] | |
| if i > 0: | |
| total += first[i] | |
| def get_next(url): | |
| ''' Get the first link from the article ''' | |
| # Get the HTML | |
| data = urllib2.urlopen(url).read() | |
| # Split to paragraphs | |
| data = data.split(c_html)[1] | |
| if '<table class="infobox biota"' in data: | |
| data = data.split('</table>')[1] | |
| while data.startswith('<table'): | |
| data = '</table>'.join(data.split('</table>')[1:]) | |
| p = data.split('<p>') | |
| paragraph = '' | |
| pars = [] | |
| # Find the first paragraph with a link in it | |
| for index, i in enumerate(p[1:]): | |
| if "<a href" in i.split('</p>')[0]: | |
| pars.append(i) | |
| for i in pars: | |
| pa = i.split('</p>')[0] | |
| if (pa[:2] in ['<b', '<i', '<a']) or (pa[0] != '<'): | |
| paragraph = pa | |
| break | |
| # Get the next article's title | |
| link = remove_parentheses(paragraph).split('"')[0].split('/')[-1] | |
| return link | |
| def next_url(title): | |
| ''' URL formatting ''' | |
| return wiki_endpoint + urllib2.quote(title).replace('%20', '_') | |
| def nextPage(name): | |
| global counter | |
| ''' Recursive function to get from a given article to the Philosophy article ''' | |
| if name == "Philosophy": | |
| return | |
| counter += 1 | |
| # Get the title of the next article | |
| url = next_url(name) | |
| # Format it to a URL | |
| title = get_next(url) | |
| # Log to the user | |
| print "{0} reached.".format(title.replace('_', ' ')) | |
| # Make a recursive call | |
| nextPage(title) | |
| def main(): | |
| ''' Get the first article's title from the user and run the code for it ''' | |
| title = ' '.join(sys.argv[1:]) | |
| try: | |
| nextPage(title) | |
| print 'Number of articles visited: %s' % str(counter) | |
| except KeyboardInterrupt: | |
| print 'User has stopped.' | |
| except Exception, e: | |
| print 'ERROR: {0}'.format(str(e)) | |
| if __name__ == '__main__': | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For a less reliable version which only use pure Python, please go here:
https://gist.github.com/OzTamir/4fd0ca8ff8ef8783412b/3b01e3d317a569235e909e7fca96ade50027c34f