Last active
August 29, 2015 14:08
-
-
Save OzTamir/4fd0ca8ff8ef8783412b to your computer and use it in GitHub Desktop.
Wikigame - Get to Philosophy from any given article
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import urllib2 | |
| import sys | |
| from bs4 import BeautifulSoup | |
| import re | |
| # Used for formating wiki links | |
| wiki_endpoint = "http://en.wikipedia.org/wiki/" | |
| # Count how many links were visited on the way | |
| counter = 0 | |
| # Regular Pattren for link | |
| link_patt = re.compile(r'<a href="/wiki/[^:"].*?".*?>') | |
| # Keep a list of visited articles to detect loops | |
| visited = [] | |
| def remove_parentheses(data): | |
| ''' We want the first link that is not in parentheses, so we make sure it's not ''' | |
| first = data.split('href="') | |
| # If the paragraph dosen't contain parentheses, return the first wiki link | |
| if not ('(' in first[0] or ')' in first[0]): | |
| return [x for x in first if x.startswith('/wiki/') and not 'File:' in x][0] | |
| # Else, find the first link after the parentheses | |
| total = first[0] | |
| for i in range(len(first)): | |
| if total.count('(') == total.count(')'): | |
| # Make sure it's a wiki link | |
| return [x for x in first[i:] if x.startswith('/wiki/')][0] | |
| if i > 0: | |
| total += first[i] | |
| def get_next(url): | |
| ''' Get the first link from the article ''' | |
| # Get the HTML | |
| html = urllib2.urlopen(url).read() | |
| # Get the article's paragraphs | |
| soup = BeautifulSoup(html) | |
| paragraphs = [x for x in soup.find(id='mw-content-text').contents if x.name == u'p'] | |
| paragraph = '' | |
| for p in paragraphs: | |
| if re.search(link_patt, str(p)): | |
| paragraph = str(p) | |
| break | |
| # Sometimes we have to deal with a 'may refer to' page, which only has links in list items | |
| if paragraph == '': | |
| lists = [x for x in soup.find(id='mw-content-text').contents if x.name in [u'ul', u'ol']] | |
| for lst in lists: | |
| list_items = [x for x in lst.contents if x.name == 'li'] | |
| for i in list_items: | |
| if re.search(link_patt, str(i)): | |
| paragraph = str(i) | |
| break | |
| link = remove_parentheses(paragraph).split('"')[0].split('/')[-1] | |
| return link | |
| def next_url(title): | |
| ''' URL formatting ''' | |
| return wiki_endpoint + urllib2.quote(title).replace('%20', '_') | |
| def nextPage(name): | |
| global counter | |
| ''' Recursive function to get from a given article to the Philosophy article ''' | |
| if name == "Philosophy": | |
| return | |
| counter += 1 | |
| # Get the title of the next article | |
| url = next_url(name) | |
| # Format it to a URL | |
| title = get_next(url) | |
| # Log to the user | |
| print "{0} reached.".format(title.replace('_', ' ')) | |
| # Check if we were here already and declare a loop if we were | |
| if title in visited: | |
| print 'Stuck in a loop, terminating.' | |
| return | |
| # Add it to the list of visited articles | |
| visited.append(title) | |
| # Make a recursive call | |
| nextPage(title) | |
| def main(): | |
| ''' Get the first article's title from the user and run the code for it ''' | |
| if len(sys.argv) < 2: | |
| print 'Please enter a title.' | |
| return | |
| title = ' '.join(sys.argv[1:]) | |
| try: | |
| nextPage(title) | |
| print 'Number of articles visited: %s' % str(counter) | |
| except KeyboardInterrupt: | |
| print 'User has stopped.' | |
| except Exception, e: | |
| print 'ERROR: {0}'.format(str(e)) | |
| if __name__ == '__main__': | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For a less reliable version which only use pure Python, please go here:
https://gist.github.com/OzTamir/4fd0ca8ff8ef8783412b/3b01e3d317a569235e909e7fca96ade50027c34f