Last active
May 25, 2018 18:23
-
-
Save AO8/48946e036d8c3a24482fc68b40e4c824 to your computer and use it in GitHub Desktop.
Using Python and BeautifulSoup, decipher untold connections to Kevin Bacon with this mesmerizing Wikipedia crawler.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Adapted from example in Ch.3 of "Web Scraping With Python, Second Edition" by Ryan Mitchell | |
| # Make a tax-deductible donation to the Wikimedia Foundation at https://wikimediafoundation.org/wiki/Ways_to_Give | |
| # Takeaway from this program: recursion is at the heart of web crawling. Crawlers retrieve page contents for a URL, | |
| # examine that page for another URL, and retrieve that page, ad infinitum | |
| import re | |
| import random | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from datetime import datetime as dt | |
| pattern = re.compile("^(/wiki/)((?!:).)*$") | |
| random.seed(dt.now()) # set random number generator seed with current system time to ensure new path through articles | |
| def get_links(article_url): | |
| html = requests.get(f"http://en.wikipedia.org{article_url}").text # fstrings require Python 3.6+ | |
| soup = BeautifulSoup(html, "html.parser") | |
| return soup.find("div", {"id":"bodyContent"}).find_all("a", href=pattern) # returns a list | |
| links = get_links("/wiki/Kevin_Bacon") | |
| while len(links) > 1: | |
| new_article = links[random.randint(0, len(links)-1)].attrs["href"] | |
| print(new_article) | |
| links = get_links(new_article) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment