AO8 · May 25, 2018 18:23
diff --git a/bacon_crawler.py b/bacon_crawler.py
 # Adapted from example in Ch.3 of "Web Scraping With Python, Second Edition" by Ryan Mitchell
 # Make a tax-deductible donation to the Wikimedia Foundation at https://wikimediafoundation.org/wiki/Ways_to_Give
 # Takeaway from this program: recursion is at the heart of web crawling. Crawlers retrieve page contents for a URL,
 # examine that page for another URL, and retrieve that page, ad infinitum

 import re
 import random
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime as dt

 pattern = re.compile("^(/wiki/)((?!:).)*$")

 random.seed(dt.now()) # set random number generator seed with current system time to ensure new path through articles
 def get_links(article_url):
    html = requests.get(f"http://en.wikipedia.org{article_url}").text # fstrings require Python 3.6+
    soup = BeautifulSoup(html, "html.parser")
    return soup.find("div", {"id":"bodyContent"}).find_all("a", href=pattern) # returns a list

 links = get_links("/wiki/Kevin_Bacon")

 while len(links) > 1:
    new_article = links[random.randint(0, len(links)-1)].attrs["href"]
    print(new_article)
    links = get_links(new_article)
	# Adapted from example in Ch.3 of "Web Scraping With Python, Second Edition" by Ryan Mitchell
	# Make a tax-deductible donation to the Wikimedia Foundation at https://wikimediafoundation.org/wiki/Ways_to_Give
	# Takeaway from this program: recursion is at the heart of web crawling. Crawlers retrieve page contents for a URL,
	# examine that page for another URL, and retrieve that page, ad infinitum

	import re
	import random
	import requests
	from bs4 import BeautifulSoup
	from datetime import datetime as dt

	pattern = re.compile("^(/wiki/)((?!:).)*$")

	random.seed(dt.now()) # set random number generator seed with current system time to ensure new path through articles
	def get_links(article_url):
	html = requests.get(f"http://en.wikipedia.org{article_url}").text # fstrings require Python 3.6+
	soup = BeautifulSoup(html, "html.parser")
	return soup.find("div", {"id":"bodyContent"}).find_all("a", href=pattern) # returns a list

	links = get_links("/wiki/Kevin_Bacon")

	while len(links) > 1:
	new_article = links[random.randint(0, len(links)-1)].attrs["href"]
	print(new_article)
	links = get_links(new_article)
No results found