RobbieClarken · September 10, 2016 08:33
diff --git a/wikicrawler.py b/wikicrawler.py
 from urllib.request import urlopen
 from urllib.parse import urljoin, unquote_plus
 import re
 from random import choice

 from bs4 import BeautifulSoup


 BASE_URL = 'https://en.wikipedia.org/'


 def get_links(page):
    soup = BeautifulSoup(page, 'html.parser')
    content = soup.find('div', {'id': 'bodyContent'})
    link_regex = re.compile('^/wiki/[^:]*$')
    return [link.attrs['href'] for link in content.find_all('a', href=link_regex)]


 if __name__ == '__main__':
    links = ['/wiki/Kevin_Bacon']
    while links:
        url = urljoin(BASE_URL, choice(links))
        print(unquote_plus(url), flush=True)
        links = get_links(urlopen(url))
	from urllib.request import urlopen
	from urllib.parse import urljoin, unquote_plus
	import re
	from random import choice

	from bs4 import BeautifulSoup


	BASE_URL = 'https://en.wikipedia.org/'


	def get_links(page):
	soup = BeautifulSoup(page, 'html.parser')
	content = soup.find('div', {'id': 'bodyContent'})
	link_regex = re.compile('^/wiki/[^:]*$')
	return [link.attrs['href'] for link in content.find_all('a', href=link_regex)]


	if __name__ == '__main__':
	links = ['/wiki/Kevin_Bacon']
	while links:
	url = urljoin(BASE_URL, choice(links))
	print(unquote_plus(url), flush=True)
	links = get_links(urlopen(url))