Skip to content

Instantly share code, notes, and snippets.

@RobbieClarken
Last active September 10, 2016 08:33
Show Gist options
  • Save RobbieClarken/be321a2c6b20780603783f61e147917e to your computer and use it in GitHub Desktop.
Save RobbieClarken/be321a2c6b20780603783f61e147917e to your computer and use it in GitHub Desktop.
from urllib.request import urlopen
from urllib.parse import urljoin, unquote_plus
import re
from random import choice
from bs4 import BeautifulSoup
BASE_URL = 'https://en.wikipedia.org/'
def get_links(page):
soup = BeautifulSoup(page, 'html.parser')
content = soup.find('div', {'id': 'bodyContent'})
link_regex = re.compile('^/wiki/[^:]*$')
return [link.attrs['href'] for link in content.find_all('a', href=link_regex)]
if __name__ == '__main__':
links = ['/wiki/Kevin_Bacon']
while links:
url = urljoin(BASE_URL, choice(links))
print(unquote_plus(url), flush=True)
links = get_links(urlopen(url))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment