Last active
September 10, 2017 21:12
-
-
Save dlrobertson/7774cd757f912b052d86cffb534fd6c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Must install requests first | |
import requests | |
# Must install beautifulsoup4 first | |
from bs4 import BeautifulSoup | |
# Used to transform relative links to absolute links | |
# and should already be installed | |
from urllib.parse import urljoin | |
# TODO: Initial population of sites to visit | |
to_visit = ... | |
# TODO: After we visit a site we need to add it to our history. | |
# When we start this will be empty, but as we go this will be | |
# populated | |
history = ... | |
# TODO: As long as there are still sites to visit, keep scraping | |
while ...: | |
# TODO: Take the next site off the list of sites we're visiting | |
next_url = ... | |
# Send an HTTP GET request to the url and return the webpage | |
req = requests.get(next_url) | |
# TODO: Append the url to our history here | |
... | |
# TODO: Ensure the status code is OK (200) | |
if ...: | |
# Parse the html into something we can word with | |
soup = BeautifulSoup(req.text) | |
# TODO: Do something more interesting than | |
# printing the url of the site we just | |
# fetched data from here | |
print(next_url) | |
# Find all the links or "a" tags in the html | |
# e.g. <a href="www.google.com">Google</a> | |
for link in soup.find_all("a", href=True): | |
# Snag the link from the link tag | |
href = link["href"] | |
# The link may be a relative link (e.g. /). | |
# So we urljoin this site (the variable next_url) | |
# and the new link (the variable href) | |
url = urljoin(next_url, href) | |
# TODO: Add the variable url to the to_visit | |
# variable here | |
... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment