Created
March 29, 2018 01:54
-
-
Save batemapf/8f424370776ff555ffb6cfd841867059 to your computer and use it in GitHub Desktop.
Link Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
# Set the variable `url` to a URL of your choice. | |
url = 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/' | |
# Send a request to the URL and save what we get back in the variable `response` | |
response = requests.get(url) | |
# Get the raw text of the response. This will be a whole bunch of HTML. | |
raw_text = response.text | |
# Make some soup! That is, create an instance of the BeautifulSoup | |
# class and feed it the raw text of the response. | |
soup = BeautifulSoup(raw_text) | |
# Now let's get all of the link addresses in the page, which is the value contained | |
# in the `href` attribute of each `<a>` tag. | |
# | |
# First, create an empty list to store the link addresses. | |
hrefs = [] | |
# Second, get all of the <a> tags on the page using BeautifulSoup's handy `.all()` method, which returns a list. | |
a_tags = soup.find_all('a') | |
# Third, loop through all of the <a> tags and append a copy of the `href` value for each one to our empty list. | |
for tag in a_tags: | |
# Get the `href` attribute for the tag | |
href = tag['href'] | |
# Append it to the list. | |
hrefs.append(href) | |
# Check out your work! | |
print(len(hrefs)) | |
print(hrefs[-1]) | |
print(hrefs[0]) | |
for h in hrefs: | |
if h.count('/') > 1: | |
print(h) | |
# Your result should look something like this: | |
""" | |
347 | |
http://sphinx-doc.org/ | |
genindex.html | |
http://www.crummy.com/software/BeautifulSoup/ | |
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html | |
http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/ | |
http://kondou.com/BS4/ | |
http://coreapython.hosting.paran.com/etc/beautifulsoup4.html | |
https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup | |
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html | |
http://www.crummy.com/software/BeautifulSoup/download/4.x/ | |
http://lxml.de/ | |
http://code.google.com/p/html5lib/ | |
http://example.com/elsie | |
http://example.com/lacie | |
http://www.w3.org/TR/html5/syntax.html#syntax | |
http://wiki.python.org/moin/PrintFails | |
http://lxml.de/ | |
http://pypi.python.org/pypi/cchardet/ | |
http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz | |
http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html | |
http://www.python.org/dev/peps/pep-0008/ | |
http://sphinx-doc.org/ | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment