batemapf · March 29, 2018 01:54
diff --git a/scraping.py b/scraping.py
 import requests
 from bs4 import BeautifulSoup

 # Set the variable `url` to a URL of your choice.
 url = 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/'

 # Send a request to the URL and save what we get back in the variable `response`
 response = requests.get(url)

 # Get the raw text of the response. This will be a whole bunch of HTML.
 raw_text = response.text

 # Make some soup! That is, create an instance of the BeautifulSoup
 # class and feed it the raw text of the response.
 soup = BeautifulSoup(raw_text)

 # Now let's get all of the link addresses in the page, which is the value contained
 # in the `href` attribute of each `<a>` tag.
 #
 # First, create an empty list to store the link addresses.
 hrefs = []

 # Second, get all of the <a> tags on the page using BeautifulSoup's handy `.all()` method, which returns a list.
 a_tags = soup.find_all('a')

 # Third, loop through all of the <a> tags and append a copy of the `href` value for each one to our empty list.
 for tag in a_tags:
  # Get the `href` attribute for the tag
  href = tag['href']
  # Append it to the list.
  hrefs.append(href)

 # Check out your work!
 print(len(hrefs))
 print(hrefs[-1])
 print(hrefs[0])
 for h in hrefs:
  if h.count('/') > 1:
    print(h)

 # Your result should look something like this:
 """
 347
 http://sphinx-doc.org/
 genindex.html
 http://www.crummy.com/software/BeautifulSoup/
 http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
 http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
 http://kondou.com/BS4/
 http://coreapython.hosting.paran.com/etc/beautifulsoup4.html
 https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
 http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
 http://www.crummy.com/software/BeautifulSoup/download/4.x/
 http://lxml.de/
 http://code.google.com/p/html5lib/
 http://example.com/elsie
 http://example.com/lacie
 http://www.w3.org/TR/html5/syntax.html#syntax
 http://wiki.python.org/moin/PrintFails
 http://lxml.de/
 http://pypi.python.org/pypi/cchardet/
 http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz
 http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
 http://www.python.org/dev/peps/pep-0008/
 http://sphinx-doc.org/
 """
	import requests
	from bs4 import BeautifulSoup

	# Set the variable `url` to a URL of your choice.
	url = 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/'

	# Send a request to the URL and save what we get back in the variable `response`
	response = requests.get(url)

	# Get the raw text of the response. This will be a whole bunch of HTML.
	raw_text = response.text

	# Make some soup! That is, create an instance of the BeautifulSoup
	# class and feed it the raw text of the response.
	soup = BeautifulSoup(raw_text)

	# Now let's get all of the link addresses in the page, which is the value contained
	# in the `href` attribute of each `<a>` tag.
	#
	# First, create an empty list to store the link addresses.
	hrefs = []

	# Second, get all of the <a> tags on the page using BeautifulSoup's handy `.all()` method, which returns a list.
	a_tags = soup.find_all('a')

	# Third, loop through all of the <a> tags and append a copy of the `href` value for each one to our empty list.
	for tag in a_tags:
	# Get the `href` attribute for the tag
	href = tag['href']
	# Append it to the list.
	hrefs.append(href)

	# Check out your work!
	print(len(hrefs))
	print(hrefs[-1])
	print(hrefs[0])
	for h in hrefs:
	if h.count('/') > 1:
	print(h)

	# Your result should look something like this:
	"""
	347
	http://sphinx-doc.org/
	genindex.html
	http://www.crummy.com/software/BeautifulSoup/
	http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
	http://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
	http://kondou.com/BS4/
	http://coreapython.hosting.paran.com/etc/beautifulsoup4.html
	https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup
	http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
	http://www.crummy.com/software/BeautifulSoup/download/4.x/
	http://lxml.de/
	http://code.google.com/p/html5lib/
	http://example.com/elsie
	http://example.com/lacie
	http://www.w3.org/TR/html5/syntax.html#syntax
	http://wiki.python.org/moin/PrintFails
	http://lxml.de/
	http://pypi.python.org/pypi/cchardet/
	http://www.crummy.com/software/BeautifulSoup/bs3/download/3.x/BeautifulSoup-3.2.0.tar.gz
	http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html
	http://www.python.org/dev/peps/pep-0008/
	http://sphinx-doc.org/
	"""