beauvais · December 11, 2012 13:23
diff --git a/extractor.py b/extractor.py
 import requests
 from sys import argv
 from bs4 import BeautifulSoup

 script, landing = argv  # landing is the first URI for this script

 def extractor(landing):  # extractor uses requests to GET pages
    r = requests.get(landing)  # and assigns them variables for use
    response = r.status_code
    c = r.text
    soup = BeautifulSoup(c)

    print response  # Making sure the server's responding (200 is good)

 def linksearch(soup):  # Looking for links in the soup from extractor
    for link in soup.find_all('a'):
        print(link.get('href'))  # Shows the links

 # The idea is to output the items found on a landing page into
 # BeautifulSoup, and output them flexibly (i.e. into a file for
 #                                          each sometimes 
 #                                          prettified)
 # Need a way to take each link, and run it through extractor.
 # A link should only be extractorified if it is from the landing
 # domain (so if 'href' is "http" and contains the domain from argv)?
 # This would crawl through the site, ignoring links to other domains
 # for extraction.


 # r = requests.get(landing)
 # response = r.status_code
 # c = r.text
 # soup = BeautifulSoup(c)

 # pretty = soup.prettify()
	import requests
	from sys import argv
	from bs4 import BeautifulSoup

	script, landing = argv # landing is the first URI for this script

	def extractor(landing): # extractor uses requests to GET pages
	r = requests.get(landing) # and assigns them variables for use
	response = r.status_code
	c = r.text
	soup = BeautifulSoup(c)

	print response # Making sure the server's responding (200 is good)

	def linksearch(soup): # Looking for links in the soup from extractor
	for link in soup.find_all('a'):
	print(link.get('href')) # Shows the links

	# The idea is to output the items found on a landing page into
	# BeautifulSoup, and output them flexibly (i.e. into a file for
	# each sometimes
	# prettified)
	# Need a way to take each link, and run it through extractor.
	# A link should only be extractorified if it is from the landing
	# domain (so if 'href' is "http" and contains the domain from argv)?
	# This would crawl through the site, ignoring links to other domains
	# for extraction.


	# r = requests.get(landing)
	# response = r.status_code
	# c = r.text
	# soup = BeautifulSoup(c)

	# pretty = soup.prettify()