Created
December 11, 2012 13:23
-
-
Save beauvais/4258514 to your computer and use it in GitHub Desktop.
extracting and soupifying
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from sys import argv | |
from bs4 import BeautifulSoup | |
script, landing = argv # landing is the first URI for this script | |
def extractor(landing): # extractor uses requests to GET pages | |
r = requests.get(landing) # and assigns them variables for use | |
response = r.status_code | |
c = r.text | |
soup = BeautifulSoup(c) | |
print response # Making sure the server's responding (200 is good) | |
def linksearch(soup): # Looking for links in the soup from extractor | |
for link in soup.find_all('a'): | |
print(link.get('href')) # Shows the links | |
# The idea is to output the items found on a landing page into | |
# BeautifulSoup, and output them flexibly (i.e. into a file for | |
# each sometimes | |
# prettified) | |
# Need a way to take each link, and run it through extractor. | |
# A link should only be extractorified if it is from the landing | |
# domain (so if 'href' is "http" and contains the domain from argv)? | |
# This would crawl through the site, ignoring links to other domains | |
# for extraction. | |
# r = requests.get(landing) | |
# response = r.status_code | |
# c = r.text | |
# soup = BeautifulSoup(c) | |
# pretty = soup.prettify() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment