Created
January 31, 2012 14:14
-
-
Save anonymous/1710695 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
from BeautifulSoup import BeautifulSoup | |
URL = 'http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfRES/res.cfm' | |
Web_cnx = urllib.urlopen(URL) | |
html = web_cnx.read() | |
soup = BeautifulSoup(html) | |
###Below are just elements - and excuse the substitute SQL. No idea how to proceed: | |
FIND input name = 'postdatefrom' | |
FIND input name = 'postdateto' | |
(NEED to enter 2-month increments beginning Nov. 2002 BUT months range 28-31 days, so…?) | |
FIIND name = 'pagenum' | |
ENTER '500' | |
FIND name = 'search' | |
PUSH SEARCH | |
###ON next page: | |
links = soup.find(href WHERE color = 23238e) | |
for link in links[1:]: | |
data = link.findAll('href') | |
link = data[0].text | |
print '\t'.join([link]) | |
### NOW, when all links obtained, PUSH each link. Within Each link: | |
FIND table WHERE cellpadding='2' AND cellspacing='14' | |
Input = soup.find('th') | |
Response = soup.find('td') | |
th = 'Date Posted' ### Do I add underscore when I define those with multiple words? | |
th = 'RecallNumber' | |
th = 'Product' | |
th = 'Code Information' | |
th = 'recalling firm/' <br> 'Manufacturer' | |
th = 'Consumer Instructions' | |
th = 'For Additional Information Contact' | |
th = 'reason for' <br> 'Recall' | |
th = 'Action' | |
th = 'Quantity in Commerce' | |
th = 'Distribution' | |
print '\t'.join([……..]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment