Created
July 15, 2013 03:16
-
-
Save seanherron/5997278 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#!/usr/bin/env python | |
import scraperwiki | |
import requests | |
import lxml.html | |
import lxml.etree | |
import string | |
import md5 | |
index_categories = string.ascii_uppercase | |
def DataTableScrape(descriptor, keyword): | |
try: | |
descriptor = drug_page_root.cssselect('td:contains("%s") + td' % keyword)[0].text_content().strip().replace(u"\u2022", "").replace(u'\r\n\t\t', '').replace(u'\r\n\t', '').split(';') | |
except IndexError: | |
descriptor = [] | |
return descriptor | |
# Next, we'll set a session up to ensure we have the right cookies. We grab these from a drug index page. | |
session = requests.Session() | |
session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=A') | |
# Next, We'll loop through this and open up each listing of drugs | |
for letter in index_categories: | |
page = session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=%s&StartRow=1&StepSize=1000000' % letter) | |
# Now the fun begins. We'll begin by using lxml to grab all the links to drug detail pages. | |
root = lxml.html.fromstring(page.content) | |
# select all the relevant product links | |
links = root.cssselect('td.product_table li a') | |
for link in links: | |
# extract what we *really* want | |
url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/' + link.get('href') | |
drug_page = session.get(url) | |
drug_page_root = lxml.html.fromstring(drug_page.content) | |
# Key | |
key = md5.new(url).digest() | |
# We'll create the Drug Name Object | |
name = DataTableScrape(descriptor="name", keyword = "Drug Name") | |
# We'll get the Active Ingredients Set Up | |
active_ingredients = DataTableScrape(descriptor="active_ingredient", keyword = "Active Ingredient") | |
# And the Company | |
company = DataTableScrape(descriptor="company", keyword = "Company") | |
# The FDA NDA (Application Number) | |
application_num = DataTableScrape(descriptor="application_num", keyword = "FDA Application") | |
unique_keys = [ 'id' ] | |
data = {'id':key, 'name':name,'active_ingredients':active_ingredients,'company':company,'application_num':application_num} | |
for item in data: | |
print type(item) | |
#scraperwiki.sql.save(unique_keys, data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment