seanherron · July 15, 2013 03:16
diff --git a/drugs@fda scraper b/drugs@fda scraper
 #!/usr/bin/env python

 #!/usr/bin/env python
 import scraperwiki
 import requests
 import lxml.html
 import lxml.etree
 import string
 import md5

 index_categories = string.ascii_uppercase

 def DataTableScrape(descriptor, keyword):
    try:
        descriptor = drug_page_root.cssselect('td:contains("%s") + td' % keyword)[0].text_content().strip().replace(u"\u2022", "").replace(u'\r\n\t\t', '').replace(u'\r\n\t', '').split(';')
    except IndexError:
        descriptor = []
    return descriptor

    # Next, we'll set a session up to ensure we have the right cookies. We grab these from a drug index page.
 session = requests.Session()
 session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=A')


    # Next, We'll loop through this and open up each listing of drugs
 for letter in index_categories:
    page = session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=%s&StartRow=1&StepSize=1000000' % letter)
    # Now the fun begins. We'll begin by using lxml to grab all the links to drug detail pages.
    root = lxml.html.fromstring(page.content)
    
    # select all the relevant product links
    links = root.cssselect('td.product_table li a')

    for link in links:
        # extract what we *really* want
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/' + link.get('href')
        drug_page = session.get(url)
        drug_page_root = lxml.html.fromstring(drug_page.content)
        
        # Key
        key = md5.new(url).digest()
    
        # We'll create the Drug Name Object
        name = DataTableScrape(descriptor="name", keyword = "Drug Name")
    
        # We'll get the Active Ingredients Set Up
        active_ingredients = DataTableScrape(descriptor="active_ingredient", keyword = "Active Ingredient")
        
        # And the Company
        company = DataTableScrape(descriptor="company", keyword = "Company")
        
        # The FDA NDA (Application Number)
        application_num = DataTableScrape(descriptor="application_num", keyword = "FDA Application")
        
        unique_keys = [ 'id' ]
        data = {'id':key, 'name':name,'active_ingredients':active_ingredients,'company':company,'application_num':application_num}
        
        for item in data:
            print type(item)
        #scraperwiki.sql.save(unique_keys, data)
	#!/usr/bin/env python

	#!/usr/bin/env python
	import scraperwiki
	import requests
	import lxml.html
	import lxml.etree
	import string
	import md5

	index_categories = string.ascii_uppercase

	def DataTableScrape(descriptor, keyword):
	try:
	descriptor = drug_page_root.cssselect('td:contains("%s") + td' % keyword)[0].text_content().strip().replace(u"\u2022", "").replace(u'\r\n\t\t', '').replace(u'\r\n\t', '').split(';')
	except IndexError:
	descriptor = []
	return descriptor

	# Next, we'll set a session up to ensure we have the right cookies. We grab these from a drug index page.
	session = requests.Session()
	session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=A')


	# Next, We'll loop through this and open up each listing of drugs
	for letter in index_categories:
	page = session.get('http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.SearchResults_Browse&DrugInitial=%s&StartRow=1&StepSize=1000000' % letter)
	# Now the fun begins. We'll begin by using lxml to grab all the links to drug detail pages.
	root = lxml.html.fromstring(page.content)

	# select all the relevant product links
	links = root.cssselect('td.product_table li a')

	for link in links:
	# extract what we really want
	url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/' + link.get('href')
	drug_page = session.get(url)
	drug_page_root = lxml.html.fromstring(drug_page.content)

	# Key
	key = md5.new(url).digest()

	# We'll create the Drug Name Object
	name = DataTableScrape(descriptor="name", keyword = "Drug Name")

	# We'll get the Active Ingredients Set Up
	active_ingredients = DataTableScrape(descriptor="active_ingredient", keyword = "Active Ingredient")

	# And the Company
	company = DataTableScrape(descriptor="company", keyword = "Company")

	# The FDA NDA (Application Number)
	application_num = DataTableScrape(descriptor="application_num", keyword = "FDA Application")

	unique_keys = [ 'id' ]
	data = {'id':key, 'name':name,'active_ingredients':active_ingredients,'company':company,'application_num':application_num}

	for item in data:
	print type(item)
	#scraperwiki.sql.save(unique_keys, data)
No results found