zstumgoren · January 31, 2012 14:24
diff --git a/FdaMedDeviceRecalls b/FdaMedDeviceRecalls
 """
 A few notes:
 * Lines starting with "#" signs are standard Python code comments
 * Multi-line comments can be wrapped in triple-quotes (""")
 """
 from itertools import islice, product
 import calendar

 from BeautifulSoup import BeautifulSoup
 import requests

 # Base URL for POST request?
 URL = 'http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfRES/res.cfm'

 """
 All the below "input" variables represent the form field's name-value pairs, which you must build
 dynamically and pass as input to the POST request. 

 To do so, you must first gather the name-value pairs into a dictionary.
 An example of this technique can be found in the below tutorial:

 https://github.com/PythonJournos/LearningPython/blob/master/tutorials/webscraping101/fec_efiles_scrape.py

 """
 ###Below are just elements - and excuse the substitute SQL. No idea how to proceed:
 # FIND input name = 'postdatefrom' 
 # FIND input name = 'postdateto'

 # (NEED to enter 2-month increments beginning Nov. 2002 BUT months range  28-31 days, so…?)
 # See below for code related to dynamically calculating days in each month


 # To figure out the start and end dates for two-month periods, you can dynamically generate combinations of 
 # years and months for your target years, and then loop through each of them

 # Below code creates a list of month-pairs: [(1,2), (3,4), ...]
 month_pairs = zip(islice(range(1,13), 0, 13, 2), islice(range(1,13), 1, 13, 2))

 # Below code produces a list containing our target combinations of year and month_pair: [(2002, (1, 2)), [(2002, (3, 4)),....[(2012, (11, 12))]
 target_date_ranges = product(range(2002,2013), month_pairs)

 # We can now loop through the above list of target date ranges and dynamically construct POST requests for 
 # each two-month period.


 today = datetime.date.today()

 for year, months in target_date_ranges:
    # You should check if the target year and month are equal to the current year and month.
    # This will prevent you from requesting data for months far into in the future (which 
    # may or may not cause an error, depending on how the FDA set up their search functionality).
    
    start_month, end_month = months

    # Calculate number of days in month for a given year using "calendar" module
    # The second value returned by "calendar.monthrange" is the number of days, so we use indexing to pull it out

    last_day_of_end_month = calendar.monthrange(year,end_month)[1]

    
    # If the end date is less than or equal to the current year and month, 
    # then submit the request
    if (year, end_month) <= (today.year, today.month):
    
        # BUILD YOUR FORM DICTIONARY HERE
        # You'll need to format the dates to match the form expected by the FDA script.
        # To format the dates, you will need to create a datetime object and then format it.
        # See the datetime documenatation, especially the section on format operators: http://docs.python.org/library/datetime.html#strftime-and-strptime-behavior
     
        # Below creates datetime objects. I'll leave it as an exercise for you to figure out how to reformat them to strings
        statedate = datetime.datetime(year, start_month, 1)
        endate = datetime.datetime(year, end_month, last_day_of_end_month)

        # Create a dictionary to pass to your POST request; it's currently empty, so you need to populate it with
        # the start and end dates (after you've converted them to strings) and any other relevant form values.
        post_dict = {}

        # UPDATE post_dict WITH APPROPRIATE FORM VALUES

        # MAKE POST REQUEST AND PROCESS THE HTML
        # You should use the requests library to do this. Again, there is sample code for this in our webscraping tutorials:
        # https://github.com/PythonJournos/LearningPython/blob/master/tutorials/webscraping101/fec_efiles_scrape.py
        
        # FEED RESULTS OF POST REQUEST TO BEAUTIFULSOUP
        soup = BeautifulSoup(html)

        # PULL OUT APPROPRIATE LINKS using soup.findAll or some other BeautifulSoup method

        # LOOP THROUGH LINKS AND EXTRACT DATA FROM TARGET PAGES (once again, you'll use requests to fetch the page
        # and BeautifulSoup to parse it and extract the target data points)
 	
    
 # FIIND name = 'pagenum'

 #ENTER '500'

 # FIND name = 'search'

 # PUSH SEARCH  # Did you mean POST here?


 ###ON next page:

 links = soup.find(href WHERE color = 23238e)

 for link  in links[1:]:
    data = link.findAll('href')
    link = data[0].text
    print '\t'.join([link])



 	### NOW, when all links obtained, PUSH each link. Within Each link:

 FIND table WHERE cellpadding='2' AND cellspacing='14'

 Input = soup.find('th')
 Response = soup.find('td')


 	th = 'Date Posted'  ### Do I add underscore when I define those with multiple words?
 	
 	th = 'RecallNumber'
 	
 	th = 'Product'
 	
 	th = 'Code Information'

 	th = 'recalling firm/' <br> 'Manufacturer'

 	th = 'Consumer Instructions'

 	th = 'For Additional Information Contact'

 	th = 'reason for' <br> 'Recall'

 	th = 'Action'

 	th = 'Quantity in Commerce'

 	th = 'Distribution'


 print '\t'.join([……..])
	"""
	A few notes:
	* Lines starting with "#" signs are standard Python code comments
	* Multi-line comments can be wrapped in triple-quotes (""")
	"""
	from itertools import islice, product
	import calendar

	from BeautifulSoup import BeautifulSoup
	import requests

	# Base URL for POST request?
	URL = 'http://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfRES/res.cfm'

	"""
	All the below "input" variables represent the form field's name-value pairs, which you must build
	dynamically and pass as input to the POST request.

	To do so, you must first gather the name-value pairs into a dictionary.
	An example of this technique can be found in the below tutorial:

	https://github.com/PythonJournos/LearningPython/blob/master/tutorials/webscraping101/fec_efiles_scrape.py

	"""
	###Below are just elements - and excuse the substitute SQL. No idea how to proceed:
	# FIND input name = 'postdatefrom'
	# FIND input name = 'postdateto'

	# (NEED to enter 2-month increments beginning Nov. 2002 BUT months range 28-31 days, so…?)
	# See below for code related to dynamically calculating days in each month


	# To figure out the start and end dates for two-month periods, you can dynamically generate combinations of
	# years and months for your target years, and then loop through each of them

	# Below code creates a list of month-pairs: [(1,2), (3,4), ...]
	month_pairs = zip(islice(range(1,13), 0, 13, 2), islice(range(1,13), 1, 13, 2))

	# Below code produces a list containing our target combinations of year and month_pair: [(2002, (1, 2)), [(2002, (3, 4)),....[(2012, (11, 12))]
	target_date_ranges = product(range(2002,2013), month_pairs)

	# We can now loop through the above list of target date ranges and dynamically construct POST requests for
	# each two-month period.


	today = datetime.date.today()

	for year, months in target_date_ranges:
	# You should check if the target year and month are equal to the current year and month.
	# This will prevent you from requesting data for months far into in the future (which
	# may or may not cause an error, depending on how the FDA set up their search functionality).

	start_month, end_month = months

	# Calculate number of days in month for a given year using "calendar" module
	# The second value returned by "calendar.monthrange" is the number of days, so we use indexing to pull it out

	last_day_of_end_month = calendar.monthrange(year,end_month)[1]


	# If the end date is less than or equal to the current year and month,
	# then submit the request
	if (year, end_month) <= (today.year, today.month):

	# BUILD YOUR FORM DICTIONARY HERE
	# You'll need to format the dates to match the form expected by the FDA script.
	# To format the dates, you will need to create a datetime object and then format it.
	# See the datetime documenatation, especially the section on format operators: http://docs.python.org/library/datetime.html#strftime-and-strptime-behavior

	# Below creates datetime objects. I'll leave it as an exercise for you to figure out how to reformat them to strings
	statedate = datetime.datetime(year, start_month, 1)
	endate = datetime.datetime(year, end_month, last_day_of_end_month)

	# Create a dictionary to pass to your POST request; it's currently empty, so you need to populate it with
	# the start and end dates (after you've converted them to strings) and any other relevant form values.
	post_dict = {}

	# UPDATE post_dict WITH APPROPRIATE FORM VALUES

	# MAKE POST REQUEST AND PROCESS THE HTML
	# You should use the requests library to do this. Again, there is sample code for this in our webscraping tutorials:
	# https://github.com/PythonJournos/LearningPython/blob/master/tutorials/webscraping101/fec_efiles_scrape.py

	# FEED RESULTS OF POST REQUEST TO BEAUTIFULSOUP
	soup = BeautifulSoup(html)

	# PULL OUT APPROPRIATE LINKS using soup.findAll or some other BeautifulSoup method

	# LOOP THROUGH LINKS AND EXTRACT DATA FROM TARGET PAGES (once again, you'll use requests to fetch the page
	# and BeautifulSoup to parse it and extract the target data points)


	# FIIND name = 'pagenum'

	#ENTER '500'

	# FIND name = 'search'

	# PUSH SEARCH # Did you mean POST here?


	###ON next page:

	links = soup.find(href WHERE color = 23238e)

	for link in links[1:]:
	data = link.findAll('href')
	link = data[0].text
	print '\t'.join([link])



	### NOW, when all links obtained, PUSH each link. Within Each link:

	FIND table WHERE cellpadding='2' AND cellspacing='14'

	Input = soup.find('th')
	Response = soup.find('td')


	th = 'Date Posted' ### Do I add underscore when I define those with multiple words?

	th = 'RecallNumber'

	th = 'Product'

	th = 'Code Information'

	th = 'recalling firm/' <br> 'Manufacturer'

	th = 'Consumer Instructions'

	th = 'For Additional Information Contact'

	th = 'reason for' <br> 'Recall'

	th = 'Action'

	th = 'Quantity in Commerce'

	th = 'Distribution'


	print '\t'.join([……..])