zstumgoren · June 8, 2012 16:26
diff --git a/georgia_leg_calendar.py b/georgia_leg_calendar.py
 from BeautifulSoup import BeautifulSoup
 import re
 import urllib2

 def main():
    base_url = "http://webmail.legis.ga.gov/Calendar/"
    program_url = base_url + "?Chamber=house"
    html = urllib2.urlopen(program_url).read()
    soup = BeautifulSoup(html)
    
    # Grab all links with __doPostBack in the href attribute
    # You could then use these to build links to the pages that need to be scraped
    postback_links = soup.findAll('a', href=re.compile(r'.*?__doPostBack.*'))
    print postback_links
    
    #postback_links now contains a list of anchor entities like below.
    # You can then extract the 'calMain', DDDD arguments from each anchor to build your 
    # urls for the target event pages that you want to scrape.
    """

    [<a href="javascript:__doPostBack('calMain','4530')" style="color:#999999" title="May 27">27</a>,
     <a href="javascript:__doPostBack('calMain','4531')" style="color:#999999" title="May 28">28</a>,
     <a href="javascript:__doPostBack('calMain','4532')" style="color:#999999" title="May 29">29</a>,
    ...]
    """

 if __name__ == '__main__':
    main()
	from BeautifulSoup import BeautifulSoup
	import re
	import urllib2

	def main():
	base_url = "http://webmail.legis.ga.gov/Calendar/"
	program_url = base_url + "?Chamber=house"
	html = urllib2.urlopen(program_url).read()
	soup = BeautifulSoup(html)

	# Grab all links with __doPostBack in the href attribute
	# You could then use these to build links to the pages that need to be scraped
	postback_links = soup.findAll('a', href=re.compile(r'.?__doPostBack.'))
	print postback_links

	#postback_links now contains a list of anchor entities like below.
	# You can then extract the 'calMain', DDDD arguments from each anchor to build your
	# urls for the target event pages that you want to scrape.
	"""

	[<a href="javascript:__doPostBack('calMain','4530')" style="color:#999999" title="May 27">27</a>,
	<a href="javascript:__doPostBack('calMain','4531')" style="color:#999999" title="May 28">28</a>,
	<a href="javascript:__doPostBack('calMain','4532')" style="color:#999999" title="May 29">29</a>,
	...]
	"""

	if __name__ == '__main__':
	main()