Skip to content

Instantly share code, notes, and snippets.

@zstumgoren
Created June 8, 2012 16:26
Show Gist options
  • Save zstumgoren/2896635 to your computer and use it in GitHub Desktop.
Save zstumgoren/2896635 to your computer and use it in GitHub Desktop.
Scrape Georgia state leg calendar site
from BeautifulSoup import BeautifulSoup
import re
import urllib2
def main():
base_url = "http://webmail.legis.ga.gov/Calendar/"
program_url = base_url + "?Chamber=house"
html = urllib2.urlopen(program_url).read()
soup = BeautifulSoup(html)
# Grab all links with __doPostBack in the href attribute
# You could then use these to build links to the pages that need to be scraped
postback_links = soup.findAll('a', href=re.compile(r'.*?__doPostBack.*'))
print postback_links
#postback_links now contains a list of anchor entities like below.
# You can then extract the 'calMain', DDDD arguments from each anchor to build your
# urls for the target event pages that you want to scrape.
"""
[<a href="javascript:__doPostBack('calMain','4530')" style="color:#999999" title="May 27">27</a>,
<a href="javascript:__doPostBack('calMain','4531')" style="color:#999999" title="May 28">28</a>,
<a href="javascript:__doPostBack('calMain','4532')" style="color:#999999" title="May 29">29</a>,
...]
"""
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment