Created
June 8, 2012 16:26
-
-
Save zstumgoren/2896635 to your computer and use it in GitHub Desktop.
Scrape Georgia state leg calendar site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from BeautifulSoup import BeautifulSoup | |
import re | |
import urllib2 | |
def main(): | |
base_url = "http://webmail.legis.ga.gov/Calendar/" | |
program_url = base_url + "?Chamber=house" | |
html = urllib2.urlopen(program_url).read() | |
soup = BeautifulSoup(html) | |
# Grab all links with __doPostBack in the href attribute | |
# You could then use these to build links to the pages that need to be scraped | |
postback_links = soup.findAll('a', href=re.compile(r'.*?__doPostBack.*')) | |
print postback_links | |
#postback_links now contains a list of anchor entities like below. | |
# You can then extract the 'calMain', DDDD arguments from each anchor to build your | |
# urls for the target event pages that you want to scrape. | |
""" | |
[<a href="javascript:__doPostBack('calMain','4530')" style="color:#999999" title="May 27">27</a>, | |
<a href="javascript:__doPostBack('calMain','4531')" style="color:#999999" title="May 28">28</a>, | |
<a href="javascript:__doPostBack('calMain','4532')" style="color:#999999" title="May 29">29</a>, | |
...] | |
""" | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment