Created
June 8, 2012 16:27
-
-
Save zstumgoren/2896636 to your computer and use it in GitHub Desktop.
Scrape Georgia state leg calendar site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
def main(): | |
base_url = "http://webmail.legis.ga.gov/Calendar/" | |
program_url = base_url + "?Chamber=house" | |
html = urllib2.urlopen(program_url).read() | |
soup = BeautifulSoup(html) | |
# Grab all links with __doPostBack in the href attribute | |
postback_links = soup.findAll('a', href=re.compile(r'.*?__doPostBack.*')) | |
print postback_links | |
#postback_links now contains a list of anchor entities like below. | |
# You can then extract the 'calMain', DDDD arguments from each anchor to build your | |
# POST request for the target event pages that you want to scrape. | |
""" | |
[<a href="javascript:__doPostBack('calMain','4530')" style="color:#999999" title="May 27">27</a>, | |
<a href="javascript:__doPostBack('calMain','4531')" style="color:#999999" title="May 28">28</a>, | |
<a href="javascript:__doPostBack('calMain','4532')" style="color:#999999" title="May 29">29</a>, | |
...] | |
""" | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment