Created
September 26, 2013 18:39
-
-
Save choffee/6718636 to your computer and use it in GitHub Desktop.
Scraper for the UK river levels.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import scraperwiki | |
| import requests | |
| import lxml.html | |
| import sys | |
| from datetime import datetime | |
| base_url='http://www.environment-agency.gov.uk' | |
| def get_level(station_url): | |
| """Get the level from a given station id""" | |
| html = requests.get(station_url).content | |
| dom = lxml.html.fromstring(html) | |
| level_el = dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-right div.chart-wrapper div.chart-top h3') | |
| try: | |
| level = level_el[0].text_content().split(":")[1].rstrip("m") | |
| for data_el in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#rightcol div.box div ul li'): | |
| data = data_el.text_content() | |
| if isinstance(data, basestring): | |
| data = data.strip() | |
| else: | |
| continue | |
| if data.startswith("Site id:"): | |
| id = data.split(":")[1].strip() | |
| if data.startswith("Station name:"): | |
| name = data.split(":")[1].strip() | |
| for main_el in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-left div.panels div.theme1 div.panel div.t div.r div.b div.l div.tr div.tl div.br div.bl div.plain_text p'): | |
| text = main_el.text_content().strip() | |
| if text.startswith("This measurement"): | |
| date = datetime.strptime("%s %s" % (text[42:52], text[33:38]), "%d/%m/%Y %H:%M") | |
| data = { | |
| 'name' : name, | |
| 'id' : id, | |
| 'level': level, | |
| 'date' : date | |
| } | |
| scraperwiki.sql.save(['level'],data) | |
| except: | |
| print "Bad url: %s" % station_url | |
| # Grab the regions | |
| html = requests.get("http://www.environment-agency.gov.uk/homeandleisure/floods/riverlevels/120485.aspx").content | |
| dom = lxml.html.fromstring(html) | |
| # Fetch the regions | |
| for entry in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'): | |
| url = entry.cssselect('a')[0].get('href') | |
| region_url = "%s%s" % (base_url, url) | |
| region_html = requests.get(region_url).content | |
| region_dom = lxml.html.fromstring(region_html) | |
| # For each region fetch the areas | |
| for area_entry in region_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'): | |
| area_url = "%s%s" % ( base_url, area_entry.cssselect('a')[0].get('href')) | |
| area_html = requests.get(area_url).content | |
| area_dom = lxml.html.fromstring(area_html) | |
| # For each catchment fetch the | |
| for catchment_entry in area_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li a#CatchmentList1_catchmentListRepeater_ctl02_hypCatchmentDetails'): | |
| catchment_url = "%s%s" % (base_url, catchment_entry.cssselect('a')[0].get('href')) | |
| catchment_html = requests.get(catchment_url).content | |
| catchment_dom = lxml.html.fromstring(catchment_html) | |
| # print catchment_html | |
| for site_entry in catchment_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text table.river-levels-table'): | |
| for stations in site_entry.cssselect('a'): | |
| station_url = "%s%s" % (base_url, stations.get('href')) | |
| get_level(station_url) | |
| print "done" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It is a good job I put my tea down (even though I don't drink tea) but only because of how silly python looks in general.
This comment might have made more sense on G+.