Skip to content

Instantly share code, notes, and snippets.

@choffee
Created September 26, 2013 18:39
Show Gist options
  • Select an option

  • Save choffee/6718636 to your computer and use it in GitHub Desktop.

Select an option

Save choffee/6718636 to your computer and use it in GitHub Desktop.
Scraper for the UK river levels.
#!/usr/bin/env python
import scraperwiki
import requests
import lxml.html
import sys
from datetime import datetime
base_url='http://www.environment-agency.gov.uk'
def get_level(station_url):
"""Get the level from a given station id"""
html = requests.get(station_url).content
dom = lxml.html.fromstring(html)
level_el = dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-right div.chart-wrapper div.chart-top h3')
try:
level = level_el[0].text_content().split(":")[1].rstrip("m")
for data_el in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#rightcol div.box div ul li'):
data = data_el.text_content()
if isinstance(data, basestring):
data = data.strip()
else:
continue
if data.startswith("Site id:"):
id = data.split(":")[1].strip()
if data.startswith("Station name:"):
name = data.split(":")[1].strip()
for main_el in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-left div.panels div.theme1 div.panel div.t div.r div.b div.l div.tr div.tl div.br div.bl div.plain_text p'):
text = main_el.text_content().strip()
if text.startswith("This measurement"):
date = datetime.strptime("%s %s" % (text[42:52], text[33:38]), "%d/%m/%Y %H:%M")
data = {
'name' : name,
'id' : id,
'level': level,
'date' : date
}
scraperwiki.sql.save(['level'],data)
except:
print "Bad url: %s" % station_url
# Grab the regions
html = requests.get("http://www.environment-agency.gov.uk/homeandleisure/floods/riverlevels/120485.aspx").content
dom = lxml.html.fromstring(html)
# Fetch the regions
for entry in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'):
url = entry.cssselect('a')[0].get('href')
region_url = "%s%s" % (base_url, url)
region_html = requests.get(region_url).content
region_dom = lxml.html.fromstring(region_html)
# For each region fetch the areas
for area_entry in region_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'):
area_url = "%s%s" % ( base_url, area_entry.cssselect('a')[0].get('href'))
area_html = requests.get(area_url).content
area_dom = lxml.html.fromstring(area_html)
# For each catchment fetch the
for catchment_entry in area_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li a#CatchmentList1_catchmentListRepeater_ctl02_hypCatchmentDetails'):
catchment_url = "%s%s" % (base_url, catchment_entry.cssselect('a')[0].get('href'))
catchment_html = requests.get(catchment_url).content
catchment_dom = lxml.html.fromstring(catchment_html)
# print catchment_html
for site_entry in catchment_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text table.river-levels-table'):
for stations in site_entry.cssselect('a'):
station_url = "%s%s" % (base_url, stations.get('href'))
get_level(station_url)
print "done"
@lexbailey
Copy link
Copy Markdown

It is a good job I put my tea down (even though I don't drink tea) but only because of how silly python looks in general.
This comment might have made more sense on G+.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment