choffee · September 26, 2013 18:39 · lexbailey · Sep 26, 2013
diff --git a/river_levels_uk_scrpaer.py b/river_levels_uk_scrpaer.py
 #!/usr/bin/env python

 import scraperwiki
 import requests
 import lxml.html
 import sys
 from datetime import datetime

 base_url='http://www.environment-agency.gov.uk'

 def get_level(station_url):
    """Get the level from a given station id"""
    html = requests.get(station_url).content
    dom = lxml.html.fromstring(html)
    level_el = dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-right div.chart-wrapper div.chart-top h3')
    try:
      level = level_el[0].text_content().split(":")[1].rstrip("m")
      for data_el in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#rightcol div.box div ul li'):
          data = data_el.text_content()
          if isinstance(data, basestring):
              data = data.strip()
          else:
              continue
        
          if data.startswith("Site id:"):
              id = data.split(":")[1].strip()
          if data.startswith("Station name:"):
              name = data.split(":")[1].strip()
      for main_el in  dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-left div.panels div.theme1 div.panel div.t div.r div.b div.l div.tr div.tl div.br div.bl div.plain_text p'):
             text = main_el.text_content().strip()
             if text.startswith("This measurement"):
                 date = datetime.strptime("%s %s" % (text[42:52], text[33:38]), "%d/%m/%Y %H:%M")
      data = {
          'name' : name,
          'id'   : id,
          'level': level,
          'date' : date
          }
      scraperwiki.sql.save(['level'],data)
    except:
        print "Bad url: %s" % station_url


 # Grab the regions
 html = requests.get("http://www.environment-agency.gov.uk/homeandleisure/floods/riverlevels/120485.aspx").content
 dom = lxml.html.fromstring(html)

 # Fetch the regions
 for entry in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'):
    url = entry.cssselect('a')[0].get('href')
    region_url =  "%s%s" % (base_url, url)
    region_html = requests.get(region_url).content
    region_dom = lxml.html.fromstring(region_html)
    # For each region fetch the areas
    for area_entry in region_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'):
        area_url = "%s%s" % ( base_url, area_entry.cssselect('a')[0].get('href'))
        area_html = requests.get(area_url).content
        area_dom = lxml.html.fromstring(area_html)
        # For each catchment fetch the 
        for catchment_entry in area_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li a#CatchmentList1_catchmentListRepeater_ctl02_hypCatchmentDetails'):
            catchment_url = "%s%s" % (base_url, catchment_entry.cssselect('a')[0].get('href'))
            catchment_html = requests.get(catchment_url).content
            catchment_dom = lxml.html.fromstring(catchment_html)
            # print catchment_html
            for site_entry in catchment_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text table.river-levels-table'):
                for stations in site_entry.cssselect('a'):
                    station_url = "%s%s" % (base_url, stations.get('href'))
                    get_level(station_url)
 print "done"
	#!/usr/bin/env python

	import scraperwiki
	import requests
	import lxml.html
	import sys
	from datetime import datetime

	base_url='http://www.environment-agency.gov.uk'

	def get_level(station_url):
	"""Get the level from a given station id"""
	html = requests.get(station_url).content
	dom = lxml.html.fromstring(html)
	level_el = dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-right div.chart-wrapper div.chart-top h3')
	try:
	level = level_el[0].text_content().split(":")[1].rstrip("m")
	for data_el in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#rightcol div.box div ul li'):
	data = data_el.text_content()
	if isinstance(data, basestring):
	data = data.strip()
	else:
	continue

	if data.startswith("Site id:"):
	id = data.split(":")[1].strip()
	if data.startswith("Station name:"):
	name = data.split(":")[1].strip()
	for main_el in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text div#station-detail-wrapper div#station-detail-left div.panels div.theme1 div.panel div.t div.r div.b div.l div.tr div.tl div.br div.bl div.plain_text p'):
	text = main_el.text_content().strip()
	if text.startswith("This measurement"):
	date = datetime.strptime("%s %s" % (text[42:52], text[33:38]), "%d/%m/%Y %H:%M")
	data = {
	'name' : name,
	'id' : id,
	'level': level,
	'date' : date
	}
	scraperwiki.sql.save(['level'],data)
	except:
	print "Bad url: %s" % station_url


	# Grab the regions
	html = requests.get("http://www.environment-agency.gov.uk/homeandleisure/floods/riverlevels/120485.aspx").content
	dom = lxml.html.fromstring(html)

	# Fetch the regions
	for entry in dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'):
	url = entry.cssselect('a')[0].get('href')
	region_url = "%s%s" % (base_url, url)
	region_html = requests.get(region_url).content
	region_dom = lxml.html.fromstring(region_html)
	# For each region fetch the areas
	for area_entry in region_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li'):
	area_url = "%s%s" % ( base_url, area_entry.cssselect('a')[0].get('href'))
	area_html = requests.get(area_url).content
	area_dom = lxml.html.fromstring(area_html)
	# For each catchment fetch the
	for catchment_entry in area_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text ul.textnavlink li a#CatchmentList1_catchmentListRepeater_ctl02_hypCatchmentDetails'):
	catchment_url = "%s%s" % (base_url, catchment_entry.cssselect('a')[0].get('href'))
	catchment_html = requests.get(catchment_url).content
	catchment_dom = lxml.html.fromstring(catchment_html)
	# print catchment_html
	for site_entry in catchment_dom.cssselect('html body div#wrapper div#innerWrapper div#contentWrapper form#form1 div#contentInnerWrapper div#contentColWrapper div#content.theme1 div.plain_text table.river-levels-table'):
	for stations in site_entry.cssselect('a'):
	station_url = "%s%s" % (base_url, stations.get('href'))
	get_level(station_url)
	print "done"
No results found