Last active
September 3, 2015 22:44
-
-
Save flother/91bc218212a4ce7783cb to your computer and use it in GitHub Desktop.
Scrape earthquakes measured by the Icelandic Meteorological Office (Veðurstofa Íslands) from 1991 to 2014.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: UTF-8 | |
""" | |
Scrape earthquakes measured by the Icelandic Meteorological Office (Veðurstofa | |
Íslands) from 1991 to 2015. | |
The data is collected from zipped KML files downloaded from vedur.is. | |
""" | |
from collections import OrderedDict | |
from cStringIO import StringIO | |
import datetime | |
import re | |
import sys | |
import zipfile | |
from csvkit.unicsv import UnicodeCSVWriter | |
import lxml.etree | |
import requests | |
YEAR_KML = "http://hraun.vedur.is/ja/google/kmls/{year}/{year}list.kmz" | |
YEARS = range(1991, 2016) | |
# Regular expression to parse date, time, depth in km and Richter magnitude | |
# from the text description in the Placemark elements. | |
DESC_RE = re.compile(r""" | |
Time:\s(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})\s | |
(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})\.(?P<fraction>\d+) | |
<br> | |
Depth:\s(?P<depth>[\d\.]+)km\s<br>\sSize:\s(?P<size>\-?[\d\.]+|\*+)Ml | |
<br> | |
""", re.X) | |
# Elements of interest (i.e. those that contain data) in the KML files. | |
PLACEMARK_ELEMENT = "{http://earth.google.com/kml/2.0}Placemark" | |
DESC_ELEMENT = "{http://earth.google.com/kml/2.0}description" | |
COORDS_ELEMENT = "{http://earth.google.com/kml/2.0}coordinates" | |
ELEMENTS = (PLACEMARK_ELEMENT, DESC_ELEMENT, COORDS_ELEMENT) | |
ATTRS = ["date_time", "latitude", "longitude", "altitude", "depth", "size"] | |
# Output data as a CSV to stdout. | |
csv = UnicodeCSVWriter(sys.stdout) | |
# Write header row. | |
csv.writerow(ATTRS) | |
# Download a zipped KML for each year. | |
for year in YEARS: | |
response = requests.get(YEAR_KML.format(year=year)) | |
kml_zip = zipfile.ZipFile(StringIO(response.content)) | |
# Unzip and parse each KML file found in the ZIP. Uncompressed KML files | |
# are available for download too, but the data for 2008 is truncated there, | |
# so the ZIPs are used instead. | |
for kml in kml_zip.infolist(): | |
earthquake = None | |
# Iteratively parse each element of interest. | |
for action, elem in lxml.etree.iterparse(kml_zip.open(kml), | |
tag=ELEMENTS, | |
events=("start", "end",)): | |
if elem.tag == PLACEMARK_ELEMENT: | |
if action == "start": | |
# If it's a new Placemark element, it's a new earthquake, | |
# so reset the data dictionary. | |
earthquake = dict(zip(ATTRS, [None] * len(ATTRS))) | |
elif action == "end": | |
# If it's the end of a Placemark element, it's the end of | |
# the data on a single earthquake. | |
if None not in earthquake.values(): | |
# At this point all required data is available for a | |
# single earthquake, so output the earthquake to the | |
# CSV. | |
csv.writerow([earthquake[k] for k in ATTRS]) | |
else: | |
# We're missing at least one value for a single | |
# earthquake. Print an error and don't add the | |
# earthquake to the CSV. | |
missing_values = ", ".join((k for k, v in | |
earthquake.iteritems() | |
if v is None)) | |
print >> sys.stderr, ("Discarding placemark missing " | |
"{}".format(missing_values)) | |
elif earthquake is not None and action == "end": | |
if elem.tag == DESC_ELEMENT: | |
# Parse the date and time of the earthquake, along with its | |
# depth below the surface and its magnitude. Date and time | |
# are available in an XML element but depth and magnitude | |
# aren't, so it's all collected from the description using | |
# a regular expression. | |
match = DESC_RE.search(elem.text) | |
if match: | |
timetuple = map(int, match.groups()[:7]) | |
# Multiply second fraction by 10,000 to make it | |
# microseconds. | |
timetuple[6] = timetuple[6] * 10000 | |
earthquake["date_time"] = datetime.datetime(*timetuple) | |
earthquake["depth"] = match.groupdict()["depth"] | |
size = match.groupdict()["size"] | |
# Size can, in a couple of cases, be "****". In that | |
# case store an empty string instead. | |
try: | |
float(size) | |
except ValueError: | |
print >> sys.stderr, ("Size '{}' not a number; " | |
"discarding.".format(size)) | |
size = "" | |
earthquake["size"] = size | |
else: | |
print >> sys.stderr, "Can't parse '{}'.".format( | |
elem.text) | |
elif elem.tag == COORDS_ELEMENT: | |
# Longitude, latitude, and altitude are stored in a | |
# comma-separated string in the coordinates element. | |
try: | |
coords = elem.text.split(",") | |
# A couple of longitudes have multiple decimal places, | |
# e.g. 12.3456.789. Fortunately in these cases it's | |
# always the first decimal place that's the correct | |
# one, so we can strip out the others to make it a | |
# valid floating-point number. | |
if len(coords) != 3: | |
print >> sys.stderr, ("Can't parse coords from " | |
"'{}'.".format(elem.text)) | |
else: | |
lon, lat, alt = coords | |
if lon.count(".") > 1: | |
lon_parts = lon.split(".", 1) | |
lon = lon_parts[0] + lon_parts[1].replace(".", | |
"") | |
earthquake["latitude"] = float(lat) | |
earthquake["longitude"] = float(lon) | |
earthquake["altitude"] = float(alt) | |
except ValueError: | |
print >> sys.stderr, ("Can't parse coordinates from " | |
"'{}'.".format(elem.text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment