Created
August 2, 2012 09:54
-
-
Save andrewharvey/3236004 to your computer and use it in GitHub Desktop.
Download and index 250k scanned geological maps from Geoscience Australia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# This script is licensed CC0 by Andrew Harvey <[email protected]> | |
# | |
# To the extent possible under law, the person who associated CC0 | |
# with this work has waived all copyright and related or neighboring | |
# rights to this work. | |
# http://creativecommons.org/publicdomain/zero/1.0/ | |
import re | |
import sys | |
from bs4 import BeautifulSoup | |
# define function to return the first match of a re.search | |
def assignFirstIfExists(i): | |
if (i != None): | |
return i.group(1) | |
else: | |
return "" | |
def parseHtmlDocument(html): | |
# set up BS | |
soup = BeautifulSoup(html) | |
# get the text within the menu tag | |
for menu in soup.find_all("menu"): | |
menuText = menu.get_text() | |
# find the href value for the link which anchor text of "250 DPI" | |
for link in soup.find_all("a", text="250 DPI"): | |
imageURL = link.get('href') | |
# extract just the .jpg file name from the URL | |
jpgFile = re.search('^.*250dpi\/(.*)&slowConnection', imageURL) | |
jpgFile = assignFirstIfExists(jpgFile) | |
# extract the elements within the menu | |
mapId = re.search(r'Map ID\(s\):\s*(.*)', menuText) | |
mapTile = re.search(r'Map Tile\(s\):\s*(.*)', menuText) | |
edition = re.search(r'Edition:\s*(.*)\s*', menuText) | |
publicationYear = re.search(r'Publication Year:\s*(.*)\s*', menuText) | |
mapId = assignFirstIfExists(mapId).strip() | |
mapTile = assignFirstIfExists(mapTile).strip() | |
edition = assignFirstIfExists(edition).strip() | |
publicationYear = assignFirstIfExists(publicationYear).strip() | |
# print the result as tab delimited values | |
print (jpgFile + '\t' + mapId + '\t' + mapTile + '\t' + edition + '\t' + publicationYear) | |
# main | |
currentHtmlDocument = "" # stores the HTML content of the "current" document as stdin can contain many HTML files concatenated together | |
for line in sys.stdin: # read lines from stdin | |
currentHtmlDocument = currentHtmlDocument + line # add to the current HTML document | |
if (line.startswith('</html>')): # if we reach the end of the html document then... | |
parseHtmlDocument(currentHtmlDocument) # parse it | |
currentHtmlDocument = "" # and reset the current HTML document | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## About | |
# This script will download the maps and associated meta data for 250k | |
# Geological Maps at http://www.geoscience.gov.au/geoportal-geologicalmaps/ | |
# | |
# Running `make all` shall suffice to run this script. | |
# ## License | |
# This script is licensed CC0 by Andrew Harvey <[email protected]> | |
# | |
# To the extent possible under law, the person who associated CC0 | |
# with this work has waived all copyright and related or neighboring | |
# rights to this work. | |
# http://creativecommons.org/publicdomain/zero/1.0/ | |
all: clean download-html-index parse-jpg-links build-index download-jpg rename-jpg | |
download-html-index: | |
wget -O 250k_index.html 'http://www.geoscience.gov.au/cgi-bin/mapserv?mapsize=450+450&mapext=-2200000.000000+-5250000.000000+2100000.000000+-950000.00000&map=%2Fnas%2Fweb%2Fops%2Fprod%2Fapps_www-c%2Fmapserver%2Fgeoportal-geologicalmaps%2Findex.map&mode=itemnquery&layer=map250&qlayer=map250&qitem=qmapname&map_map250_query_template=sheetindex.html&qstring=%2F%2F' | |
parse-jpg-links: 250k_index.html | |
cat $< | grep 'download?map' | grep -o '/geoportal-geologicalmaps/download?map=250dpi/.*.jpg' | sed 's/^/http:\/\/www.geoscience.gov.au/' > 250dpi.txt | |
build-index : 250k_index.html | |
./index_parser.py < $< > 250k_index.tsv | |
download-jpg : 250dpi.txt | |
wget --directory-prefix=250dpi -i $< | |
rename-jpg : | |
rename 's/^250dpi\/.*%2F/250dpi\//' 250dpi/*.jpg | |
clean : | |
rm -f 250dpi.txt 250k_index.html 250k_index.tsv | |
rm -rf 250dpi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment