andrewharvey · August 2, 2012 09:54
diff --git a/index_parser.py b/index_parser.py
 #!/usr/bin/python

 # This script is licensed CC0 by Andrew Harvey <[email protected]>
 #
 # To the extent possible under law, the person who associated CC0
 # with this work has waived all copyright and related or neighboring
 # rights to this work.
 # http://creativecommons.org/publicdomain/zero/1.0/

 import re
 import sys
 from bs4 import BeautifulSoup

 # define function to return the first match of a re.search
 def assignFirstIfExists(i):
    if (i != None):
        return i.group(1)
    else:
        return ""

 def parseHtmlDocument(html):
    # set up BS
    soup = BeautifulSoup(html)

    # get the text within the menu tag
    for menu in soup.find_all("menu"):
       menuText = menu.get_text()

    # find the href value for the link which anchor text of "250 DPI"
    for link in soup.find_all("a", text="250 DPI"):
       imageURL = link.get('href')

    # extract just the .jpg file name from the URL
    jpgFile = re.search('^.*250dpi\/(.*)&slowConnection', imageURL)
    jpgFile = assignFirstIfExists(jpgFile)

    # extract the elements within the menu
    mapId = re.search(r'Map ID\(s\):\s*(.*)', menuText)
    mapTile = re.search(r'Map Tile\(s\):\s*(.*)', menuText)
    edition = re.search(r'Edition:\s*(.*)\s*', menuText)
    publicationYear = re.search(r'Publication Year:\s*(.*)\s*', menuText)

    mapId = assignFirstIfExists(mapId).strip()
    mapTile = assignFirstIfExists(mapTile).strip()
    edition = assignFirstIfExists(edition).strip()
    publicationYear = assignFirstIfExists(publicationYear).strip()

    # print the result as tab delimited values
    print (jpgFile + '\t' + mapId + '\t' + mapTile + '\t' + edition + '\t' + publicationYear)

 # main
 currentHtmlDocument = "" # stores the HTML content of the "current" document as stdin can contain many HTML files concatenated together
 for line in sys.stdin: # read lines from stdin
    currentHtmlDocument = currentHtmlDocument + line # add to the current HTML document
    if (line.startswith('</html>')): # if we reach the end of the html document then...
        parseHtmlDocument(currentHtmlDocument) # parse it
        currentHtmlDocument = "" # and reset the current HTML document


diff --git a/Makefile b/Makefile
 ## About
 # This script will download the maps and associated meta data for 250k
 # Geological Maps at http://www.geoscience.gov.au/geoportal-geologicalmaps/
 #
 # Running `make all` shall suffice to run this script.

 # ## License
 # This script is licensed CC0 by Andrew Harvey <[email protected]>
 #
 # To the extent possible under law, the person who associated CC0
 # with this work has waived all copyright and related or neighboring
 # rights to this work.
 # http://creativecommons.org/publicdomain/zero/1.0/

 all: clean download-html-index parse-jpg-links build-index download-jpg rename-jpg

 download-html-index:
 	wget -O 250k_index.html 'http://www.geoscience.gov.au/cgi-bin/mapserv?mapsize=450+450&mapext=-2200000.000000+-5250000.000000+2100000.000000+-950000.00000&map=%2Fnas%2Fweb%2Fops%2Fprod%2Fapps_www-c%2Fmapserver%2Fgeoportal-geologicalmaps%2Findex.map&mode=itemnquery&layer=map250&qlayer=map250&qitem=qmapname&map_map250_query_template=sheetindex.html&qstring=%2F%2F'

 parse-jpg-links: 250k_index.html
 	cat $< | grep 'download?map' | grep -o '/geoportal-geologicalmaps/download?map=250dpi/.*.jpg' | sed 's/^/http:\/\/www.geoscience.gov.au/' > 250dpi.txt

 build-index : 250k_index.html
 	./index_parser.py < $< > 250k_index.tsv

 download-jpg : 250dpi.txt
 	wget --directory-prefix=250dpi -i $<

 rename-jpg :
 	rename 's/^250dpi\/.*%2F/250dpi\//' 250dpi/*.jpg

 clean :
 	rm -f 250dpi.txt 250k_index.html 250k_index.tsv
 	rm -rf 250dpi
	#!/usr/bin/python

	# This script is licensed CC0 by Andrew Harvey <[email protected]>
	#
	# To the extent possible under law, the person who associated CC0
	# with this work has waived all copyright and related or neighboring
	# rights to this work.
	# http://creativecommons.org/publicdomain/zero/1.0/

	import re
	import sys
	from bs4 import BeautifulSoup

	# define function to return the first match of a re.search
	def assignFirstIfExists(i):
	if (i != None):
	return i.group(1)
	else:
	return ""

	def parseHtmlDocument(html):
	# set up BS
	soup = BeautifulSoup(html)

	# get the text within the menu tag
	for menu in soup.find_all("menu"):
	menuText = menu.get_text()

	# find the href value for the link which anchor text of "250 DPI"
	for link in soup.find_all("a", text="250 DPI"):
	imageURL = link.get('href')

	# extract just the .jpg file name from the URL
	jpgFile = re.search('^.250dpi\/(.)&slowConnection', imageURL)
	jpgFile = assignFirstIfExists(jpgFile)

	# extract the elements within the menu
	mapId = re.search(r'Map ID\(s\):\s(.)', menuText)
	mapTile = re.search(r'Map Tile\(s\):\s(.)', menuText)
	edition = re.search(r'Edition:\s(.)\s*', menuText)
	publicationYear = re.search(r'Publication Year:\s(.)\s*', menuText)

	mapId = assignFirstIfExists(mapId).strip()
	mapTile = assignFirstIfExists(mapTile).strip()
	edition = assignFirstIfExists(edition).strip()
	publicationYear = assignFirstIfExists(publicationYear).strip()

	# print the result as tab delimited values
	print (jpgFile + '\t' + mapId + '\t' + mapTile + '\t' + edition + '\t' + publicationYear)

	# main
	currentHtmlDocument = "" # stores the HTML content of the "current" document as stdin can contain many HTML files concatenated together
	for line in sys.stdin: # read lines from stdin
	currentHtmlDocument = currentHtmlDocument + line # add to the current HTML document
	if (line.startswith('</html>')): # if we reach the end of the html document then...
	parseHtmlDocument(currentHtmlDocument) # parse it
	currentHtmlDocument = "" # and reset the current HTML document
	## About
	# This script will download the maps and associated meta data for 250k
	# Geological Maps at http://www.geoscience.gov.au/geoportal-geologicalmaps/
	#
	# Running `make all` shall suffice to run this script.

	# ## License
	# This script is licensed CC0 by Andrew Harvey <[email protected]>
	#
	# To the extent possible under law, the person who associated CC0
	# with this work has waived all copyright and related or neighboring
	# rights to this work.
	# http://creativecommons.org/publicdomain/zero/1.0/

	all: clean download-html-index parse-jpg-links build-index download-jpg rename-jpg

	download-html-index:
	wget -O 250k_index.html 'http://www.geoscience.gov.au/cgi-bin/mapserv?mapsize=450+450&mapext=-2200000.000000+-5250000.000000+2100000.000000+-950000.00000&map=%2Fnas%2Fweb%2Fops%2Fprod%2Fapps_www-c%2Fmapserver%2Fgeoportal-geologicalmaps%2Findex.map&mode=itemnquery&layer=map250&qlayer=map250&qitem=qmapname&map_map250_query_template=sheetindex.html&qstring=%2F%2F'

	parse-jpg-links: 250k_index.html
	cat $< \| grep 'download?map' \| grep -o '/geoportal-geologicalmaps/download?map=250dpi/.*.jpg' \| sed 's/^/http:\/\/www.geoscience.gov.au/' > 250dpi.txt

	build-index : 250k_index.html
	./index_parser.py < $< > 250k_index.tsv

	download-jpg : 250dpi.txt
	wget --directory-prefix=250dpi -i $<

	rename-jpg :
	rename 's/^250dpi\/.%2F/250dpi\//' 250dpi/.jpg

	clean :
	rm -f 250dpi.txt 250k_index.html 250k_index.tsv
	rm -rf 250dpi