Last active
December 16, 2015 06:08
-
-
Save j08lue/5389041 to your computer and use it in GitHub Desktop.
Fills out and submit a form on the NODC.NOAA website and extract download links on the resulting page. The links can then be downloaded using download_from_url.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Fetch WOA09 files from NODC.NOAA website via WOAselect""" | |
import re | |
import mechanize | |
import os | |
import urllib2 | |
import shutil | |
import time | |
def abs2relftp(url): | |
"""Replaces leading slash in absolute ftp url with '%2f' | |
http://stackoverflow.com/questions/1162053/any-way-to-specify-absolute-paths-in-ftp-urls | |
http://tools.ietf.org/html/draft-casey-url-ftp-00""" | |
return '/'.join(['/'.join(url.split('/')[:2]), | |
re.sub('/','/%2f/',('/'.join(url.split('/')[2:])),count=1)]) | |
def get_woa_links(parameters=dict(T=1,S=2)): | |
"""Fetch WOA09 files from NODC.NOAA website via WOAselect | |
Paramters | |
--------- | |
parameters : dict | |
Ocean parameters to download, | |
mapping the parameter number in the web interface | |
to a parameter ID to be used in file names etc. | |
e.g. {'T': '1'} for salinity | |
Functioning | |
----------- | |
Fills out and submit a form on the NODC.NOAA website | |
http://www.nodc.noaa.gov/cgi-bin/OC5/SELECT/woaselect.pl | |
and extracts download links on the resulting page. Stores | |
the links together with target file names for later download.""" | |
# run through parameters (Temp, Sal, etc.) to download | |
for parID,parN in parameters.iteritems(): | |
# create empty files | |
with open('urls_{}_csv.txt'.format(parID),'w') as f: | |
pass | |
with open('urls_{}_shp.txt'.format(parID),'w') as f: | |
pass | |
# open browser | |
br = mechanize.Browser() | |
br.open('http://www.nodc.noaa.gov/cgi-bin/OC5/SELECT/woaselect.pl?parameter={}'.format(parN)) | |
#br.addheaders = [('User-agent','Mozilla/5.0')] | |
# run though months and levels | |
for mon in xrange(1,12+1): | |
for lev in xrange(1,33+1): | |
print 'Parameter {}, month {}, level {}'.format(parID,mon,lev) | |
# fill data selection form | |
print 'submitting values ...' | |
br.select_form(name='submitform') | |
br.form.set_all_readonly(False) | |
br['north'] = '90' | |
br['south'] = '-90' | |
br['east'] = '180' | |
br['west'] = '-180' | |
br['grid'] = ['2'] # 1/4 grid | |
br['figure_type'] = ['0'] | |
br['time_period'] = ['{}'.format(mon)] # increment this from 1 through 12 | |
br['depth'] = ['{}'.format(lev)] # increment from 1 through 33 | |
# submit values | |
print 'waiting for online processing ...' | |
datasite = br.submit() #; print datasite.read() | |
# define base file name | |
fname = '{}_{:02d}_{:02d}'.format(parID,mon,lev) | |
# store url to ASCII file | |
url = br.find_link(text='ASCII').url | |
fnm = fname+'.csv.gz' | |
with open('urls_{}_csv.txt'.format(parID),'a') as f: | |
f.write(','.join([url,fnm])) | |
# store url to ArcGIS file | |
url = br.find_link(text='ArcGIS').url | |
fnm = fname+'.tar.gz' | |
with open('urls_{}_shp.txt'.format(parID),'a') as f: | |
f.write(','.join([url,fnm])+'\n') | |
# return to previous page | |
br.back() | |
# close browser when done | |
br.close() | |
print '... done' | |
def download(url, localFileName): | |
req = urllib2.Request(url) | |
r = urllib2.urlopen(url) | |
with open(localFileName, 'wb') as f: | |
shutil.copyfileobj(r, f) | |
def download_urls(urls_lst,dataDir,maxt=60,dt=10): | |
print 'processing {} URLs'.format(len(urls_lst)) | |
done_url = [] | |
for k in xrange(len(urls_lst)): | |
url = urls_lst[k][0] | |
fnm = urls_lst[k][1].rstrip() | |
fname = dataDir+fnm | |
if os.path.isfile(fname): | |
print 'file {} exists'.format(fname) | |
continue | |
else: | |
t = 0 | |
while t <= maxt: | |
print 't={} {} --> {} ...'.format(t,url,fnm) | |
try: | |
download(url, fname) | |
except: | |
print 'not successful. Waiting {}s'.format(dt) | |
time.sleep(dt) | |
t+=dt | |
else: | |
done_url.append(k) | |
break | |
else: | |
print 'URL {} could not be retrieved.'.format(url) | |
raise | |
return done_url | |
def download_files(dataDir='.',maxn=1,parIDs=['S','T']): | |
"""Download the files to *dataDir*, trying *maxn* times""" | |
n=0 | |
while n <= maxn: | |
for parID in parIDs: | |
# read lines from URL files into a list | |
urls_csv = [] | |
with open('urls_{}_csv.txt'.format(parID),'r') as f: | |
for line in f.readlines(): | |
urls_csv.append(line.split(',')) | |
urls_shp = [] | |
with open('urls_{}_shp.txt'.format(parID),'r') as f: | |
for line in f.readlines(): | |
urls_shp.append(line.split(',')) | |
# download ASCII files | |
kdone = download_urls(urls_csv,dataDir) | |
# download ArcGis files | |
kdone = download_urls(urls_shp,dataDir) | |
n+=1 | |
if maxn > 1: | |
# wait before next attempt | |
print 'waiting for 2 hours ...' | |
time.sleep(7200) | |
if __name__ == "__main__": | |
get_woa_links(parameters=dict(T=1,S=2)) | |
download_files(parIDs=['S','T']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment