Last active
May 22, 2019 21:46
-
-
Save tgherzog/57f316b1ad01021d2d0777a739105bd8 to your computer and use it in GitHub Desktop.
terre-biodiv data uploader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Upload files to a terre-biodiv s3 bucket | |
The first form uploads files from a local directory to an S3 bucket | |
The second form uses an S3 bucket as the source | |
Either BUCKET or SRCBUCKET can include a path prefix to control the | |
copy operation, e.g., wbg-terre-biodiv/data | |
Usage: | |
terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] [--root=DIRECTORY] BUCKET | |
terre-biodiv.py [--config=YAML] [--test] [--report] [--no-warnings] [--profile=NAME] SRCBUCKET BUCKET | |
Options: | |
--config=YAML Config file [default: terre-biodiv.yaml] | |
--test Report operations only: don't upload | |
--no-warnings Don't warn if non-matching files are encountered | |
--profile=NAME AWS profile from .aws/credentials [default: default] | |
--root=DIRECTORY Root directory to traverse [default: .] | |
--report Provide a detailed report | |
""" | |
import os | |
import yaml | |
import boto3 | |
import re | |
import sys | |
from docopt import docopt | |
config = docopt(__doc__) | |
# file_pattern_screen determines which files/keys are recognized and copied | |
file_pattern_screen = r'^(\w{3}) (\d{3}) ([^_]+)(_\d+)?.tif$' | |
def main(): | |
global config, filemap | |
# first load a filename remapping matrix | |
with open(config['--config'], 'r') as fd: | |
filemap = yaml.load(fd) | |
# convert to bucket name and path prefix | |
config['BUCKET'] = bucket_info(config['BUCKET']) | |
if config['SRCBUCKET']: | |
config['SRCBUCKET'] = bucket_info(config['SRCBUCKET']) | |
if config['--test'] and not config['SRCBUCKET']: | |
s3 = None | |
else: | |
session = boto3.session.Session(profile_name=config['--profile']) | |
s3 = session.client('s3') | |
status = {'files': 0, 'size': 0, 'transferred': 0, 'errors': 0, 'countries': {}} | |
if config['SRCBUCKET']: | |
# scan a bucket | |
params = {'Bucket': config['SRCBUCKET']['bucket'], 'Prefix': config['SRCBUCKET']['prefix'], 'MaxKeys': 100} | |
response = {'IsTruncated': True} | |
while response['IsTruncated']: | |
response = s3.list_objects_v2(**params) | |
params['ContinuationToken'] = response.get('NextContinuationToken') | |
if response.get('Contents'): | |
for elem in response['Contents']: | |
key = elem['Key'] | |
filename = os.path.basename(key) | |
aws_key = s3key(filename) | |
if not aws_key: | |
continue | |
srckey = '{}/{}'.format(config['SRCBUCKET']['bucket'], key) | |
status['files'] += 1 | |
status['size'] += elem['Size'] | |
print 'Copying s3://{} to s3://{}/{}'.format(srckey, config['BUCKET']['bucket'], aws_key) | |
iso3 = country_id(filename) | |
if config['--test']: | |
tracker(status, iso3) | |
else: | |
try: | |
s3.copy_object(Bucket=config['BUCKET']['bucket'], CopySource=srckey, Key=aws_key) | |
status['transferred'] += elem['Size'] | |
tracker(status, iso3) | |
except Exception as err: | |
sys.stderr.write(str(err) + '\n') | |
status['errors'] += 1 | |
else: | |
# scan a local directory | |
for curdir,subdirs,files in os.walk(config['--root']): | |
for key in files: | |
aws_key = s3key(key) | |
if not aws_key: | |
continue | |
fullpath = os.path.join(curdir, key) | |
size = os.path.getsize(fullpath) | |
status['files'] += 1 | |
status['size'] += size | |
print 'Uploading {} to s3://{}/{}'.format(fullpath, config['BUCKET']['bucket'], aws_key) | |
iso3 = country_id(key) | |
if config['--test']: | |
tracker(status, iso3) | |
elif s3: | |
try: | |
s3.upload_file(fullpath, config['BUCKET']['bucket'], aws_key) | |
status['transferred'] += size | |
tracker(status, iso3) | |
except boto3.exceptions.S3UploadFailedError as err: | |
sys.stderr.write(str(err) + '\n') | |
status['errors'] += 1 | |
# summary | |
print 'Done: files: {}, size: {}, transferred: {}, errors: {}'.format(status['files'], hsz(status['size']), hsz(status['transferred']), status['errors']) | |
if config['--report']: | |
keys = status['countries'].keys() | |
keys.sort() | |
print 'File counts by country:' | |
for i in keys: | |
print ' {} {}'.format(i, status['countries'][i]) | |
def hsz(size): | |
_size = size | |
for unit in ['', 'Kb', 'Mb', 'Gb', 'Tb', 'Pb']: | |
if abs(size) < 1024.0: | |
return '{:.1f}{}'.format(size, unit) | |
size /= 1024.0 | |
return _size | |
def s3key(filename): | |
'''Returns the correct s3 key for filename, or None if it should not be processed | |
''' | |
global config, filemap, file_pattern_screen | |
(base,ext) = os.path.splitext(filename) | |
if ext.lower() != '.tif': | |
return None | |
# Example filename: "XXX YYY Amphibians Result-indstwisaac.tif" | |
# or: "XXX YYY Amphibians Result-indstwisaac_3.tif" | |
# XXX and YYY should be a 3-character ASCII and numeric segment respectively | |
# the remainder must match a pattern in the yaml file | |
m = re.match(file_pattern_screen, filename) | |
if not m or not filemap.get(m.group(3).lower()): | |
if not config['--no-warnings']: | |
sys.stderr.write('Unrecognized file name pattern: {}\n'.format(filename)) | |
return None | |
suffix = m.group(4) | |
return '{}{}/{}-{}{}.tif'.format(config['BUCKET']['prefix'], m.group(1), m.group(2), filemap[m.group(3).lower()], suffix if suffix else '') | |
def country_id(filename): | |
global file_pattern_screen | |
m = re.match(file_pattern_screen, filename) | |
return m.group(1) if m else None | |
def tracker(status, iso3): | |
if status['countries'].get(iso3): | |
status['countries'][iso3] += 1 | |
else: | |
status['countries'][iso3] = 1 | |
def bucket_info(bucket): | |
parts = bucket.split('/',1) | |
if len(parts) < 2: | |
parts.append('') | |
if parts[1] and parts[1][-1:] != '/': | |
parts[1] = parts[1] + '/' | |
return {'bucket': parts[0], 'prefix': parts[1]} | |
if __name__=='__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all specie total count: allspecies-totalcount | |
amphibians.total count: amphibians-totalcount | |
birds.total count: birds-totalcount | |
mammals.total count: mammals-totalcount | |
reptiles.total count: reptiles-totalcount | |
all specie encr count: allspecies-encrcount | |
amphibians.encr count: amphibians-encrcount | |
birds.encr count: birds-encrcount | |
mammals.encr count: mammals-encrcount | |
reptiles.encr count: reptiles-encrcount | |
result-indstwisaac: allspecies-extisaac | |
amphibians result-indstwisaac: amphibians-extisaac | |
birds result-indstwisaac: birds-extisaac | |
mammals result-indstwisaac: mammals-extisaac | |
reptiles result-indstwisaac: reptiles-extisaac | |
result-indstwi50: allspecies-extmooers50 | |
amphibians result-indstwi50: amphibians-extmooers50 | |
birds result-indstwi50: birds-extmooers50 | |
mammals result-indstwi50: mammals-extmooers50 | |
reptiles result-indstwi50: reptiles-extmooers50 | |
result-indstwi100: allspecies-extmooers100 | |
amphibians result-indstwi100: amphibians-extmooers100 | |
birds result-indstwi100: birds-extmooers100 | |
mammals result-indstwi100: mammals-extmooers100 | |
reptiles result-indstwi100: reptiles-extmooers100 | |
result-indstwi500: allspecies-extmooers500 | |
amphibians result-indstwi500: amphibians-extmooers500 | |
birds result-indstwi500: birds-extmooers500 | |
mammals result-indstwi500: mammals-extmooers500 | |
reptiles result-indstwi500: reptiles-extmooers500 | |
all specie total endem: allspecies-endemicity | |
amphibians.total endem: amphibians-endemicity | |
birds.total endem: birds-endemicity | |
mammals.total endem: mammals-endemicity | |
reptiles.total endem: reptiles-endemicity | |
ecoregions.total endem: ecoregion-vulnerability |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment