Last active
August 29, 2015 14:14
-
-
Save tingletech/89e40d59c07e717f7c2a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from pprint import pprint as pp | |
import sys | |
import os | |
import re | |
import scandir | |
# each file needs to be in its own directory | |
BASE1 = u'/home/ec2-user/merge' | |
BASE2 = u'/home/ec2-user/merge2' | |
def main(argv=None): | |
# find all the files in the directory | |
for dir_entry in scandir.scandir(BASE1): | |
if dir_entry.is_file: | |
subdir, hardlink = s3_key_to_fullpath(dir_entry.name) | |
linksource = os.path.join(BASE1, dir_entry.name) | |
create_link(subdir, linksource, hardlink) | |
def s3_key_to_fullpath(name): | |
ark_name = os.path.splitext(name)[0] | |
subdir = os.path.join(BASE2, | |
ark_name[-2:], | |
ark_name) | |
fullpath = os.path.join(BASE2, | |
subdir, | |
name) | |
return subdir, fullpath | |
def create_link(subdir, linksource, hardlink): | |
if not(os.path.isfile(hardlink)): | |
_mkdir(subdir) | |
os.link(linksource, hardlink) | |
print subdir, linksource, hardlink | |
# http://code.activestate.com/recipes/82465-a-friendly-mkdir/ | |
def _mkdir(newdir): | |
"""works the way a good mkdir should :) | |
- already exists, silently complete | |
- regular file in the way, raise an exception | |
- parent directory(ies) does not exist, make them as well | |
""" | |
if os.path.isdir(newdir): | |
pass | |
elif os.path.isfile(newdir): | |
raise OSError("a file with the same name as the desired " \ | |
"dir, '%s', already exists." % newdir) | |
else: | |
head, tail = os.path.split(newdir) | |
if head and not os.path.isdir(head): | |
_mkdir(head) | |
#print "_mkdir %s" % repr(newdir) | |
if tail: | |
os.mkdir(newdir) | |
if __name__ == "__main__": | |
sys.exit(main()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from pprint import pprint as pp | |
import sys | |
import os | |
import re | |
import argparse | |
import urlparse | |
import boto | |
import re | |
def main(argv=None): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'bucket', | |
nargs=1, | |
help="s3://bucket[/optional/path] where the EAC XML are" | |
) | |
parser.add_argument('local_dir', nargs=1) | |
parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), | |
default=sys.stdin) | |
parser.add_argument('--pull_all', dest='all', action='store_true') | |
if argv is None: | |
argv = parser.parse_args() | |
parts = urlparse.urlsplit(argv.bucket[0]) | |
# SplitResult | |
# (scheme='s3', netloc='test.pdf', path='/dkd', query='', fragment='') | |
s3 = boto.connect_s3() | |
bucket = s3.get_bucket(parts.netloc) | |
if argv.all: | |
pull_all(bucket, argv.bucket[0], argv.local_dir[0]) | |
else: | |
for line in argv.infile: | |
info = get_info(argv.bucket[0], argv.local_dir[0], line.strip("\n")) | |
if info: | |
key_name = os.path.join(parts.path, info['filename']).strip("/") | |
key = bucket.get_key(key_name) | |
if key: | |
_mkdir(info['subdir']) | |
key.get_contents_to_filename(info['localpath']) | |
def get_info(bucket, localdir, string): | |
try: | |
naan, part = parse_ark(string) | |
except TypeError: | |
return | |
subdir, localpath = parse_to_fullpath(naan, part, localdir) | |
filename = '{0}-{1}.xml'.format(naan, part) | |
return { | |
"filename": filename, | |
"subdir": subdir, | |
"localpath": localpath | |
} | |
def pull_all(bucket, bucketurl, localdir): | |
parts = urlparse.urlsplit(bucketurl) | |
for key in bucket.list(): | |
if key.name.startswith(parts.path[1:]): | |
info = get_info(bucketurl, localdir, key.name) | |
_mkdir(info['subdir']) | |
key.get_contents_to_filename(info['localpath']) | |
# 99166-w600735z.xml | |
# ark:/99166/w600735z | |
# | |
def parse_ark(string): | |
matchObj = re.match(r'.*(\d\d\d\d\d)(?:-|/)([a-z0-9]*)', string) | |
if matchObj: | |
naan = matchObj.group(1) | |
part = matchObj.group(2) | |
return (naan, part) | |
def parse_to_fullpath(naan, part, BASE2): | |
ark_name = '-'.join([naan, part]) | |
subdir = os.path.join(BASE2, | |
ark_name[-2:], | |
ark_name) | |
fullpath = os.path.join(BASE2, | |
subdir, | |
'{0}.xml'.format(ark_name)) | |
return subdir, fullpath | |
# http://code.activestate.com/recipes/82465-a-friendly-mkdir/ | |
def _mkdir(newdir): | |
"""works the way a good mkdir should :) | |
- already exists, silently complete | |
- regular file in the way, raise an exception | |
- parent directory(ies) does not exist, make them as well | |
""" | |
if os.path.isdir(newdir): | |
pass | |
elif os.path.isfile(newdir): | |
raise OSError("a file with the same name as the desired " \ | |
"dir, '%s', already exists." % newdir) | |
else: | |
head, tail = os.path.split(newdir) | |
if head and not os.path.isdir(head): | |
_mkdir(head) | |
#print "_mkdir %s" % repr(newdir) | |
if tail: | |
os.mkdir(newdir) | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment