Last active
April 6, 2020 08:51
-
-
Save Ajnasz/1bf40f2f5cea57fef03a9899e29ffb99 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Copy flickr photos to folder | |
options: | |
-s upload to s3, not you will need to set AWS_SECRET_ACCESS_KEY | |
AWS_ACCESS_KEY_ID and ENDPOINT_URL environment variables if you want to do | |
this | |
-d directoryName The directory where the photos listed | |
""" | |
import time | |
import json | |
import mimetypes | |
from os import listdir, makedirs, environ | |
from os.path import isfile, join, getsize | |
import re | |
from shutil import copyfile | |
import sys | |
import getopt | |
import boto3 | |
import progressbar | |
ID_REGEXP = re.compile(r'^.+_([0-9]+)(:?_o)?\.(?:jpg|png|jpeg|mp4|gif)$') | |
CACHE = dict() | |
AWS_SECRET_ACCESS_KEY = environ['AWS_SECRET_ACCESS_KEY'] | |
AWS_ACCESS_KEY_ID = environ['AWS_ACCESS_KEY_ID'] | |
ENDPOINT_URL = environ['AWS_ENDPOINT_URL'] | |
CONFIGURATION = { | |
'use_s3': False, | |
'config_folder': './data', | |
'directories': list(), | |
'bucket': 'ajnasz-photos', | |
'last_upload': 0, | |
} | |
def limit_request_rate(): | |
""" | |
Limits requests by enabling uploading files only in every 5 seconds | |
""" | |
now = time.time() | |
diff = now - CONFIGURATION['last_upload'] | |
if diff < 6: | |
time.sleep(diff) | |
CONFIGURATION['last_upload'] = time.time() | |
def get_flickr_id(file_name): | |
""" | |
Find flicker photo id from filename | |
""" | |
id_match = ID_REGEXP.search(file_name) | |
if id is None: | |
return None | |
return id_match.group(1) | |
def get_file_config(flicker_id): | |
""" | |
Find flickr configuration of a photo from flickr photo id | |
""" | |
with open('%s/photo_%s.json' % | |
(CONFIGURATION['config_folder'], flicker_id)) as photo_file: | |
data = json.load(photo_file) | |
photo_file.close() | |
return data | |
def is_photo_of_album(photo_id, album): | |
""" | |
Figures out if photo is in an album | |
""" | |
return photo_id in album['photos'] | |
def get_albums(): | |
""" | |
Returns all albums | |
""" | |
albums_key = 'albums' | |
if albums_key not in CACHE: | |
with open('data/albums.json') as albums: | |
CACHE[albums_key] = json.load(albums)['albums'] | |
albums.close() | |
return CACHE[albums_key] | |
def get_album_of_photo(flickr_photo_id): | |
""" | |
Finds album for a photo | |
""" | |
albums = get_albums() | |
album_of_photo = next((album for album in albums | |
if is_photo_of_album(flickr_photo_id, album)), None) | |
if album_of_photo is None: | |
return None | |
return album_of_photo | |
def get_album_name(album): | |
""" | |
returns album name | |
""" | |
if album is None: | |
return 'NoAlbum' | |
return album['title'] | |
def copy_to_directory(destination_directory, directory, data): | |
""" | |
photo file to destination directory into it's album | |
""" | |
file_name = data[0] | |
album = data[1] | |
album_name = get_album_name(album) | |
album_path = '%s/%s' % (destination_directory, album_name) | |
try: | |
# Create target Directory | |
makedirs(album_path) | |
except FileExistsError: | |
pass | |
file_path = join(directory, file_name) | |
dest_path = join(album_path, file_name) | |
copyfile(file_path, dest_path) | |
print("copy %s to %s" % (file_path, dest_path)) | |
def get_file_list(directory): | |
""" | |
Returns this list of photos inside a directory | |
""" | |
only_files = [f for f in listdir(directory) if isfile(join(directory, f))] | |
file_ids = map(get_flickr_id, only_files) | |
file_configs = map(get_file_config, file_ids) | |
file_albums = map(get_album_of_photo, map(lambda f: f['id'], file_configs)) | |
all_data = list(zip(only_files, file_albums)) | |
return all_data | |
def _key_existing_size__list(client, bucket, key): | |
"""return the key's size if it exist, else None""" | |
response = client.list_objects_v2( | |
Bucket=bucket, | |
Prefix=key, | |
) | |
for obj in response.get('Contents', []): | |
if obj['Key'] == key: | |
return obj['Size'] | |
return 0 | |
def is_existing_file_on_s3(s3_client, file_name, file_size): | |
""" | |
Checks if a file exists on s3 | |
""" | |
key_size = _key_existing_size__list( | |
s3_client, CONFIGURATION['bucket'], file_name) | |
return key_size == file_size | |
def copy_to_s3(s3_client, directory_name, data): | |
""" | |
copies files to s3 | |
""" | |
file_name = data[0] | |
album = data[1] | |
album_name = re.sub(r'[^a-z0-9A-Z]', '_', get_album_name(album)) | |
album_path = '%s/%s' % ('flickr', album_name) | |
dest_path = join(album_path, file_name) | |
local_file_name = join(directory_name, file_name) | |
file_size = float(getsize(local_file_name)) | |
if is_existing_file_on_s3(s3_client, dest_path, file_size): | |
print("skip file %s" % dest_path) | |
return | |
mime_type = mimetypes.guess_type(local_file_name) | |
limit_request_rate() | |
widgets = [ | |
file_name, | |
progressbar.widgets.Percentage(), | |
progressbar.widgets.Bar(), | |
progressbar.widgets.FileTransferSpeed(), | |
] | |
up_proc = progressbar.ProgressBar(maxval=file_size, widgets=widgets) | |
def progress_file_upload(sent_size): | |
up_proc.update(up_proc.currval + sent_size) | |
up_proc.start() | |
s3_client.upload_file(local_file_name, | |
CONFIGURATION['bucket'], | |
dest_path, | |
Callback=progress_file_upload, | |
ExtraArgs={ | |
'ContentType': mime_type[0] | |
} | |
) | |
def copy_file(is_s3, directory_name, data): | |
""" | |
copies files | |
""" | |
if is_s3: | |
s3_client = boto3.client('s3', | |
endpoint_url=ENDPOINT_URL, | |
region_name='ams3', | |
aws_secret_access_key=AWS_SECRET_ACCESS_KEY, | |
aws_access_key_id=AWS_ACCESS_KEY_ID | |
) | |
copy_to_s3(s3_client, directory_name, data) | |
else: | |
copy_to_directory('albums', directory_name, data) | |
def main(args): | |
""" | |
Main function | |
""" | |
try: | |
opts, args = getopt.getopt(args[1:], 'sd:c:b:', [ | |
's3', 'directories=', 'config-folder=', 'bucket' | |
]) | |
except getopt.GetoptError as err: | |
print(err) | |
sys.exit(2) | |
for option, argument in opts: | |
if option in ("-s", "--s3"): | |
CONFIGURATION['use_s3'] = True | |
elif option in ("-d", "--directories"): | |
CONFIGURATION['directories'].append(argument) | |
elif option in ("-c", "--config-folder"): | |
CONFIGURATION['config_folder'] = argument | |
elif option in ("-b", "--bucket"): | |
CONFIGURATION['bucket'] = argument | |
else: | |
assert False, "unhandled option" | |
for directory_name in CONFIGURATION['directories']: | |
for file_data in get_file_list(directory_name): | |
copy_file(CONFIGURATION['use_s3'], directory_name, file_data) | |
if __name__ == '__main__': | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment