Skip to content

Instantly share code, notes, and snippets.

@Ajnasz
Last active April 6, 2020 08:51
Show Gist options
  • Save Ajnasz/1bf40f2f5cea57fef03a9899e29ffb99 to your computer and use it in GitHub Desktop.
Save Ajnasz/1bf40f2f5cea57fef03a9899e29ffb99 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Copy flickr photos to folder
options:
-s upload to s3, not you will need to set AWS_SECRET_ACCESS_KEY
AWS_ACCESS_KEY_ID and ENDPOINT_URL environment variables if you want to do
this
-d directoryName The directory where the photos listed
"""
import time
import json
import mimetypes
from os import listdir, makedirs, environ
from os.path import isfile, join, getsize
import re
from shutil import copyfile
import sys
import getopt
import boto3
import progressbar
ID_REGEXP = re.compile(r'^.+_([0-9]+)(:?_o)?\.(?:jpg|png|jpeg|mp4|gif)$')
CACHE = dict()
AWS_SECRET_ACCESS_KEY = environ['AWS_SECRET_ACCESS_KEY']
AWS_ACCESS_KEY_ID = environ['AWS_ACCESS_KEY_ID']
ENDPOINT_URL = environ['AWS_ENDPOINT_URL']
CONFIGURATION = {
'use_s3': False,
'config_folder': './data',
'directories': list(),
'bucket': 'ajnasz-photos',
'last_upload': 0,
}
def limit_request_rate():
"""
Limits requests by enabling uploading files only in every 5 seconds
"""
now = time.time()
diff = now - CONFIGURATION['last_upload']
if diff < 6:
time.sleep(diff)
CONFIGURATION['last_upload'] = time.time()
def get_flickr_id(file_name):
"""
Find flicker photo id from filename
"""
id_match = ID_REGEXP.search(file_name)
if id is None:
return None
return id_match.group(1)
def get_file_config(flicker_id):
"""
Find flickr configuration of a photo from flickr photo id
"""
with open('%s/photo_%s.json' %
(CONFIGURATION['config_folder'], flicker_id)) as photo_file:
data = json.load(photo_file)
photo_file.close()
return data
def is_photo_of_album(photo_id, album):
"""
Figures out if photo is in an album
"""
return photo_id in album['photos']
def get_albums():
"""
Returns all albums
"""
albums_key = 'albums'
if albums_key not in CACHE:
with open('data/albums.json') as albums:
CACHE[albums_key] = json.load(albums)['albums']
albums.close()
return CACHE[albums_key]
def get_album_of_photo(flickr_photo_id):
"""
Finds album for a photo
"""
albums = get_albums()
album_of_photo = next((album for album in albums
if is_photo_of_album(flickr_photo_id, album)), None)
if album_of_photo is None:
return None
return album_of_photo
def get_album_name(album):
"""
returns album name
"""
if album is None:
return 'NoAlbum'
return album['title']
def copy_to_directory(destination_directory, directory, data):
"""
photo file to destination directory into it's album
"""
file_name = data[0]
album = data[1]
album_name = get_album_name(album)
album_path = '%s/%s' % (destination_directory, album_name)
try:
# Create target Directory
makedirs(album_path)
except FileExistsError:
pass
file_path = join(directory, file_name)
dest_path = join(album_path, file_name)
copyfile(file_path, dest_path)
print("copy %s to %s" % (file_path, dest_path))
def get_file_list(directory):
"""
Returns this list of photos inside a directory
"""
only_files = [f for f in listdir(directory) if isfile(join(directory, f))]
file_ids = map(get_flickr_id, only_files)
file_configs = map(get_file_config, file_ids)
file_albums = map(get_album_of_photo, map(lambda f: f['id'], file_configs))
all_data = list(zip(only_files, file_albums))
return all_data
def _key_existing_size__list(client, bucket, key):
"""return the key's size if it exist, else None"""
response = client.list_objects_v2(
Bucket=bucket,
Prefix=key,
)
for obj in response.get('Contents', []):
if obj['Key'] == key:
return obj['Size']
return 0
def is_existing_file_on_s3(s3_client, file_name, file_size):
"""
Checks if a file exists on s3
"""
key_size = _key_existing_size__list(
s3_client, CONFIGURATION['bucket'], file_name)
return key_size == file_size
def copy_to_s3(s3_client, directory_name, data):
"""
copies files to s3
"""
file_name = data[0]
album = data[1]
album_name = re.sub(r'[^a-z0-9A-Z]', '_', get_album_name(album))
album_path = '%s/%s' % ('flickr', album_name)
dest_path = join(album_path, file_name)
local_file_name = join(directory_name, file_name)
file_size = float(getsize(local_file_name))
if is_existing_file_on_s3(s3_client, dest_path, file_size):
print("skip file %s" % dest_path)
return
mime_type = mimetypes.guess_type(local_file_name)
limit_request_rate()
widgets = [
file_name,
progressbar.widgets.Percentage(),
progressbar.widgets.Bar(),
progressbar.widgets.FileTransferSpeed(),
]
up_proc = progressbar.ProgressBar(maxval=file_size, widgets=widgets)
def progress_file_upload(sent_size):
up_proc.update(up_proc.currval + sent_size)
up_proc.start()
s3_client.upload_file(local_file_name,
CONFIGURATION['bucket'],
dest_path,
Callback=progress_file_upload,
ExtraArgs={
'ContentType': mime_type[0]
}
)
def copy_file(is_s3, directory_name, data):
"""
copies files
"""
if is_s3:
s3_client = boto3.client('s3',
endpoint_url=ENDPOINT_URL,
region_name='ams3',
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_access_key_id=AWS_ACCESS_KEY_ID
)
copy_to_s3(s3_client, directory_name, data)
else:
copy_to_directory('albums', directory_name, data)
def main(args):
"""
Main function
"""
try:
opts, args = getopt.getopt(args[1:], 'sd:c:b:', [
's3', 'directories=', 'config-folder=', 'bucket'
])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
for option, argument in opts:
if option in ("-s", "--s3"):
CONFIGURATION['use_s3'] = True
elif option in ("-d", "--directories"):
CONFIGURATION['directories'].append(argument)
elif option in ("-c", "--config-folder"):
CONFIGURATION['config_folder'] = argument
elif option in ("-b", "--bucket"):
CONFIGURATION['bucket'] = argument
else:
assert False, "unhandled option"
for directory_name in CONFIGURATION['directories']:
for file_data in get_file_list(directory_name):
copy_file(CONFIGURATION['use_s3'], directory_name, file_data)
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment