Last active
April 25, 2018 14:45
-
-
Save graphaelli/1b7940e515c664dd6663daf152390e8b to your computer and use it in GitHub Desktop.
Flickr Archive Utilities - MIT License
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Configuration for Flickr Backup & Download and Google Upload. | |
""" | |
from collections import namedtuple | |
# fill in after first run, each will be loggeed | |
flickr_oauth_token = None | |
flickr_oauth_secret = None | |
google_refresh_token = None # only available at first run | |
google_access_token = None | |
# | |
# Fill in keys and secrets before first run | |
# | |
# API Keys from https://www.flickr.com/services/apps/by/me. | |
Flickr = namedtuple('Flickr', ('key', 'secret', 'oauth_token', 'oauth_secret')) | |
flickr = Flickr( | |
key='API_KEY', | |
secret='API_SECRET', | |
oauth_token=flickr_oauth_token, | |
oauth_secret=flickr_oauth_secret, | |
) | |
# id and secret from https://console.cloud.google.com/apis/credentials | |
Google = namedtuple('Google', ('client_id', 'secret', 'access_token', 'refresh_token')) | |
google = Google( | |
client_id='CLIENT_ID', | |
secret='SECRET', | |
access_token=google_access_token, | |
refresh_token=google_refresh_token, | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Dump information about all of the authenticated user's photos to back-$date.json: | |
[ | |
{ | |
"id": "12345", | |
"owner": "12345@N00", | |
"secret": "9876", | |
"server": "119", | |
"farm": 1, | |
"title": "", | |
"ispublic": 1, | |
"isfriend": 0, | |
"isfamily": 0, | |
"description": { | |
"_content": "" | |
}, | |
"dateupload": "1160371553", | |
"datetaken": "2006-10-08 18:34:44", | |
"datetakengranularity": "0", | |
"datetakenunknown": 0, | |
"media": "photo", | |
"media_status": "ready", | |
"url_o": "https://farm1.staticflickr.com/119/12345_9876_o.jpg", | |
"height_o": "2112", | |
"width_o": "2816" | |
} | |
] | |
""" | |
import datetime | |
import json | |
from requests_oauthlib import OAuth1Session, OAuth2Session | |
from config import flickr | |
authorization_base_url = 'https://www.flickr.com/services/oauth/authorize' | |
request_token_url = 'https://www.flickr.com/services/oauth/request_token' | |
access_token_url = 'https://www.flickr.com/services/oauth/access_token' | |
def time_from(ts): | |
return datetime.datetime.fromtimestamp(int(ts)).strftime('%Y-%m-%d %H:%M:%S') | |
def auth1(): | |
session = OAuth1Session(flickr.key, client_secret=flickr.secret, callback_uri='http://127.0.0.1:8000') | |
session.fetch_request_token(request_token_url, params={'perms': 'read'}) | |
print('authorize:', session.authorization_url(authorization_base_url, perms='read')) | |
redirect_response = input('Paste the full redirect URL here:') | |
session.parse_authorization_response(redirect_response) | |
print(session.fetch_access_token(access_token_url)) | |
return session | |
def get_session(): | |
if not flickr.oauth_token: | |
session = auth1() | |
print(f'oauth_token="{session.auth.client.resource_owner_key}"') | |
print(f'oauth_secret="{session.auth.client.resource_owner_secret}"') | |
else: | |
session = OAuth1Session( | |
flickr.key, | |
client_secret=flickr.secret, | |
resource_owner_key=flickr.oauth_token, | |
resource_owner_secret=flickr.oauth_secret, | |
) | |
return session | |
def search(session, page): | |
""" https://www.flickr.com/services/api/flickr.photos.search.html """ | |
r = session.get( | |
'https://api.flickr.com/services/rest', | |
params={ | |
'nojsoncallback': '1', | |
'format': 'json', | |
'method': 'flickr.photos.search', | |
'sort': 'date-posted-asc', | |
'per_page': 500, | |
'page': page, | |
'extras': 'description,url_o,date_upload,date_taken,media', | |
'user_id': 'me', | |
} | |
) | |
return r.json() | |
def main(): | |
session = get_session() | |
photos = [] | |
page_cnt = page_num = 1 | |
while page_num <= page_cnt: | |
print("page", page_num, "of", page_cnt) | |
page = search(session, page_num) | |
photos.extend(page['photos']['photo']) | |
page_num = page['photos']['page'] + 1 | |
page_cnt = page['photos']['pages'] | |
with open("backup-{:%Y%m%d_%H%M}.json".format(datetime.datetime.now()), mode="w") as f: | |
json.dump(photos, f) | |
photo_ids = {p['id'] for p in photos} | |
print("{:d} photos found".format(len(photo_ids))) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Download all of the photos and videos previously queried with flickr_backup.py. | |
Photos/videos will be stored by date taken in: photos/YYYY/MM/, 3 files each: | |
photos/2006/04/${photo_id}.info.json | |
photos/2006/04/${photo_id}.meta.json | |
photos/2006/04/${photo_url_o}.jpg | |
This script is resumable - it tries to minimize the number of api calls if a photo has already been downloaded. | |
""" | |
import json | |
import os | |
import queue | |
import sys | |
import threading | |
import time | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
import requests | |
from flickr_backup import get_session | |
def requests_retry_session(retries=10, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None): | |
"""https://www.peterbe.com/plog/best-practice-with-retries-with-requests""" | |
session = session or requests.Session() | |
retry = Retry( | |
total=retries, | |
read=retries, | |
connect=retries, | |
backoff_factor=backoff_factor, | |
status_forcelist=status_forcelist, | |
) | |
adapter = HTTPAdapter(max_retries=retry) | |
session.mount('http://', adapter) | |
session.mount('https://', adapter) | |
return session | |
def info(session, photo_id, secret): | |
""" https://www.flickr.com/services/api/flickr.photos.getInfo.html """ | |
r = session.get( | |
'https://api.flickr.com/services/rest', | |
params={ | |
'nojsoncallback': '1', | |
'format': 'json', | |
'method': 'flickr.photos.getInfo', | |
'photo_id': photo_id, | |
'secret': secret, | |
} | |
) | |
return r.json() | |
def sizes(session, photo_id): | |
""" https://www.flickr.com/services/api/flickr.photos.getSizes.html """ | |
r = session.get( | |
'https://api.flickr.com/services/rest', | |
params={ | |
'nojsoncallback': '1', | |
'format': 'json', | |
'method': 'flickr.photos.getSizes', | |
'photo_id': photo_id, | |
} | |
) | |
return r.json() | |
def video(session, photo_id): | |
""" list of video urls to try """ | |
sz = sizes(session, photo_id) | |
json.dump(sz, sys.stderr) | |
def dims(x): | |
if not x: | |
return 0 | |
return int(x['width']) * int(x['height']) | |
url = None | |
for s in sz['sizes']['size']: | |
# consider only downloadable videos | |
if s['media'] == 'video' and s['label'] != 'Video Player': | |
# source sometimes doesn't exist ?! | |
if requests.head(s['source']).status_code < 400: | |
# take biggest video or the one labeled original | |
if dims(s) > dims(url) or s['label'] == "Video Original": | |
url = s | |
return url['source'] | |
def download(session, item): | |
# create photo working directory | |
if 'datetaken' in item: | |
taken = time.strptime(item['datetaken'], '%Y-%m-%d %H:%M:%S') | |
else: | |
taken = time.gmtime(0) | |
photo_dir = os.path.join('photos', "{:04d}".format(taken.tm_year), "{:02d}".format(taken.tm_mon)) | |
os.makedirs(photo_dir, exist_ok=True) | |
pid = item['id'] | |
original = os.path.basename(item['url_o']) | |
url_o = item['url_o'] | |
if item.get('media', '') == 'video': | |
original = original + '.mp4' | |
# defer url_o lookup until after checking if original is already downloaded | |
# skip already downloaded | |
photo_path = os.path.join(photo_dir, original) | |
info_path = os.path.join(photo_dir, '{}.info.json'.format(pid)) | |
photo_downloaded = os.path.isfile(photo_path) and os.path.getsize(photo_path) > 0 | |
info_downloaded = os.path.isfile(info_path) and os.path.getsize(info_path) > 0 | |
if all((photo_downloaded, info_downloaded)): | |
print('already downloaded', pid, photo_path) | |
return | |
# save metadata per photo | |
meta_path = os.path.join(photo_dir, '{}.meta.json'.format(pid)) | |
with open(meta_path, mode='w') as m: | |
json.dump(item, m) | |
# download media | |
if not photo_downloaded: | |
# delayed until here to save API calls | |
if item.get('media', '') == 'video': | |
print('looking up video info for', pid) | |
url_o = video(session, pid) | |
print('downloading', pid) | |
r = requests_retry_session().get(url_o, stream=True, timeout=10) | |
r.raise_for_status() | |
with open(photo_path, mode='wb') as p: | |
for chunk in r.iter_content(chunk_size=4096): | |
p.write(chunk) | |
# save info per photo | |
if not info_downloaded: | |
print('getting info for', pid) | |
i = info(session, pid, item['secret']) | |
with open(info_path, mode='w') as m: | |
json.dump(i, m) | |
def worker(q, session): | |
while True: | |
item = q.get() | |
if item is None: | |
break | |
try: | |
download(session, item) | |
except Exception as e: | |
print("error with {}:".format(item), e) | |
q.task_done() | |
def main(source, num_worker_threads=4): | |
q = queue.Queue() | |
threads = [] | |
for i in range(num_worker_threads): | |
session = get_session() | |
t = threading.Thread(target=worker, args=(q,session)) | |
t.start() | |
threads.append(t) | |
for item in source: | |
q.put(item) | |
# block until all tasks are done | |
q.join() | |
# stop workers | |
for i in range(num_worker_threads): | |
q.put(None) | |
for t in threads: | |
t.join() | |
if __name__ == '__main__': | |
with open(sys.argv[1], 'r') as f: | |
main(json.load(f), num_worker_threads=10) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Upload photos [but not videos] previously downloaded with flickr_download.py. | |
Title set in Flickr will be preserved, otherwise they will be set to 'Untitled' + the date taken. | |
Descrption set in Flickr will be preserved. Tags and other metadata will be included as the last line of the description. | |
Photos will be added to the 'default' album aka Drop Box. There is a 2000 item limit per album. | |
Creates a ${photo_id}.google_upload.json for each item uploaded. | |
To upload all jpgs in photos/: | |
$ find photos -name '*jpg' | xargs -n 500 ./google_upload.py | |
""" | |
import io | |
import json | |
import os | |
import sys | |
import time | |
import xml.etree.ElementTree as ET | |
from requests_oauthlib import OAuth2Session | |
from urllib3.filepost import choose_boundary | |
from config import google | |
authorization_base_url = "https://accounts.google.com/o/oauth2/v2/auth" | |
token_url = "https://www.googleapis.com/oauth2/v4/token" | |
scope = [ | |
"https://www.googleapis.com/auth/userinfo.email", | |
"https://www.googleapis.com/auth/userinfo.profile", | |
"https://picasaweb.google.com/data/", | |
] | |
def auth2(): | |
session = OAuth2Session(client_id=google.client_id, scope=scope, redirect_uri="https://localhost:8000/") | |
authorization_url, state = session.authorization_url(authorization_base_url, access_type="offline", approval_prompt='force') | |
print('authorize:', authorization_url) | |
redirect_response = input('Paste the full redirect URL here:') | |
token = session.fetch_token(token_url, client_secret=google.secret, authorization_response=redirect_response) | |
with open('google-token', 'w') as g: | |
g.write(str(token)) | |
return session | |
def get_session(): | |
if google.access_token: | |
def token_updater(token): | |
print("token updated", token, file=sys.stderr) | |
session = OAuth2Session( | |
client_id=google.client_id, | |
token={ | |
'access_token': google.access_token, | |
'expires_at': time.time() - 10, | |
'refresh_token': google.refresh_token, | |
}, | |
auto_refresh_kwargs={ | |
'client_id': google.client_id, | |
'client_secret': google.secret, | |
}, | |
auto_refresh_url=token_url, | |
token_updater=token_updater, | |
) | |
else: | |
session = auth2() | |
print(f'access_token="{session.access_token}"', file=sys.stderr) | |
print(f'refresh_token="{session.refresh_token}"', file=sys.stderr) | |
return session | |
def upload_with_metadata(session, media, content_type, title, description=None, datetaken=None, album='default'): | |
""" https://developers.google.com/picasa-web/docs/3.0/developers_guide_protocol#PostPhotos """ | |
term = 'photo' # if content_type.startswith('image') else 'video' | |
entry = ET.Element('entry', xmlns='http://www.w3.org/2005/Atom') | |
te = ET.SubElement(entry, 'title') | |
te.text = title | |
if description: | |
summ = ET.SubElement(entry, 'summary') | |
summ.text = description | |
ET.SubElement(entry, 'category', | |
scheme="http://schemas.google.com/g/2005#kind", | |
term=f"http://schemas.google.com/photos/2007#{term}" | |
) | |
#if content_type.startswith('image/'): | |
boundary = choose_boundary() | |
body = io.BytesIO() | |
body.write(f'Media multipart posting\n--{boundary}\n'.encode('utf-8')) | |
body.write('Content-Type: application/atom+xml\n\n'.encode('utf-8')) | |
body.write(ET.tostring(entry)) | |
body.write(f'\n--{boundary}\n'.encode('utf-8')) | |
body.write(f'Content-Type: {content_type}\n\n'.encode('utf-8')) | |
body.write(media.read()) | |
body.write(f'\n--{boundary}--\n'.encode('utf-8')) | |
body.seek(0) | |
headers = { | |
'Content-Type': f'multipart/related; boundary="{boundary}"', | |
'GData-Version': '3', | |
'MIME-version': '1.0', | |
} | |
return session.post( | |
f'https://picasaweb.google.com/data/feed/api/user/default/albumid/{album}?alt=json', | |
headers=headers, | |
data=body, | |
) | |
def upload(session, photo): | |
photo_dir = os.path.dirname(photo) | |
flickr_pid = os.path.basename(photo).split("_", 1)[0] | |
goog_path = os.path.join(photo_dir, f'{flickr_pid}.google_upload.json') | |
if os.path.exists(goog_path): | |
print(f"already uploaded {photo}") | |
return | |
with open(os.path.join(photo_dir, f'{flickr_pid}.info.json')) as i: | |
info = json.load(i) | |
if info["stat"] != "ok": | |
raise Exception(f"bad stat for {photo}") | |
info = info['photo'] | |
# https://developers.google.com/picasa-web/docs/3.0/developers_guide_protocol#PostVideo | |
if info['media'] == 'video': | |
content_type = 'video/mp4' | |
### temporarily skip videos ### | |
print(f"skipped video {photo}") | |
return | |
### ### | |
elif info['originalformat'] == 'jpg': | |
content_type = 'image/jpeg' | |
elif info['originalformat'] == 'png': | |
content_type = 'image/png' | |
else: | |
raise Exception(f"failed to detect content type for {photo}") | |
title = info['title']['_content'] or 'Untitled ' + info["dates"]["taken"] | |
description = info['description']['_content'] | |
if description != "": | |
description += "\n\n" | |
description += f'flickr:id={flickr_pid}' | |
for k, v in info["dates"].items(): | |
if k == 'lastupdate': | |
continue | |
elif k in ('takengranularity', 'takenunknown') and v in ('', '0', 0): | |
continue | |
description += f'|date_{k}={v}' | |
if info['tags']['tag']: | |
description += '|tags=' + ",".join([t['_content'] for t in info['tags']['tag']]) | |
if 'location' in info: | |
lat = info['location'].get('latitude', '') | |
lon = info['location'].get('longitude', '') | |
description += f'|location={lat},{lon}' | |
print(f"uploading {photo} title: {title} description: {description}") | |
with open(photo, 'rb') as p: | |
rsp = upload_with_metadata(session, p, content_type, title, description=description) | |
if rsp.status_code != 201: | |
print(rsp.status_code, rsp.content.decode('utf-8')) | |
rsp.raise_for_status() | |
with open(goog_path, mode='wb') as g: | |
g.write(rsp.content) | |
def main(): | |
session = get_session() | |
for photo in sys.argv[1:]: | |
upload(session, photo) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Copyright 2018 Gil Raphaelli | |
Permission is hereby granted, free of charge, to any person obtaining a copy of | |
this software and associated documentation files (the "Software"), to deal in | |
the Software without restriction, including without limitation the rights to | |
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | |
the Software, and to permit persons to whom the Software is furnished to do so, | |
subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | |
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | |
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | |
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
appnope==0.1.0 | |
bleach==2.1.3 | |
certifi==2018.1.18 | |
chardet==3.0.4 | |
decorator==4.2.1 | |
entrypoints==0.2.3 | |
html5lib==1.0.1 | |
idna==2.6 | |
ipdb==0.11 | |
ipykernel==4.8.2 | |
ipython==6.2.1 | |
ipython-genutils==0.2.0 | |
ipywidgets==7.1.2 | |
jedi==0.11.1 | |
Jinja2==2.10 | |
jsonschema==2.6.0 | |
jupyter==1.0.0 | |
jupyter-client==5.2.3 | |
jupyter-console==5.2.0 | |
jupyter-core==4.4.0 | |
MarkupSafe==1.0 | |
mistune==0.8.3 | |
nbconvert==5.3.1 | |
nbformat==4.4.0 | |
notebook==5.4.1 | |
oauthlib==2.0.7 | |
pandocfilters==1.4.2 | |
parso==0.1.1 | |
pexpect==4.4.0 | |
pickleshare==0.7.4 | |
prompt-toolkit==1.0.15 | |
ptyprocess==0.5.2 | |
Pygments==2.2.0 | |
python-dateutil==2.7.1 | |
python-oauth2==1.1.0 | |
pyzmq==17.0.0 | |
qtconsole==4.3.1 | |
requests==2.18.4 | |
requests-oauthlib==0.8.0 | |
Send2Trash==1.5.0 | |
simplegeneric==0.8.1 | |
six==1.11.0 | |
terminado==0.8.1 | |
testpath==0.3.1 | |
tornado==5.0.1 | |
traitlets==4.3.2 | |
urllib3==1.22 | |
wcwidth==0.1.7 | |
webencodings==0.5.1 | |
widgetsnbextension==3.1.4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment