Skip to content

Instantly share code, notes, and snippets.

@graphaelli
Last active April 25, 2018 14:45
Show Gist options
  • Save graphaelli/1b7940e515c664dd6663daf152390e8b to your computer and use it in GitHub Desktop.
Save graphaelli/1b7940e515c664dd6663daf152390e8b to your computer and use it in GitHub Desktop.
Flickr Archive Utilities - MIT License
"""
Configuration for Flickr Backup & Download and Google Upload.
"""
from collections import namedtuple
# fill in after first run, each will be loggeed
flickr_oauth_token = None
flickr_oauth_secret = None
google_refresh_token = None # only available at first run
google_access_token = None
#
# Fill in keys and secrets before first run
#
# API Keys from https://www.flickr.com/services/apps/by/me.
Flickr = namedtuple('Flickr', ('key', 'secret', 'oauth_token', 'oauth_secret'))
flickr = Flickr(
key='API_KEY',
secret='API_SECRET',
oauth_token=flickr_oauth_token,
oauth_secret=flickr_oauth_secret,
)
# id and secret from https://console.cloud.google.com/apis/credentials
Google = namedtuple('Google', ('client_id', 'secret', 'access_token', 'refresh_token'))
google = Google(
client_id='CLIENT_ID',
secret='SECRET',
access_token=google_access_token,
refresh_token=google_refresh_token,
)
#!/usr/bin/env python3
"""
Dump information about all of the authenticated user's photos to back-$date.json:
[
{
"id": "12345",
"owner": "12345@N00",
"secret": "9876",
"server": "119",
"farm": 1,
"title": "",
"ispublic": 1,
"isfriend": 0,
"isfamily": 0,
"description": {
"_content": ""
},
"dateupload": "1160371553",
"datetaken": "2006-10-08 18:34:44",
"datetakengranularity": "0",
"datetakenunknown": 0,
"media": "photo",
"media_status": "ready",
"url_o": "https://farm1.staticflickr.com/119/12345_9876_o.jpg",
"height_o": "2112",
"width_o": "2816"
}
]
"""
import datetime
import json
from requests_oauthlib import OAuth1Session, OAuth2Session
from config import flickr
authorization_base_url = 'https://www.flickr.com/services/oauth/authorize'
request_token_url = 'https://www.flickr.com/services/oauth/request_token'
access_token_url = 'https://www.flickr.com/services/oauth/access_token'
def time_from(ts):
return datetime.datetime.fromtimestamp(int(ts)).strftime('%Y-%m-%d %H:%M:%S')
def auth1():
session = OAuth1Session(flickr.key, client_secret=flickr.secret, callback_uri='http://127.0.0.1:8000')
session.fetch_request_token(request_token_url, params={'perms': 'read'})
print('authorize:', session.authorization_url(authorization_base_url, perms='read'))
redirect_response = input('Paste the full redirect URL here:')
session.parse_authorization_response(redirect_response)
print(session.fetch_access_token(access_token_url))
return session
def get_session():
if not flickr.oauth_token:
session = auth1()
print(f'oauth_token="{session.auth.client.resource_owner_key}"')
print(f'oauth_secret="{session.auth.client.resource_owner_secret}"')
else:
session = OAuth1Session(
flickr.key,
client_secret=flickr.secret,
resource_owner_key=flickr.oauth_token,
resource_owner_secret=flickr.oauth_secret,
)
return session
def search(session, page):
""" https://www.flickr.com/services/api/flickr.photos.search.html """
r = session.get(
'https://api.flickr.com/services/rest',
params={
'nojsoncallback': '1',
'format': 'json',
'method': 'flickr.photos.search',
'sort': 'date-posted-asc',
'per_page': 500,
'page': page,
'extras': 'description,url_o,date_upload,date_taken,media',
'user_id': 'me',
}
)
return r.json()
def main():
session = get_session()
photos = []
page_cnt = page_num = 1
while page_num <= page_cnt:
print("page", page_num, "of", page_cnt)
page = search(session, page_num)
photos.extend(page['photos']['photo'])
page_num = page['photos']['page'] + 1
page_cnt = page['photos']['pages']
with open("backup-{:%Y%m%d_%H%M}.json".format(datetime.datetime.now()), mode="w") as f:
json.dump(photos, f)
photo_ids = {p['id'] for p in photos}
print("{:d} photos found".format(len(photo_ids)))
if __name__ == '__main__':
main()
#!/usr/bin/env python3
"""
Download all of the photos and videos previously queried with flickr_backup.py.
Photos/videos will be stored by date taken in: photos/YYYY/MM/, 3 files each:
photos/2006/04/${photo_id}.info.json
photos/2006/04/${photo_id}.meta.json
photos/2006/04/${photo_url_o}.jpg
This script is resumable - it tries to minimize the number of api calls if a photo has already been downloaded.
"""
import json
import os
import queue
import sys
import threading
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests
from flickr_backup import get_session
def requests_retry_session(retries=10, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None):
"""https://www.peterbe.com/plog/best-practice-with-retries-with-requests"""
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def info(session, photo_id, secret):
""" https://www.flickr.com/services/api/flickr.photos.getInfo.html """
r = session.get(
'https://api.flickr.com/services/rest',
params={
'nojsoncallback': '1',
'format': 'json',
'method': 'flickr.photos.getInfo',
'photo_id': photo_id,
'secret': secret,
}
)
return r.json()
def sizes(session, photo_id):
""" https://www.flickr.com/services/api/flickr.photos.getSizes.html """
r = session.get(
'https://api.flickr.com/services/rest',
params={
'nojsoncallback': '1',
'format': 'json',
'method': 'flickr.photos.getSizes',
'photo_id': photo_id,
}
)
return r.json()
def video(session, photo_id):
""" list of video urls to try """
sz = sizes(session, photo_id)
json.dump(sz, sys.stderr)
def dims(x):
if not x:
return 0
return int(x['width']) * int(x['height'])
url = None
for s in sz['sizes']['size']:
# consider only downloadable videos
if s['media'] == 'video' and s['label'] != 'Video Player':
# source sometimes doesn't exist ?!
if requests.head(s['source']).status_code < 400:
# take biggest video or the one labeled original
if dims(s) > dims(url) or s['label'] == "Video Original":
url = s
return url['source']
def download(session, item):
# create photo working directory
if 'datetaken' in item:
taken = time.strptime(item['datetaken'], '%Y-%m-%d %H:%M:%S')
else:
taken = time.gmtime(0)
photo_dir = os.path.join('photos', "{:04d}".format(taken.tm_year), "{:02d}".format(taken.tm_mon))
os.makedirs(photo_dir, exist_ok=True)
pid = item['id']
original = os.path.basename(item['url_o'])
url_o = item['url_o']
if item.get('media', '') == 'video':
original = original + '.mp4'
# defer url_o lookup until after checking if original is already downloaded
# skip already downloaded
photo_path = os.path.join(photo_dir, original)
info_path = os.path.join(photo_dir, '{}.info.json'.format(pid))
photo_downloaded = os.path.isfile(photo_path) and os.path.getsize(photo_path) > 0
info_downloaded = os.path.isfile(info_path) and os.path.getsize(info_path) > 0
if all((photo_downloaded, info_downloaded)):
print('already downloaded', pid, photo_path)
return
# save metadata per photo
meta_path = os.path.join(photo_dir, '{}.meta.json'.format(pid))
with open(meta_path, mode='w') as m:
json.dump(item, m)
# download media
if not photo_downloaded:
# delayed until here to save API calls
if item.get('media', '') == 'video':
print('looking up video info for', pid)
url_o = video(session, pid)
print('downloading', pid)
r = requests_retry_session().get(url_o, stream=True, timeout=10)
r.raise_for_status()
with open(photo_path, mode='wb') as p:
for chunk in r.iter_content(chunk_size=4096):
p.write(chunk)
# save info per photo
if not info_downloaded:
print('getting info for', pid)
i = info(session, pid, item['secret'])
with open(info_path, mode='w') as m:
json.dump(i, m)
def worker(q, session):
while True:
item = q.get()
if item is None:
break
try:
download(session, item)
except Exception as e:
print("error with {}:".format(item), e)
q.task_done()
def main(source, num_worker_threads=4):
q = queue.Queue()
threads = []
for i in range(num_worker_threads):
session = get_session()
t = threading.Thread(target=worker, args=(q,session))
t.start()
threads.append(t)
for item in source:
q.put(item)
# block until all tasks are done
q.join()
# stop workers
for i in range(num_worker_threads):
q.put(None)
for t in threads:
t.join()
if __name__ == '__main__':
with open(sys.argv[1], 'r') as f:
main(json.load(f), num_worker_threads=10)
#!/usr/bin/env python3
"""
Upload photos [but not videos] previously downloaded with flickr_download.py.
Title set in Flickr will be preserved, otherwise they will be set to 'Untitled' + the date taken.
Descrption set in Flickr will be preserved. Tags and other metadata will be included as the last line of the description.
Photos will be added to the 'default' album aka Drop Box. There is a 2000 item limit per album.
Creates a ${photo_id}.google_upload.json for each item uploaded.
To upload all jpgs in photos/:
$ find photos -name '*jpg' | xargs -n 500 ./google_upload.py
"""
import io
import json
import os
import sys
import time
import xml.etree.ElementTree as ET
from requests_oauthlib import OAuth2Session
from urllib3.filepost import choose_boundary
from config import google
authorization_base_url = "https://accounts.google.com/o/oauth2/v2/auth"
token_url = "https://www.googleapis.com/oauth2/v4/token"
scope = [
"https://www.googleapis.com/auth/userinfo.email",
"https://www.googleapis.com/auth/userinfo.profile",
"https://picasaweb.google.com/data/",
]
def auth2():
session = OAuth2Session(client_id=google.client_id, scope=scope, redirect_uri="https://localhost:8000/")
authorization_url, state = session.authorization_url(authorization_base_url, access_type="offline", approval_prompt='force')
print('authorize:', authorization_url)
redirect_response = input('Paste the full redirect URL here:')
token = session.fetch_token(token_url, client_secret=google.secret, authorization_response=redirect_response)
with open('google-token', 'w') as g:
g.write(str(token))
return session
def get_session():
if google.access_token:
def token_updater(token):
print("token updated", token, file=sys.stderr)
session = OAuth2Session(
client_id=google.client_id,
token={
'access_token': google.access_token,
'expires_at': time.time() - 10,
'refresh_token': google.refresh_token,
},
auto_refresh_kwargs={
'client_id': google.client_id,
'client_secret': google.secret,
},
auto_refresh_url=token_url,
token_updater=token_updater,
)
else:
session = auth2()
print(f'access_token="{session.access_token}"', file=sys.stderr)
print(f'refresh_token="{session.refresh_token}"', file=sys.stderr)
return session
def upload_with_metadata(session, media, content_type, title, description=None, datetaken=None, album='default'):
""" https://developers.google.com/picasa-web/docs/3.0/developers_guide_protocol#PostPhotos """
term = 'photo' # if content_type.startswith('image') else 'video'
entry = ET.Element('entry', xmlns='http://www.w3.org/2005/Atom')
te = ET.SubElement(entry, 'title')
te.text = title
if description:
summ = ET.SubElement(entry, 'summary')
summ.text = description
ET.SubElement(entry, 'category',
scheme="http://schemas.google.com/g/2005#kind",
term=f"http://schemas.google.com/photos/2007#{term}"
)
#if content_type.startswith('image/'):
boundary = choose_boundary()
body = io.BytesIO()
body.write(f'Media multipart posting\n--{boundary}\n'.encode('utf-8'))
body.write('Content-Type: application/atom+xml\n\n'.encode('utf-8'))
body.write(ET.tostring(entry))
body.write(f'\n--{boundary}\n'.encode('utf-8'))
body.write(f'Content-Type: {content_type}\n\n'.encode('utf-8'))
body.write(media.read())
body.write(f'\n--{boundary}--\n'.encode('utf-8'))
body.seek(0)
headers = {
'Content-Type': f'multipart/related; boundary="{boundary}"',
'GData-Version': '3',
'MIME-version': '1.0',
}
return session.post(
f'https://picasaweb.google.com/data/feed/api/user/default/albumid/{album}?alt=json',
headers=headers,
data=body,
)
def upload(session, photo):
photo_dir = os.path.dirname(photo)
flickr_pid = os.path.basename(photo).split("_", 1)[0]
goog_path = os.path.join(photo_dir, f'{flickr_pid}.google_upload.json')
if os.path.exists(goog_path):
print(f"already uploaded {photo}")
return
with open(os.path.join(photo_dir, f'{flickr_pid}.info.json')) as i:
info = json.load(i)
if info["stat"] != "ok":
raise Exception(f"bad stat for {photo}")
info = info['photo']
# https://developers.google.com/picasa-web/docs/3.0/developers_guide_protocol#PostVideo
if info['media'] == 'video':
content_type = 'video/mp4'
### temporarily skip videos ###
print(f"skipped video {photo}")
return
### ###
elif info['originalformat'] == 'jpg':
content_type = 'image/jpeg'
elif info['originalformat'] == 'png':
content_type = 'image/png'
else:
raise Exception(f"failed to detect content type for {photo}")
title = info['title']['_content'] or 'Untitled ' + info["dates"]["taken"]
description = info['description']['_content']
if description != "":
description += "\n\n"
description += f'flickr:id={flickr_pid}'
for k, v in info["dates"].items():
if k == 'lastupdate':
continue
elif k in ('takengranularity', 'takenunknown') and v in ('', '0', 0):
continue
description += f'|date_{k}={v}'
if info['tags']['tag']:
description += '|tags=' + ",".join([t['_content'] for t in info['tags']['tag']])
if 'location' in info:
lat = info['location'].get('latitude', '')
lon = info['location'].get('longitude', '')
description += f'|location={lat},{lon}'
print(f"uploading {photo} title: {title} description: {description}")
with open(photo, 'rb') as p:
rsp = upload_with_metadata(session, p, content_type, title, description=description)
if rsp.status_code != 201:
print(rsp.status_code, rsp.content.decode('utf-8'))
rsp.raise_for_status()
with open(goog_path, mode='wb') as g:
g.write(rsp.content)
def main():
session = get_session()
for photo in sys.argv[1:]:
upload(session, photo)
if __name__ == '__main__':
main()
Copyright 2018 Gil Raphaelli
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
appnope==0.1.0
bleach==2.1.3
certifi==2018.1.18
chardet==3.0.4
decorator==4.2.1
entrypoints==0.2.3
html5lib==1.0.1
idna==2.6
ipdb==0.11
ipykernel==4.8.2
ipython==6.2.1
ipython-genutils==0.2.0
ipywidgets==7.1.2
jedi==0.11.1
Jinja2==2.10
jsonschema==2.6.0
jupyter==1.0.0
jupyter-client==5.2.3
jupyter-console==5.2.0
jupyter-core==4.4.0
MarkupSafe==1.0
mistune==0.8.3
nbconvert==5.3.1
nbformat==4.4.0
notebook==5.4.1
oauthlib==2.0.7
pandocfilters==1.4.2
parso==0.1.1
pexpect==4.4.0
pickleshare==0.7.4
prompt-toolkit==1.0.15
ptyprocess==0.5.2
Pygments==2.2.0
python-dateutil==2.7.1
python-oauth2==1.1.0
pyzmq==17.0.0
qtconsole==4.3.1
requests==2.18.4
requests-oauthlib==0.8.0
Send2Trash==1.5.0
simplegeneric==0.8.1
six==1.11.0
terminado==0.8.1
testpath==0.3.1
tornado==5.0.1
traitlets==4.3.2
urllib3==1.22
wcwidth==0.1.7
webencodings==0.5.1
widgetsnbextension==3.1.4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment