Last active
August 29, 2015 14:19
-
-
Save mhermans/05e0c537f13073044dc4 to your computer and use it in GitHub Desktop.
Python script to match Zotero group library metadata with Box-hosted ebook files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from boxsdk import Client, OAuth2 # https://github.com/box/box-python-sdk | |
from boxsdk.exception import BoxAPIException | |
from pyzotero import zotero # https://github.com/urschrei/pyzotero | |
import os.path | |
""" | |
This script scans a Box input-folder and a Zotero group-library, and matches the ebook PDF's | |
in the Box-folder with the metadata in the Zotero-library. | |
Matches are based on the SHA1 file checksum, which Box calculates and tracks automatically | |
for all files and which is added manually to the "Extra"-field of the Zotero-items. | |
Linking items between Box and Zotero happens by adding the public url for the Zotero item in | |
the Box file metadata, and adding the (company-restricted) public link to the url-field of the | |
Zotero item. | |
Additionally the ebook-file in the Box input-folder is renamed based on the Zotero-metadata, | |
and moved to a permanent Box ebook-folder. | |
End result is that ebooks and their metadata can be managed through Zotero, with the files | |
themselves hosted and accessable through Box and the Zotero web or offline interface. | |
Maarten Hermans | www.mhermans.net | maarten AT mhermans DOT net | |
""" | |
def zitem_ebook_hash(item): | |
"""Return sha1 ebook hash-value for zotero item.""" | |
extra = item['data'].get('extra') | |
if extra and 'ebook_sha1' in extra: | |
ebook_hash = extra.split(':')[1] | |
else: | |
ebook_hash = None | |
return ebook_hash | |
def construct_filename(zotero_item, extention=None): | |
"""Construct a new filename based on zotero item metadata.""" | |
title = zotero_item['data'].get('title') | |
title = zotero_item['data'].get('title')[0:55].strip().strip('.') | |
yr = zotero_item['data'].get('date') #[0:4] # TODO more robust year selection? | |
creators = zotero_item['data'].get('creators') | |
if len(creators) == 1: | |
a1 = creators[0].get('lastName') | |
fn = ' '.join([a1, '-', yr, '-']) | |
title = title[0:(80-len(fn))].strip(' .:') | |
fn = ' '.join([fn, title]) | |
if len(creators) == 2: | |
a1 = creators[0].get('lastName') | |
a2 = creators[1].get('lastName') | |
fn = ' '.join([a1, 'and', a2, '-', yr, '-']) | |
title = title[0:(80-len(fn))].strip(' .:') | |
fn = ' '.join([fn, title]) | |
if len(creators) > 2: | |
a1 = creators[0].get('lastName') | |
fn = ' '.join([a1, 'et al.', '-', yr, '-']) | |
title = title[0:(80-len(fn))].strip(' .:') | |
fn = ' '.join([fn, title]) | |
if extention: | |
fn = '.'.join([fn, extention.strip('. ')]) | |
return(fn) | |
def link_items(zotero_client, zot_item, box_item): | |
"""Link/set metadata between matching Zotero and Box items.""" | |
# set zotero url-field to company-restricted url | |
# ---------------------------------------------- | |
zot_item['data']['url'] = box_item.get_shared_link('company') | |
zotero_client.update_item(zot_item) | |
# add box-file metadata-field for zotero-url | |
# ------------------------------------------ | |
try: | |
box_item_metadata = matched_box_item.metadata().get() | |
# get succeeded (metadata present) -> check if zotero_url present | |
# => only set if not present (TODO update?) | |
if not box_item_metadata.get('zotero_web_url'): | |
box_item.metadata().create({ | |
'zotero_web_url': zot_item['links']['alternate']['href']}) | |
except BoxAPIException: | |
# apparently error on get() if no metadata present -> create | |
box_item.metadata().create({ | |
'zotero_web_url': zot_item['links']['alternate']['href']}) | |
# rename box-file based on zotero metadata | |
# ---------------------------------------- | |
extension = os.path.splitext(box_item['name'])[1] # get original ext. | |
fn_new = construct_filename(zot_item, extension) | |
if not box_item['name'] == fn_new: | |
box_item.rename(fn_new) | |
# TODO check if file already exits (currently throws box-error) | |
# Read in Zotero, Box authentication credentials | |
# ============================================== | |
# Zotero API credentials | |
zotero_api_key = 'Eme6vXpAaJd0p0' | |
zotero_user_id = '216' | |
zotero_group_id = '39' | |
# Box OAuth credentials (not SSO) | |
# https://developers.box.com/ | |
# https://kuleuven.app.box.com/developers/services/edit/123613l | |
box_access_token = 'lfCPYI1z2t06llPWFHI6' # expires in 1h | |
box_client_id = '66gutzmxttjneszyvzx' | |
box_client_secret = 'SalwRViciDWVH3RGPZp' | |
box_input_folder_id = '33282' | |
box_ebooks_folder_id = '33230' | |
# Authenticate with Zotero and Box, intiatialize clients | |
# ====================================================== | |
zot = zotero.Zotero(zotero_group_id, 'group', zotero_api_key) | |
oauth = OAuth2( | |
client_id=box_client_id, | |
client_secret=box_client_secret, | |
access_token=box_access_token | |
) | |
box = Client(oauth) | |
# Fetch items and their SHA1-file hash | |
# ==================================== | |
# lists -> sha1-keyed dicts for zotero and box items | |
zot_items = zot.top() | |
zot_items_dict = {zitem_ebook_hash(item) : item for item in zot_items if zitem_ebook_hash(item)} | |
box_input_folder = box.folder(folder_id=box_input_folder_id) | |
box_ebooks_folder = box.folder(folder_id=box_ebooks_folder_id) | |
# fetch items, and fetch additional file-data for box items | |
box_items = box_input_folder.get_items(limit=100, offset=0) | |
box_items = [item.get() for item in box_items] | |
box_items_dict = {item['sha1'] : item for item in box_items} | |
# Match and link Zotero and Box items | |
# =================================== | |
# get matches on sha1-hash between box and zotero | |
matched_hashes = set(box_items_dict.keys()).intersection(zot_items_dict.keys()) | |
print(len(matched_hashes)) | |
# iterate over all matches, and link up items between zotero and box | |
for item_hash in list(matched_hashes): | |
matched_zot_item = zot_items_dict[item_hash] | |
matched_box_item = box_items_dict[item_hash] | |
print matched_zot_item['data']['title'] | |
print matched_box_item['name'] | |
# link both items: set box-url, zotero-url, etc. | |
link_items(zot, matched_zot_item, matched_box_item) | |
# move renamed file from input folder to ebooks folder | |
matched_box_item.move(box_ebooks_folder) | |
# list unmatched files (sha1 hash + filename) in input folder | |
# =========================================================== | |
for file_hash, item in box_items_dict.items(): | |
print ':'.join(['ebook_sha1', file_hash]), item['name'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment