Created
February 7, 2019 17:39
-
-
Save phette23/9bec679b7b677af7e396e8a40e7a7047 to your computer and use it in GitHub Desktop.
upload a taxonomy to openEQUELLA using a specially-formatted CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# ################################# # | |
# UploadTaxonomy.py # | |
# # | |
# added CLI by Eric Phetteplace # | |
# California College of the Arts # | |
# vault.cca.edu | libraries.cca.edu # | |
# 2014-07-32 # | |
# # | |
# Pearson's notes below # | |
# ################################# # | |
# UploadTaxonomy.py | |
# Author: Jim Kurian, Pearson | |
# Date: November 15, 2012 | |
# | |
# This script populates a taxonomy in EQUELLA 4.1 and higher. | |
# It requires Python 2.7. | |
# | |
# USAGE: | |
# It requires a CSV file to be formatted as follows: | |
# | |
# term0\term00\term000 | |
# term0\term00\term001 | |
# term1 | |
# term1\term10 | |
# term1\term11 | |
# etc... | |
# | |
# If keys are required then following format should be used: | |
# | |
# term0\term00\term000, key1, value1, key2, value2, key3, value3 | |
# term0\term00\term001, key1, value1, , , , | |
# term1 , key1, value1, key2, value2, , | |
# term1\term10 , LONG_DATA, value1, key2, value2, , | |
# term1\term11 , LONG_DATA, value1, , , , | |
# etc... | |
# | |
# Any number of key/value pairs can be added to each row however | |
# each row of the csv must have the same number of columns. Value | |
# fields with commas or quotes in them should be surrounded by | |
# double quotes. MS Excel does this for you automatically. | |
# Use the key "LONG_DATA" to add a HTML Fragments to a term for | |
# display in the pop-view view of the term selector control. | |
# Set the EQUELLA connection settings in settings.py and the | |
# location of the CSV file and the taxonomy ID in the variables | |
# below. | |
# | |
# NOTE: | |
# This script requires the deleteTerm() method to be implemented | |
# in equellaSoap.py module. This function is not present in the | |
# equellaSoap.py module in the integration pack that ships with | |
# EQUELLA 5.0 or earlier. Use the module provided with this | |
# script. | |
import csv | |
import codecs | |
import sys | |
import traceback | |
import time | |
import equellasoap | |
import argparse | |
parser = argparse.ArgumentParser(description='Upload a taxonomy to VAULT.') | |
parser.add_argument('--un', nargs='?', action='store', help='EQUELLA Username') | |
parser.add_argument('--pw', nargs='?', action='store', required=True, | |
help='EQUELLA Password') | |
parser.add_argument('--tid', nargs='?', action='store', required=True, | |
help='Taxonomy ID') | |
parser.add_argument('--csv', nargs='?', action='store', | |
help='Path to taxonomy CSV file') | |
parser.add_argument('--clear', action='store_true', | |
help='Clear the taxonomy (delete all prior terms)') | |
parser.add_argument('--start', default=1, type=int, | |
help='Row of the CSV to start at, e.g. use 2 to skip header row') | |
args = parser.parse_args() | |
csvFileName = args.csv # relative path to CSV file to load | |
# Pearson's code below except for calls to args | |
# ---- SETTINGS ---- # | |
# EQUELLA parameters | |
institutionUrl = 'https://vault.cca.edu' | |
username = args.un | |
password = args.pw | |
# CSV unicode encoding | |
# typically "utf-8" or "latin1" (Excel often saves non-Ascii CSV files as "latin1") | |
encoding = "utf-8" | |
# id of taxonomy to update (retrieve from Admin Console) | |
taxonID = args.tid | |
# set clearTaxonomy to True if taxonomy is to be cleared first | |
clearTaxonomy = args.clear | |
# which row in the CSV to start processing at (e.g. 1 to process entire CSV) | |
startRow = args.start | |
# set to true to only unlock taxonomy (no uploading) | |
unlockOnly = False | |
# set to true to only clear taxonomy (no uploading) | |
clearOnly = False | |
# set to true to cause script to retry on error (useful for poor network connections) | |
retryOnError = False | |
# optional EQUELLA parameters: | |
proxyUrl = None | |
useTokens = False | |
tokenUser = 'tokenUsername' | |
sharedSecretId = 'tokenSecretId' | |
sharedSecretValue = 'tokenSecretValue' | |
# --- END SETTINGS --- # | |
def unicode_csv_reader(utf8_data, encoding, dialect=csv.excel, **kwargs): | |
csv_reader = csv.reader(utf8_data, dialect=dialect, **kwargs) | |
firstRow = True | |
for row in csv_reader: | |
# remove BOM for utf-8 | |
if firstRow: | |
if row[0].startswith(codecs.BOM_UTF8): | |
row[0] = row[0].decode("utf-8")[1:] | |
firstRow = False | |
yield [cell.decode(encoding) for cell in row] | |
if not institutionUrl.endswith("/"): | |
institutionUrl += "/" | |
# test logging in and logging out of EQUELLA | |
equella = equellasoap.EquellaSoap(institutionUrl, username, password, proxyUrl) | |
equella.logout() | |
# test unlocking taxonomy | |
try: | |
equella.unlockTaxonomy(taxonID, force=1) | |
except: | |
raise "Unable to unlock taxonomy, check taxonomy ID" | |
# test opening and closing CSV | |
csvFile = open(csvFileName, "rb") | |
csvFile.close() | |
rownum = 0 | |
lastAttemptedRow = 0 | |
lastAttemptedTerm = "" | |
incomplete = True | |
retrying = False | |
while incomplete: | |
try: | |
# instantiate an EquellaSoap object and lock the taxonomy for editing | |
print "Logging into EQUELLA..." | |
equella = equellasoap.EquellaSoap(institutionUrl, username, password, proxyUrl) | |
equella.unlockTaxonomy(taxonID, force=1) | |
if unlockOnly: | |
print "Taxonomy successfully unlocked." | |
sys.exit("Unlock taxonomy only") | |
print "Locking taxonomy for editing..." | |
equella.lockTaxonomyForEditing(taxonID) | |
# clear the taxonomy in EQUELLA | |
if clearTaxonomy or clearOnly: | |
print "Clearing taxonomy..." | |
rootTerms = equella.listTerms(taxonID, "") | |
for rootTerm in rootTerms: | |
equella.deleteTerm(taxonID, rootTerm) | |
if clearOnly: | |
print "Taxonomy successfully cleared." | |
sys.exit("Clear taxonomy only") | |
# read CSV file | |
print "Importing taxonomy from CSV starting from row %s..." % (startRow) | |
csvFile = open(csvFileName, "rb") | |
reader = csv.reader(csvFile, dialect=csv.excel) | |
# get row count | |
rowcount = len(list(reader)) | |
csvFile.close() | |
csvFile = open(csvFileName, "rbU") | |
reader = unicode_csv_reader(csvFile, encoding) | |
# remove last term in case it was only partially completed | |
if lastAttemptedRow > 0: | |
try: | |
if len(equella.listTerms(taxonID, lastAttemptedTerm)) == 0: | |
print "Deleting " + lastAttemptedTerm + " to reimport..." | |
equella.deleteTerm(taxonID, lastAttemptedTerm) | |
except: | |
pass | |
# iterate through csv rows | |
for row in reader: | |
rownum += 1 | |
if rownum >= startRow: | |
lastAttemptedRow = rownum | |
print "Processing row %s of %s" % (rownum, rowcount) | |
terms = row[0].split("\\") | |
parentTerm = "" | |
# iterate through terms of path | |
termsAdded = False | |
for i in range(0, len(terms)): | |
term = terms[i].strip() | |
fullTermPath = "" | |
if parentTerm == "": | |
fullTermPath = term | |
else: | |
fullTermPath = parentTerm + "\\" + term | |
lastAttemptedTerm = fullTermPath | |
# if term deosn't exist add it | |
siblingTerms = equella.listTerms(taxonID, parentTerm) | |
if term not in siblingTerms: | |
termsAdded = True | |
print " adding " + fullTermPath | |
equella.insertTerm(taxonID, parentTerm, term) | |
# Add data to term if it is a leaf node | |
for j in range(1, len(row), 2): | |
key = row[j].strip() | |
if i == len(terms) - 1 and key != "": | |
print " adding data key: " + key | |
equella.setTermData(taxonID, fullTermPath, key, row[j + 1].strip()) | |
# set current term as parent for next row | |
# (i.e. term) in the CSV | |
parentTerm = fullTermPath | |
if not termsAdded: | |
print " no new terms to be added" | |
incomplete = False | |
except: | |
exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() | |
error = ''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback)) | |
if "Clear taxonomy only" in error or "Unlock taxonomy only" in error: | |
incomplete = False | |
elif retryOnError: | |
# retry | |
print "Error occured: " + str(exceptionValue) | |
print " Last Attempted Term: " + lastAttemptedTerm | |
print " Last Attempted Row: " + str(lastAttemptedRow) | |
print " Retrying..." | |
time.sleep(2) | |
clearTaxonomy = False | |
startRow = lastAttemptedRow | |
rownum = 0 | |
else: | |
raise Exception, error | |
# unlock the taxonomy, logout of EQUELLA and close the csv file | |
equella.unlockTaxonomy(taxonID) | |
equella.logout() | |
if not clearOnly and not unlockOnly: | |
csvFile.close() | |
print "-- PROCESSING COMPLETE --" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment