Skip to content

Instantly share code, notes, and snippets.

@ngeraci
Last active January 31, 2019 23:47
Show Gist options
  • Save ngeraci/7a571f20d439920409cf0310b9079dcb to your computer and use it in GitHub Desktop.
Save ngeraci/7a571f20d439920409cf0310b9079dcb to your computer and use it in GitHub Desktop.
A script to get a list of Nuxeo paths for missing items (and write them to a text file) when there is a discrepancy between Calisphere test and production.
""" A script to get the Nuxeo paths for missing items (and write them
to a text file) when there is a discrepancy between number of items in
Calisphere test and production.
"""
import requests
from bs4 import BeautifulSoup
from pynux import utils
def __main__():
# numeric id from the CDL collections registry
registry_id = 26943
# number of objects in test
test_count = 5033
# number of objects in production
prod_count = 5933
# path to collection in Nuxeo
nuxeo_path = "/asset-library/UCR/SCUA/Archival/Klein/"
# text file to write out to
outfile_path = "missing_items.txt"
missing_paths = calisphere_compare(registry_id, test_count, prod_count, nuxeo_path)
with open(outfile_path, "w") as outfile:
for path in missing_paths:
outfile.write("{}\n".format(path))
def calisphere_compare(registry_id, test_count, prod_count, nuxeo_path):
""" Compare ARKs from the two sites.
Get a list of ARKs that only appear in production,
and return a list of their corresponding Nuxeo paths.
"""
# process HTML from test site and extract ARKs
test_html = get_calisphere_html(registry_id, test_count, "test")
test_arks = get_arks_from_html(test_html)
# process HTML from production site and extract ARKs
prod_html = get_calisphere_html(registry_id, prod_count, "prod")
prod_arks = get_arks_from_html(prod_html)
# get a list of the ARKs that only appear in production
in_prod_only = list(set(prod_arks) - set(test_arks))
# query the Nuxeo API to get the Nuxeo paths for the missing items
paths = nx_paths(nuxeo_path, in_prod_only)
return paths
def get_calisphere_html(collection_id, item_count, test_or_prod):
""" Use requests library to get HTML from Calisphere.
"""
if test_or_prod == "test":
base_url = "http://calisphere-test.cdlib.org/"
elif test_or_prod == "prod":
base_url = "https://calisphere.org/"
url = "{}/collections/{}/?q=&rows={}".format(base_url, collection_id, item_count)
html = requests.get(url).text
return html
def get_arks_from_html(calisphere_html):
"""Takes Calisphere HTML, returns list of ARKs.
"""
soup = BeautifulSoup(calisphere_html, "lxml")
#[:-1] to trim the trailing slash on the ARKs
arks = [i["data-item_id"][:-1] for i in soup.find_all(data_item_id)]
return arks
def data_item_id(tag):
"""BeautifulSoup function to get all tags with attribute "data-item_id" (ARKs)
"""
return tag.has_attr("data-item_id")
def nx_paths(path, ark_list):
""" Given a path to a Nuxeo directory and a list of ARKs, queries the Nuxeo API
to returns a list of Nuxeo paths for objects in directory that match ARKs in list.
"""
nx_utils = utils.Nuxeo()
documents = nx_utils.nxql(
u"""
SELECT * FROM SampleCustomPicture
WHERE ecm:path STARTSWITH "{}"
AND ecm:currentLifeCycleState != "deleted"
""".format(path)
)
paths = []
for doc in documents:
ark = doc["properties"]["ucldc_schema:identifier"]
if ark in ark_list:
paths.append(doc["path"])
return paths
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment