Last active
January 31, 2019 23:47
-
-
Save ngeraci/7a571f20d439920409cf0310b9079dcb to your computer and use it in GitHub Desktop.
A script to get a list of Nuxeo paths for missing items (and write them to a text file) when there is a discrepancy between Calisphere test and production.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" A script to get the Nuxeo paths for missing items (and write them | |
to a text file) when there is a discrepancy between number of items in | |
Calisphere test and production. | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from pynux import utils | |
def __main__(): | |
# numeric id from the CDL collections registry | |
registry_id = 26943 | |
# number of objects in test | |
test_count = 5033 | |
# number of objects in production | |
prod_count = 5933 | |
# path to collection in Nuxeo | |
nuxeo_path = "/asset-library/UCR/SCUA/Archival/Klein/" | |
# text file to write out to | |
outfile_path = "missing_items.txt" | |
missing_paths = calisphere_compare(registry_id, test_count, prod_count, nuxeo_path) | |
with open(outfile_path, "w") as outfile: | |
for path in missing_paths: | |
outfile.write("{}\n".format(path)) | |
def calisphere_compare(registry_id, test_count, prod_count, nuxeo_path): | |
""" Compare ARKs from the two sites. | |
Get a list of ARKs that only appear in production, | |
and return a list of their corresponding Nuxeo paths. | |
""" | |
# process HTML from test site and extract ARKs | |
test_html = get_calisphere_html(registry_id, test_count, "test") | |
test_arks = get_arks_from_html(test_html) | |
# process HTML from production site and extract ARKs | |
prod_html = get_calisphere_html(registry_id, prod_count, "prod") | |
prod_arks = get_arks_from_html(prod_html) | |
# get a list of the ARKs that only appear in production | |
in_prod_only = list(set(prod_arks) - set(test_arks)) | |
# query the Nuxeo API to get the Nuxeo paths for the missing items | |
paths = nx_paths(nuxeo_path, in_prod_only) | |
return paths | |
def get_calisphere_html(collection_id, item_count, test_or_prod): | |
""" Use requests library to get HTML from Calisphere. | |
""" | |
if test_or_prod == "test": | |
base_url = "http://calisphere-test.cdlib.org/" | |
elif test_or_prod == "prod": | |
base_url = "https://calisphere.org/" | |
url = "{}/collections/{}/?q=&rows={}".format(base_url, collection_id, item_count) | |
html = requests.get(url).text | |
return html | |
def get_arks_from_html(calisphere_html): | |
"""Takes Calisphere HTML, returns list of ARKs. | |
""" | |
soup = BeautifulSoup(calisphere_html, "lxml") | |
#[:-1] to trim the trailing slash on the ARKs | |
arks = [i["data-item_id"][:-1] for i in soup.find_all(data_item_id)] | |
return arks | |
def data_item_id(tag): | |
"""BeautifulSoup function to get all tags with attribute "data-item_id" (ARKs) | |
""" | |
return tag.has_attr("data-item_id") | |
def nx_paths(path, ark_list): | |
""" Given a path to a Nuxeo directory and a list of ARKs, queries the Nuxeo API | |
to returns a list of Nuxeo paths for objects in directory that match ARKs in list. | |
""" | |
nx_utils = utils.Nuxeo() | |
documents = nx_utils.nxql( | |
u""" | |
SELECT * FROM SampleCustomPicture | |
WHERE ecm:path STARTSWITH "{}" | |
AND ecm:currentLifeCycleState != "deleted" | |
""".format(path) | |
) | |
paths = [] | |
for doc in documents: | |
ark = doc["properties"]["ucldc_schema:identifier"] | |
if ark in ark_list: | |
paths.append(doc["path"]) | |
return paths |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment