ngeraci · January 31, 2019 23:47
diff --git a/calisphere_compare.py b/calisphere_compare.py
 """ A script to get the Nuxeo paths for missing items (and write them
 to a text file) when there is a discrepancy between number of items in
 Calisphere test and production.
 """

 import requests
 from bs4 import BeautifulSoup
 from pynux import utils


 def __main__():

    # numeric id from the CDL collections registry
    registry_id = 26943

    # number of objects in test
    test_count = 5033

    # number of objects in production
    prod_count = 5933

    # path to collection in Nuxeo
    nuxeo_path = "/asset-library/UCR/SCUA/Archival/Klein/"

    # text file to write out to
    outfile_path = "missing_items.txt"

    missing_paths = calisphere_compare(registry_id, test_count, prod_count, nuxeo_path)

    with open(outfile_path, "w") as outfile:
        for path in missing_paths:
            outfile.write("{}\n".format(path))


 def calisphere_compare(registry_id, test_count, prod_count, nuxeo_path):
    """ Compare ARKs from the two sites.
    Get a list of ARKs that only appear in production,
    and return a list of their corresponding Nuxeo paths.
    """

    # process HTML from test site and extract ARKs
    test_html = get_calisphere_html(registry_id, test_count, "test")
    test_arks = get_arks_from_html(test_html)

    # process HTML from production site and extract ARKs
    prod_html = get_calisphere_html(registry_id, prod_count, "prod")
    prod_arks = get_arks_from_html(prod_html)

    # get a list of the ARKs that only appear in production
    in_prod_only = list(set(prod_arks) - set(test_arks))

    # query the Nuxeo API to get the Nuxeo paths for the missing items
    paths = nx_paths(nuxeo_path, in_prod_only)

    return paths


 def get_calisphere_html(collection_id, item_count, test_or_prod):
    """ Use requests library to get HTML from Calisphere.
    """

    if test_or_prod == "test":
        base_url = "http://calisphere-test.cdlib.org/"
    elif test_or_prod == "prod":
        base_url = "https://calisphere.org/"

    url = "{}/collections/{}/?q=&rows={}".format(base_url, collection_id, item_count)

    html = requests.get(url).text

    return html


 def get_arks_from_html(calisphere_html):
    """Takes Calisphere HTML, returns list of ARKs.
    """
    soup = BeautifulSoup(calisphere_html, "lxml")

    #[:-1] to trim the trailing slash on the ARKs
    arks = [i["data-item_id"][:-1] for i in soup.find_all(data_item_id)]

    return arks


 def data_item_id(tag):
    """BeautifulSoup function to get all tags with attribute "data-item_id" (ARKs)
    """
    return tag.has_attr("data-item_id")


 def nx_paths(path, ark_list):
    """ Given a path to a Nuxeo directory and a list of ARKs, queries the Nuxeo API
    to returns a list of Nuxeo paths for objects in directory that match ARKs in list.
    """
    nx_utils = utils.Nuxeo()

    documents = nx_utils.nxql(
        u"""
 SELECT * FROM SampleCustomPicture
 WHERE ecm:path STARTSWITH "{}"
 AND ecm:currentLifeCycleState != "deleted"
 """.format(path)
    )

    paths = []
    for doc in documents:
        ark = doc["properties"]["ucldc_schema:identifier"]
        if ark in ark_list:
            paths.append(doc["path"])

    return paths
	""" A script to get the Nuxeo paths for missing items (and write them
	to a text file) when there is a discrepancy between number of items in
	Calisphere test and production.
	"""

	import requests
	from bs4 import BeautifulSoup
	from pynux import utils


	def __main__():

	# numeric id from the CDL collections registry
	registry_id = 26943

	# number of objects in test
	test_count = 5033

	# number of objects in production
	prod_count = 5933

	# path to collection in Nuxeo
	nuxeo_path = "/asset-library/UCR/SCUA/Archival/Klein/"

	# text file to write out to
	outfile_path = "missing_items.txt"

	missing_paths = calisphere_compare(registry_id, test_count, prod_count, nuxeo_path)

	with open(outfile_path, "w") as outfile:
	for path in missing_paths:
	outfile.write("{}\n".format(path))


	def calisphere_compare(registry_id, test_count, prod_count, nuxeo_path):
	""" Compare ARKs from the two sites.
	Get a list of ARKs that only appear in production,
	and return a list of their corresponding Nuxeo paths.
	"""

	# process HTML from test site and extract ARKs
	test_html = get_calisphere_html(registry_id, test_count, "test")
	test_arks = get_arks_from_html(test_html)

	# process HTML from production site and extract ARKs
	prod_html = get_calisphere_html(registry_id, prod_count, "prod")
	prod_arks = get_arks_from_html(prod_html)

	# get a list of the ARKs that only appear in production
	in_prod_only = list(set(prod_arks) - set(test_arks))

	# query the Nuxeo API to get the Nuxeo paths for the missing items
	paths = nx_paths(nuxeo_path, in_prod_only)

	return paths


	def get_calisphere_html(collection_id, item_count, test_or_prod):
	""" Use requests library to get HTML from Calisphere.
	"""

	if test_or_prod == "test":
	base_url = "http://calisphere-test.cdlib.org/"
	elif test_or_prod == "prod":
	base_url = "https://calisphere.org/"

	url = "{}/collections/{}/?q=&rows={}".format(base_url, collection_id, item_count)

	html = requests.get(url).text

	return html


	def get_arks_from_html(calisphere_html):
	"""Takes Calisphere HTML, returns list of ARKs.
	"""
	soup = BeautifulSoup(calisphere_html, "lxml")

	#[:-1] to trim the trailing slash on the ARKs
	arks = [i["data-item_id"][:-1] for i in soup.find_all(data_item_id)]

	return arks


	def data_item_id(tag):
	"""BeautifulSoup function to get all tags with attribute "data-item_id" (ARKs)
	"""
	return tag.has_attr("data-item_id")


	def nx_paths(path, ark_list):
	""" Given a path to a Nuxeo directory and a list of ARKs, queries the Nuxeo API
	to returns a list of Nuxeo paths for objects in directory that match ARKs in list.
	"""
	nx_utils = utils.Nuxeo()

	documents = nx_utils.nxql(
	u"""
	SELECT * FROM SampleCustomPicture
	WHERE ecm:path STARTSWITH "{}"
	AND ecm:currentLifeCycleState != "deleted"
	""".format(path)
	)

	paths = []
	for doc in documents:
	ark = doc["properties"]["ucldc_schema:identifier"]
	if ark in ark_list:
	paths.append(doc["path"])

	return paths