shawngraham · January 6, 2025 13:51
diff --git a/search.py b/search.py
 %%capture
 !python3 -m pip install git+https://github.com/p-lod/plodlib
 !pip install requests_cache
 !pip install rdflib

 import plodlib
 import json
 import pandas as pd
 from string import Template
 import rdflib as rdf
 import requests_cache
 import requests

 def search_images(search_term):
  """
    Searches the P-LOD triplestore for resources whose labels
    contain a specified term. Returns a list of image URLs 
    and associated information for those resources.

    Args:
        search_term (str): The term to search for within labels.

    Returns:
        list: A list of dictionaries, each containing information about an image
        and its associated concept. Returns an empty list if no matching
        results are found.
    """

  # Connect to the remote triplestore with read-only connection
  store = rdf.plugins.stores.sparqlstore.SPARQLStore(query_endpoint = "http://52.170.134.25:3030/plod_endpoint/query",
                                       context_aware = False,
                                       returnFormat = 'json')
  g = rdf.Graph(store)

  qt = Template("""
      PREFIX p-lod: <urn:p-lod:id:>
      PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
      SELECT DISTINCT ?concept ?concept_label ?img_urn ?l_record ?l_media ?l_batch ?l_description
      WHERE {
        ?concept rdfs:label ?concept_label .
        FILTER regex(?concept_label, "$search_term", "i")
        
        { ?component p-lod:depicts ?concept .
          ?component p-lod:best-image ?img_urn .
         }
         UNION
         {
           ?concept p-lod:best-image ?img_urn .
         }

        ?img_urn p-lod:x-luna-record-id ?l_record .
        ?img_urn p-lod:x-luna-media-id  ?l_media .
        ?img_urn p-lod:x-luna-batch-id  ?l_batch .
        ?img_urn p-lod:x-luna-description ?l_description .
      }
  """)

  results = g.query(qt.substitute(search_term = search_term))
  results_df = pd.DataFrame(results, columns = results.json['head']['vars'])
  
  if len(results_df) == 0:
      return []
  
  def add_luna_info_search(row):
      
      img_src = None #default if no URLs present (probably means LUNA doesn't have image though triplestore thinks it does)
      img_description = None
      tilde_val = plodlib.luna_tilde_val(row['img_urn'])
      
      luna_json = json.loads(requests.get(f'https://umassamherst.lunaimaging.com/luna/servlet/as/fetchMediaSearch?mid=umass~{tilde_val}~{tilde_val}~{row["l_record"]}~{row["l_media"]}&fullData=true').text)
      
      if len(luna_json):
          img_attributes = json.loads(luna_json[0]['attributes'])
          if 'image_description_english' in img_attributes.keys():
              img_description = img_attributes['image_description_english']
          else:
            try:
                if   tilde_val == '14':
                  img_description = json.loads(luna_json[0]['fieldValues'])[2]['value']
                elif tilde_val == '16':
                  img_description = json.loads(luna_json[0]['fieldValues'])[1]['value']
                else:
                  img_description = f"unrecognized collection {tilde_val}"
            except:
                img_description = "Trying to get description failed"

          if 'urlSize4' in img_attributes.keys(): # use size 4, sure, but only if there's nothing else
            img_src = img_attributes['urlSize4']
          if 'urlSize2' in img_attributes.keys(): # preferred
            img_src = img_attributes['urlSize2']
          elif 'urlSize3' in img_attributes.keys():
            img_src = img_attributes['urlSize3']
          else:
            img_src = img_attributes['urlSize1']

      row['img_url'] = img_src
      row['img_current_description'] = img_description
      return row
  
  results_df = results_df.apply(add_luna_info_search, axis = 1)
  
  return results_df.to_dict(orient='records')

 # Example usage:
 search_term = "theseus"
 image_results = search_images(search_term)

 if image_results:
    for item in image_results:
        print(f"Concept: {item['concept_label']}")
        print(f"Image URL: {item['img_url']}")
        print(f"Image Description: {item['img_current_description']}")
        print("---")
 else:
    print(f"No images found for '{search_term}'")
	%%capture
	!python3 -m pip install git+https://github.com/p-lod/plodlib
	!pip install requests_cache
	!pip install rdflib

	import plodlib
	import json
	import pandas as pd
	from string import Template
	import rdflib as rdf
	import requests_cache
	import requests

	def search_images(search_term):
	"""
	Searches the P-LOD triplestore for resources whose labels
	contain a specified term. Returns a list of image URLs
	and associated information for those resources.

	Args:
	search_term (str): The term to search for within labels.

	Returns:
	list: A list of dictionaries, each containing information about an image
	and its associated concept. Returns an empty list if no matching
	results are found.
	"""

	# Connect to the remote triplestore with read-only connection
	store = rdf.plugins.stores.sparqlstore.SPARQLStore(query_endpoint = "http://52.170.134.25:3030/plod_endpoint/query",
	context_aware = False,
	returnFormat = 'json')
	g = rdf.Graph(store)

	qt = Template("""
	PREFIX p-lod: <urn:p-lod:id:>
	PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
	SELECT DISTINCT ?concept ?concept_label ?img_urn ?l_record ?l_media ?l_batch ?l_description
	WHERE {
	?concept rdfs:label ?concept_label .
	FILTER regex(?concept_label, "$search_term", "i")

	{ ?component p-lod:depicts ?concept .
	?component p-lod:best-image ?img_urn .
	}
	UNION
	{
	?concept p-lod:best-image ?img_urn .
	}

	?img_urn p-lod:x-luna-record-id ?l_record .
	?img_urn p-lod:x-luna-media-id ?l_media .
	?img_urn p-lod:x-luna-batch-id ?l_batch .
	?img_urn p-lod:x-luna-description ?l_description .
	}
	""")

	results = g.query(qt.substitute(search_term = search_term))
	results_df = pd.DataFrame(results, columns = results.json['head']['vars'])

	if len(results_df) == 0:
	return []

	def add_luna_info_search(row):

	img_src = None #default if no URLs present (probably means LUNA doesn't have image though triplestore thinks it does)
	img_description = None
	tilde_val = plodlib.luna_tilde_val(row['img_urn'])

	luna_json = json.loads(requests.get(f'https://umassamherst.lunaimaging.com/luna/servlet/as/fetchMediaSearch?mid=umass~{tilde_val}~{tilde_val}~{row["l_record"]}~{row["l_media"]}&fullData=true').text)

	if len(luna_json):
	img_attributes = json.loads(luna_json[0]['attributes'])
	if 'image_description_english' in img_attributes.keys():
	img_description = img_attributes['image_description_english']
	else:
	try:
	if tilde_val == '14':
	img_description = json.loads(luna_json[0]['fieldValues'])[2]['value']
	elif tilde_val == '16':
	img_description = json.loads(luna_json[0]['fieldValues'])[1]['value']
	else:
	img_description = f"unrecognized collection {tilde_val}"
	except:
	img_description = "Trying to get description failed"

	if 'urlSize4' in img_attributes.keys(): # use size 4, sure, but only if there's nothing else
	img_src = img_attributes['urlSize4']
	if 'urlSize2' in img_attributes.keys(): # preferred
	img_src = img_attributes['urlSize2']
	elif 'urlSize3' in img_attributes.keys():
	img_src = img_attributes['urlSize3']
	else:
	img_src = img_attributes['urlSize1']

	row['img_url'] = img_src
	row['img_current_description'] = img_description
	return row

	results_df = results_df.apply(add_luna_info_search, axis = 1)

	return results_df.to_dict(orient='records')

	# Example usage:
	search_term = "theseus"
	image_results = search_images(search_term)

	if image_results:
	for item in image_results:
	print(f"Concept: {item['concept_label']}")
	print(f"Image URL: {item['img_url']}")
	print(f"Image Description: {item['img_current_description']}")
	print("---")
	else:
	print(f"No images found for '{search_term}'")