rdhyee · November 9, 2012 14:19
diff --git a/dpla_query.py b/dpla_query.py
 # Goal:  feed a bunch of search terms to try to get at some collections
 # something to compare to:  https://gist.github.com/4046626

 # API doc: https://github.com/dpla/platform/wiki
 # data sources: http://dp.la/wiki/Platform_test_data_sources

 import requests
 import json
 import urllib
 from itertools import islice

 # Retrieve an item by ID
 # http://api.dp.la/v1/items/a4e2346032cae75b0832abe064c14bcb

 # Retrieve multiple items by ID
 # http://api.dp.la/v1/items/a4e2346032cae75b0832abe0644e9b26,a4e2346032cae75b0832abe064c14bcb


 def dpla_query(**kw_input):
    
    kwargs = {"page_size": 20, "page": 1, "sort_order":"asc"}
    
    # fudgy -- allow an extra parameter to allow for ones that can fit kw_input -- e.g., spatial.coordinates
    extras = kw_input.pop('extras',{})
    kw_input.update(extras)
    
    kwargs.update(kw_input)
    kwargs = dict([(k,v) for (k,v) in kwargs.items() if v is not None])
    
    # asc vs desc
    
    # available text search fields
    text_search_fields = ("title", "description", "dplaContributor", "creator", "type", "publisher", "format", "rights", "contributor", "spatial")
    expected_doc_fields = ['title','description', 'creator', 'type', 'publisher', 'format', 'rights', 'contributor', 'created', 'spatial', 'temporal', 'source']
    
    # temporal fields
    # http://api.dp.la/v1/items?temporal.after=1963-11-01&temporal.before=1963-11-30
    
    # location available...not implemented here
    more_items = True
    
   # content["count"], content["start"], content["limit"]
    
    while more_items:
        r = requests.get("http://api.dp.la/v1/items?" + urllib.urlencode(kwargs))
        content = json.loads(r.content)
        if len(content["docs"]):
            for doc in content["docs"]:
                yield (doc, content["count"])
            if kwargs['sort_order'] == 'desc':
                kwargs['page'] -= 1
            else:
                kwargs['page'] += 1
        else:
            more_items = False


 # search terms to feed in 

 SEARCH_TERMS = ["Bach", "tree", "horse", "cow", "Gore"]
 # collections 
 collections = set()

 for term in SEARCH_TERMS:
    results = list(islice(dpla_query(q=term),100)) 
            
    for (i, (doc, count)) in enumerate(results):
        collections.add(doc.get('isPartOf', {'title':None}).get('title'))
    
 print len(collections)

 for collection in collections:
    print collection
	# Goal: feed a bunch of search terms to try to get at some collections
	# something to compare to: https://gist.github.com/4046626

	# API doc: https://github.com/dpla/platform/wiki
	# data sources: http://dp.la/wiki/Platform_test_data_sources

	import requests
	import json
	import urllib
	from itertools import islice

	# Retrieve an item by ID
	# http://api.dp.la/v1/items/a4e2346032cae75b0832abe064c14bcb

	# Retrieve multiple items by ID
	# http://api.dp.la/v1/items/a4e2346032cae75b0832abe0644e9b26,a4e2346032cae75b0832abe064c14bcb


	def dpla_query(**kw_input):

	kwargs = {"page_size": 20, "page": 1, "sort_order":"asc"}

	# fudgy -- allow an extra parameter to allow for ones that can fit kw_input -- e.g., spatial.coordinates
	extras = kw_input.pop('extras',{})
	kw_input.update(extras)

	kwargs.update(kw_input)
	kwargs = dict([(k,v) for (k,v) in kwargs.items() if v is not None])

	# asc vs desc

	# available text search fields
	text_search_fields = ("title", "description", "dplaContributor", "creator", "type", "publisher", "format", "rights", "contributor", "spatial")
	expected_doc_fields = ['title','description', 'creator', 'type', 'publisher', 'format', 'rights', 'contributor', 'created', 'spatial', 'temporal', 'source']

	# temporal fields
	# http://api.dp.la/v1/items?temporal.after=1963-11-01&temporal.before=1963-11-30

	# location available...not implemented here
	more_items = True

	# content["count"], content["start"], content["limit"]

	while more_items:
	r = requests.get("http://api.dp.la/v1/items?" + urllib.urlencode(kwargs))
	content = json.loads(r.content)
	if len(content["docs"]):
	for doc in content["docs"]:
	yield (doc, content["count"])
	if kwargs['sort_order'] == 'desc':
	kwargs['page'] -= 1
	else:
	kwargs['page'] += 1
	else:
	more_items = False


	# search terms to feed in

	SEARCH_TERMS = ["Bach", "tree", "horse", "cow", "Gore"]
	# collections
	collections = set()

	for term in SEARCH_TERMS:
	results = list(islice(dpla_query(q=term),100))

	for (i, (doc, count)) in enumerate(results):
	collections.add(doc.get('isPartOf', {'title':None}).get('title'))

	print len(collections)

	for collection in collections:
	print collection