Skip to content

Instantly share code, notes, and snippets.

@rdhyee
Created November 9, 2012 14:19
Show Gist options
  • Save rdhyee/4045946 to your computer and use it in GitHub Desktop.
Save rdhyee/4045946 to your computer and use it in GitHub Desktop.
dp.la API query method
# Goal: feed a bunch of search terms to try to get at some collections
# something to compare to: https://gist.github.com/4046626
# API doc: https://github.com/dpla/platform/wiki
# data sources: http://dp.la/wiki/Platform_test_data_sources
import requests
import json
import urllib
from itertools import islice
# Retrieve an item by ID
# http://api.dp.la/v1/items/a4e2346032cae75b0832abe064c14bcb
# Retrieve multiple items by ID
# http://api.dp.la/v1/items/a4e2346032cae75b0832abe0644e9b26,a4e2346032cae75b0832abe064c14bcb
def dpla_query(**kw_input):
kwargs = {"page_size": 20, "page": 1, "sort_order":"asc"}
# fudgy -- allow an extra parameter to allow for ones that can fit kw_input -- e.g., spatial.coordinates
extras = kw_input.pop('extras',{})
kw_input.update(extras)
kwargs.update(kw_input)
kwargs = dict([(k,v) for (k,v) in kwargs.items() if v is not None])
# asc vs desc
# available text search fields
text_search_fields = ("title", "description", "dplaContributor", "creator", "type", "publisher", "format", "rights", "contributor", "spatial")
expected_doc_fields = ['title','description', 'creator', 'type', 'publisher', 'format', 'rights', 'contributor', 'created', 'spatial', 'temporal', 'source']
# temporal fields
# http://api.dp.la/v1/items?temporal.after=1963-11-01&temporal.before=1963-11-30
# location available...not implemented here
more_items = True
# content["count"], content["start"], content["limit"]
while more_items:
r = requests.get("http://api.dp.la/v1/items?" + urllib.urlencode(kwargs))
content = json.loads(r.content)
if len(content["docs"]):
for doc in content["docs"]:
yield (doc, content["count"])
if kwargs['sort_order'] == 'desc':
kwargs['page'] -= 1
else:
kwargs['page'] += 1
else:
more_items = False
# search terms to feed in
SEARCH_TERMS = ["Bach", "tree", "horse", "cow", "Gore"]
# collections
collections = set()
for term in SEARCH_TERMS:
results = list(islice(dpla_query(q=term),100))
for (i, (doc, count)) in enumerate(results):
collections.add(doc.get('isPartOf', {'title':None}).get('title'))
print len(collections)
for collection in collections:
print collection
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment