Created
November 9, 2012 14:19
-
-
Save rdhyee/4045946 to your computer and use it in GitHub Desktop.
dp.la API query method
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Goal: feed a bunch of search terms to try to get at some collections | |
# something to compare to: https://gist.github.com/4046626 | |
# API doc: https://github.com/dpla/platform/wiki | |
# data sources: http://dp.la/wiki/Platform_test_data_sources | |
import requests | |
import json | |
import urllib | |
from itertools import islice | |
# Retrieve an item by ID | |
# http://api.dp.la/v1/items/a4e2346032cae75b0832abe064c14bcb | |
# Retrieve multiple items by ID | |
# http://api.dp.la/v1/items/a4e2346032cae75b0832abe0644e9b26,a4e2346032cae75b0832abe064c14bcb | |
def dpla_query(**kw_input): | |
kwargs = {"page_size": 20, "page": 1, "sort_order":"asc"} | |
# fudgy -- allow an extra parameter to allow for ones that can fit kw_input -- e.g., spatial.coordinates | |
extras = kw_input.pop('extras',{}) | |
kw_input.update(extras) | |
kwargs.update(kw_input) | |
kwargs = dict([(k,v) for (k,v) in kwargs.items() if v is not None]) | |
# asc vs desc | |
# available text search fields | |
text_search_fields = ("title", "description", "dplaContributor", "creator", "type", "publisher", "format", "rights", "contributor", "spatial") | |
expected_doc_fields = ['title','description', 'creator', 'type', 'publisher', 'format', 'rights', 'contributor', 'created', 'spatial', 'temporal', 'source'] | |
# temporal fields | |
# http://api.dp.la/v1/items?temporal.after=1963-11-01&temporal.before=1963-11-30 | |
# location available...not implemented here | |
more_items = True | |
# content["count"], content["start"], content["limit"] | |
while more_items: | |
r = requests.get("http://api.dp.la/v1/items?" + urllib.urlencode(kwargs)) | |
content = json.loads(r.content) | |
if len(content["docs"]): | |
for doc in content["docs"]: | |
yield (doc, content["count"]) | |
if kwargs['sort_order'] == 'desc': | |
kwargs['page'] -= 1 | |
else: | |
kwargs['page'] += 1 | |
else: | |
more_items = False | |
# search terms to feed in | |
SEARCH_TERMS = ["Bach", "tree", "horse", "cow", "Gore"] | |
# collections | |
collections = set() | |
for term in SEARCH_TERMS: | |
results = list(islice(dpla_query(q=term),100)) | |
for (i, (doc, count)) in enumerate(results): | |
collections.add(doc.get('isPartOf', {'title':None}).get('title')) | |
print len(collections) | |
for collection in collections: | |
print collection |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment