-
-
Save ixtel/4d334d4672d6d3b67e8a to your computer and use it in GitHub Desktop.
Google Book Service
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import urlencode | |
from datetime import date, datetime | |
from httplib import HTTPSConnection, HTTPException | |
try: | |
from django.utils import simplejson as json | |
except ImportError: | |
import json | |
def capitalize(word): | |
""" | |
Checks if a hyphen is in the word, and if so, capitalizes every | |
word in the hyphen. Otherwise, returns the standard capitalize. | |
""" | |
if '-' in word: | |
words = word.split('-') | |
words = [word.capitalize() for word in words] | |
return '-'.join(words) | |
return word.capitalize() | |
def title_case(string): | |
""" | |
Capitalize the first and last word of the string, and every other | |
word in the string, EXCEPT those that are listed in the articles. | |
""" | |
articles = ["the", "a", "an", "of", "at", "on", "to", "over", "and", "but", "or", "nor"] | |
words = string.split(' ') | |
for i, word in enumerate(words): | |
# First word and last word should be capitalized. | |
if i == 0 or i == len(words)-1: | |
word = word.capitalize( ) | |
else: | |
# Check for acronyms. | |
if word.isupper( ): | |
continue | |
else: | |
word = word.lower( ) | |
if word not in articles: | |
word = capitalize(word) | |
words[i] = word | |
return ' '.join(words) | |
def make_title(t): | |
articles = ["The", "A", "An", "Of", "At", "On"] | |
# Make words uppercase and remove extra whitespace | |
t = t.strip() | |
t = title_case(t) | |
# Replace any html entities | |
t = t.replace('&Amp;', '&') | |
t = t.replace('&', '&') | |
# Check if comma appended article: | |
tgrp = t.split(',') | |
if tgrp[-1].strip( ) in articles: | |
tgrp.insert(0, tgrp.pop()) | |
t = "%s %s" % (tgrp[0].strip(), ','.join(tgrp[1:])) | |
return t | |
class GoogleBooksException(Exception): | |
""" | |
Abstract class to deal with Google Books errors. | |
""" | |
pass | |
class QueryBuilder(object): | |
""" | |
Helper to dynamically construct a Google Books | |
search query, both from init and at runtime. | |
A Google Books search query is constructed of | |
search terms and keyword search terms. | |
Search terms are searched throughout the search | |
space of the book, title, author, etc. | |
Keyword search terms restrict the search to | |
particular fields, those which I will allow are | |
listed in the KEYWORDS class variable. | |
""" | |
KEYWORDS = ( 'intitle', | |
'inauthor', | |
'inpublisher', | |
'subject', | |
'isbn', | |
'lccn', | |
'oclc', ) | |
def __init__(self, *terms, **kwterms): | |
""" | |
Construct a query from terms and keyword terms. | |
""" | |
# Init instance variables | |
self.terms = [] | |
self.kwterms = {} | |
# Add the passed arguments | |
for term in terms: | |
self.add_term(term) | |
for keyword, term in kwterms.items(): | |
self.add_keyword_term(keyword, term) | |
def add_term(self, term): | |
""" | |
Add a unique term to the query. | |
""" | |
for t in self.terms: | |
if t == term: | |
return | |
self.terms.append(term) | |
def add_keyword_term(self, keyword, term): | |
""" | |
Add a keyword term to the query | |
""" | |
if keyword not in QueryBuilder.KEYWORDS: | |
raise GoogleBooksException('%s is not a valid keyword' % keyword) | |
if keyword in self.kwterms: | |
self.kwterms[keyword] = '+'.join([self.kwterms[keyword], term]) | |
else: | |
self.kwterms[keyword] = term | |
# The following are aliases for constructing the query string. | |
@property | |
def query(self): | |
""" | |
Property alias to initiate the query string construction | |
""" | |
return self.get_query_string() | |
def get_query_string(self): | |
""" | |
Construct the query string and return. | |
""" | |
q = "+".join(self.terms) | |
for k, t in self.kwterms.items(): | |
if q != '': | |
q = "+".join([q, "%s:%s" % (k, t)]) | |
else: | |
q = "%s:%s" % (k, t) | |
return q | |
def __str__(self): | |
return self.query | |
def __unicode__(self): | |
return self.query | |
class GoogleBooks(object): | |
VERB = "GET" | |
HOST = "www.googleapis.com" | |
URI = "/books/v1/volumes" | |
def __init__(self, **kwargs): | |
# add any additional parameters to query or orverride defaults: | |
for (k, v) in kwargs.iteritems(): | |
setattr(self, k, v) | |
def lookup(self, query): | |
""" | |
Executes a lookup request for the given QueryBuilder query | |
Returns one of the following: | |
- None (for nothing found) | |
- A single dictionary (for one item found) | |
- A list of dictionaries (for multiple items found) | |
Note: GoogleBooksExceptions raised in parse will be caught, | |
and none will be returned instead. Directly parse to debug. | |
""" | |
params = {'q':query} | |
response = self.execute(params) | |
# Check for an Error | |
if "error" in response: | |
if "errors" in response["error"]: | |
errors = response["error"]["errors"] | |
if len(errors) > 0: | |
if "message" in errors[0]: | |
raise GoogleBooksException(errors[0]["message"]) | |
raise GoogleBooksException("Unknown error response from Google") | |
# No error: return "relevant" data | |
else: | |
if "totalItems" in response: | |
count = response["totalItems"] | |
if count == 0: | |
return None | |
else: | |
if "items" in response: | |
items = response["items"] | |
else: | |
raise GoogleBooksException("Unknown API error: could not find items in response.") | |
if count > 1: | |
# Return a list of the items returned. | |
# TODO: case where there are multiple pages of results | |
books = [ ] | |
for item in items: | |
try: | |
books.append(BookDict(item)) | |
except GoogleBooksException: | |
continue | |
if len(books) > 0: | |
return books | |
else: | |
return None | |
elif count == 1: | |
# Return a single item, some queries expect this, e.g. isbn queries | |
try: | |
return BookDict(items[0]) | |
except GoogleBooksException: | |
return None | |
else: | |
# Total count was 0 or negative | |
return None | |
else: | |
raise GoogleBooksException("Unkown API error: no total item count in response.") | |
def get_required_params(self): | |
""" | |
Returns a dictionary of the parameters that must | |
be added to the query. | |
""" | |
return {} | |
def execute(self, params): | |
""" | |
Execute a GET request with the given parameters | |
""" | |
# create parameters string | |
request = self.get_required_params() # Create a request dictionary of required params | |
request.update(params) # Update required params with passed in params | |
request = urlencode(request) # Percent escape and encode the request dictionary | |
# create the http connection | |
try: | |
conn = HTTPSConnection(self.HOST) | |
conn.request(self.VERB, '?'.join([self.URI, request])) | |
except HTTPException as e: | |
raise GoogleBooksException("Could not connect to Google: %s" % str(e)) | |
# grab and parse the response | |
try: | |
response = conn.getresponse( ) | |
response = json.loads(response.read()) | |
except ValueError as e: | |
raise GoogleBooksException("Could not parse response from Google: %s" % str(e)) | |
# clean up the connection | |
conn.close( ) | |
return response | |
class BookDict(object): | |
""" | |
Emulates a Query dictionary and wraps the returned | |
JSON data from a Google Books Lookup. This class | |
parses out the data for use in other applications, | |
and decides what is "relevant" to our application. | |
""" | |
def __init__(self, item_data): | |
""" | |
Requires a python dictionary converted from Google Books JSON, | |
specifically the item (not the items list or whole JSON). | |
""" | |
self.raw_data = item_data | |
self.item = self.parse(item_data) | |
@property | |
def isbn(self): | |
""" | |
Searches the "industryIdentifiers" field for | |
the most relevant ISBN -- looks for ISBN_13, | |
but returns ISBN_10 if it can't find it. | |
""" | |
identifiers = self["industryIdentifiers"] | |
searchkeys = ("ISBN_13", "ISBN_10") # Order matters here, the first one is the priority! | |
return self.isbn_search(identifiers, searchkeys) | |
@property | |
def shortisbn(self): | |
""" | |
Searches for ISBN_10 in "industryIdentifiers" | |
""" | |
identifiers = self["industryIdentifiers"] | |
return self.isbn_search(identifiers, ("ISBN_10",)) | |
@property | |
def title(self): | |
""" | |
Alias for self['title'] | |
""" | |
return make_title(self['title']) | |
@property | |
def authors(self): | |
""" | |
Generator function that yields all the authors | |
in the authors field. | |
This could simply be an alias for self['authors'] | |
but if we want to do any management or memory | |
optimization, this functionality will be helpful | |
Note: Currently yields full name strings. | |
""" | |
for author in self['authors']: | |
yield author | |
@property | |
def publisher(self): | |
""" | |
Alias for self['publisher']. Yields publisher name. | |
""" | |
return self['publisher'] | |
@property | |
def pubdate(self): | |
""" | |
Converts "publishedDate" into a python date | |
If a publishedDate couldn't be found or parsed, | |
this method will return None. | |
""" | |
published = self["publishedDate"] | |
datefmts = ("%Y-%m-%d", "%Y-%m", "%Y") | |
for fmt in datefmts: | |
try: | |
dt = datetime.strptime(published, fmt) | |
return dt.date() | |
except ValueError: | |
# If ValueError, then this format didn't work, try the next. | |
continue | |
except TypeError: | |
# If TypeError, then published is probably None, so break and return. | |
break | |
return None | |
@property | |
def pages(self): | |
""" | |
Alias for self["pageCount"] | |
""" | |
return self["pageCount"] | |
@property | |
def description(self): | |
""" | |
Alias for self["description"] | |
""" | |
return self["description"] | |
@property | |
def language(self): | |
""" | |
Alias for self["language"] | |
""" | |
return self["language"] | |
@property | |
def thumbnail_url(self): | |
""" | |
Looks for the biggest thumbnail image URL to return. | |
NOTE: To get bigger images out of Google, you have | |
to do a request with the Google specific identifier. | |
TODO: Strip weirdness out of URL | |
""" | |
sizes = ("thumbnail", "smallThumbnail") | |
for size in sizes: | |
if size in self["imageLinks"]: | |
return self["imageLinks"][size] | |
return None | |
def parse(self, data): | |
""" | |
Parses out the relevant data from the raw data, and stores it | |
for access as though this object were a dictionary. | |
Modify this method for required vs. optional fields | |
""" | |
if "volumeInfo" in data: | |
item = data["volumeInfo"] | |
else: | |
raise GoogleBooksException("Expected to find volumeInfo, could not.") | |
required = ("title", | |
"industryIdentifiers",) | |
optional = ("authors", | |
"publisher", | |
"publishedDate", | |
"description", | |
"pageCount", | |
"imageLinks", | |
"language") | |
book = { } | |
for key in required: | |
if key not in item: | |
raise GoogleBooksException("Required key, %s, was not found", key) | |
else: | |
book[key] = item[key] | |
for key in optional: | |
if key in item: | |
book[key] = item[key] | |
else: | |
book[key] = None | |
return book | |
def isbn_search(self, space, keys): | |
""" | |
Performs a "priority search" on the search space, | |
based on the order of search keys if they're in | |
the search space. | |
The expected type of the space is an iterable | |
that contains dictionary-like objects, with the | |
search keys. (In this case, Google Books | |
industryIdentifiers list) | |
The keys is an ordered tuple or list to search | |
the dictionary for-- it returns the first key | |
in the order it can find it. | |
TODO: Make more generic | |
""" | |
found = None # Holds the currently found item | |
index = None # Holds the index of the key for current item | |
for item in space: | |
if item['type'] in keys: | |
if item['type'] == keys[0]: | |
return item['identifier'] # If it's the first key, then return. | |
else: | |
if index is None or keys.index(item['type']) < index: | |
found = item['identifier'] | |
index = keys.index(item['type']) | |
else: | |
continue | |
else: | |
continue | |
return found | |
def serialize(self): | |
""" | |
Returns a dictionary compatible to the Book model | |
""" | |
return { | |
'isbn': self.isbn, | |
'shortisbn': self.shortisbn, | |
'title': self.title, | |
'authors': [a for a in self.authors], | |
'publisher': {'name':self.publisher, 'location':None}, | |
'pubdate': self.pubdate, | |
'pages': self.pages, | |
'description': self.description, | |
'language': self.language, | |
} | |
def __getitem__(self, name): | |
""" | |
Provides read-only access to item dictionary, through normal | |
dictionary access mechansim. | |
""" | |
return self.item[name] if name in self else None | |
def __contains__(self, name): | |
return name in self.item | |
def __str__(self): | |
return "<BookDict: %s by %s>" % (self['title'], ', '.join(self['authors'])) | |
if __name__ == "__main__": | |
#query = QueryBuilder('flowers', inauthor="keyes") | |
query = QueryBuilder(isbn="9780441001866") | |
books = GoogleBooks() | |
response = books.lookup(query) | |
if response is None: | |
print "Could not find query: %s" % query | |
elif isinstance(response, list): | |
for book in response: | |
print book | |
else: | |
print response | |
for k, v in response.serialize().items(): | |
print "%s: %s" % (k, v) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment