Skip to content

Instantly share code, notes, and snippets.

@bbengfort
Created April 7, 2015 16:12
Show Gist options
  • Save bbengfort/047b778915131969a5dd to your computer and use it in GitHub Desktop.
Save bbengfort/047b778915131969a5dd to your computer and use it in GitHub Desktop.
Google Book Service
from urllib import urlencode
from datetime import date, datetime
from httplib import HTTPSConnection, HTTPException
try:
from django.utils import simplejson as json
except ImportError:
import json
def capitalize(word):
"""
Checks if a hyphen is in the word, and if so, capitalizes every
word in the hyphen. Otherwise, returns the standard capitalize.
"""
if '-' in word:
words = word.split('-')
words = [word.capitalize() for word in words]
return '-'.join(words)
return word.capitalize()
def title_case(string):
"""
Capitalize the first and last word of the string, and every other
word in the string, EXCEPT those that are listed in the articles.
"""
articles = ["the", "a", "an", "of", "at", "on", "to", "over", "and", "but", "or", "nor"]
words = string.split(' ')
for i, word in enumerate(words):
# First word and last word should be capitalized.
if i == 0 or i == len(words)-1:
word = word.capitalize( )
else:
# Check for acronyms.
if word.isupper( ):
continue
else:
word = word.lower( )
if word not in articles:
word = capitalize(word)
words[i] = word
return ' '.join(words)
def make_title(t):
articles = ["The", "A", "An", "Of", "At", "On"]
# Make words uppercase and remove extra whitespace
t = t.strip()
t = title_case(t)
# Replace any html entities
t = t.replace('&Amp;', '&')
t = t.replace('&', '&')
# Check if comma appended article:
tgrp = t.split(',')
if tgrp[-1].strip( ) in articles:
tgrp.insert(0, tgrp.pop())
t = "%s %s" % (tgrp[0].strip(), ','.join(tgrp[1:]))
return t
class GoogleBooksException(Exception):
"""
Abstract class to deal with Google Books errors.
"""
pass
class QueryBuilder(object):
"""
Helper to dynamically construct a Google Books
search query, both from init and at runtime.
A Google Books search query is constructed of
search terms and keyword search terms.
Search terms are searched throughout the search
space of the book, title, author, etc.
Keyword search terms restrict the search to
particular fields, those which I will allow are
listed in the KEYWORDS class variable.
"""
KEYWORDS = ( 'intitle',
'inauthor',
'inpublisher',
'subject',
'isbn',
'lccn',
'oclc', )
def __init__(self, *terms, **kwterms):
"""
Construct a query from terms and keyword terms.
"""
# Init instance variables
self.terms = []
self.kwterms = {}
# Add the passed arguments
for term in terms:
self.add_term(term)
for keyword, term in kwterms.items():
self.add_keyword_term(keyword, term)
def add_term(self, term):
"""
Add a unique term to the query.
"""
for t in self.terms:
if t == term:
return
self.terms.append(term)
def add_keyword_term(self, keyword, term):
"""
Add a keyword term to the query
"""
if keyword not in QueryBuilder.KEYWORDS:
raise GoogleBooksException('%s is not a valid keyword' % keyword)
if keyword in self.kwterms:
self.kwterms[keyword] = '+'.join([self.kwterms[keyword], term])
else:
self.kwterms[keyword] = term
# The following are aliases for constructing the query string.
@property
def query(self):
"""
Property alias to initiate the query string construction
"""
return self.get_query_string()
def get_query_string(self):
"""
Construct the query string and return.
"""
q = "+".join(self.terms)
for k, t in self.kwterms.items():
if q != '':
q = "+".join([q, "%s:%s" % (k, t)])
else:
q = "%s:%s" % (k, t)
return q
def __str__(self):
return self.query
def __unicode__(self):
return self.query
class GoogleBooks(object):
VERB = "GET"
HOST = "www.googleapis.com"
URI = "/books/v1/volumes"
def __init__(self, **kwargs):
# add any additional parameters to query or orverride defaults:
for (k, v) in kwargs.iteritems():
setattr(self, k, v)
def lookup(self, query):
"""
Executes a lookup request for the given QueryBuilder query
Returns one of the following:
- None (for nothing found)
- A single dictionary (for one item found)
- A list of dictionaries (for multiple items found)
Note: GoogleBooksExceptions raised in parse will be caught,
and none will be returned instead. Directly parse to debug.
"""
params = {'q':query}
response = self.execute(params)
# Check for an Error
if "error" in response:
if "errors" in response["error"]:
errors = response["error"]["errors"]
if len(errors) > 0:
if "message" in errors[0]:
raise GoogleBooksException(errors[0]["message"])
raise GoogleBooksException("Unknown error response from Google")
# No error: return "relevant" data
else:
if "totalItems" in response:
count = response["totalItems"]
if count == 0:
return None
else:
if "items" in response:
items = response["items"]
else:
raise GoogleBooksException("Unknown API error: could not find items in response.")
if count > 1:
# Return a list of the items returned.
# TODO: case where there are multiple pages of results
books = [ ]
for item in items:
try:
books.append(BookDict(item))
except GoogleBooksException:
continue
if len(books) > 0:
return books
else:
return None
elif count == 1:
# Return a single item, some queries expect this, e.g. isbn queries
try:
return BookDict(items[0])
except GoogleBooksException:
return None
else:
# Total count was 0 or negative
return None
else:
raise GoogleBooksException("Unkown API error: no total item count in response.")
def get_required_params(self):
"""
Returns a dictionary of the parameters that must
be added to the query.
"""
return {}
def execute(self, params):
"""
Execute a GET request with the given parameters
"""
# create parameters string
request = self.get_required_params() # Create a request dictionary of required params
request.update(params) # Update required params with passed in params
request = urlencode(request) # Percent escape and encode the request dictionary
# create the http connection
try:
conn = HTTPSConnection(self.HOST)
conn.request(self.VERB, '?'.join([self.URI, request]))
except HTTPException as e:
raise GoogleBooksException("Could not connect to Google: %s" % str(e))
# grab and parse the response
try:
response = conn.getresponse( )
response = json.loads(response.read())
except ValueError as e:
raise GoogleBooksException("Could not parse response from Google: %s" % str(e))
# clean up the connection
conn.close( )
return response
class BookDict(object):
"""
Emulates a Query dictionary and wraps the returned
JSON data from a Google Books Lookup. This class
parses out the data for use in other applications,
and decides what is "relevant" to our application.
"""
def __init__(self, item_data):
"""
Requires a python dictionary converted from Google Books JSON,
specifically the item (not the items list or whole JSON).
"""
self.raw_data = item_data
self.item = self.parse(item_data)
@property
def isbn(self):
"""
Searches the "industryIdentifiers" field for
the most relevant ISBN -- looks for ISBN_13,
but returns ISBN_10 if it can't find it.
"""
identifiers = self["industryIdentifiers"]
searchkeys = ("ISBN_13", "ISBN_10") # Order matters here, the first one is the priority!
return self.isbn_search(identifiers, searchkeys)
@property
def shortisbn(self):
"""
Searches for ISBN_10 in "industryIdentifiers"
"""
identifiers = self["industryIdentifiers"]
return self.isbn_search(identifiers, ("ISBN_10",))
@property
def title(self):
"""
Alias for self['title']
"""
return make_title(self['title'])
@property
def authors(self):
"""
Generator function that yields all the authors
in the authors field.
This could simply be an alias for self['authors']
but if we want to do any management or memory
optimization, this functionality will be helpful
Note: Currently yields full name strings.
"""
for author in self['authors']:
yield author
@property
def publisher(self):
"""
Alias for self['publisher']. Yields publisher name.
"""
return self['publisher']
@property
def pubdate(self):
"""
Converts "publishedDate" into a python date
If a publishedDate couldn't be found or parsed,
this method will return None.
"""
published = self["publishedDate"]
datefmts = ("%Y-%m-%d", "%Y-%m", "%Y")
for fmt in datefmts:
try:
dt = datetime.strptime(published, fmt)
return dt.date()
except ValueError:
# If ValueError, then this format didn't work, try the next.
continue
except TypeError:
# If TypeError, then published is probably None, so break and return.
break
return None
@property
def pages(self):
"""
Alias for self["pageCount"]
"""
return self["pageCount"]
@property
def description(self):
"""
Alias for self["description"]
"""
return self["description"]
@property
def language(self):
"""
Alias for self["language"]
"""
return self["language"]
@property
def thumbnail_url(self):
"""
Looks for the biggest thumbnail image URL to return.
NOTE: To get bigger images out of Google, you have
to do a request with the Google specific identifier.
TODO: Strip weirdness out of URL
"""
sizes = ("thumbnail", "smallThumbnail")
for size in sizes:
if size in self["imageLinks"]:
return self["imageLinks"][size]
return None
def parse(self, data):
"""
Parses out the relevant data from the raw data, and stores it
for access as though this object were a dictionary.
Modify this method for required vs. optional fields
"""
if "volumeInfo" in data:
item = data["volumeInfo"]
else:
raise GoogleBooksException("Expected to find volumeInfo, could not.")
required = ("title",
"industryIdentifiers",)
optional = ("authors",
"publisher",
"publishedDate",
"description",
"pageCount",
"imageLinks",
"language")
book = { }
for key in required:
if key not in item:
raise GoogleBooksException("Required key, %s, was not found", key)
else:
book[key] = item[key]
for key in optional:
if key in item:
book[key] = item[key]
else:
book[key] = None
return book
def isbn_search(self, space, keys):
"""
Performs a "priority search" on the search space,
based on the order of search keys if they're in
the search space.
The expected type of the space is an iterable
that contains dictionary-like objects, with the
search keys. (In this case, Google Books
industryIdentifiers list)
The keys is an ordered tuple or list to search
the dictionary for-- it returns the first key
in the order it can find it.
TODO: Make more generic
"""
found = None # Holds the currently found item
index = None # Holds the index of the key for current item
for item in space:
if item['type'] in keys:
if item['type'] == keys[0]:
return item['identifier'] # If it's the first key, then return.
else:
if index is None or keys.index(item['type']) < index:
found = item['identifier']
index = keys.index(item['type'])
else:
continue
else:
continue
return found
def serialize(self):
"""
Returns a dictionary compatible to the Book model
"""
return {
'isbn': self.isbn,
'shortisbn': self.shortisbn,
'title': self.title,
'authors': [a for a in self.authors],
'publisher': {'name':self.publisher, 'location':None},
'pubdate': self.pubdate,
'pages': self.pages,
'description': self.description,
'language': self.language,
}
def __getitem__(self, name):
"""
Provides read-only access to item dictionary, through normal
dictionary access mechansim.
"""
return self.item[name] if name in self else None
def __contains__(self, name):
return name in self.item
def __str__(self):
return "<BookDict: %s by %s>" % (self['title'], ', '.join(self['authors']))
if __name__ == "__main__":
#query = QueryBuilder('flowers', inauthor="keyes")
query = QueryBuilder(isbn="9780441001866")
books = GoogleBooks()
response = books.lookup(query)
if response is None:
print "Could not find query: %s" % query
elif isinstance(response, list):
for book in response:
print book
else:
print response
for k, v in response.serialize().items():
print "%s: %s" % (k, v)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment