Skip to content

Instantly share code, notes, and snippets.

@eyeseast
Created December 31, 2010 15:12
Show Gist options
  • Save eyeseast/761073 to your computer and use it in GitHub Desktop.
Save eyeseast/761073 to your computer and use it in GitHub Desktop.
Download a search result full of documents as their original PDFs
#!/usr/bin/env python
# encoding: utf-8
"""
Download a search result full of documents as their original PDFs
"""
import sys
import os
import httplib2
import urllib
try:
import json
except ImportError:
import simplejson as json
# create a global Http object so we can reuse connections
http = httplib2.Http('.cache')
def search(q, page=1, sections=False, annotations=False):
"""
Search for documents on DocumentCloud
"""
base = "http://www.documentcloud.org/api/search.json?"
params = {
'q': q,
'page': page,
'sections': sections,
'annotations': annotations
}
resp, content = http.request(base + urllib.urlencode(params))
results = json.loads(content)
return results.get('documents')
def save_docs(documents):
if not os.path.isdir('documents'):
os.mkdir('documents')
d = os.path.abspath('documents')
saved = []
for doc in documents:
pdf = doc['resources']['pdf']
fn = os.path.join(d, pdf.rsplit('/', 1)[1])
f, _ = urllib.urlretrieve(pdf, fn)
saved.append(f)
return saved
def main():
if not sys.argv[1:]:
sys.stderr.write("\nPlease enter a search query\n\n")
sys.exit(0)
query = " ".join(sys.argv[1:])
page = 1
documents = []
while True:
results = search(query, page=page)
if results:
documents += results
page += 1
else:
print "Downloaded %s documents from %s pages" % (len(documents), page)
saved = save_docs(documents)
print '\n'.join(saved)
break
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment