Skip to content

Instantly share code, notes, and snippets.

@Radcliffe
Created December 30, 2014 05:56
Show Gist options
  • Save Radcliffe/eba734e22d39eb01f6d8 to your computer and use it in GitHub Desktop.
Save Radcliffe/eba734e22d39eb01f6d8 to your computer and use it in GitHub Desktop.
Download all USPTO patent grants 1976 - present
# Download patent grants 1976 - 2014
import os
import urllib
import urllib2
import lxml.html
if not os.path.exists('data'):
os.mkdir('data')
URL = "http://www.google.com/googlebooks/uspto-patents-grants-text.html"
doc = lxml.html.parse(URL)
body = doc.find("body")
count = 0
for link in body.findall("a"):
href = link.get("href")
if href and href.endswith(".zip"):
count += 1
path, year, fname = href.rsplit("/", 2)
if not os.path.exists("data/%s" % year):
os.mkdir("data/%s" % year)
fullname = "data/%s/%s" % (year, fname)
print "%04d - Retrieving %s" % (count, fullname)
urllib.urlretrieve(href, fullname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment