Skip to content

Instantly share code, notes, and snippets.

@aniemerg
Created August 4, 2012 21:07
Show Gist options
  • Save aniemerg/3259952 to your computer and use it in GitHub Desktop.
Save aniemerg/3259952 to your computer and use it in GitHub Desktop.
Retrieves forward cites for sample of Patents. Randomly samples patents with replacement in the range from *begin* to *end* and counts forward cites using the USPTO's PATFT. Repeats for *number* of times.
# CountForwardCites(begin, end, number)
# by Allan Niemerg
# Randomly samples patents with replacement in the range
# from *begin* to *end* and counts forward cites using the
# USPTO's PATFT. Repeats for *number* of times.
# *IMPORTANT NOTE*
# The USPTO PATFT policies limit the number of database accesses
# per IP address. To avoid violating their terms of use and
# getting banned, please limit the number of patents you
# examine using this script. I would suggest no more than 100
# patents per day.
import urllib2
import re
import datetime
import pprint
import time
import random
import sys
import httplib
def CountForwardCites(begin=6000000, end=6500000, number=100):
#randomly pick out numbers in the range
patentNumbers = []
for num in range(1, number):
patentNumbers.append(random.randrange(begin, end, 1))
FCites = {}
TestResults = {}
#get forward cites for each patent
for patent in patentNumbers:
#Let's not flood the patent office
time.sleep(5)
# Create Search Address with Correct Search Options
search_address = "http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50"
options = "&Query=ref%%2F%s&d=PTXT" % (patent)
the_add = search_address + options
# Query USPTO search
print the_add
h = urllib2.HTTPHandler(debuglevel=1)
opener = urllib2.build_opener(h)
request = urllib2.Request(the_add)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6')
urllib2.install_opener(opener)
try:
feeddata = opener.open(request).read()
#print feeddata
except urllib2.HTTPError, e:
print e.code
continue
except urllib2.URLError, e:
print e.args
continue
except httplib.BadStatusLine, e:
print "Bad Status line for %s" % (str(patent))
continue
# Extract Forward cites
forwardCites = 0
searchstring = "REF/"+str(patent)+"</B>: (\d+) patents."
Number = re.findall(searchstring, feeddata)
if len(Number)==0:
forwardCites=1
else:
forwardCites=int(Number[0])
print "forward cites for "+str(patent)+": "+str(forwardCites)
FCites[forwardCites] = FCites.get(forwardCites, 0) + 1
TestResults[patent] = forwardCites
#Save Data Points to File
out1 = "TestResultsJuly20.txt"
outf = open(out1, 'w')
rsults = TestResults.items()
for rsult in rsults:
line = "%s, %s \n" % (str(rsult[0]), str(rsult[1]))
outf.write(line)
outf.close()
thetime = time.strftime("%Y%m%d_%S")
outputfile = "ForwardCites_"+ thetime + ".txt"
outfile = open(outputfile, 'w')
results = FCites.items()
results.sort()
for result in results:
line = "[\'%s\', %s],\n" % (result[0], result[1])
outfile.write(line)
outfile.close()
if __name__=="__main__":
if len(sys.argv)>1:
CountForwardCites(int(sys.argv[1]),int(sys.argv[2]),int(sys.argv[3]) )
else:
CountForwardCites()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment