Created
August 4, 2012 21:07
-
-
Save aniemerg/3259952 to your computer and use it in GitHub Desktop.
Retrieves forward cites for sample of Patents. Randomly samples patents with replacement in the range from *begin* to *end* and counts forward cites using the USPTO's PATFT. Repeats for *number* of times.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CountForwardCites(begin, end, number) | |
# by Allan Niemerg | |
# Randomly samples patents with replacement in the range | |
# from *begin* to *end* and counts forward cites using the | |
# USPTO's PATFT. Repeats for *number* of times. | |
# *IMPORTANT NOTE* | |
# The USPTO PATFT policies limit the number of database accesses | |
# per IP address. To avoid violating their terms of use and | |
# getting banned, please limit the number of patents you | |
# examine using this script. I would suggest no more than 100 | |
# patents per day. | |
import urllib2 | |
import re | |
import datetime | |
import pprint | |
import time | |
import random | |
import sys | |
import httplib | |
def CountForwardCites(begin=6000000, end=6500000, number=100): | |
#randomly pick out numbers in the range | |
patentNumbers = [] | |
for num in range(1, number): | |
patentNumbers.append(random.randrange(begin, end, 1)) | |
FCites = {} | |
TestResults = {} | |
#get forward cites for each patent | |
for patent in patentNumbers: | |
#Let's not flood the patent office | |
time.sleep(5) | |
# Create Search Address with Correct Search Options | |
search_address = "http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50" | |
options = "&Query=ref%%2F%s&d=PTXT" % (patent) | |
the_add = search_address + options | |
# Query USPTO search | |
print the_add | |
h = urllib2.HTTPHandler(debuglevel=1) | |
opener = urllib2.build_opener(h) | |
request = urllib2.Request(the_add) | |
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6') | |
urllib2.install_opener(opener) | |
try: | |
feeddata = opener.open(request).read() | |
#print feeddata | |
except urllib2.HTTPError, e: | |
print e.code | |
continue | |
except urllib2.URLError, e: | |
print e.args | |
continue | |
except httplib.BadStatusLine, e: | |
print "Bad Status line for %s" % (str(patent)) | |
continue | |
# Extract Forward cites | |
forwardCites = 0 | |
searchstring = "REF/"+str(patent)+"</B>: (\d+) patents." | |
Number = re.findall(searchstring, feeddata) | |
if len(Number)==0: | |
forwardCites=1 | |
else: | |
forwardCites=int(Number[0]) | |
print "forward cites for "+str(patent)+": "+str(forwardCites) | |
FCites[forwardCites] = FCites.get(forwardCites, 0) + 1 | |
TestResults[patent] = forwardCites | |
#Save Data Points to File | |
out1 = "TestResultsJuly20.txt" | |
outf = open(out1, 'w') | |
rsults = TestResults.items() | |
for rsult in rsults: | |
line = "%s, %s \n" % (str(rsult[0]), str(rsult[1])) | |
outf.write(line) | |
outf.close() | |
thetime = time.strftime("%Y%m%d_%S") | |
outputfile = "ForwardCites_"+ thetime + ".txt" | |
outfile = open(outputfile, 'w') | |
results = FCites.items() | |
results.sort() | |
for result in results: | |
line = "[\'%s\', %s],\n" % (result[0], result[1]) | |
outfile.write(line) | |
outfile.close() | |
if __name__=="__main__": | |
if len(sys.argv)>1: | |
CountForwardCites(int(sys.argv[1]),int(sys.argv[2]),int(sys.argv[3]) ) | |
else: | |
CountForwardCites() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment