aniemerg · August 4, 2012 21:07
diff --git a/CountForwardCites.py b/CountForwardCites.py
 # CountForwardCites(begin, end, number)
 # by Allan Niemerg
 # Randomly samples patents with replacement in the range 
 # from *begin* to *end* and counts forward cites using the 
 # USPTO's PATFT. Repeats for *number* of times. 

 # *IMPORTANT NOTE*
 # The USPTO PATFT policies limit the number of database accesses
 # per IP address. To avoid violating their terms of use and 
 # getting banned, please limit the number of patents you 
 # examine using this script. I would suggest no more than 100
 # patents per day. 

 import urllib2
 import re
 import datetime
 import pprint
 import time
 import random
 import sys
 import httplib

 def CountForwardCites(begin=6000000, end=6500000, number=100): 

  #randomly pick out numbers in the range
  patentNumbers = []
  for num in range(1, number): 
    patentNumbers.append(random.randrange(begin, end, 1))

  FCites = {}
  TestResults =  {}

  #get forward cites for each patent
  for patent in patentNumbers:
   
    #Let's not flood the patent office
    time.sleep(5)
    

    # Create Search Address with Correct Search Options
    search_address = "http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50"
    options = "&Query=ref%%2F%s&d=PTXT" % (patent)
    the_add = search_address + options

    # Query USPTO search
    print the_add
    h = urllib2.HTTPHandler(debuglevel=1)
    opener = urllib2.build_opener(h)
    request = urllib2.Request(the_add)
    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6')  
    urllib2.install_opener(opener)
    try:
      feeddata = opener.open(request).read()
      #print feeddata 
    except urllib2.HTTPError, e:
        print e.code
        continue
    except urllib2.URLError, e:
        print e.args
        continue
    except httplib.BadStatusLine, e:
        print "Bad Status line for %s" % (str(patent))
        continue
    
    # Extract Forward cites 
    forwardCites = 0
    searchstring = "REF/"+str(patent)+"</B>: (\d+) patents."
    Number = re.findall(searchstring, feeddata)
    if len(Number)==0:
        forwardCites=1
    else:
        forwardCites=int(Number[0])
    print "forward cites for "+str(patent)+": "+str(forwardCites)
    FCites[forwardCites] = FCites.get(forwardCites, 0) + 1
    TestResults[patent] = forwardCites

    
  #Save Data Points to File
  out1 = "TestResultsJuly20.txt"
  outf = open(out1, 'w')
  rsults = TestResults.items()
  for rsult in rsults:
    line = "%s, %s \n" % (str(rsult[0]), str(rsult[1]))
    outf.write(line)
  outf.close()
  
  thetime = time.strftime("%Y%m%d_%S")
  outputfile = "ForwardCites_"+ thetime + ".txt"
  outfile = open(outputfile, 'w') 
  results = FCites.items()
  results.sort()
  for result in results:
    line = "[\'%s\', %s],\n" % (result[0], result[1]) 
    outfile.write(line)      
  outfile.close()


 if __name__=="__main__":
   if len(sys.argv)>1:
     CountForwardCites(int(sys.argv[1]),int(sys.argv[2]),int(sys.argv[3]) )
   else:
     CountForwardCites()
	# CountForwardCites(begin, end, number)
	# by Allan Niemerg
	# Randomly samples patents with replacement in the range
	# from begin to end and counts forward cites using the
	# USPTO's PATFT. Repeats for number of times.

	# IMPORTANT NOTE
	# The USPTO PATFT policies limit the number of database accesses
	# per IP address. To avoid violating their terms of use and
	# getting banned, please limit the number of patents you
	# examine using this script. I would suggest no more than 100
	# patents per day.

	import urllib2
	import re
	import datetime
	import pprint
	import time
	import random
	import sys
	import httplib

	def CountForwardCites(begin=6000000, end=6500000, number=100):

	#randomly pick out numbers in the range
	patentNumbers = []
	for num in range(1, number):
	patentNumbers.append(random.randrange(begin, end, 1))

	FCites = {}
	TestResults = {}

	#get forward cites for each patent
	for patent in patentNumbers:

	#Let's not flood the patent office
	time.sleep(5)


	# Create Search Address with Correct Search Options
	search_address = "http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=0&p=1&f=S&l=50"
	options = "&Query=ref%%2F%s&d=PTXT" % (patent)
	the_add = search_address + options

	# Query USPTO search
	print the_add
	h = urllib2.HTTPHandler(debuglevel=1)
	opener = urllib2.build_opener(h)
	request = urllib2.Request(the_add)
	request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6')
	urllib2.install_opener(opener)
	try:
	feeddata = opener.open(request).read()
	#print feeddata
	except urllib2.HTTPError, e:
	print e.code
	continue
	except urllib2.URLError, e:
	print e.args
	continue
	except httplib.BadStatusLine, e:
	print "Bad Status line for %s" % (str(patent))
	continue

	# Extract Forward cites
	forwardCites = 0
	searchstring = "REF/"+str(patent)+"</B>: (\d+) patents."
	Number = re.findall(searchstring, feeddata)
	if len(Number)==0:
	forwardCites=1
	else:
	forwardCites=int(Number[0])
	print "forward cites for "+str(patent)+": "+str(forwardCites)
	FCites[forwardCites] = FCites.get(forwardCites, 0) + 1
	TestResults[patent] = forwardCites


	#Save Data Points to File
	out1 = "TestResultsJuly20.txt"
	outf = open(out1, 'w')
	rsults = TestResults.items()
	for rsult in rsults:
	line = "%s, %s \n" % (str(rsult[0]), str(rsult[1]))
	outf.write(line)
	outf.close()

	thetime = time.strftime("%Y%m%d_%S")
	outputfile = "ForwardCites_"+ thetime + ".txt"
	outfile = open(outputfile, 'w')
	results = FCites.items()
	results.sort()
	for result in results:
	line = "[\'%s\', %s],\n" % (result[0], result[1])
	outfile.write(line)
	outfile.close()


	if __name__=="__main__":
	if len(sys.argv)>1:
	CountForwardCites(int(sys.argv[1]),int(sys.argv[2]),int(sys.argv[3]) )
	else:
	CountForwardCites()