Taiiwo · November 9, 2015 15:42
diff --git a/Instructions.md b/Instructions.md
diff --git a/populate_csv.py b/populate_csv.py
 import sys
 import xml.etree.ElementTree as xml
 import urllib2
 import urllib
 import time
 import thread

 if len(sys.argv) < 3:
    input_file = raw_input("Enter the filename of the input file: ")
    output_file = "output_file.csv"
 else:
    input_file = sys.argv[1]
    output_file = sys.argv[2]

 input_file = open(input_file, "r").read()
 input_file = input_file.splitlines()
 output_r = open(output_file, "r").read()
 output_w = open(output_file, "w+")
 input_length = len(input_file)
 cur = -1
 out = ""
 def threadf(cur):
    global out, input_file, nthreads
    nthreads += 1
    line = input_file[cur]
    line = line.split(",")
    fname = line[0].strip()
    lname = line[1].strip()
    if fname + ', ' +lname in output_r:
        nthreads -= 1
        return False
    xpath = "/search_results/@count"
    base_uri = "https://clinicaltrials.gov/search?term=%%28%%22Principal+Investigator%%22+OR+Contact+OR+Investigator+OR+%%22Study+Chair%%22+OR+%%22Study+Director%%22%%29+%%22%s+%s%%22&recr=%s&displayxml=true"
    uri1 = base_uri % (fname, lname, "Open")
    uri2 = base_uri % (fname, lname, "Closed")
    ret = [fname, lname]
    for uri in (uri1, uri2):
        uri = uri.replace(' ', '+')
        tries = 0
        while tries < 5:
            try:
                data = urllib2.urlopen(uri).read()
                break
            except urllib2.HTTPError:
                tries += 1
                print "http error for %s:%s #%s" % (fname, lname, str(tries))
                time.sleep(1)
        if tries >= 5:
            nthreads -= 1
            return False
        data = xml.fromstring(data)
        count = data.get('count')
        ret.append(count)
    output_w.write((', ').join(ret) + '\n')
    out += (', ').join(ret) + '\n'
    print (', ').join(ret) + ',(%d/%d)' % (out.count('\n'), input_length)
    nthreads -= 1
 threads = 10
 nthreads = 0
 num1 = 0
 num2 = len(input_file)
 cur = 0
 while cur < len(input_file):
    #arg1 = num1 + ((num2 - num1)/threads)*(i-1)
    #arg2 = num1 + ((num2 - num1)/threads)*i
    if nthreads < threads:
        thread.start_new_thread(threadf, (cur,))
        time.sleep(0.3)
        cur += 1
 while 1:
    time.sleep(5)
	import sys
	import xml.etree.ElementTree as xml
	import urllib2
	import urllib
	import time
	import thread

	if len(sys.argv) < 3:
	input_file = raw_input("Enter the filename of the input file: ")
	output_file = "output_file.csv"
	else:
	input_file = sys.argv[1]
	output_file = sys.argv[2]

	input_file = open(input_file, "r").read()
	input_file = input_file.splitlines()
	output_r = open(output_file, "r").read()
	output_w = open(output_file, "w+")
	input_length = len(input_file)
	cur = -1
	out = ""
	def threadf(cur):
	global out, input_file, nthreads
	nthreads += 1
	line = input_file[cur]
	line = line.split(",")
	fname = line[0].strip()
	lname = line[1].strip()
	if fname + ', ' +lname in output_r:
	nthreads -= 1
	return False
	xpath = "/search_results/@count"
	base_uri = "https://clinicaltrials.gov/search?term=%%28%%22Principal+Investigator%%22+OR+Contact+OR+Investigator+OR+%%22Study+Chair%%22+OR+%%22Study+Director%%22%%29+%%22%s+%s%%22&recr=%s&displayxml=true"
	uri1 = base_uri % (fname, lname, "Open")
	uri2 = base_uri % (fname, lname, "Closed")
	ret = [fname, lname]
	for uri in (uri1, uri2):
	uri = uri.replace(' ', '+')
	tries = 0
	while tries < 5:
	try:
	data = urllib2.urlopen(uri).read()
	break
	except urllib2.HTTPError:
	tries += 1
	print "http error for %s:%s #%s" % (fname, lname, str(tries))
	time.sleep(1)
	if tries >= 5:
	nthreads -= 1
	return False
	data = xml.fromstring(data)
	count = data.get('count')
	ret.append(count)
	output_w.write((', ').join(ret) + '\n')
	out += (', ').join(ret) + '\n'
	print (', ').join(ret) + ',(%d/%d)' % (out.count('\n'), input_length)
	nthreads -= 1
	threads = 10
	nthreads = 0
	num1 = 0
	num2 = len(input_file)
	cur = 0
	while cur < len(input_file):
	#arg1 = num1 + ((num2 - num1)/threads)*(i-1)
	#arg2 = num1 + ((num2 - num1)/threads)*i
	if nthreads < threads:
	thread.start_new_thread(threadf, (cur,))
	time.sleep(0.3)
	cur += 1
	while 1:
	time.sleep(5)