mtdukes · April 20, 2017 04:02
diff --git a/collegescorecard_v2.py b/collegescorecard_v2.py
 '''
 A generalized script to gather data from
 DOE's College Scorecard site using JSON

 USAGE
 python collegescorecard_v2.py 'https://api.data.gov/ed/collegescorecard/v1/schools.json?school.ownership=2&fields=id,school.name,2014.student.share_firstgeneration&page=0&per_page=100&api_key=MY_API_KEY_HIDDEN'
 '''
 #import libraries
 #the time library allows us to pause for a second
 from time import sleep
 #these libraries expand the functionality
 #of your existing code
 import csv, json, urllib2, math, argparse

 #main scraper function
 #accepts the name of a csv file as a parameter
 def scrape_function(query):
 	#find where the field list starts in the query string
 	start = query.index('fields=') + len('fields=')
 	#find where the field list ends in the query string
 	end = query.index('&',start)
 	#store the fields from the query to a header variable
 	header = query[start:end].split(',')

 	#open a csv file and define the headers
 	with open('scorecard-data.csv','wb') as f:
 		#whenever we want to write to our csv,
 		#we'll call 'writer' and tell it what to do
 		writer = csv.writer(f)
 		#write the header to the first row in our csv
 		writer.writerow(header)
 		#report back to us if it's successful
 		print 'New file created...'

 		#load initial json_data
 		#report that we're loading the data
 		print 'Loading JSON data...'
 		#use urllib2 library to open the url
 		#and store the contents into a file
 		json_file = urllib2.urlopen(query).read()
 		#use json library to parse the file
 		#into a structured format readable in python
 		json_data = json.loads(json_file)

 		#get the number of entries in the data
 		#which we can think of as rows in a spreadsheet
 		total_rows = json_data['metadata']['total']
 		per_page = json_data['metadata']['per_page']
 		current_page = json_data['metadata']['page']
 		#calculate the number of pages we'll need
 		#to go through to capture all the data
 		last_page = int(math.ceil(total_rows/float(per_page))) - 1
 		print 'Metadata captured ...'

 		#loop through the pages and write the data to your csv
 		while current_page <= last_page:
 			#report that we're loading the data (again) 
 			print 'Loading JSON data...'
 			#use urllib2 library to open the url
 			#and store the contents into a file (again)
 			json_file = urllib2.urlopen(query).read()
 			#use json library to parse the file
 			#into a structured format readable in python (again)
 			json_data = json.loads(json_file)

 			#loop through the results on each page and write each to a row
 			for result in json_data['results']:
 				#create a blank list called 'row'
 				row = []
 				#loop through the header list and use it to match
 				#each field in the data to the field in the spreadsheet
 				for field in header:
 					#add each datapoint in the proper order to our row list
 					row.append(result[field])
 				#write the row to our spreadsheet
 				writer.writerow(row)
 			#report back to us
 			print "Page",current_page,"written to CSV..."
 			
 			#increment the page counter
 			current_page += 1
 			#queue up the next page of the query
 			query += '&page='+str(current_page)
 			#report that we're done with this step
 			#and we're taking a short break
 			print 'Sleeping...'
 			#sleep for 3 seconds, because we want to be
 			#sensitive to the computing demands of automated
 			#scraping of data (we're so nice!)
 			sleep(3)

 #this is what executes when you run collegescorecard_v2.py
 if __name__ == '__main__':

 	#use a built-in parser to accept the query as an argument to 
 	parser = argparse.ArgumentParser(description='Download the JSON data from CollegeScorecard and save as CSV')
 	parser.add_argument('query',help='Enter a valid API query')
 	args = parser.parse_args()
 	
 	#run our main scraping function, feeding in a query
 	scrape_function(args.query)
 	
 	print '...done'
	'''
	A generalized script to gather data from
	DOE's College Scorecard site using JSON

	USAGE
	python collegescorecard_v2.py 'https://api.data.gov/ed/collegescorecard/v1/schools.json?school.ownership=2&fields=id,school.name,2014.student.share_firstgeneration&page=0&per_page=100&api_key=MY_API_KEY_HIDDEN'
	'''
	#import libraries
	#the time library allows us to pause for a second
	from time import sleep
	#these libraries expand the functionality
	#of your existing code
	import csv, json, urllib2, math, argparse

	#main scraper function
	#accepts the name of a csv file as a parameter
	def scrape_function(query):
	#find where the field list starts in the query string
	start = query.index('fields=') + len('fields=')
	#find where the field list ends in the query string
	end = query.index('&',start)
	#store the fields from the query to a header variable
	header = query[start:end].split(',')

	#open a csv file and define the headers
	with open('scorecard-data.csv','wb') as f:
	#whenever we want to write to our csv,
	#we'll call 'writer' and tell it what to do
	writer = csv.writer(f)
	#write the header to the first row in our csv
	writer.writerow(header)
	#report back to us if it's successful
	print 'New file created...'

	#load initial json_data
	#report that we're loading the data
	print 'Loading JSON data...'
	#use urllib2 library to open the url
	#and store the contents into a file
	json_file = urllib2.urlopen(query).read()
	#use json library to parse the file
	#into a structured format readable in python
	json_data = json.loads(json_file)

	#get the number of entries in the data
	#which we can think of as rows in a spreadsheet
	total_rows = json_data['metadata']['total']
	per_page = json_data['metadata']['per_page']
	current_page = json_data['metadata']['page']
	#calculate the number of pages we'll need
	#to go through to capture all the data
	last_page = int(math.ceil(total_rows/float(per_page))) - 1
	print 'Metadata captured ...'

	#loop through the pages and write the data to your csv
	while current_page <= last_page:
	#report that we're loading the data (again)
	print 'Loading JSON data...'
	#use urllib2 library to open the url
	#and store the contents into a file (again)
	json_file = urllib2.urlopen(query).read()
	#use json library to parse the file
	#into a structured format readable in python (again)
	json_data = json.loads(json_file)

	#loop through the results on each page and write each to a row
	for result in json_data['results']:
	#create a blank list called 'row'
	row = []
	#loop through the header list and use it to match
	#each field in the data to the field in the spreadsheet
	for field in header:
	#add each datapoint in the proper order to our row list
	row.append(result[field])
	#write the row to our spreadsheet
	writer.writerow(row)
	#report back to us
	print "Page",current_page,"written to CSV..."

	#increment the page counter
	current_page += 1
	#queue up the next page of the query
	query += '&page='+str(current_page)
	#report that we're done with this step
	#and we're taking a short break
	print 'Sleeping...'
	#sleep for 3 seconds, because we want to be
	#sensitive to the computing demands of automated
	#scraping of data (we're so nice!)
	sleep(3)

	#this is what executes when you run collegescorecard_v2.py
	if __name__ == '__main__':

	#use a built-in parser to accept the query as an argument to
	parser = argparse.ArgumentParser(description='Download the JSON data from CollegeScorecard and save as CSV')
	parser.add_argument('query',help='Enter a valid API query')
	args = parser.parse_args()

	#run our main scraping function, feeding in a query
	scrape_function(args.query)

	print '...done'
No results found