Last active
April 20, 2017 04:02
-
-
Save mtdukes/7df5e36b9b767ebf7e50a65180191237 to your computer and use it in GitHub Desktop.
A generalized script to gather data from DOE's College Scorecard site using JSON
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| A generalized script to gather data from | |
| DOE's College Scorecard site using JSON | |
| USAGE | |
| python collegescorecard_v2.py 'https://api.data.gov/ed/collegescorecard/v1/schools.json?school.ownership=2&fields=id,school.name,2014.student.share_firstgeneration&page=0&per_page=100&api_key=MY_API_KEY_HIDDEN' | |
| ''' | |
| #import libraries | |
| #the time library allows us to pause for a second | |
| from time import sleep | |
| #these libraries expand the functionality | |
| #of your existing code | |
| import csv, json, urllib2, math, argparse | |
| #main scraper function | |
| #accepts the name of a csv file as a parameter | |
| def scrape_function(query): | |
| #find where the field list starts in the query string | |
| start = query.index('fields=') + len('fields=') | |
| #find where the field list ends in the query string | |
| end = query.index('&',start) | |
| #store the fields from the query to a header variable | |
| header = query[start:end].split(',') | |
| #open a csv file and define the headers | |
| with open('scorecard-data.csv','wb') as f: | |
| #whenever we want to write to our csv, | |
| #we'll call 'writer' and tell it what to do | |
| writer = csv.writer(f) | |
| #write the header to the first row in our csv | |
| writer.writerow(header) | |
| #report back to us if it's successful | |
| print 'New file created...' | |
| #load initial json_data | |
| #report that we're loading the data | |
| print 'Loading JSON data...' | |
| #use urllib2 library to open the url | |
| #and store the contents into a file | |
| json_file = urllib2.urlopen(query).read() | |
| #use json library to parse the file | |
| #into a structured format readable in python | |
| json_data = json.loads(json_file) | |
| #get the number of entries in the data | |
| #which we can think of as rows in a spreadsheet | |
| total_rows = json_data['metadata']['total'] | |
| per_page = json_data['metadata']['per_page'] | |
| current_page = json_data['metadata']['page'] | |
| #calculate the number of pages we'll need | |
| #to go through to capture all the data | |
| last_page = int(math.ceil(total_rows/float(per_page))) - 1 | |
| print 'Metadata captured ...' | |
| #loop through the pages and write the data to your csv | |
| while current_page <= last_page: | |
| #report that we're loading the data (again) | |
| print 'Loading JSON data...' | |
| #use urllib2 library to open the url | |
| #and store the contents into a file (again) | |
| json_file = urllib2.urlopen(query).read() | |
| #use json library to parse the file | |
| #into a structured format readable in python (again) | |
| json_data = json.loads(json_file) | |
| #loop through the results on each page and write each to a row | |
| for result in json_data['results']: | |
| #create a blank list called 'row' | |
| row = [] | |
| #loop through the header list and use it to match | |
| #each field in the data to the field in the spreadsheet | |
| for field in header: | |
| #add each datapoint in the proper order to our row list | |
| row.append(result[field]) | |
| #write the row to our spreadsheet | |
| writer.writerow(row) | |
| #report back to us | |
| print "Page",current_page,"written to CSV..." | |
| #increment the page counter | |
| current_page += 1 | |
| #queue up the next page of the query | |
| query += '&page='+str(current_page) | |
| #report that we're done with this step | |
| #and we're taking a short break | |
| print 'Sleeping...' | |
| #sleep for 3 seconds, because we want to be | |
| #sensitive to the computing demands of automated | |
| #scraping of data (we're so nice!) | |
| sleep(3) | |
| #this is what executes when you run collegescorecard_v2.py | |
| if __name__ == '__main__': | |
| #use a built-in parser to accept the query as an argument to | |
| parser = argparse.ArgumentParser(description='Download the JSON data from CollegeScorecard and save as CSV') | |
| parser.add_argument('query',help='Enter a valid API query') | |
| args = parser.parse_args() | |
| #run our main scraping function, feeding in a query | |
| scrape_function(args.query) | |
| print '...done' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment