Created
May 30, 2013 15:07
-
-
Save jczaplew/5678587 to your computer and use it in GitHub Desktop.
EIA Nuclear Powerplant data scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Script to scrape all nuclear powerplant data from the EIA database | |
## and create a CSV | |
## Also requires an EIA API key, which you can register for here http://www.eia.gov/beta/api/register.cfm | |
## John J Czaplewski | [email protected] | May, 2013 | |
import urllib2 | |
import json | |
import csv | |
#Create a CSV file and writer | |
csvWriter = csv.writer(open('results.csv', 'wb+')) | |
csvWriter.writerow(['name', 'fuel', 'generator', 'series_id', 'lat', 'lon', 'source', 'data_2012', 'data_2011', 'data_2010', 'data_2009', 'data_2008', 'data_2007', 'data_2006', 'data_2005', 'data_2004', 'data_2003', 'data_2002', 'data_2001']) | |
#First query finds the IDs of all the available powerplants | |
request1 = urllib2.urlopen('http://api.eia.gov/category/?api_key=BA406DCCD54844844DC10C573C176695&category_id=1018') | |
powerplants = json.load(request1) | |
#Now we loop through the results and package all of the IDs into a list for use later | |
powerplantIds = [] | |
i = 0 | |
while (i < len(powerplants["category"]["childcategories"])): | |
powerplantIds.append(powerplants["category"]["childcategories"][i]["category_id"]) | |
i+=1 | |
# Start looping through the list of powerplants that was just created... | |
j = 0 | |
while (j < len(powerplantIds)): | |
#Start navigating down the JSON...for each powerplant, find all available categories of data | |
request2 = urllib2.urlopen('http://api.eia.gov/category/?api_key=BA406DCCD54844844DC10C573C176695&category_id=' + str(powerplantIds[j])) | |
powerplantNetGen = json.load(request2) | |
x = 0 | |
#Loop through all available categories for each powerplant, and grab only the ones that are annual data for a specific type of fuel source | |
keepers = [] | |
while (x < len(powerplantNetGen["category"]["childseries"])): | |
name = powerplantNetGen["category"]["childseries"][x]["name"] | |
name = name.split(" : ") | |
if name[2] == "Nuclear" and powerplantNetGen["category"]["childseries"][x]["f"] == "A" : | |
keepers.append(powerplantNetGen["category"]["childseries"][x]["series_id"]) | |
x += 1 | |
#Now that we know all the fuel types for a powerplant, we're going to request that type's most specific data | |
y = 0 | |
while (y < len(keepers)) : | |
request3 = urllib2.urlopen('http://api.eia.gov/series/?series_id=' + keepers[y] + '&api_key=BA406DCCD54844844DC10C573C176695') | |
stationData = json.load(request3) | |
name = stationData["series"][0]["name"] | |
nameSanitized = name.replace("'", "") | |
nameSanitized = nameSanitized.split(" : ") | |
#We have the data, just have to clean it up before inserting into the database | |
years = ["2012", "2011", "2010", "2009", "2008", "2007", "2006", "2005", "2004", "2003", "2002", "2001"] | |
yearData = [] | |
z = 0 | |
while z < len(years): | |
try: | |
if stationData["series"][0]["data"][z][0] == years[z] : | |
yearData.append(stationData["series"][0]["data"][z][1]) | |
else : | |
yearData.append("0") | |
except IndexError: | |
yearData.append("0") | |
z += 1 | |
# Finally write a new row to the CSV | |
csvWriter.writerow([nameSanitized[1], nameSanitized[2], nameSanitized[3], stationData["series"][0]["series_id"], stationData["series"][0]["lat"], stationData["series"][0]["lon"], stationData["series"][0]["source"], yearData[0], yearData[1], yearData[2], yearData[3], yearData[4], yearData[5], yearData[6], yearData[7], yearData[8], yearData[9], yearData[10], yearData[11]]) | |
print "Inserted a row" | |
y += 1 | |
#Now do it again, and again | |
print "Done with " + str(j) + " of " + str(len(powerplantIds)) | |
j += 1 | |
print "Finished!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment