Created
March 29, 2013 13:34
-
-
Save thom-nic/5270881 to your computer and use it in GitHub Desktop.
Script to load EnerNOC Open data (see http://open.enernoc.com/data) into a relational database. Note that this resulted in a ~500 MB sqlite file for all 100 sites in the 2012 dataset. Once the data is loaded, you can make standard relational queries over the data such as: select r.* from readings r join sites s on r.site_id = s.site_id where s.i…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'sqlite3' | |
require 'csv' | |
module ENOC | |
## | |
# Functions to load EnerNOC Open Data (http://open.enernoc.com/data) | |
# into a sqlite database for relational access. This sould be easy | |
# to migrate to Postgres or MariaDB if you want a 'real' database. | |
# | |
module Sql | |
DB_FILE = 'anon/sql/all_data.db' | |
def self.create_schema db | |
db.execute_batch "pragma foreign_keys=ON; | |
create table if not exists sites ( | |
site_id int not null, | |
industry varchar(25), | |
sub_industry varchar(40), | |
lat float, | |
lng float, | |
sq_ft int, | |
timezone varchar(20), | |
tzo varchar int ); | |
create unique index if not exists site_site_id on sites( site_id asc ); | |
create table if not exists readings ( | |
id integer primary key, | |
site_id int not null | |
references sites( site_id ) on delete cascade, | |
timestamp int not null, | |
value int not null, | |
estimated bit default 0, | |
anomaly_type varchar(20) ); | |
create unique index if not exists readings_site_id_timestamp | |
on readings( site_id, timestamp asc );" | |
puts "Created tables" | |
end | |
def self.import_metadata csv_file, db | |
db.transaction | |
counter = 0 | |
db.execute "delete from sites" | |
db.prepare( | |
"insert into sites ( | |
site_id, | |
industry, sub_industry, | |
sq_ft, | |
lat, lng, | |
timezone, tzo | |
) values (?,?,?,?,?,?,?,?)") { |stmt| | |
CSV.open( csv_file, 'r', :headers=> true ).each { |row| | |
stmt.execute( | |
row['SITE_ID'].to_i, | |
row['INDUSTRY'], | |
row['SUB_INDUSTRY'], | |
row['SQ_FT'].to_i, | |
row['LAT'].to_f, | |
row['LNG'].to_f, | |
row['TIME_ZONE'], | |
row['TZ_OFFSET'].split(':')[0].to_i ) | |
counter += 1 | |
} | |
} | |
db.commit | |
puts "Inserted #{counter} meta rows" | |
# res = db.get_first_value "select count(*) from sites" | |
# puts "=----- #{res}" | |
rescue Exception => err | |
db.rollback if db.transaction_active? | |
raise | |
end | |
def self.import_from_csv dir, db | |
db.execute "delete from readings" | |
Dir.glob( "#{dir}/*.csv" ).each { |file| | |
puts "Reading #{file}..." | |
counter = 0 | |
db.transaction | |
site_id = file.split('/')[-1].split('.')[0].to_i | |
CSV.open( file, 'r', :headers=> true ).each { |row| | |
# puts row | |
anomaly = row['anomaly'] | |
anomaly = nil unless anomaly.size > 0 | |
db.execute( | |
"insert into readings ( | |
site_id, | |
timestamp, | |
value, | |
estimated, | |
anomaly_type | |
) values (?,?,?,?,?)", | |
site_id, | |
row['timestamp'].to_i, | |
row['value'].to_f, | |
row['estimated'].to_i, | |
anomaly ) | |
counter += 1 | |
} | |
db.commit | |
puts "wrote #{counter} rows" | |
} | |
rescue Exception => err | |
db.rollback if db.transaction_active? | |
raise | |
end | |
def self.main | |
db = SQLite3::Database.new DB_FILE | |
# puts db.get_first_value 'SELECT SQLITE_VERSION()' | |
create_schema db | |
import_metadata 'anon/meta/all_sites.csv', db | |
import_from_csv 'anon/csv', db | |
db.close | |
end | |
end | |
end | |
if __FILE__ == $0 | |
ENOC::Sql.main | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Yay Ruby!!