Created
February 5, 2012 16:28
-
-
Save yannski/1746376 to your computer and use it in GitHub Desktop.
Simple ETL system written with rake
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# procedure : | |
# in development : rake db:restore etl:extract:all db:drop db:create db:migrate etl:load:all | |
# in production : rake db:backup etl:extract:all db:drop db:create db:migrate etl:load:all | |
namespace :etl do | |
namespace :extract do | |
task :all => :environment do | |
FileUtils.rm_rf 'db/extracted_data/' | |
FileUtils.mkdir_p 'db/extracted_data/' | |
Rake::Task['etl:extract:cities'].invoke | |
Rake::Task['etl:extract:addresses'].invoke | |
Rake::Task['etl:extract:users'].invoke | |
Rake::Task['etl:extract:products'].invoke | |
Rake::Task['etl:extract:enterprises'].invoke | |
# etc etc etc | |
end | |
task :cities => :environment do | |
sql = "SELECT * FROM cities" | |
ary = ActiveRecord::Base.connection.execute(sql) | |
FasterCSV.open("db/extracted_data/cities.csv", "w") do |csv| | |
ary.each{|row| | |
location_id = row[0] | |
# do your own thing so the variable row is filled with the corrected data | |
csv << row.cleanup! | |
} | |
end | |
end | |
end | |
namespace :load do | |
task :all => :environment do | |
Dir.glob("db/extracted_data/*csv").each{|filename| | |
table_name = FileUtils.basename filename, ".csv" | |
filename = Dir.pwd + "/" + filename | |
sql = "LOAD DATA LOCAL INFILE '#{filename}' INTO TABLE #{table_name} CHARACTER SET 'utf8' FIELDS TERMINATED BY ',' ENCLOSED BY '\"'" | |
begin | |
ary = ActiveRecord::Base.connection.execute(sql) | |
rescue | |
ActiveRecord::Base.connection.reconnect! | |
retry | |
end | |
} | |
end | |
end | |
end | |
class Time | |
def to_mysql | |
strftime("%Y-%m-%d %H:%M:%S %z") | |
end | |
end | |
def untouched new_table_name, old_table_name = nil | |
old_table_name ||= new_table_name | |
sql = "SELECT * FROM #{old_table_name}" | |
ary = ActiveRecord::Base.connection.execute(sql) | |
FasterCSV.open("db/extracted_data/#{new_table_name}.csv", "w") do |csv| | |
ary.each{|row| | |
csv << row.cleanup! | |
} | |
end | |
end | |
class Array | |
def to_h(&block) | |
Hash[*self.collect { |v| | |
[v, block.call(v)] | |
}.flatten] | |
end | |
def cleanup | |
self.map{|x| | |
if x.nil? | |
"\\N" | |
elsif x.is_a?(Time) | |
x.to_mysql | |
else | |
x | |
end | |
} | |
end | |
def cleanup! | |
self.map!{|x| | |
if x.nil? | |
"\\N" | |
elsif x.is_a?(Time) | |
x.to_mysql | |
elsif x.is_a?(TrueClass) | |
1 | |
elsif x.is_a?(FalseClass) | |
0 | |
else | |
x | |
end | |
} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment