-
-
Save jackrg/76ade1724bd816292e4e to your computer and use it in GitHub Desktop.
class ActiveRecord::Base | |
def self.import!(record_list) | |
raise ArgumentError "record_list not an Array of Hashes" unless record_list.is_a?(Array) && record_list.all? {|rec| rec.is_a? Hash } | |
return record_list if record_list.empty? | |
(1..record_list.count).step(1000).each do |start| | |
key_list, value_list = convert_record_list(record_list[start-1..start+999]) | |
sql = "INSERT INTO #{self.table_name} (#{key_list.join(", ")}) VALUES #{value_list.map {|rec| "(#{rec.join(", ")})" }.join(" ,")}" | |
self.connection.insert_sql(sql) | |
end | |
return record_list | |
end | |
def self.convert_record_list(record_list) | |
# Build the list of keys | |
key_list = record_list.map(&:keys).flatten.map(&:to_s).uniq.sort | |
value_list = record_list.map do |rec| | |
list = [] | |
key_list.each {|key| list << ActiveRecord::Base.connection.quote(rec[key] || rec[key.to_sym]) } | |
list | |
end | |
# If table has standard timestamps and they're not in the record list then add them to the record list | |
time = ActiveRecord::Base.connection.quote(Time.now) | |
for field_name in %w(created_at updated_at) | |
if self.column_names.include?(field_name) && !(key_list.include?(field_name)) | |
key_list << field_name | |
value_list.each {|rec| rec << time } | |
end | |
end | |
return [key_list, value_list] | |
end | |
end |
I think this code double-includes some rows. Why start at 1 and then go from start-1
to start+999
? This means your first range is from 0..1000, then 1000..2000, so you include row 1000 twice. How about something more like this: (0...record_list.count).step(1000).each do |start|
and then record_list[start...start+1000]}
You are indeed double-including counting rows...
Try:
(0..record_list.count).step(1000).each do |start| key_list, value_list = convert_record_list(record_list[start..(start+999)])
You are forgetting to use default values if they are set, instead you would just set it to 'NULL'
I added:
default = columns.map { |c| [c.name, c.default] }.to_h
default['created_at'] = Time.now.utc
default['updated_at'] = Time.now.utc
and modified the line :
key_list.each { |key| list << ActiveRecord::Base.connection.quote(rec[key] || rec[key.to_sym] || default[key]) }
I honestly disagree with baking in the step functionality to that method anyways. Why should that method care about doing anything but preparing the bulk insert statement? It shouldn't be throttling the input, the user shouldn't be passing things to it that slow it down, and if they do, they should be adjusting their input size.
The cleaner implementation is to avoid adding that logic into the method in the first place, for all those suggesting he simply fix the resulting logic bug.
@jackrg how might you write this if you wanted to selectively insert columns from your raw data (e.g. in a csv, say you only want column 1, 3, and 4 to save to your ActiveRecord table instead of all columns 1-4)? I know how to do that using CSV.foreach, but it's taking 5 minutes to import 20,000 rows.
This is vulnerable to sql injection - you need to escape values!
http://stackoverflow.com/a/15317838/873650
Thank you for your gist. I'm using AR-SQLServer-adapter and I had to change self.connection.insert_sql(sql) TO self.connection.execute(sql)