amichal · August 2, 2012 18:12
diff --git a/bulk_merge.rb b/bulk_merge.rb
  # Bulk merge a bunch of records
  # 
  # we expect most of the records to import to represent updates
  # so this method tries to be efficent with database queries
  # by using a single query to find any existing records matching the key_attributes
  # updating those records and finally inserting each remaining record
  #
  # It DOEST NOT use a transaction or lock. callers responsibilty to
  # add that if desired
  #
  #   records_to_merge: an array of objects (often ActiveRecord) representing
  #     the new records. Each record should respond to record[attribute] with the 
  #     the new value.
  #   key_attributes: an array of attribute names that together uniquely identify 
  #     the records to be merged in. e.g. the candidate_key for the join
  #   attributes_to_update: an array of attributes names to update
  #
  # returns an array of the new/or created objects. 
  # Tip: you can use id_changed? to see if the object was created by the merge
  def self.bulk_merge(records_to_merge, key_attributes, attributes_to_update)
    # TODO: this would be a lot faster/safer if we somehow used mysql insert or update,
    # SQL 2003 merge, or other http://en.wikipedia.org/wiki/Upsert models

    return [] if records_to_merge.empty?
    raise ArgumentError.new "key_attributes must be a non-empty array" if key_attributes.empty?
    raise ArgumentError.new "attributes_to_update must be a non-empty array" if attributes_to_update.empty?
        
    key_attributes.map! &:to_sym
    attributes_to_update.map! &:to_sym
    
    # loop over the incoming records building a query to find them in the 
    # database as well as a hash lookup table to quickly find them by their key
    conditions = Array.new
    incoming_records_by_key = Hash.new
    records_to_merge.each do |incoming_record|
      conditions << sanitize_sql_for_conditions(Hash[key_attributes.map{|a|[a,incoming_record[a]]}])
      key_value = key_attributes.map{|a| incoming_record[a]}
      incoming_records_by_key[key_value] = incoming_record
    end

    #update all the existing records
    select = [:id] + key_attributes + attributes_to_update
    updates = scoped(:select => select.join(','), :conditions=>'('+conditions.join(")\nOR(")+')').each do |existing_record|
      key_value = key_attributes.map{|a| existing_record[a]}
      if incoming_record = incoming_records_by_key.delete(key_value)
        attributes_to_update.each do |a|
          existing_record[a] = incoming_record[a]
        end
        existing_record.save
      end
    end

    #add new onces
    inserts = incoming_records_by_key.values.map do |incoming_record|
      self.create do |new_record|
        (key_attributes + attributes_to_update).each do |a|
          new_record[a] = incoming_record[a]
        end
      end
    end
    updates + inserts   
  end
diff --git a/export.rb b/export.rb
 class ActiveRecord::Base
  # Export the current scope using FasterCSV to an IO object
  # See FasterCSV#new(io, options), #find_each
  # After some additional options the following options 
  # e.g.
  #
  #     scope.export(File.open('dump.csv','w'))
  #     scope.export(File.open('dump.csv','w'), :columns=>#w/name address email/)
  #
  # options
  #
  # * :columns - an Array of method names called with #send on each 
  #               instance to generate row data
  # * :batch_size - passed to #find_each to control the batch size
  # * :headers - defaults to options[:columns]
  # * :write_headers - defaults to ture
  # * :return_headers - defaults to ture
  # * :skip_blanks - defaults to true
  def self.export(io, options={})
    options[:columns] ||= column_names
    columns = options.delete(:columns)
    batch_size = options.delete(:batch_size)
    options[:headers] ||= columns
    options[:write_headers] ||= true
    options[:return_headers] ||= true
    options[:skip_blanks] ||= true
    csv = FasterCSV.new(io, options) 

    find_each(:batch_size=>batch_size) do |instance|
      row = columns.map do |column_method| 
        instance.send(column_method)
      end
      yield(row, instance, csv) if block_given?
      csv << row 
    end

    return csv
  end
 end
	# Bulk merge a bunch of records
	#
	# we expect most of the records to import to represent updates
	# so this method tries to be efficent with database queries
	# by using a single query to find any existing records matching the key_attributes
	# updating those records and finally inserting each remaining record
	#
	# It DOEST NOT use a transaction or lock. callers responsibilty to
	# add that if desired
	#
	# records_to_merge: an array of objects (often ActiveRecord) representing
	# the new records. Each record should respond to record[attribute] with the
	# the new value.
	# key_attributes: an array of attribute names that together uniquely identify
	# the records to be merged in. e.g. the candidate_key for the join
	# attributes_to_update: an array of attributes names to update
	#
	# returns an array of the new/or created objects.
	# Tip: you can use id_changed? to see if the object was created by the merge
	def self.bulk_merge(records_to_merge, key_attributes, attributes_to_update)
	# TODO: this would be a lot faster/safer if we somehow used mysql insert or update,
	# SQL 2003 merge, or other http://en.wikipedia.org/wiki/Upsert models

	return [] if records_to_merge.empty?
	raise ArgumentError.new "key_attributes must be a non-empty array" if key_attributes.empty?
	raise ArgumentError.new "attributes_to_update must be a non-empty array" if attributes_to_update.empty?

	key_attributes.map! &:to_sym
	attributes_to_update.map! &:to_sym

	# loop over the incoming records building a query to find them in the
	# database as well as a hash lookup table to quickly find them by their key
	conditions = Array.new
	incoming_records_by_key = Hash.new
	records_to_merge.each do \|incoming_record\|
	conditions << sanitize_sql_for_conditions(Hash[key_attributes.map{\|a\|[a,incoming_record[a]]}])
	key_value = key_attributes.map{\|a\| incoming_record[a]}
	incoming_records_by_key[key_value] = incoming_record
	end

	#update all the existing records
	select = [:id] + key_attributes + attributes_to_update
	updates = scoped(:select => select.join(','), :conditions=>'('+conditions.join(")\nOR(")+')').each do \|existing_record\|
	key_value = key_attributes.map{\|a\| existing_record[a]}
	if incoming_record = incoming_records_by_key.delete(key_value)
	attributes_to_update.each do \|a\|
	existing_record[a] = incoming_record[a]
	end
	existing_record.save
	end
	end

	#add new onces
	inserts = incoming_records_by_key.values.map do \|incoming_record\|
	self.create do \|new_record\|
	(key_attributes + attributes_to_update).each do \|a\|
	new_record[a] = incoming_record[a]
	end
	end
	end
	updates + inserts
	end
	class ActiveRecord::Base
	# Export the current scope using FasterCSV to an IO object
	# See FasterCSV#new(io, options), #find_each
	# After some additional options the following options
	# e.g.
	#
	# scope.export(File.open('dump.csv','w'))
	# scope.export(File.open('dump.csv','w'), :columns=>#w/name address email/)
	#
	# options
	#
	# * :columns - an Array of method names called with #send on each
	# instance to generate row data
	# * :batch_size - passed to #find_each to control the batch size
	# * :headers - defaults to options[:columns]
	# * :write_headers - defaults to ture
	# * :return_headers - defaults to ture
	# * :skip_blanks - defaults to true
	def self.export(io, options={})
	options[:columns] \|\|= column_names
	columns = options.delete(:columns)
	batch_size = options.delete(:batch_size)
	options[:headers] \|\|= columns
	options[:write_headers] \|\|= true
	options[:return_headers] \|\|= true
	options[:skip_blanks] \|\|= true
	csv = FasterCSV.new(io, options)

	find_each(:batch_size=>batch_size) do \|instance\|
	row = columns.map do \|column_method\|
	instance.send(column_method)
	end
	yield(row, instance, csv) if block_given?
	csv << row
	end

	return csv
	end
	end