Skip to content

Instantly share code, notes, and snippets.

@barmstrong
Created October 28, 2011 23:50
Show Gist options
  • Select an option

  • Save barmstrong/1323865 to your computer and use it in GitHub Desktop.

Select an option

Save barmstrong/1323865 to your computer and use it in GitHub Desktop.
processing large csv files in ruby
class ZendeskTicketsJob
extend Resque::Plugins::ExponentialBackoff
@queue = :low
FIELDS = ['zendesk_id', 'requester_id', 'assignee_id', 'group', 'subject', 'tags', 'status', 'priority', 'via', 'ticket_type', 'created_at', 'assigned_at', 'solved_at', 'resolution_time', 'satisfaction', 'group_stations', 'assignee_stations', 'reopens', 'replies', 'first_reply_time_in_minutes', 'first_reply_time_in_minutes_within_business_hours', 'first_resolution_time_in_minutes', 'first_resolution_time_in_minutes_within_business_hours', 'full_resolution_time_in_minutes', 'full_resolution_time_in_minutes_within_business_hours', 'agent_wait_time_in_minutes', 'agent_wait_time_in_minutes_within_business_hours', 'requester_wait_time_in_minutes', 'requester_wait_time_in_minutes_within_business_hours', 'reservation_code', 'requires_manual_closing']
def self.perform(url)
`rm /tmp/zendesk_tickets*`
`wget #{url} -O /tmp/zendesk_tickets.csv.zip`
`unzip -p /tmp/zendesk_tickets.csv.zip > /tmp/zendesk_tickets.csv`
# IO.foreach doesn't read the entire file into memory at once, which is good since a standard FasterCSV.parse on this file can take an hour or more
lines = []
IO.foreach('/tmp/zendesk_tickets.csv') do |line|
lines << line
if lines.size >= 1000
lines = FasterCSV.parse(lines.join) rescue next
store lines
lines = []
end
end
store lines
end
def self.store lines
return if lines.blank?
puts "Storing #{lines.size} rows..."
connection.execute("
INSERT INTO zendesk_tickets (#{fields_to_sql FIELDS})
VALUES #{data_to_sql lines}
ON DUPLICATE KEY
UPDATE #{fields_to_update_sql FIELDS[1..-1]}
")
end
def self.fields_to_sql fields
fields.collect{|f| connection.quote_column_name(f) }.join(',')
end
def self.data_to_sql lines
data = lines.collect do |v|
# ... some manipulation of the data into our own formats
v
end.compact
data.collect do |row|
"(#{row.collect{|f| connection.quote(f)}.join(',')})"
end.join(', ')
end
def self.connection
@connection ||= ZendeskTicket.new.connection
end
def self.fields_to_update_sql fields
fields.collect do |f|
qcn = connection.quote_column_name(f)
"#{qcn} = VALUES(#{qcn})"
end.join(', ')
end
end
@sgringwe
Copy link
Copy Markdown

What kind of performance did this yield for you?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment