-
-
Save leomao10/2021915 to your computer and use it in GitHub Desktop.
processing large csv files in ruby
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ZendeskTicketsJob | |
extend Resque::Plugins::ExponentialBackoff | |
@queue = :low | |
FIELDS = ['zendesk_id', 'requester_id', 'assignee_id', 'group', 'subject', 'tags', 'status', 'priority', 'via', 'ticket_type', 'created_at', 'assigned_at', 'solved_at', 'resolution_time', 'satisfaction', 'group_stations', 'assignee_stations', 'reopens', 'replies', 'first_reply_time_in_minutes', 'first_reply_time_in_minutes_within_business_hours', 'first_resolution_time_in_minutes', 'first_resolution_time_in_minutes_within_business_hours', 'full_resolution_time_in_minutes', 'full_resolution_time_in_minutes_within_business_hours', 'agent_wait_time_in_minutes', 'agent_wait_time_in_minutes_within_business_hours', 'requester_wait_time_in_minutes', 'requester_wait_time_in_minutes_within_business_hours', 'reservation_code', 'requires_manual_closing'] | |
def self.perform(url) | |
`rm /tmp/zendesk_tickets*` | |
`wget #{url} -O /tmp/zendesk_tickets.csv.zip` | |
`unzip -p /tmp/zendesk_tickets.csv.zip > /tmp/zendesk_tickets.csv` | |
# IO.foreach doesn't read the entire file into memory at once, which is good since a standard FasterCSV.parse on this file can take an hour or more | |
lines = [] | |
IO.foreach('/tmp/zendesk_tickets.csv') do |line| | |
lines << line | |
if lines.size >= 1000 | |
lines = FasterCSV.parse(lines.join) rescue next | |
store lines | |
lines = [] | |
end | |
end | |
store lines | |
end | |
def self.store lines | |
return if lines.blank? | |
puts "Storing #{lines.size} rows..." | |
connection.execute(" | |
INSERT INTO zendesk_tickets (#{fields_to_sql FIELDS}) | |
VALUES #{data_to_sql lines} | |
ON DUPLICATE KEY | |
UPDATE #{fields_to_update_sql FIELDS[1..-1]} | |
") | |
end | |
def self.fields_to_sql fields | |
fields.collect{|f| connection.quote_column_name(f) }.join(',') | |
end | |
def self.data_to_sql lines | |
data = lines.collect do |v| | |
# ... some manipulation of the data into our own formats | |
v | |
end.compact | |
data.collect do |row| | |
"(#{row.collect{|f| connection.quote(f)}.join(',')})" | |
end.join(', ') | |
end | |
def self.connection | |
@connection ||= ZendeskTicket.new.connection | |
end | |
def self.fields_to_update_sql fields | |
fields.collect do |f| | |
qcn = connection.quote_column_name(f) | |
"#{qcn} = VALUES(#{qcn})" | |
end.join(', ') | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment