Last active
August 21, 2018 11:51
-
-
Save henrikj242/d3c423afc964a08a05526d60bb4facd5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ------------------------------------------------------------------------------------- | |
# This script parses a csv file, groups customers (whom may occur more than once), based on their email. | |
# It attempts to include the most meaningful stem data (first name, last name etc) for each customer | |
# - This means taking the last value after sorting so to not use empty values, but it the same customer | |
# exists more than once with actual values in his stem data, it may become invalid. | |
# It generates a row for each customer, with their purchases listed in the same way as the input file: | |
# - All skus in one column, separated by a comma, | |
# - All order dates in one column, separated by a comma | |
# ... etc | |
# Certain columns (7-13) are skipped | |
# FirstName,LastName,Email,CountryCode,ShippingAddress.postal_code,ShippingAddress.city,BoughtItems, | |
# skip elements at index 7 - 13 | |
# RegisteredItems, RegisteredItems.Sku, RegisteredItems.item_name, RegisteredItems.PurchaseDate, RegisteredItems.SerialNumber, | |
# RegisteredItems.RegisteredDate, RegisteredItems.BoughtFrom, | |
# BoughtItems.sku,BoughtItems.OrderId,BoughtItems.tax_amount,BoughtItems.name,BoughtItems.CurrencyCode,BoughtItems.OrderDate,BoughtItems.price, | |
# BoughtItems.ShippingDate,BoughtItems.quantity,BoughtItems.UsedVoucher | |
# group purchases by lower case email | |
# output conflicting rows in a separate file | |
require 'csv' | |
class Parser | |
def initialize | |
@debug = true | |
@source = 'in.csv' | |
# @customers = {} | |
@customers = { | |
'[email protected]' => { | |
first_name: 'John', | |
last_name: 'Doe', | |
email: '[email protected]', | |
country_code: 'US', | |
postal_code: '1234', | |
city: 'Los Angeles', | |
bought_items: '-', | |
items: [ | |
{ | |
sku: 'abcd-1234', | |
order_id: '12345678', | |
tax_amount: '100', | |
product_name: 'Beoplay H2', | |
currency_code: 'US', | |
order_date: 'Jan 1 1907', | |
price: '200', | |
shipping_date: 'Jan 3 1907', | |
quantity: '42', | |
used_voucher: 'x' | |
} | |
] | |
} | |
} | |
end | |
def customer_values(first_name, last_name, email, country_code, postal_code, city, bought_items) | |
existing = @customers[email&.downcase] | |
if existing.nil? | |
{ | |
first_name: first_name, | |
last_name: last_name, | |
email: email, | |
country_code: country_code, | |
postal_code: postal_code, | |
city: city, | |
bought_items: bought_items, | |
items: [] | |
} | |
else | |
{ | |
first_name: [first_name, existing[:first_name]].sort.last, | |
last_name: [last_name, existing[:last_name]].sort.last, | |
email: [email, existing[:email]].sort.last, | |
country_code: [country_code, existing[:country_code]].sort.last, | |
postal_code: [postal_code, existing[:postal_code]].sort.last, | |
city: [city, existing[:city]].sort.last, | |
bought_items: [bought_items, existing[:bought_items]].sort.last, | |
items: existing[:items] | |
} | |
end | |
end | |
def parse | |
i = 0 | |
CSV.foreach(@source, quote_char: '"', col_sep: ',') do |row| | |
i += 1 | |
# next if i < 185500 | |
row.slice!(7,7) | |
first_name, last_name, email, country_code, postal_code, city, bought_items, | |
skus, order_ids, tax_amounts, product_names, currency_codes, | |
order_dates, prices, shipping_dates, quantities, vouchers = row.map(&:to_s) | |
customer = {email => customer_values(first_name, last_name, email, country_code, postal_code, city, bought_items)} | |
if skus | |
order_ids = order_ids.split(',') | |
tax_amounts = tax_amounts.split(',') | |
product_names = product_names.split(',') | |
currency_codes = currency_codes.split(',') | |
order_date = order_dates.split(',') | |
prices = prices.split(',') | |
shipping_dates = shipping_dates.split(',') | |
quantity = quantities.split(',') | |
voucher = vouchers.split(',') | |
skus.split(',').each_with_index do |sku, index| | |
customer[email][:items] << { | |
sku: sku, | |
order_id: (order_ids[index] if index < order_ids.size).to_s, | |
tax_amount: (tax_amounts[index] if index < tax_amounts.size).to_s, | |
product_name: (product_names[index] if index < product_names.size).to_s, | |
currency_code: (currency_codes[index] if index < currency_codes.size).to_s, | |
order_date: (order_dates[index] if index < order_dates.size).to_s, | |
price: (prices[index] if index < prices.size).to_s, | |
shipping_date: (shipping_dates[index] if index < shipping_dates.size).to_s, | |
quantity: (quantities[index] if index < quantities.size).to_s, | |
voucher: (vouchers[index] if index < vouchers.size).to_s | |
} | |
end | |
end | |
puts "DEBUG line #{i}; customer: #{customer}" if @debug | |
@customers[email.downcase] = customer[email] | |
break if i > 1000000 # first 150000 in less than 5 minutes, 10:25 | |
end | |
end | |
def customers | |
@customers | |
end | |
end | |
def items_to_csv(items) | |
skus = [] | |
order_id = [] | |
tax_amount = [] | |
product_name = [] | |
currency_code = [] | |
order_date = [] | |
price = [] | |
shipping_date = [] | |
quantity = [] | |
voucher = [] | |
if (items.is_a?(Array) && items.size > 0) | |
items.each do |item| | |
skus << item[:sku] | |
order_id << item[:order_id] | |
tax_amount << item[:tax_amount] | |
product_name << item[:product_name] | |
currency_code << item[:currency_code] | |
order_date << item[:order_date] | |
price << item[:price] | |
shipping_date << item[:shipping_date] | |
quantity << item[:quantity] | |
voucher << item[:voucher] | |
end | |
end | |
'"' + skus.join(',') + '",' + | |
'"' + order_id.join(',') + '",' + | |
'"' + tax_amount.join(',') + '",' + | |
'"' + product_name.join(',') + '",' + | |
'"' + currency_code.join(',') + '",' + | |
'"' + order_date.join(',') + '",' + | |
'"' + price.join(',') + '",' + | |
'"' + shipping_date.join(',') + '",' + | |
'"' + quantity.join(',') + '",' + | |
'"' + voucher.join(',') + '"' | |
end | |
p = Parser.new | |
p.parse | |
File.open('out.csv', 'w') do |f| | |
p.customers.each do |k, v| | |
f.puts [ | |
v[:first_name], v[:last_name], v[:email], v[:country_code], | |
v[:postal_code], v[:city], v[:bought_items] | |
].map{ |e| e.include?(',') ? "\"#{e}\"" : e }.join(',') + ',' + items_to_csv(v[:items]) | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script opens a csv file where some field contains several values (sku's). | |
# It generates a new file where each of these values determine a new row, | |
# in which the remaining fields are just copied. | |
# In this specific case, some other fields contain the same number of values, | |
# corresponding with the values in field[4] (purchase dates and order numbers) | |
# The output is formatted with a tab separator and all fields are quoted. | |
require 'csv' | |
def get_item(items, index) | |
items = items.split(',') | |
items[index] | |
end | |
File.open('destination.csv', 'w') do |file| | |
CSV.foreach('source.csv') do |row| | |
lines = [] | |
skus = row[4].split(',').each_with_index do |sku, index| | |
line = [ | |
row[0], row[1], row[2], row[3], sku, get_item(row[5], index), get_item(row[6], index), get_item(row[7], index) | |
] | |
lines << line | |
end | |
lines.each do |line| | |
file.puts line.map{|item| "\"#{item}\""}.join("\t") | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment