Last active
December 30, 2015 07:49
-
-
Save KernelPanicAUS/7798112 to your computer and use it in GitHub Desktop.
CSV parser in Ruby, spits out CSV record count within a time frame (3 month increments)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# encoding: UTF-8 | |
require 'date' | |
require 'FileUtils' | |
require 'Benchmark' | |
require 'CSV' | |
require 'progress_bar' | |
class CSV | |
module ProgressBar | |
def progress_bar | |
::ProgressBar.new(@io.size, :bar, :percentage, :elapsed, :eta) | |
end | |
def each | |
progress_bar = self.progress_bar | |
super do |row| | |
yield row | |
progress_bar.count = self.pos | |
progress_bar.increment!(0) | |
end | |
end | |
end | |
class WithProgressBar < CSV | |
include ProgressBar | |
end | |
def self.with_progress_bar | |
WithProgressBar | |
end | |
end | |
################ | |
# Initialisers # | |
################ | |
log = File.open("./summary.log","a+") | |
rowcount = 0 | |
filecount = 0 | |
results = Hash.new(0) | |
filelist = Array.new | |
ranges = [(Date.parse("2012-01-01")..Date.parse("2012-03-31")), (Date.parse("2012-04-01")..Date.parse("2012-06-30")), (Date.parse("2012-07-01")..Date.parse("2012-09-30")), (Date.parse("2012-10-01")..Date.parse("2012-12-31")), (Date.parse("2013-01-01")..Date.parse("2013-03-31")), (Date.parse("2013-04-01")..Date.parse("2013-06-30")), (Date.parse("2013-07-01")..Date.parse("2013-09-30")), (Date.parse("2013-10-01")..Date.parse("2013-12-31"))] | |
# Each lines from the CSV will be fed to the script for evaluation and will be subsequently dumped from memory as the CSV files are quite large. | |
# Each line will be delimited, the 4 th element which is the record creation date, will be evaluated against a few timeframes which are stored in the rangehash hash. | |
# If a date does occur, the counter will increment and store the value in the counter array, used to output the stats at the end of the scipt. | |
log.write("##########################################################################################\n") | |
log.write("#{Time.now}\n") | |
log.write("##########################################################################################\n") | |
# Compiles a list of CSV files in the script's current working directory. | |
Dir.foreach(".") do |file| | |
if File.extname(file) == '.csv' | |
filelist << file | |
end | |
end | |
# Parses every CSV file saved in the previous operation. | |
filelist.each do |spreadsheet| | |
puts "Parsing : #{spreadsheet}, File size : #{File.size(spreadsheet)}" | |
log.write("Parsing : #{spreadsheet}, File size : #{File.size(spreadsheet)} \n") | |
time = Benchmark.realtime do | |
CSV.with_progress_bar.foreach(spreadsheet, headers: :true, encoding:'iso-8859-1:utf-8') do |row| | |
dirty_date = Date.parse(row['CreatedDate'].split[0]) rescue nil | |
ranges.each do |range| | |
results[range] += 1 if range.include? dirty_date | |
end | |
filecount+=1 | |
end | |
log.write("Time elapsed to process this file #{time} seconds \n\n") | |
results.each do |range, count| | |
log.write("#{count} lines found between #{range.first.to_s} and #{range.last.to_s} .\n") | |
end | |
results = Hash.new(0) | |
log.write("\n") | |
log.write("******* End of File : #{spreadsheet} *******\n") | |
log.write("\n") | |
log.write("#{filecount} lines in #{spreadsheet} \n") | |
filecount = 0 | |
end | |
end | |
log.write("##########################################################################################\n") | |
log.write("Parse ended at #{Time.now}\n") | |
log.write("\n\n\n\n") | |
log.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment