Skip to content

Instantly share code, notes, and snippets.

@labocho
Last active August 29, 2015 14:03
Show Gist options
  • Select an option

  • Save labocho/d623e15cb1878fa6f91b to your computer and use it in GitHub Desktop.

Select an option

Save labocho/d623e15cb1878fa6f91b to your computer and use it in GitHub Desktop.
CSV Converter
#!/usr/bin/env ruby
# Example:
# # UTF-8 CSV to Excel Compatible TSV
# $ csvconv --output-encoding="bom|utf-16le" --output-col-sep="\t" data.csv > data.txt
# # Excel Compatible TSV to UTF-8 CSV
# $ csvconv --input-encoding="bom|utf-16" --input-col-sep="\t" --output-encoding="utf-8" --output-col-sep="," data.txt > data.csv
require "optparse"
require "csv"
options = {
input_encoding: "utf-8",
input_row_sep: nil,
input_col_sep: nil,
output_encoding: nil,
output_row_sep: nil,
output_col_sep: nil,
new_line: "\n",
}
BOM = "\u{feff}"
OptionParser.new do |o|
o.on("--input-encoding INPUT_ENCODING"){|v| options[:input_encoding] = v }
o.on("--input-row-sep INPUT_ROW_SEPARATOR"){|v| options[:input_row_sep] = v }
o.on("--input-col-sep INPUT_COLUMN_SEPARATOR"){|v| options[:input_col_sep] = v }
o.on("--output-encoding OUTPUT_ENCODING"){|v| options[:output_encoding] = v }
o.on("--output-row-sep OUTPUT_ROW_SEPARATOR"){|v| options[:output_row_sep] = v }
o.on("--output-col-sep OUTPUT_COLUMN_SEPARATOR"){|v| options[:output_col_sep] = v }
o.on("--new-line NEW_LINE_CHARACTER"){|v| options[:new_line] = v }
o.parse!(ARGV)
end
# "\\n" to LF, "\\r" to CR, "\\t" to TAB
CONTROL_CHARS = {
"\\n" => "\n",
"\\r" => "\r",
"\\t" => "\t",
}
options.keys.each{|k|
if options[k]
CONTROL_CHARS.each do |escaped, char|
options[k].gsub!(escaped, char)
end
end
}
ARGV.each do |file|
# Read
src = open(file, "rb:#{options[:input_encoding]}"){|f| f.read }
src.encode!("utf-8")
# Unify new line
src.gsub!(/(\r\n|\r|\n)/, options[:new_line])
# Convert format
src_csv = CSV.new(src, row_sep: options[:input_row_sep] || :auto, col_sep: options[:input_col_sep] || ",")
dest = ""
dest_csv = CSV.new(
dest,
row_sep: options[:output_row_sep] || src_csv.row_sep,
col_sep: options[:output_col_sep] || src_csv.col_sep
)
src_csv.each{|row| dest_csv << row }
# Add BOM if required
output_encoding = options[:output_encoding] || options[:input_encoding]
if output_encoding =~ /\Abom\|(.+)\z/i
output_encoding = $~.captures.first
dest = BOM + dest
end
# Encode and output
puts dest.encode(output_encoding)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment