Created
November 29, 2012 17:25
-
-
Save radaniba/4170563 to your computer and use it in GitHub Desktop.
Excel Spreadsheet Reader with Ruby
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### IMPORTS | |
require 'roo' | |
require 'pp' | |
### IMPLEMENTATION ### | |
# A Excel spreadsheet reader that can clean up column names and convert data. | |
# | |
# Assumptions: The data is read off the first sheet of the workbook. The sheet | |
# has column headers. Column names are cleaned up, so as to tolerate small | |
# errors (see "read_headers"). Each row has the same number of entries / | |
# columns. | |
# | |
# This happens Excel95 (xls) and Excel XML (xlsx). | |
# | |
# @example How to use this class | |
# rdr = XlsReader('my-excel.xls') | |
# rdr.read() { |rec| print rec } | |
# | |
class ExcelReader | |
attr_accessor :wbook, :syn_dict, :headers | |
# Initialise the reader. | |
# | |
# @param [String] infile the path to the spreadsheet to be read in. | |
# @param [Hash] syn_dict a remapping of column names, to allow synonyms | |
# | |
# @example How to create a reader | |
# rdr = XlsReader('my-excel.xls') | |
# rdr = XlsReader('my-excel.xls', {'foo_bar'=> 'foobar'}) | |
# | |
def initialize(infile, file_type) | |
# NOTE: roo insists that files end with the right extension | |
tmpfile = Tempfile.new(['excel', ".#{file_type}"]) | |
tmpfile_hndl = tmpfile.open() | |
tmpfile_hndl.write (open(infile, 'r').read) | |
tmpfile_hndl.close() | |
if file_type == 'xls' | |
@wbook = Excel.new(tmpfile.path) | |
elsif file_type == 'xlsx' | |
@wbook = Excelx.new(tmpfile.path) | |
else | |
raise ArgumentError("unrecognised filetype '#{file_type}'") | |
end | |
# NOTE: in roo, you don't select a worksheet, you name the current one | |
@wbook.default_sheet = wbook.sheets.first | |
@syn_dict = {} | |
end | |
# Read the sheet, and return each non-header row to the passed block. | |
# | |
# This first reads the headers and cleans them up (see "read_headers"). When | |
# each row is read, the values are stored in a hash with the cleaned up | |
# column names as keys. If a method exists called "method_", this is | |
# used to convert the value. Spreadsheet seems to convert floats but dates | |
# are returned as Excel serial values. | |
# | |
# @param [Block] blk An executable block | |
# | |
def read(&blk) | |
## Preconditions: | |
# NOTE: you can't grab a row or just read a line in roo, you have to ask | |
# about the bounds and explcitly iter over the cell contents. | |
row_start = @wbook.first_row | |
row_stop = @wbook.last_row | |
col_start = @wbook.first_column | |
col_stop = @wbook.last_column | |
if row_start != 1 | |
raise RuntimeError, "data must start on the first row not #{row_start}" | |
end | |
if col_start != 1 | |
raise RuntimeError, "data must start in the first column not #{col_start}" | |
end | |
# Main: | |
# grab and parse headers | |
@headers = read_headers(col_stop) | |
# read each row | |
(2..row_stop).each { |i| | |
row_data = (1..col_stop).collect { |j| @wbook.cell(i,j) } | |
row_zip = @headers.zip(row_data).flatten() | |
row_hash = Hash[*row_zip] | |
row_hash.each_pair { |k,v| | |
meth_name = "convert_#{k}" | |
if (self.respond_to?(meth_name)) | |
row_hash[k] = self.send(meth_name, v) | |
else | |
row_hash[k] = convert(v) | |
end | |
} | |
blk.call(row_hash) | |
} | |
end | |
# Clean up the title of a column to something reasonable | |
# | |
# If the header is recognised or a synonym, return it in the canonical | |
# i.e. a symbol. Otherwise send back the raw form (a string with space | |
# stripped from flanks) | |
# | |
def clean_col_header (hdr) | |
# Conversion is: | |
# - strip flanking space | |
# - convert all internal non-alphanumerics to underscores | |
# reduce consequetive underscores to a single one | |
# TODO: strip flanking underscores | |
clean_str = hdr.downcase.strip.gsub(/\W+/, '_') | |
return clean_str.gsub(/_+/, '_') | |
end | |
# Return the canonical set of headers. | |
# | |
# This makes everything lowercase, strips flankning space, and substitutes | |
# underscores for spaces. Override to validate or for further process headers. | |
# | |
def read_headers (col_stop) | |
# collect header row | |
headers = (1..col_stop).collect { |j| | |
@wbook.cell(1,j) | |
} | |
# drop case, strip flanking spaces, replace gaps with underscores | |
return headers.collect { |h| | |
raw_val = h.strip() | |
h_str = clean_col_header (raw_val) | |
@syn_dict.fetch(h_str, raw_val) | |
} | |
end | |
# General conversion of spreadsheet cell values | |
# | |
def convert(val) | |
# clean up strings and return nil for | |
if val.class == String | |
val.strip!() | |
if ["?", "unknown"].member?(val.downcase()) | |
return nil | |
end | |
end | |
return val | |
end | |
end | |
======================================================================= | |
======================================================================= | |
No description given | |
COLNAMES = { | |
"name" => :name, | |
"isolate" => :isolate_name, | |
"isolate_name" => :isolate_name, | |
"collected" => :collected, | |
"date_collected" => :collected, | |
"comment" => :comment, | |
"note" => :note, | |
"year" => :season, | |
"season" => :season, | |
"pathogen" => :pathogen_type, | |
"pathogen_type" => :pathogen_type, | |
"subtype" => :pathogen_type, | |
} | |
======================================================================= | |
======================================================================= | |
No description given | |
class ExcelReader < SpreadsheetReader::ExcelReader | |
def initialize(infile, file_type) | |
super(infile, file_type) | |
@syn_dict = COLNAMES | |
end | |
def convert_season (raw_arg) | |
season = raw_arg.to_i() | |
return season | |
end | |
def convert_patient_gender (raw_val) | |
if ['', nil].member?(raw_val) | |
return nil | |
end | |
val = raw_val.strip.downcase() | |
return { | |
:m => :male, | |
:f => :female, | |
:male => :male, | |
:female => :female | |
}.fetch(val.to_sym(), val) | |
end | |
def convert_patient_antivirals(raw_val) | |
if ['', nil].member?(raw_val) | |
return nil | |
end | |
val = raw_val.strip.downcase().gsub(/\s/, '') | |
return { | |
:patienttreated => :patientTreated, | |
:patientpostexposureprophylaxis => :patientPostExposureProphylaxis, | |
:contacttreated => :contactTreated, | |
:contactpostexposureprophylaxis => :contactPostExposureProphylaxis, | |
:n => :no, | |
:no => :no, | |
:unknown => :unknown, | |
:"?" => :unknown, | |
:"" => :unknown, | |
}.fetch(val.to_sym(), val) | |
end | |
... | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment