radaniba · November 29, 2012 17:25
diff --git a/readexcel.rb b/readexcel.rb
 ### IMPORTS

 require 'roo'
 require 'pp'

 ### IMPLEMENTATION ###

 # A Excel spreadsheet reader that can clean up column names and convert data.
 #
 # Assumptions: The data is read off the first sheet of the workbook. The sheet
 # has column headers. Column names are cleaned up, so as to tolerate small
 # errors (see "read_headers"). Each row has the same number of entries /
 # columns.
 #
 # This happens Excel95 (xls) and Excel XML (xlsx).
 #
 # @example How to use this class
 #    rdr = XlsReader('my-excel.xls')
 #    rdr.read() { |rec| print rec }
 #
 class ExcelReader

   attr_accessor :wbook, :syn_dict, :headers

   # Initialise the reader.
   #
   # @param [String] infile   the path to the spreadsheet to be read in.
   # @param [Hash] syn_dict   a remapping of column names, to allow synonyms
   #
   # @example How to create a reader
   #    rdr = XlsReader('my-excel.xls')
   #    rdr = XlsReader('my-excel.xls', {'foo_bar'=> 'foobar'})
   #
   def initialize(infile, file_type)
      # NOTE: roo insists that files end with the right extension
      tmpfile = Tempfile.new(['excel', ".#{file_type}"])
      tmpfile_hndl = tmpfile.open()
      tmpfile_hndl.write (open(infile, 'r').read)
      tmpfile_hndl.close()

      if file_type == 'xls'
         @wbook = Excel.new(tmpfile.path)
      elsif file_type == 'xlsx'
         @wbook = Excelx.new(tmpfile.path)
      else
         raise ArgumentError("unrecognised filetype '#{file_type}'")
      end
      # NOTE: in roo, you don't select a worksheet, you name the current one
      @wbook.default_sheet = wbook.sheets.first
      @syn_dict = {}
   end

   # Read the sheet, and return each non-header row to the passed block.
   #
   # This first reads the headers and cleans them up (see "read_headers"). When
   # each row is read, the values are stored in a hash with the cleaned up
   # column names as keys. If a method exists called "method_", this is
   # used to convert the value. Spreadsheet seems to convert floats but dates
   # are returned as Excel serial values.
   #
   # @param [Block] blk  An executable block
   #
   def read(&blk)
      ## Preconditions:
      # NOTE: you can't grab a row or just read a line in roo, you have to ask
      # about the bounds and explcitly iter over the cell contents.
      row_start = @wbook.first_row
      row_stop = @wbook.last_row
      col_start = @wbook.first_column
      col_stop = @wbook.last_column
      if row_start != 1
         raise RuntimeError, "data must start on the first row not #{row_start}"
      end
      if col_start != 1
         raise RuntimeError, "data must start in the first column not #{col_start}"
      end
      # Main:
      # grab and parse headers
      @headers = read_headers(col_stop)
      # read each row
      (2..row_stop).each { |i|
         row_data = (1..col_stop).collect { |j| @wbook.cell(i,j) }
         row_zip = @headers.zip(row_data).flatten()
         row_hash = Hash[*row_zip]
         row_hash.each_pair { |k,v|
            meth_name = "convert_#{k}"
            if (self.respond_to?(meth_name))
               row_hash[k] = self.send(meth_name, v)
            else
               row_hash[k] = convert(v)
            end
         }
         blk.call(row_hash)
      }
   end

   # Clean up the title of a column to something reasonable
   #
   # If the header is recognised or a synonym, return it in the canonical
   # i.e. a symbol. Otherwise send back the raw form (a string with space
   # stripped from flanks)
   #
   def clean_col_header (hdr)
      # Conversion is:
      # - strip flanking space
      # - convert all internal non-alphanumerics to underscores
      # reduce consequetive underscores to a single one
      # TODO: strip flanking underscores
      clean_str = hdr.downcase.strip.gsub(/\W+/, '_')
      return clean_str.gsub(/_+/, '_')
   end

   # Return the canonical set of headers.
   #
   # This makes everything lowercase, strips flankning space, and substitutes
   # underscores for spaces. Override to validate or for further process headers.
   #
   def read_headers (col_stop)
      # collect header row
      headers = (1..col_stop).collect { |j|
         @wbook.cell(1,j)
      }
      # drop case, strip flanking spaces, replace gaps with underscores
      return headers.collect { |h|
         raw_val = h.strip()
         h_str = clean_col_header (raw_val)
         @syn_dict.fetch(h_str, raw_val)
      }
   end

   # General conversion of spreadsheet cell values
   #
   def convert(val)
      # clean up strings and return nil for
      if val.class == String
         val.strip!()
         if ["?", "unknown"].member?(val.downcase())
            return nil
         end
      end
      return val
   end

 end

 =======================================================================
 =======================================================================
 No description given

 COLNAMES = {
 "name" => :name,
 "isolate" => :isolate_name,
 "isolate_name" => :isolate_name,
 "collected" => :collected,
 "date_collected" => :collected,
 "comment" => :comment,
 "note" => :note,
 "year" => :season,
 "season" => :season,
 "pathogen" => :pathogen_type,
 "pathogen_type" => :pathogen_type,
 "subtype" => :pathogen_type,
 }

 =======================================================================
 =======================================================================
 No description given

 class ExcelReader < SpreadsheetReader::ExcelReader
    def initialize(infile, file_type)
       super(infile, file_type)
       @syn_dict = COLNAMES
    end

    def convert_season (raw_arg)
       season = raw_arg.to_i()
       return season
    end

    def convert_patient_gender (raw_val)
       if ['', nil].member?(raw_val)
          return nil
       end
       val = raw_val.strip.downcase()
       return {
         :m => :male,
         :f => :female,
         :male => :male,
         :female => :female
      }.fetch(val.to_sym(), val)
   end

   def convert_patient_antivirals(raw_val)
      if ['', nil].member?(raw_val)
         return nil
      end
      val = raw_val.strip.downcase().gsub(/\s/, '')
      return {
         :patienttreated => :patientTreated,
         :patientpostexposureprophylaxis => :patientPostExposureProphylaxis,
         :contacttreated => :contactTreated,
         :contactpostexposureprophylaxis => :contactPostExposureProphylaxis,
         :n => :no,
         :no => :no,
         :unknown => :unknown,
         :"?" => :unknown,
         :"" => :unknown,
      }.fetch(val.to_sym(), val)
   end

   ...

 end
	### IMPORTS

	require 'roo'
	require 'pp'

	### IMPLEMENTATION ###

	# A Excel spreadsheet reader that can clean up column names and convert data.
	#
	# Assumptions: The data is read off the first sheet of the workbook. The sheet
	# has column headers. Column names are cleaned up, so as to tolerate small
	# errors (see "read_headers"). Each row has the same number of entries /
	# columns.
	#
	# This happens Excel95 (xls) and Excel XML (xlsx).
	#
	# @example How to use this class
	# rdr = XlsReader('my-excel.xls')
	# rdr.read() { \|rec\| print rec }
	#
	class ExcelReader

	attr_accessor :wbook, :syn_dict, :headers

	# Initialise the reader.
	#
	# @param [String] infile the path to the spreadsheet to be read in.
	# @param [Hash] syn_dict a remapping of column names, to allow synonyms
	#
	# @example How to create a reader
	# rdr = XlsReader('my-excel.xls')
	# rdr = XlsReader('my-excel.xls', {'foo_bar'=> 'foobar'})
	#
	def initialize(infile, file_type)
	# NOTE: roo insists that files end with the right extension
	tmpfile = Tempfile.new(['excel', ".#{file_type}"])
	tmpfile_hndl = tmpfile.open()
	tmpfile_hndl.write (open(infile, 'r').read)
	tmpfile_hndl.close()

	if file_type == 'xls'
	@wbook = Excel.new(tmpfile.path)
	elsif file_type == 'xlsx'
	@wbook = Excelx.new(tmpfile.path)
	else
	raise ArgumentError("unrecognised filetype '#{file_type}'")
	end
	# NOTE: in roo, you don't select a worksheet, you name the current one
	@wbook.default_sheet = wbook.sheets.first
	@syn_dict = {}
	end

	# Read the sheet, and return each non-header row to the passed block.
	#
	# This first reads the headers and cleans them up (see "read_headers"). When
	# each row is read, the values are stored in a hash with the cleaned up
	# column names as keys. If a method exists called "method_", this is
	# used to convert the value. Spreadsheet seems to convert floats but dates
	# are returned as Excel serial values.
	#
	# @param [Block] blk An executable block
	#
	def read(&blk)
	## Preconditions:
	# NOTE: you can't grab a row or just read a line in roo, you have to ask
	# about the bounds and explcitly iter over the cell contents.
	row_start = @wbook.first_row
	row_stop = @wbook.last_row
	col_start = @wbook.first_column
	col_stop = @wbook.last_column
	if row_start != 1
	raise RuntimeError, "data must start on the first row not #{row_start}"
	end
	if col_start != 1
	raise RuntimeError, "data must start in the first column not #{col_start}"
	end
	# Main:
	# grab and parse headers
	@headers = read_headers(col_stop)
	# read each row
	(2..row_stop).each { \|i\|
	row_data = (1..col_stop).collect { \|j\| @wbook.cell(i,j) }
	row_zip = @headers.zip(row_data).flatten()
	row_hash = Hash[*row_zip]
	row_hash.each_pair { \|k,v\|
	meth_name = "convert_#{k}"
	if (self.respond_to?(meth_name))
	row_hash[k] = self.send(meth_name, v)
	else
	row_hash[k] = convert(v)
	end
	}
	blk.call(row_hash)
	}
	end

	# Clean up the title of a column to something reasonable
	#
	# If the header is recognised or a synonym, return it in the canonical
	# i.e. a symbol. Otherwise send back the raw form (a string with space
	# stripped from flanks)
	#
	def clean_col_header (hdr)
	# Conversion is:
	# - strip flanking space
	# - convert all internal non-alphanumerics to underscores
	# reduce consequetive underscores to a single one
	# TODO: strip flanking underscores
	clean_str = hdr.downcase.strip.gsub(/\W+/, '_')
	return clean_str.gsub(/_+/, '_')
	end

	# Return the canonical set of headers.
	#
	# This makes everything lowercase, strips flankning space, and substitutes
	# underscores for spaces. Override to validate or for further process headers.
	#
	def read_headers (col_stop)
	# collect header row
	headers = (1..col_stop).collect { \|j\|
	@wbook.cell(1,j)
	}
	# drop case, strip flanking spaces, replace gaps with underscores
	return headers.collect { \|h\|
	raw_val = h.strip()
	h_str = clean_col_header (raw_val)
	@syn_dict.fetch(h_str, raw_val)
	}
	end

	# General conversion of spreadsheet cell values
	#
	def convert(val)
	# clean up strings and return nil for
	if val.class == String
	val.strip!()
	if ["?", "unknown"].member?(val.downcase())
	return nil
	end
	end
	return val
	end

	end

	=======================================================================
	=======================================================================
	No description given

	COLNAMES = {
	"name" => :name,
	"isolate" => :isolate_name,
	"isolate_name" => :isolate_name,
	"collected" => :collected,
	"date_collected" => :collected,
	"comment" => :comment,
	"note" => :note,
	"year" => :season,
	"season" => :season,
	"pathogen" => :pathogen_type,
	"pathogen_type" => :pathogen_type,
	"subtype" => :pathogen_type,
	}

	=======================================================================
	=======================================================================
	No description given

	class ExcelReader < SpreadsheetReader::ExcelReader
	def initialize(infile, file_type)
	super(infile, file_type)
	@syn_dict = COLNAMES
	end

	def convert_season (raw_arg)
	season = raw_arg.to_i()
	return season
	end

	def convert_patient_gender (raw_val)
	if ['', nil].member?(raw_val)
	return nil
	end
	val = raw_val.strip.downcase()
	return {
	:m => :male,
	:f => :female,
	:male => :male,
	:female => :female
	}.fetch(val.to_sym(), val)
	end

	def convert_patient_antivirals(raw_val)
	if ['', nil].member?(raw_val)
	return nil
	end
	val = raw_val.strip.downcase().gsub(/\s/, '')
	return {
	:patienttreated => :patientTreated,
	:patientpostexposureprophylaxis => :patientPostExposureProphylaxis,
	:contacttreated => :contactTreated,
	:contactpostexposureprophylaxis => :contactPostExposureProphylaxis,
	:n => :no,
	:no => :no,
	:unknown => :unknown,
	:"?" => :unknown,
	:"" => :unknown,
	}.fetch(val.to_sym(), val)
	end

	...

	end