drio · July 2, 2009 15:35
diff --git a/gistfile1.rb b/gistfile1.rb
 #!/usr/bin/env ruby
 #
 # Author::    David Rio Deiros  (mailto:[email protected])
 # Copyright:: Copyright (c) 2009 David Rio Deiros
 # License::   BSD
 #
 # vim: tw=80 ts=2 sw=2
 #
 # genome_hasher.rb: Hashes a whole genome and allows you to query it for k-mers
 #  sequence
 #
 # Usage example:
 #
 # require 'genome_hasher'
 # puts Genome.subgen('M', 47, 49) # CAT

 require 'find'

 # Hash a entire genome so you can query for kmers
 # Uses around 4.4G and takes around 3/4 Minutes to load.
 # Singleton class
 class Genome

  private_class_method :new
  attr_reader :hash_gen
  #@@genome_dir = "/data/bucket15/xdget_genomes/Hsap"
  @@genome_dir = "/Users/drio/src/gbm_pipeline/test/genomes.test/hsap"
  @@genome     = nil

  # Singleton Method: to requets for a kmer in the genome
  def Genome.subgen(chrm, start_pos, end_pos)
    @@genome = new unless @@genome
    raise "no chrm #{chrm} @ #{@@genome_dir} ?" unless @@genome.hash_gen[chrm]
    # Since the sequence starts at 0 (array), we want to pos - 1
    @@genome.hash_gen[chrm][:seq][(start_pos-1)..(end_pos-1)].upcase
  end

  # Singleton Method: Rehash the genome (perhaps you want to work against other genome)
  def Genome.rehash(g_dir)
    @@genome_dir = g_dir
    hash_it
  end

  # Singleton Method: Number of chrms we have hashed
  def Genome.n_chrms
    number_chrms
  end

  def initialize
    raise "Dir not found: #{@@genome_dir}" unless File.exists?(@@genome_dir)
    @g_dir    = @@genome_dir
    @hash_gen = {}
    hash_it
  end

  private

  def number_chrms
    @hash_gen.size
  end

  def hash_it
    Logger.log "Hashing the genome ..."
    # Prepare the keys, the chrms
    prepare_chrms
    # values: sequence per each chrm
    hash_each_chrm
  end

  def hash_each_chrm
    @hash_gen.each do |c, value|
      Logger.log "Chrm: " + c
      path = value[:path]
      each_seq_line(path) { |sl| value[:seq] << sl }
    end
  end

  def each_seq_line(file)
    File.open(file).each_line {|l| yield l.chomp unless l =~ /^>/}
  end

  def prepare_chrms
    each_chr do |cp| # cp = chromosome path
      @hash_gen[chrm_only(cp)] = { :path => cp, :seq => ""}
    end
  end

  def each_chr
    Find.find(@g_dir) do |path|
      yield path if path =~ /chr[0-9MXY]+\.fa/
    end
  end

  # only the chrm number
  def chrm_only(path)
    path.match(/chr([0-9MXY]+)\.fa/)[1]
  end
 end

 #
 # Main
 #
 if !$test && __FILE__ == $0
  puts Genome.subgen('M', 47, 49) # CAT

 # Tests
 elsif __FILE__ == $0
 require 'test/unit'
  class TestGenome < Test::Unit::TestCase
    def test_hash_chr
      assert_equal(1, Genome.n_chrms)
    end

    def test_hash_chr
      assert_equal('CAT', Genome.subgen('M', 48, 50))
      assert_equal('TTG', Genome.subgen('M', 51, 53))
      assert_equal('AACCCTAACC', Genome.subgen('1', 211, 220))
      assert_equal('GCCCGCGCAGGCGCAGAGAGGCGC', Genome.subgen('2', 911, 934))
    end
  end
 end
	#!/usr/bin/env ruby
	#
	# Author:: David Rio Deiros (mailto:[email protected])
	# Copyright:: Copyright (c) 2009 David Rio Deiros
	# License:: BSD
	#
	# vim: tw=80 ts=2 sw=2
	#
	# genome_hasher.rb: Hashes a whole genome and allows you to query it for k-mers
	# sequence
	#
	# Usage example:
	#
	# require 'genome_hasher'
	# puts Genome.subgen('M', 47, 49) # CAT

	require 'find'

	# Hash a entire genome so you can query for kmers
	# Uses around 4.4G and takes around 3/4 Minutes to load.
	# Singleton class
	class Genome

	private_class_method :new
	attr_reader :hash_gen
	#@@genome_dir = "/data/bucket15/xdget_genomes/Hsap"
	@@genome_dir = "/Users/drio/src/gbm_pipeline/test/genomes.test/hsap"
	@@genome = nil

	# Singleton Method: to requets for a kmer in the genome
	def Genome.subgen(chrm, start_pos, end_pos)
	@@genome = new unless @@genome
	raise "no chrm #{chrm} @ #{@@genome_dir} ?" unless @@genome.hash_gen[chrm]
	# Since the sequence starts at 0 (array), we want to pos - 1
	@@genome.hash_gen[chrm][:seq][(start_pos-1)..(end_pos-1)].upcase
	end

	# Singleton Method: Rehash the genome (perhaps you want to work against other genome)
	def Genome.rehash(g_dir)
	@@genome_dir = g_dir
	hash_it
	end

	# Singleton Method: Number of chrms we have hashed
	def Genome.n_chrms
	number_chrms
	end

	def initialize
	raise "Dir not found: #{@@genome_dir}" unless File.exists?(@@genome_dir)
	@g_dir = @@genome_dir
	@hash_gen = {}
	hash_it
	end

	private

	def number_chrms
	@hash_gen.size
	end

	def hash_it
	Logger.log "Hashing the genome ..."
	# Prepare the keys, the chrms
	prepare_chrms
	# values: sequence per each chrm
	hash_each_chrm
	end

	def hash_each_chrm
	@hash_gen.each do \|c, value\|
	Logger.log "Chrm: " + c
	path = value[:path]
	each_seq_line(path) { \|sl\| value[:seq] << sl }
	end
	end

	def each_seq_line(file)
	File.open(file).each_line {\|l\| yield l.chomp unless l =~ /^>/}
	end

	def prepare_chrms
	each_chr do \|cp\| # cp = chromosome path
	@hash_gen[chrm_only(cp)] = { :path => cp, :seq => ""}
	end
	end

	def each_chr
	Find.find(@g_dir) do \|path\|
	yield path if path =~ /chr[0-9MXY]+\.fa/
	end
	end

	# only the chrm number
	def chrm_only(path)
	path.match(/chr([0-9MXY]+)\.fa/)[1]
	end
	end

	#
	# Main
	#
	if !$test && __FILE__ == $0
	puts Genome.subgen('M', 47, 49) # CAT

	# Tests
	elsif __FILE__ == $0
	require 'test/unit'
	class TestGenome < Test::Unit::TestCase
	def test_hash_chr
	assert_equal(1, Genome.n_chrms)
	end

	def test_hash_chr
	assert_equal('CAT', Genome.subgen('M', 48, 50))
	assert_equal('TTG', Genome.subgen('M', 51, 53))
	assert_equal('AACCCTAACC', Genome.subgen('1', 211, 220))
	assert_equal('GCCCGCGCAGGCGCAGAGAGGCGC', Genome.subgen('2', 911, 934))
	end
	end
	end