Skip to content

Instantly share code, notes, and snippets.

@drio
Created July 2, 2009 15:35
Show Gist options
  • Save drio/139530 to your computer and use it in GitHub Desktop.
Save drio/139530 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
#
# Author:: David Rio Deiros (mailto:[email protected])
# Copyright:: Copyright (c) 2009 David Rio Deiros
# License:: BSD
#
# vim: tw=80 ts=2 sw=2
#
# genome_hasher.rb: Hashes a whole genome and allows you to query it for k-mers
# sequence
#
# Usage example:
#
# require 'genome_hasher'
# puts Genome.subgen('M', 47, 49) # CAT
require 'find'
# Hash a entire genome so you can query for kmers
# Uses around 4.4G and takes around 3/4 Minutes to load.
# Singleton class
class Genome
private_class_method :new
attr_reader :hash_gen
#@@genome_dir = "/data/bucket15/xdget_genomes/Hsap"
@@genome_dir = "/Users/drio/src/gbm_pipeline/test/genomes.test/hsap"
@@genome = nil
# Singleton Method: to requets for a kmer in the genome
def Genome.subgen(chrm, start_pos, end_pos)
@@genome = new unless @@genome
raise "no chrm #{chrm} @ #{@@genome_dir} ?" unless @@genome.hash_gen[chrm]
# Since the sequence starts at 0 (array), we want to pos - 1
@@genome.hash_gen[chrm][:seq][(start_pos-1)..(end_pos-1)].upcase
end
# Singleton Method: Rehash the genome (perhaps you want to work against other genome)
def Genome.rehash(g_dir)
@@genome_dir = g_dir
hash_it
end
# Singleton Method: Number of chrms we have hashed
def Genome.n_chrms
number_chrms
end
def initialize
raise "Dir not found: #{@@genome_dir}" unless File.exists?(@@genome_dir)
@g_dir = @@genome_dir
@hash_gen = {}
hash_it
end
private
def number_chrms
@hash_gen.size
end
def hash_it
Logger.log "Hashing the genome ..."
# Prepare the keys, the chrms
prepare_chrms
# values: sequence per each chrm
hash_each_chrm
end
def hash_each_chrm
@hash_gen.each do |c, value|
Logger.log "Chrm: " + c
path = value[:path]
each_seq_line(path) { |sl| value[:seq] << sl }
end
end
def each_seq_line(file)
File.open(file).each_line {|l| yield l.chomp unless l =~ /^>/}
end
def prepare_chrms
each_chr do |cp| # cp = chromosome path
@hash_gen[chrm_only(cp)] = { :path => cp, :seq => ""}
end
end
def each_chr
Find.find(@g_dir) do |path|
yield path if path =~ /chr[0-9MXY]+\.fa/
end
end
# only the chrm number
def chrm_only(path)
path.match(/chr([0-9MXY]+)\.fa/)[1]
end
end
#
# Main
#
if !$test && __FILE__ == $0
puts Genome.subgen('M', 47, 49) # CAT
# Tests
elsif __FILE__ == $0
require 'test/unit'
class TestGenome < Test::Unit::TestCase
def test_hash_chr
assert_equal(1, Genome.n_chrms)
end
def test_hash_chr
assert_equal('CAT', Genome.subgen('M', 48, 50))
assert_equal('TTG', Genome.subgen('M', 51, 53))
assert_equal('AACCCTAACC', Genome.subgen('1', 211, 220))
assert_equal('GCCCGCGCAGGCGCAGAGAGGCGC', Genome.subgen('2', 911, 934))
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment