Created
July 2, 2009 15:35
-
-
Save drio/139529 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Author:: David Rio Deiros (mailto:[email protected]) | |
# Copyright:: Copyright (c) 2009 David Rio Deiros | |
# License:: BSD | |
# | |
# vim: tw=80 ts=2 sw=2 | |
# | |
# genome_hasher.rb: Hashes a whole genome and allows you to query it for k-mers | |
# sequence | |
# | |
# Usage example: | |
# | |
# require 'genome_hasher' | |
# puts Genome.subgen('M', 47, 49) # CAT | |
require 'find' | |
# Hash a entire genome so you can query for kmers | |
# Uses around 4.4G and takes around 3/4 Minutes to load. | |
# Singleton class | |
class Genome | |
private_class_method :new | |
attr_reader :hash_gen | |
#@@genome_dir = "/data/bucket15/xdget_genomes/Hsap" | |
@@genome_dir = "/Users/drio/src/gbm_pipeline/test/genomes.test/hsap" | |
@@genome = nil | |
# Singleton Method: to requets for a kmer in the genome | |
def Genome.subgen(chrm, start_pos, end_pos) | |
@@genome = new unless @@genome | |
raise "no chrm #{chrm} @ #{@@genome_dir} ?" unless @@genome.hash_gen[chrm] | |
# Since the sequence starts at 0 (array), we want to pos - 1 | |
@@genome.hash_gen[chrm][:seq][(start_pos-1)..(end_pos-1)].upcase | |
end | |
# Singleton Method: Rehash the genome (perhaps you want to work against other genome) | |
def Genome.rehash(g_dir) | |
@@genome_dir = g_dir | |
hash_it | |
end | |
# Singleton Method: Number of chrms we have hashed | |
def Genome.n_chrms | |
number_chrms | |
end | |
def initialize | |
raise "Dir not found: #{@@genome_dir}" unless File.exists?(@@genome_dir) | |
@g_dir = @@genome_dir | |
@hash_gen = {} | |
hash_it | |
end | |
private | |
def number_chrms | |
@hash_gen.size | |
end | |
def hash_it | |
Logger.log "Hashing the genome ..." | |
# Prepare the keys, the chrms | |
prepare_chrms | |
# values: sequence per each chrm | |
hash_each_chrm | |
end | |
def hash_each_chrm | |
@hash_gen.each do |c, value| | |
Logger.log "Chrm: " + c | |
path = value[:path] | |
each_seq_line(path) { |sl| value[:seq] << sl } | |
end | |
end | |
def each_seq_line(file) | |
File.open(file).each_line {|l| yield l.chomp unless l =~ /^>/} | |
end | |
def prepare_chrms | |
each_chr do |cp| # cp = chromosome path | |
@hash_gen[chrm_only(cp)] = { :path => cp, :seq => ""} | |
end | |
end | |
def each_chr | |
Find.find(@g_dir) do |path| | |
yield path if path =~ /chr[0-9MXY]+\.fa/ | |
end | |
end | |
# only the chrm number | |
def chrm_only(path) | |
path.match(/chr([0-9MXY]+)\.fa/)[1] | |
end | |
end | |
# | |
# Main | |
# | |
if !$test && __FILE__ == $0 | |
puts Genome.subgen('M', 47, 49) # CAT | |
# Tests | |
elsif __FILE__ == $0 | |
require 'test/unit' | |
class TestGenome < Test::Unit::TestCase | |
def test_hash_chr | |
assert_equal(1, Genome.n_chrms) | |
end | |
def test_hash_chr | |
assert_equal('CAT', Genome.subgen('M', 48, 50)) | |
assert_equal('TTG', Genome.subgen('M', 51, 53)) | |
assert_equal('AACCCTAACC', Genome.subgen('1', 211, 220)) | |
assert_equal('GCCCGCGCAGGCGCAGAGAGGCGC', Genome.subgen('2', 911, 934)) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment