delagoya · October 12, 2010 20:11 · delagoya · Oct 12, 2010
diff --git a/two_bit_format_parsing_example.rb b/two_bit_format_parsing_example.rb
 require 'rubygems'
 require "bindata"

 # A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
 # randomly-accessible format. The file contains masking information as well as
 # the DNA itself.
 # 
 # The file begins with a 16-byte header containing the following fields:
 # 
 # signature - the number 0x1A412743 in the architecture of the machine that
 # created the file version - zero for now. Readers should abort if they see a
 # version number higher than 0. sequenceCount - the number of sequences in the
 # file. reserved - always zero for now All fields are 32 bits unless noted. If
 # the signature value is not as given, the reader program should byte-swap the
 # signature and check if the swapped version matches. If so, all multiple-byte
 # entities in the file will have to be byte-swapped. This enables these binary
 files to be used unchanged on different architectures.
 class TwoBitHeader < BinData::Record
 	endian :little 
 	uint32 :sig
 	uint32 :ver
 	uint32 :num
 	uint32 :reserved_flag
 end

 # The header is followed by a file index, which contains one entry for each
 # sequence. Each index entry contains three fields:
 # 
 # nameSize - a byte containing the length of the name field name - the sequence
 # name itself, of variable length depending on nameSize offset - the 32-bit
 # offset of the sequence data relative to the start of the file
 class TwoBitIndexEntry < BinData::Record
  endian :little
  int8 :len
  string :name, :read_length => :len
  uint32 :seq_offset
 end

 # The index is followed by the sequence records, which contain nine fields:
 # 
 #   dnaSize - number of bases of DNA in the sequence
 #   nBlockCount - the number of blocks of Ns in the file 
 #                 (representing unknown sequence)
 #   nBlockStarts - the starting position for each block of Ns
 #   nBlockSizes - the size of each block of Ns
 #   maskBlockCount - the number of masked (lower-case) blocks
 #   maskBlockStarts - the starting position for each masked block
 #   maskBlockSizes - the size of each masked block
 #   reserved - always zero for now
 #   packedDna - the DNA packed to two bits per base, represented as so: T - 00, 
 #               C- 01, A - 10, G - 11. The first base is in the 
 #               most significant 2-bit byte; the last base is in the least 
 #               significant 2 bits. For example, the sequence TCAG is 
 #               represented as 00011011. The packedDna field is padded with 0
 #               bits as necessary to take an even multiple of 32 bits 
 #               in the file, which improves I/O performance on some machines.

 class TwoBitArray < BinData::Wrapper
  endian :little
  mandatory_parameter :len
  array :type => :uint32, :initial_length => :len
 end

 class TwoBitSequence < BinData::Wrapper
    endian :little
    mandatory_parameter :len
    array :type => :bit2, :initial_length => :len
 end

 class TwoBitSequenceEntry < BinData::Record
  endian :little
  uint32 :dna_size

  # N blocks
  uint32 :n_block_count
  two_bit_array :n_block_starts, :len => :n_block_count
  two_bit_array :n_block_sizes, :len => :n_block_count


  # masked blocks
  uint32 :m_block_count
  two_bit_array :m_block_starts, :len => :m_block_count
  two_bit_array :m_block_sizes, :len => :m_block_count

  uint32 :reserved
  # two_bit_sequence :sequence, :len => :dna_size
 end

 io = File.open(ARGV[0])

 h = TwoBitHeader.read(io)
 puts "HEADER : #{h.sig} | #{h.ver} | #{h.num} | #{h.reserved_flag }"

 puts "So we have #{h.num} sequences here. Let's get the second sequence..."

 first_entry = TwoBitIndexEntry.read(io)
 puts "First sequence entry is #{first_entry.name} and is at file offset #{first_entry.seq_offset} ."
 second_entry = TwoBitIndexEntry.read(io)
 puts "Second sequence entry is #{second_entry.name} and is at file offset #{second_entry.seq_offset} ."


 # Set the read position to the second entry's start position
 io.pos = second_entry.seq_offset
 # This is big, probably not a good way to do this...
 seq_entry = TwoBitSequenceEntry.read(io)

 puts "Second sequence is #{seq_entry.dna_size} bases." 
 puts "There are #{seq_entry.n_block_count} \"N\" blocks."
 puts "The largest \"N\" block is #{seq_entry.n_block_sizes.max}"
 puts "There are #{seq_entry.m_block_count} masked blocks."
 puts "The largest masked block is #{seq_entry.m_block_sizes.max}"

 puts "Reading first 10 bases. "
 seq = TwoBitSequence.new(:len => 10)
 seq.read(io)

 puts "Sequence in 2bit format is: #{seq.map {|e| "%0.2d" % e.to_i.to_s(2)}.join " "}"

 def convert_to_ascii(arr)
  bases = {0 => "T", 1 => "C", 2 => "A", 3 => "G" }
  arr.collect {|b| bases[b]}.join("")
 end

 puts "Converting that to ASCII is: " + convert_to_ascii(seq)
	require 'rubygems'
	require "bindata"

	# A .2bit file stores multiple DNA sequences (up to 4 Gb total) in a compact
	# randomly-accessible format. The file contains masking information as well as
	# the DNA itself.
	#
	# The file begins with a 16-byte header containing the following fields:
	#
	# signature - the number 0x1A412743 in the architecture of the machine that
	# created the file version - zero for now. Readers should abort if they see a
	# version number higher than 0. sequenceCount - the number of sequences in the
	# file. reserved - always zero for now All fields are 32 bits unless noted. If
	# the signature value is not as given, the reader program should byte-swap the
	# signature and check if the swapped version matches. If so, all multiple-byte
	# entities in the file will have to be byte-swapped. This enables these binary
	files to be used unchanged on different architectures.
	class TwoBitHeader < BinData::Record
	endian :little
	uint32 :sig
	uint32 :ver
	uint32 :num
	uint32 :reserved_flag
	end

	# The header is followed by a file index, which contains one entry for each
	# sequence. Each index entry contains three fields:
	#
	# nameSize - a byte containing the length of the name field name - the sequence
	# name itself, of variable length depending on nameSize offset - the 32-bit
	# offset of the sequence data relative to the start of the file
	class TwoBitIndexEntry < BinData::Record
	endian :little
	int8 :len
	string :name, :read_length => :len
	uint32 :seq_offset
	end

	# The index is followed by the sequence records, which contain nine fields:
	#
	# dnaSize - number of bases of DNA in the sequence
	# nBlockCount - the number of blocks of Ns in the file
	# (representing unknown sequence)
	# nBlockStarts - the starting position for each block of Ns
	# nBlockSizes - the size of each block of Ns
	# maskBlockCount - the number of masked (lower-case) blocks
	# maskBlockStarts - the starting position for each masked block
	# maskBlockSizes - the size of each masked block
	# reserved - always zero for now
	# packedDna - the DNA packed to two bits per base, represented as so: T - 00,
	# C- 01, A - 10, G - 11. The first base is in the
	# most significant 2-bit byte; the last base is in the least
	# significant 2 bits. For example, the sequence TCAG is
	# represented as 00011011. The packedDna field is padded with 0
	# bits as necessary to take an even multiple of 32 bits
	# in the file, which improves I/O performance on some machines.

	class TwoBitArray < BinData::Wrapper
	endian :little
	mandatory_parameter :len
	array :type => :uint32, :initial_length => :len
	end

	class TwoBitSequence < BinData::Wrapper
	endian :little
	mandatory_parameter :len
	array :type => :bit2, :initial_length => :len
	end

	class TwoBitSequenceEntry < BinData::Record
	endian :little
	uint32 :dna_size

	# N blocks
	uint32 :n_block_count
	two_bit_array :n_block_starts, :len => :n_block_count
	two_bit_array :n_block_sizes, :len => :n_block_count


	# masked blocks
	uint32 :m_block_count
	two_bit_array :m_block_starts, :len => :m_block_count
	two_bit_array :m_block_sizes, :len => :m_block_count

	uint32 :reserved
	# two_bit_sequence :sequence, :len => :dna_size
	end

	io = File.open(ARGV[0])

	h = TwoBitHeader.read(io)
	puts "HEADER : #{h.sig} \| #{h.ver} \| #{h.num} \| #{h.reserved_flag }"

	puts "So we have #{h.num} sequences here. Let's get the second sequence..."

	first_entry = TwoBitIndexEntry.read(io)
	puts "First sequence entry is #{first_entry.name} and is at file offset #{first_entry.seq_offset} ."
	second_entry = TwoBitIndexEntry.read(io)
	puts "Second sequence entry is #{second_entry.name} and is at file offset #{second_entry.seq_offset} ."


	# Set the read position to the second entry's start position
	io.pos = second_entry.seq_offset
	# This is big, probably not a good way to do this...
	seq_entry = TwoBitSequenceEntry.read(io)

	puts "Second sequence is #{seq_entry.dna_size} bases."
	puts "There are #{seq_entry.n_block_count} \"N\" blocks."
	puts "The largest \"N\" block is #{seq_entry.n_block_sizes.max}"
	puts "There are #{seq_entry.m_block_count} masked blocks."
	puts "The largest masked block is #{seq_entry.m_block_sizes.max}"

	puts "Reading first 10 bases. "
	seq = TwoBitSequence.new(:len => 10)
	seq.read(io)

	puts "Sequence in 2bit format is: #{seq.map {\|e\| "%0.2d" % e.to_i.to_s(2)}.join " "}"

	def convert_to_ascii(arr)
	bases = {0 => "T", 1 => "C", 2 => "A", 3 => "G" }
	arr.collect {\|b\| bases[b]}.join("")
	end

	puts "Converting that to ASCII is: " + convert_to_ascii(seq)