Created
April 25, 2012 11:17
-
-
Save lomereiter/2489032 to your computer and use it in GitHub Desktop.
BGZF block range in D
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module bgzfrange; | |
import std.stream; | |
struct BgzfBlock { | |
// field types are as in the specification | |
// ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t | |
public ushort bsize; | |
public char[] compressed_data = void; | |
public uint crc32; | |
public uint input_size; | |
} | |
class BgzfRange { | |
private Stream _stream; | |
private size_t _start_offset; | |
private BgzfBlock _current_block; | |
private bool _load_next_block() { | |
_start_offset = _stream.position; | |
try { | |
auto bgzf_magic = _stream.readString(4); | |
if (bgzf_magic != x"1F 8B 08 04") { | |
return false; | |
} | |
uint gzip_mod_time; | |
ubyte gzip_extra_flags; | |
ubyte gzip_os; | |
ushort gzip_extra_length; | |
_stream.read(gzip_mod_time); | |
_stream.read(gzip_extra_flags); | |
_stream.read(gzip_os); | |
_stream.read(gzip_extra_length); | |
ushort bsize; // total Block SIZE minus 1 | |
bool found_block_size = false; | |
// read extra subfields | |
size_t len = 0; | |
while (len < gzip_extra_length) { | |
ubyte si1; // Subfield Identifier1 | |
ubyte si2; // Subfield Identifier2 | |
ushort slen; // Subfield LENgth | |
_stream.read(si1); | |
_stream.read(si2); | |
_stream.read(slen); | |
if (si1 == 66 && si2 == 67) { | |
// found 'BC' as subfield identifier | |
if (slen != 2) { | |
return false; // wrong subfield length | |
} | |
if (found_block_size) { | |
return false; // duplicate field | |
} | |
// read block size | |
_stream.read(bsize); | |
found_block_size = true; | |
// read the rest | |
_stream.readString(slen - bsize.sizeof); | |
} else { | |
// this field has nothing to do with block size, | |
// just skip | |
_stream.readString(slen); | |
} | |
len += si1.sizeof + si2.sizeof + slen.sizeof + slen; | |
} | |
if (len != gzip_extra_length) { | |
return false; | |
} | |
if (!found_block_size) { | |
return false; | |
} | |
// read compressed data | |
auto cdata_size = bsize - gzip_extra_length - 19; | |
_current_block.bsize = bsize; | |
_current_block.compressed_data = _stream.readString(cdata_size); | |
_stream.read(_current_block.crc32); | |
_stream.read(_current_block.input_size); | |
return true; | |
} catch (ReadException e) { | |
return false; // TODO: better error handling | |
} | |
return false; | |
} | |
public this(Stream stream) { | |
_stream = stream; | |
_load_next_block(); | |
} | |
@property public size_t start_offset() { return _start_offset; } | |
public bool empty() { | |
return _stream.eof(); | |
} | |
public void popFront() { | |
_load_next_block(); | |
} | |
public BgzfBlock front() { | |
return _current_block; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bgzfrange; | |
import std.stream; | |
import std.stdio; | |
void main() { | |
BufferedFile file = new BufferedFile("HG00125.chrom20.ILLUMINA.bwa.GBR.low_coverage.20111114.bam"); | |
uint blocks = 0; | |
uint[] block_sizes; | |
uint[] block_input_sizes; | |
foreach(block; new BgzfRange(file)) { | |
blocks++; | |
block_sizes ~= block.bsize; // get some stats | |
// shows that average size is ~20k | |
block_input_sizes ~= block.input_size; | |
// average input size is ~65k (almost max for ushort) | |
// i.e compression is about 3.2x | |
} | |
file.close(); | |
writefln("total blocks: %d", blocks); | |
auto out_file = new BufferedFile("block_size_distribution.dat", FileMode.OutNew); | |
foreach(sz; block_sizes) { out_file.writefln("%d", sz); } | |
out_file.close(); | |
out_file = new BufferedFile("block_input_size_distribution.dat", FileMode.OutNew); | |
foreach(sz; block_input_sizes) { out_file.writefln("%d", sz); } | |
out_file.close(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment