Skip to content

Instantly share code, notes, and snippets.

@lomereiter
Created April 25, 2012 11:17
Show Gist options
  • Save lomereiter/2489032 to your computer and use it in GitHub Desktop.
Save lomereiter/2489032 to your computer and use it in GitHub Desktop.
BGZF block range in D
module bgzfrange;
import std.stream;
struct BgzfBlock {
// field types are as in the specification
// ushort ~ uint16_t, char ~ uint8_t, uint ~ uint32_t
public ushort bsize;
public char[] compressed_data = void;
public uint crc32;
public uint input_size;
}
class BgzfRange {
private Stream _stream;
private size_t _start_offset;
private BgzfBlock _current_block;
private bool _load_next_block() {
_start_offset = _stream.position;
try {
auto bgzf_magic = _stream.readString(4);
if (bgzf_magic != x"1F 8B 08 04") {
return false;
}
uint gzip_mod_time;
ubyte gzip_extra_flags;
ubyte gzip_os;
ushort gzip_extra_length;
_stream.read(gzip_mod_time);
_stream.read(gzip_extra_flags);
_stream.read(gzip_os);
_stream.read(gzip_extra_length);
ushort bsize; // total Block SIZE minus 1
bool found_block_size = false;
// read extra subfields
size_t len = 0;
while (len < gzip_extra_length) {
ubyte si1; // Subfield Identifier1
ubyte si2; // Subfield Identifier2
ushort slen; // Subfield LENgth
_stream.read(si1);
_stream.read(si2);
_stream.read(slen);
if (si1 == 66 && si2 == 67) {
// found 'BC' as subfield identifier
if (slen != 2) {
return false; // wrong subfield length
}
if (found_block_size) {
return false; // duplicate field
}
// read block size
_stream.read(bsize);
found_block_size = true;
// read the rest
_stream.readString(slen - bsize.sizeof);
} else {
// this field has nothing to do with block size,
// just skip
_stream.readString(slen);
}
len += si1.sizeof + si2.sizeof + slen.sizeof + slen;
}
if (len != gzip_extra_length) {
return false;
}
if (!found_block_size) {
return false;
}
// read compressed data
auto cdata_size = bsize - gzip_extra_length - 19;
_current_block.bsize = bsize;
_current_block.compressed_data = _stream.readString(cdata_size);
_stream.read(_current_block.crc32);
_stream.read(_current_block.input_size);
return true;
} catch (ReadException e) {
return false; // TODO: better error handling
}
return false;
}
public this(Stream stream) {
_stream = stream;
_load_next_block();
}
@property public size_t start_offset() { return _start_offset; }
public bool empty() {
return _stream.eof();
}
public void popFront() {
_load_next_block();
}
public BgzfBlock front() {
return _current_block;
}
}
import bgzfrange;
import std.stream;
import std.stdio;
void main() {
BufferedFile file = new BufferedFile("HG00125.chrom20.ILLUMINA.bwa.GBR.low_coverage.20111114.bam");
uint blocks = 0;
uint[] block_sizes;
uint[] block_input_sizes;
foreach(block; new BgzfRange(file)) {
blocks++;
block_sizes ~= block.bsize; // get some stats
// shows that average size is ~20k
block_input_sizes ~= block.input_size;
// average input size is ~65k (almost max for ushort)
// i.e compression is about 3.2x
}
file.close();
writefln("total blocks: %d", blocks);
auto out_file = new BufferedFile("block_size_distribution.dat", FileMode.OutNew);
foreach(sz; block_sizes) { out_file.writefln("%d", sz); }
out_file.close();
out_file = new BufferedFile("block_input_size_distribution.dat", FileMode.OutNew);
foreach(sz; block_input_sizes) { out_file.writefln("%d", sz); }
out_file.close();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment