Skip to content

Instantly share code, notes, and snippets.

@brentp
Last active September 4, 2016 03:07
Show Gist options
  • Save brentp/7d013bd9532b34690960ffcd9dad3164 to your computer and use it in GitHub Desktop.
Save brentp/7d013bd9532b34690960ffcd9dad3164 to your computer and use it in GitHub Desktop.
"""
usage is:
$ bgz-extract.py some.vcf.gz 14 20
and in another thread:
$ bgz-extract.py some.vcf.gz 15 20
...
The first number is the requested *chunk* and the 2nd is the total number of chunks which can be anything specified by the user.
"""
import sys
import mmap
import gzip
fh = open(sys.argv[1])
chunk = int(sys.argv[2])
chunks = int(sys.argv[3])
# use 0-based internally
chunk -= 1
magic = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00"
gz = gzip.GzipFile(fileobj=fh)
read_size = 65536
# write the header except for the first chunk where it will be written anyway.
if chunk > 0:
for line in gz:
if line[0] != "#": break
sys.stdout.write(line)
fh.seek(0)
m = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
assert m.find(magic) == 0
# chunk_size is determine by the size of the file.
chunk_size = m.size() / chunks
# start is the first place we find the magic
start = m.find(magic, chunk * chunk_size)
# end is then next bgzip block after chunk_size bytes.
end = m.find(magic, start + chunk_size)
if end == -1:
end = m.size()
fh.seek(start)
with gzip.GzipFile(fileobj=fh) as gz:
if chunk > 0:
# always start block with a newline.
gz.readline()
while fh.tell() < end:
sys.stdout.write(gz.read(read_size))
start = min(end, start + read_size)
# always end block with a newline
sys.stdout.write(gz.readline())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment