Last active
September 4, 2016 03:07
-
-
Save brentp/7d013bd9532b34690960ffcd9dad3164 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
usage is: | |
$ bgz-extract.py some.vcf.gz 14 20 | |
and in another thread: | |
$ bgz-extract.py some.vcf.gz 15 20 | |
... | |
The first number is the requested *chunk* and the 2nd is the total number of chunks which can be anything specified by the user. | |
""" | |
import sys | |
import mmap | |
import gzip | |
fh = open(sys.argv[1]) | |
chunk = int(sys.argv[2]) | |
chunks = int(sys.argv[3]) | |
# use 0-based internally | |
chunk -= 1 | |
magic = b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" | |
gz = gzip.GzipFile(fileobj=fh) | |
read_size = 65536 | |
# write the header except for the first chunk where it will be written anyway. | |
if chunk > 0: | |
for line in gz: | |
if line[0] != "#": break | |
sys.stdout.write(line) | |
fh.seek(0) | |
m = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ) | |
assert m.find(magic) == 0 | |
# chunk_size is determine by the size of the file. | |
chunk_size = m.size() / chunks | |
# start is the first place we find the magic | |
start = m.find(magic, chunk * chunk_size) | |
# end is then next bgzip block after chunk_size bytes. | |
end = m.find(magic, start + chunk_size) | |
if end == -1: | |
end = m.size() | |
fh.seek(start) | |
with gzip.GzipFile(fileobj=fh) as gz: | |
if chunk > 0: | |
# always start block with a newline. | |
gz.readline() | |
while fh.tell() < end: | |
sys.stdout.write(gz.read(read_size)) | |
start = min(end, start + read_size) | |
# always end block with a newline | |
sys.stdout.write(gz.readline()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment