Last active
March 4, 2025 09:15
-
-
Save brantfaircloth/7ffbc6a8501c85d05f03 to your computer and use it in GitHub Desktop.
split (gzipped) fastq files to temp files quickly in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
(c) 2014 Brant Faircloth || http://faircloth-lab.org/ | |
All rights reserved. | |
This code is distributed under a 3-clause BSD license. Please see | |
LICENSE.txt for more information. | |
Created on 08 October 2014 22:07 CDT (-0500) | |
""" | |
import os | |
import gzip | |
import itertools | |
import tempfile | |
import multiprocessing | |
#import pdb | |
def is_gzip(filename): | |
magic_bytes = "\x1f\x8b\x08" | |
with open(filename) as infile: | |
file_start = infile.read(len(magic_bytes)) | |
if file_start.startswith(magic_bytes): | |
return True | |
return False | |
def split(filename): | |
name = os.path.splitext(os.path.basename(filename))[0] + "-" | |
if is_gzip(filename): | |
with gzip.open(filename, 'rb') as infile: | |
reads = itertools.izip_longest(*[infile]*4) | |
tempfiles = write(reads, name, gzip=True) | |
else: | |
with open(filename, 'rU') as infile: | |
reads = itertools.izip_longest(*[infile]*4) | |
tempfiles = write(reads, name, gzip=False) | |
return tempfiles | |
def batches(reads): | |
while True: | |
batchiter = itertools.islice(reads, 10000) | |
yield itertools.chain([batchiter.next()], batchiter) | |
def write(reads, name, gzip=True): | |
tempfiles = [] | |
for cnt, batch in enumerate(batches(reads)): | |
if gzip: | |
with tempfile.NamedTemporaryFile(mode="w+b", delete=False, prefix=name, suffix=".gz") as outfile: | |
gz = gzip.GzipFile(mode="wb", fileobj=outfile) | |
[gz.write("{}{}{}{}".format(*seq)) for seq in batch] | |
tempfiles.append(outfile.name) | |
else: | |
with tempfile.NamedTemporaryFile(mode="w", delete=False, prefix=name, suffix=".fastq") as outfile: | |
[outfile.write("{}{}{}{}".format(*seq)) for seq in batch] | |
tempfiles.append(outfile.name) | |
return tempfiles | |
# fastq => fastq files: 100k parsed on SSD in 0.364 sec total | |
r1 = "R1-100k.fastq" | |
r2 = "R2-100k.fastq" | |
# fastq.gz => fastq.gz files: 100k parsed on SSD in 31.442 sec total | |
#r1 = "R1-100k.fastq.gz" | |
#r2 = "R2-100k.fastq.gz" | |
# IO-bound, but still => speedup | |
p = multiprocessing.Pool(2) | |
tempfiles = p.map(split, [r1,r2]) | |
# re-pair split R1 with R2 mates | |
combos = zip(tempfiles[0], tempfiles[1]) | |
# cleanup | |
[os.remove(f) for f in tempfiles[0]] | |
[os.remove(f) for f in tempfiles[1]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment