rchikhi · October 25, 2022 06:16 · rchikhi · Feb 17, 2014 · rchikhi · Mar 31, 2014
diff --git a/estimate-insert-sizes b/estimate-insert-sizes
 #!/usr/bin/env python
 doc = """
 Quickly estimates insert sizes of read datasets, given some sequence(s) they can be mapped to.
 Author: Rayan Chikhi         

 short usage: <reference> <*.fastq> 

 example: 
         estimate-insert-sizes contigs.fa readsA_1.fq readsA_2.fq readsB_1.fq readsB_2.fq

 or, with shell globbing:
         
         estimate-insert-sizes contigs.fa *.fq

 special case, a single argument is interpreted as interleaved pairs:
         
         estimate-insert-sizes contigs.fa interleaved.fq
 """

 """ technical note:
 by default, bwa will be executed with "-t X" to read X*100kbp sequences, instead of just 100kbp.
 100kbp is not enough in my experience to detect insert sizes.
 incidentally, bwa will use X threads, even if the cpu has less cores than that.
 X can be changed by modifying this variable:
 """

 nb_threads = 5

 # --------

 from glob import glob
 import subprocess
 import sys, os

 if len(sys.argv) < 3:
    exit(doc)

 reference = sys.argv[1]
 reads = sorted(sys.argv[2:])

 try:
    subprocess.call(["bwa"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 except:
    exit("Please make sure that the `bwa` binary is in your $PATH")

 for read in reads:
    if not os.path.isfile(read):
        exit("Error: %s does not exist" % read)

 if len(reads) == 1:
    print "Assuming that %s is interleaved" % reads[0]
    reads += [""]

 if not os.path.isfile(reference+".sa"):
    print "Creating index file.."
    subprocess.call(["bwa", "index", reference], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

 def parse_list(line, nb_elts):
    # specific to BWA-MEM stderr format
    return map(lambda x: int(float(x)), ' '.join(line.strip().replace(',','').split()[-nb_elts:])[1:-1].split())

 stats = dict()
 for read1, read2 in zip(reads[::2],reads[1::2]):
    print( "Processing: \n %s \n %s " % (read1,read2) )
    cmd = ["bwa", "mem"] + (["-p"] if read2 == "" else []) +  ["-t %d" % nb_threads, reference, read1, read2]
    DEVNULL = open(os.devnull, 'wb')
    process = subprocess.Popen(cmd, stdout=DEVNULL, stderr=subprocess.PIPE)
    seen_candidate_line = False
    while True:
        line = process.stderr.readline()
        if line == '' and process.poll() != None:
            break
        if "worker" in line:
            break
        if "pestat" not in line:
            continue
        if "candidate unique pairs for" in line:
            if seen_candidate_line:
                break
            seen_candidate_line = True
            nb_pairs = parse_list(line,4)
            for i in xrange(4):
                stats[['FF', 'FR', 'RF', 'RR'][i]] = { 'nb_pairs' : nb_pairs[i] }
        if "orientation" in line:
            orientation = line.strip().split()[-1].replace('.','')
        if "mem_pestat] mean and std.dev:" in line:
            mean, stdev = parse_list(line,2)
            stats[orientation]['mean'] = mean
            stats[orientation]['stdev'] = stdev
            if orientation == 'RR':
                # stats are complete
                break
        sys.stdout.write(line)
        sys.stdout.flush()
    if process.poll() is None:
        process.terminate()
   
    results = sorted(stats.items(), key = lambda x: x[1]['nb_pairs'], reverse=True)
    most_likely = results[0]
    mean = most_likely[1]['mean']
    stdev = most_likely[1]['stdev']
    print "Orientation", most_likely[0], "mean", mean, "stdev", stdev