Skip to content

Instantly share code, notes, and snippets.

@mr-c
Forked from ctb/compare.py
Created September 24, 2014 00:50
Show Gist options
  • Save mr-c/9139e281881c62127f1c to your computer and use it in GitHub Desktop.
Save mr-c/9139e281881c62127f1c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import sys
import screed
if len(sys.argv) != 4:
print >>sys.stderr, "USAGE: compare.py <mutations.txt> <orig.fasta> <corrected.fasta>"
sys.exit(1)
fp_out = open('fp.out', 'w')
fn_out = open('fn.out', 'w')
mutations = {}
orig_reads = {}
for line in open(sys.argv[1]):
lexemes = line.strip().split("\t")
if len(lexemes) != 4:
continue
read_name = lexemes[0]
pos = int(lexemes[1])
orig = lexemes[2].upper()
mut = lexemes[3].upper()
if read_name not in mutations:
mutations[read_name] = {}
if pos not in mutations[read_name]: # only take first mutation for a site
mutations[read_name][pos] = {"correct" : orig, "mutation" : mut}
for n, record in enumerate(screed.open(sys.argv[2])):
orig_reads[record["name"]] = record["sequence"].upper()
tps, fps, tns, fns, tot_errors = 0, 0, 0, 0, 0
print >>sys.stderr, "#read\ttp\tfp\ttn\tfn\ttotal_errors"
for n, record in enumerate(screed.open(sys.argv[3])):
name = record["name"]
seq = record["sequence"][:100].upper()
orig = orig_reads[name]
read_mut = mutations.get(name, {})
tp, fp, tn, fn = 0, 0, 0, 0
for pos in range(len(seq)):
if pos >= len(orig):
print >>sys.stderr, "{0}\n{1}\n{2}".format(name, seq, orig)
if pos in read_mut:
if seq[pos] == read_mut[pos]["correct"]:
tp += 1
else:
fn += 1
print >>fn_out, seq, orig, pos, name
else:
if seq[pos] == orig[pos]:
tn += 1
else:
fp += 1
print >>fp_out, seq, orig, pos, name
tps += tp
fps += fp
tns += tn
fns += fn
tot_errors += len(read_mut)
print "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(name, tp, fp, tn, fn, len(read_mut))
print >>sys.stderr, "Totals\t\t{0}\t{1}\t{2}\t{3}\t{4}".format(tps, fps, tns, fns, tot_errors)
551 export PYTHONPATH=~/dev/khmer:~/dev/screed
532 python ~/dev/nullgraph/make-reads.py --mutation-details details.out genome.fa > reads.fa
536 ../scripts/normalize-by-median.py -k 20 -C 20 reads.fa -s reads.ht
539 python ../sandbox/read_aligner.py reads.ht reads.fa > out 2> errout
552 python ../sandbox/read_aligner.py reads.ht reads.fa > out 2> errout
554 ../../2013-read-aligner/compare.py details.out reads.fa out
#! /usr/bin/env python
import sys
import khmer
kh = khmer.load_counting_hash(sys.argv[1])
print kh.ksize()
for line in open(sys.argv[2]):
line = line.strip()
seq_a, seq_b, mutpos, name = line.split()
mutpos = int(mutpos)
print name
for pos in range(len(seq_a) - kh.ksize() + 1):
signal = ''
if pos == mutpos:
signal = '*'
print '%s%s:%s,%s' % (signal, pos, kh.get(seq_a[pos:pos+kh.ksize()]),
kh.get(seq_b[pos:pos+kh.ksize()]),),
print ''
print '---'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment