Created
September 23, 2012 19:17
-
-
Save lomereiter/3772725 to your computer and use it in GitHub Desktop.
simple SNP caller
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// How to build: | |
// rdmd -O -release -inline --build-only -Ipath/to/latest/sambamba/ snpcaller.d | |
// | |
// Usage: ./snpcaller input.bam > snps.dat | |
// (input.bam must contain MD tags) | |
import bamfile; | |
import reconstruct; | |
import splitter; | |
import std.array; | |
import std.parallelism; | |
import std.ascii; | |
import std.stdio; | |
import std.typecons; | |
import std.conv; | |
import std.algorithm; | |
import std.range; | |
immutable char[4] bases = ['A', 'C', 'G', 'T']; | |
struct Snp { | |
uint position; | |
char reference_base; | |
char sample_base; | |
bool is_transition() @property const { | |
return (reference_base == 'A' && sample_base == 'G') || | |
(reference_base == 'G' && sample_base == 'A') || | |
(reference_base == 'T' && sample_base == 'C') || | |
(reference_base == 'C' && sample_base == 'T'); | |
} | |
bool is_transversion() @property const { | |
return !is_transition; | |
} | |
} | |
/// Processes all reads given and searched for SNPs on the interval | |
/// [start_ref_pos, end_ref_pos) where both start and end are 0-based. | |
Snp[] callSNPs(Alignment[] reads, uint start_ref_pos, uint end_ref_pos, | |
uint reference_length, | |
uint minimum_witnesses=2, uint minimum_coverage=2) { | |
auto snps = appender!(Snp[])(); | |
auto start_pos = reads.front.position; | |
auto dna = to!string(dna(reads)); | |
auto countA = new int[dna.length]; | |
auto countC = new int[dna.length]; | |
auto countG = new int[dna.length]; | |
auto countT = new int[dna.length]; | |
foreach (read; reads) { | |
auto seq = read.sequence; | |
auto ref_pos = read.position; | |
foreach (cigar_op; read.cigar) { | |
if (cigar_op.is_query_consuming && cigar_op.is_reference_consuming) { | |
for (auto i = 0; i < cigar_op.length; i++) { | |
if (ref_pos + i >= reference_length) { | |
break; | |
} | |
auto index = ref_pos + i - start_pos; | |
if (index >= dna.length) { | |
stderr.writeln("read name: ", read.read_name); | |
stderr.writeln("index: ", index); | |
stderr.writeln("start_pos: ", start_pos); | |
stderr.writeln("CIGAR: ", read.cigarString()); | |
stderr.writeln("CIGAR operation type: ", cigar_op.operation); | |
stderr.writeln("CIGAR operation length: ", cigar_op.length); | |
if (!read["MD"].is_nothing) { | |
stderr.writeln("MD: ", to!string(read["MD"])); | |
} | |
break; | |
} | |
switch (seq[i]) { | |
case 'a', 'A': | |
++countA[index]; | |
break; | |
case 'c', 'C': | |
++countC[index]; | |
break; | |
case 'g', 'G': | |
++countG[index]; | |
break; | |
case 't', 'T': | |
++countT[index]; | |
break; | |
default: | |
break; | |
} | |
} | |
} | |
if (cigar_op.is_query_consuming) { | |
seq = seq[cigar_op.length .. seq.length]; | |
} | |
if (cigar_op.is_reference_consuming) { | |
ref_pos += cigar_op.length; | |
} | |
} | |
} | |
int[4] counts; | |
int transitions; | |
int transversions; | |
int reference_bases_count; | |
auto beg = max(start_ref_pos, start_pos); | |
auto end = min(end_ref_pos, reads[$-1].position + reads[$-1].basesCovered()); | |
for (auto k = beg; k < end; ++k) { | |
auto i = k - start_pos; | |
auto b = dna[i]; | |
auto base = toUpper(b); | |
if (base == 'N') | |
continue; | |
reference_bases_count++; | |
counts[0] = countA[i]; | |
counts[1] = countC[i]; | |
counts[2] = countG[i]; | |
counts[3] = countT[i]; | |
size_t max_index; | |
int total_count; | |
for (auto j = 0; j < 4; j++) { | |
if (counts[j] > counts[max_index]) { | |
max_index = j; | |
} | |
total_count += counts[j]; | |
} | |
if (counts[max_index] < minimum_witnesses || total_count < minimum_coverage) | |
continue; | |
if (bases[max_index] != base) { | |
auto snp_base = bases[max_index]; | |
snps.put(Snp(cast(uint)(start_pos + i), cast(char)base, cast(char)snp_base)); | |
} | |
} | |
return snps.data(); | |
} | |
int main(string[] args) { | |
if (args.length != 2) { | |
stderr.writeln("Usage: ", args[0], " <input.bam>"); | |
stderr.writeln(" BAM must have MD tags"); | |
stderr.writeln(); | |
stderr.writeln(" Output format: <0-based position on reference> <ref. base> <sample base>"); | |
return 1; | |
} | |
auto task_pool = new TaskPool(totalCPUs); | |
scope(exit) task_pool.finish(); | |
auto bam = BamFile(args[1], task_pool); | |
auto reads = filter!"a.is_unmapped == false && a.mapping_quality > 10"(bam.alignments); | |
int transitions; | |
int transversions; | |
static auto findSNPs(Tuple!(Alignment[], int) reads_and_reference_length) { | |
auto reads = reads_and_reference_length[0]; | |
auto reference_length = reads_and_reference_length[1]; | |
return callSNPs(reads, reads[0].position, reads[$-1].position, | |
reference_length, | |
5, // minimum number of reads with the same base different from reference | |
5);// minimum coverage | |
} | |
writeln("#filename ", args[1]); | |
while (!reads.empty) { | |
auto ref_id = reads.front.ref_id; | |
if (ref_id == -1) { | |
break; | |
} | |
stderr.writeln("[info] processing reference sequence ", bam.reference(ref_id).name); | |
writeln("#chromosome ", bam.reference(ref_id).name); | |
auto current_reference_reads = until!"a.ref_id != b"(reads, ref_id); | |
auto chunks = current_reference_reads.chunksConsumingLessThan(5_000_000); | |
auto chunks_with_reference_lengths = map!((Alignment[] reads) { | |
return tuple(reads, bam.reference(reads[0].ref_id).length); | |
})(chunks); | |
foreach (snps; task_pool.map!findSNPs(chunks_with_reference_lengths, 16, 2)) { | |
foreach (snp; snps) { | |
writeln(snp.position, "\t", snp.reference_base, "\t", snp.sample_base); | |
} | |
} | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment