Skip to content

Instantly share code, notes, and snippets.

@lomereiter
Created September 23, 2012 19:17
Show Gist options
  • Save lomereiter/3772725 to your computer and use it in GitHub Desktop.
Save lomereiter/3772725 to your computer and use it in GitHub Desktop.
simple SNP caller
// How to build:
// rdmd -O -release -inline --build-only -Ipath/to/latest/sambamba/ snpcaller.d
//
// Usage: ./snpcaller input.bam > snps.dat
// (input.bam must contain MD tags)
import bamfile;
import reconstruct;
import splitter;
import std.array;
import std.parallelism;
import std.ascii;
import std.stdio;
import std.typecons;
import std.conv;
import std.algorithm;
import std.range;
immutable char[4] bases = ['A', 'C', 'G', 'T'];
struct Snp {
uint position;
char reference_base;
char sample_base;
bool is_transition() @property const {
return (reference_base == 'A' && sample_base == 'G') ||
(reference_base == 'G' && sample_base == 'A') ||
(reference_base == 'T' && sample_base == 'C') ||
(reference_base == 'C' && sample_base == 'T');
}
bool is_transversion() @property const {
return !is_transition;
}
}
/// Processes all reads given and searched for SNPs on the interval
/// [start_ref_pos, end_ref_pos) where both start and end are 0-based.
Snp[] callSNPs(Alignment[] reads, uint start_ref_pos, uint end_ref_pos,
uint reference_length,
uint minimum_witnesses=2, uint minimum_coverage=2) {
auto snps = appender!(Snp[])();
auto start_pos = reads.front.position;
auto dna = to!string(dna(reads));
auto countA = new int[dna.length];
auto countC = new int[dna.length];
auto countG = new int[dna.length];
auto countT = new int[dna.length];
foreach (read; reads) {
auto seq = read.sequence;
auto ref_pos = read.position;
foreach (cigar_op; read.cigar) {
if (cigar_op.is_query_consuming && cigar_op.is_reference_consuming) {
for (auto i = 0; i < cigar_op.length; i++) {
if (ref_pos + i >= reference_length) {
break;
}
auto index = ref_pos + i - start_pos;
if (index >= dna.length) {
stderr.writeln("read name: ", read.read_name);
stderr.writeln("index: ", index);
stderr.writeln("start_pos: ", start_pos);
stderr.writeln("CIGAR: ", read.cigarString());
stderr.writeln("CIGAR operation type: ", cigar_op.operation);
stderr.writeln("CIGAR operation length: ", cigar_op.length);
if (!read["MD"].is_nothing) {
stderr.writeln("MD: ", to!string(read["MD"]));
}
break;
}
switch (seq[i]) {
case 'a', 'A':
++countA[index];
break;
case 'c', 'C':
++countC[index];
break;
case 'g', 'G':
++countG[index];
break;
case 't', 'T':
++countT[index];
break;
default:
break;
}
}
}
if (cigar_op.is_query_consuming) {
seq = seq[cigar_op.length .. seq.length];
}
if (cigar_op.is_reference_consuming) {
ref_pos += cigar_op.length;
}
}
}
int[4] counts;
int transitions;
int transversions;
int reference_bases_count;
auto beg = max(start_ref_pos, start_pos);
auto end = min(end_ref_pos, reads[$-1].position + reads[$-1].basesCovered());
for (auto k = beg; k < end; ++k) {
auto i = k - start_pos;
auto b = dna[i];
auto base = toUpper(b);
if (base == 'N')
continue;
reference_bases_count++;
counts[0] = countA[i];
counts[1] = countC[i];
counts[2] = countG[i];
counts[3] = countT[i];
size_t max_index;
int total_count;
for (auto j = 0; j < 4; j++) {
if (counts[j] > counts[max_index]) {
max_index = j;
}
total_count += counts[j];
}
if (counts[max_index] < minimum_witnesses || total_count < minimum_coverage)
continue;
if (bases[max_index] != base) {
auto snp_base = bases[max_index];
snps.put(Snp(cast(uint)(start_pos + i), cast(char)base, cast(char)snp_base));
}
}
return snps.data();
}
int main(string[] args) {
if (args.length != 2) {
stderr.writeln("Usage: ", args[0], " <input.bam>");
stderr.writeln(" BAM must have MD tags");
stderr.writeln();
stderr.writeln(" Output format: <0-based position on reference> <ref. base> <sample base>");
return 1;
}
auto task_pool = new TaskPool(totalCPUs);
scope(exit) task_pool.finish();
auto bam = BamFile(args[1], task_pool);
auto reads = filter!"a.is_unmapped == false && a.mapping_quality > 10"(bam.alignments);
int transitions;
int transversions;
static auto findSNPs(Tuple!(Alignment[], int) reads_and_reference_length) {
auto reads = reads_and_reference_length[0];
auto reference_length = reads_and_reference_length[1];
return callSNPs(reads, reads[0].position, reads[$-1].position,
reference_length,
5, // minimum number of reads with the same base different from reference
5);// minimum coverage
}
writeln("#filename ", args[1]);
while (!reads.empty) {
auto ref_id = reads.front.ref_id;
if (ref_id == -1) {
break;
}
stderr.writeln("[info] processing reference sequence ", bam.reference(ref_id).name);
writeln("#chromosome ", bam.reference(ref_id).name);
auto current_reference_reads = until!"a.ref_id != b"(reads, ref_id);
auto chunks = current_reference_reads.chunksConsumingLessThan(5_000_000);
auto chunks_with_reference_lengths = map!((Alignment[] reads) {
return tuple(reads, bam.reference(reads[0].ref_id).length);
})(chunks);
foreach (snps; task_pool.map!findSNPs(chunks_with_reference_lengths, 16, 2)) {
foreach (snp; snps) {
writeln(snp.position, "\t", snp.reference_base, "\t", snp.sample_base);
}
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment