lomereiter · September 23, 2012 19:17
diff --git a/snpcaller.d b/snpcaller.d
 // How to build: 
 // rdmd -O -release -inline --build-only -Ipath/to/latest/sambamba/ snpcaller.d
 //
 // Usage: ./snpcaller input.bam > snps.dat
 // (input.bam must contain MD tags)

 import bamfile;
 import reconstruct;
 import splitter;

 import std.array;
 import std.parallelism;
 import std.ascii;
 import std.stdio;
 import std.typecons;
 import std.conv;
 import std.algorithm;
 import std.range;

 immutable char[4] bases = ['A', 'C', 'G', 'T'];

 struct Snp {
    uint position;
    char reference_base;
    char sample_base;

    bool is_transition() @property const {
        return (reference_base == 'A' && sample_base == 'G') ||
               (reference_base == 'G' && sample_base == 'A') ||
               (reference_base == 'T' && sample_base == 'C') ||
               (reference_base == 'C' && sample_base == 'T');
    }

    bool is_transversion() @property const {
        return !is_transition;
    }
 }

 /// Processes all reads given and searched for SNPs on the interval 
 /// [start_ref_pos, end_ref_pos) where both start and end are 0-based.
 Snp[] callSNPs(Alignment[] reads, uint start_ref_pos, uint end_ref_pos,
               uint reference_length,
               uint minimum_witnesses=2, uint minimum_coverage=2) {

    auto snps = appender!(Snp[])();

    auto start_pos = reads.front.position;
    auto dna = to!string(dna(reads));

    auto countA = new int[dna.length];
    auto countC = new int[dna.length];
    auto countG = new int[dna.length];
    auto countT = new int[dna.length];

    foreach (read; reads) {
        auto seq = read.sequence;
        auto ref_pos = read.position;
        foreach (cigar_op; read.cigar) {
            if (cigar_op.is_query_consuming && cigar_op.is_reference_consuming) {
                for (auto i = 0; i < cigar_op.length; i++) {
                    if (ref_pos + i >= reference_length) {
                        break;
                    }

                    auto index = ref_pos + i - start_pos;

                    if (index >= dna.length) {
                        stderr.writeln("read name: ", read.read_name);
                        stderr.writeln("index: ", index);
                        stderr.writeln("start_pos: ", start_pos);
                        stderr.writeln("CIGAR: ", read.cigarString());
                        stderr.writeln("CIGAR operation type: ", cigar_op.operation);
                        stderr.writeln("CIGAR operation length: ", cigar_op.length);
                        if (!read["MD"].is_nothing) {
                            stderr.writeln("MD: ", to!string(read["MD"]));
                        }
                        break;
                    }

                    switch (seq[i]) {
                        case 'a', 'A':
                            ++countA[index];
                            break;
                        case 'c', 'C':
                            ++countC[index];
                            break;
                        case 'g', 'G':
                            ++countG[index];
                            break;
                        case 't', 'T':
                            ++countT[index];
                            break;
                        default:
                            break;
                    }
                }
            }

            if (cigar_op.is_query_consuming) {
                seq = seq[cigar_op.length .. seq.length];
            }

            if (cigar_op.is_reference_consuming) {
                ref_pos += cigar_op.length;
            }
        }
    }

    int[4] counts;
    int transitions;
    int transversions;
    int reference_bases_count;

    auto beg = max(start_ref_pos, start_pos);
    auto end = min(end_ref_pos, reads[$-1].position + reads[$-1].basesCovered());

    for (auto k = beg; k < end; ++k) {
        auto i = k - start_pos;
        auto b = dna[i];
        auto base = toUpper(b);
        if (base == 'N')
            continue;

        reference_bases_count++;

        counts[0] = countA[i];
        counts[1] = countC[i];
        counts[2] = countG[i];
        counts[3] = countT[i];
        size_t max_index;
        int total_count;
        for (auto j = 0; j < 4; j++) {
            if (counts[j] > counts[max_index]) {
                max_index = j;
            }
            total_count += counts[j];
        }

        if (counts[max_index] < minimum_witnesses || total_count < minimum_coverage) 
            continue;

        if (bases[max_index] != base) {
            auto snp_base = bases[max_index];

            snps.put(Snp(cast(uint)(start_pos + i), cast(char)base, cast(char)snp_base));
        }
    }

    return snps.data();
 }

 int main(string[] args) {

    if (args.length != 2) {
        stderr.writeln("Usage: ", args[0], " <input.bam>");
        stderr.writeln("           BAM must have MD tags");
        stderr.writeln();
        stderr.writeln("           Output format: <0-based position on reference> <ref. base> <sample base>");
        return 1;
    }

    auto task_pool = new TaskPool(totalCPUs);
    scope(exit) task_pool.finish();
    auto bam = BamFile(args[1], task_pool);

    auto reads = filter!"a.is_unmapped == false && a.mapping_quality > 10"(bam.alignments);

    int transitions;
    int transversions;

    static auto findSNPs(Tuple!(Alignment[], int) reads_and_reference_length) {
        auto reads = reads_and_reference_length[0];
        auto reference_length = reads_and_reference_length[1];
        return callSNPs(reads, reads[0].position, reads[$-1].position, 
                        reference_length,
                        5, // minimum number of reads with the same base different from reference
                        5);// minimum coverage
    }

    writeln("#filename ", args[1]);

    while (!reads.empty) {

        auto ref_id = reads.front.ref_id;
        if (ref_id == -1) {
            break;
        }

        stderr.writeln("[info] processing reference sequence ", bam.reference(ref_id).name);
        writeln("#chromosome ", bam.reference(ref_id).name);

        auto current_reference_reads = until!"a.ref_id != b"(reads, ref_id);

        auto chunks = current_reference_reads.chunksConsumingLessThan(5_000_000);

        auto chunks_with_reference_lengths = map!((Alignment[] reads) { 
                                                    return tuple(reads, bam.reference(reads[0].ref_id).length);
                                                    })(chunks);

        foreach (snps; task_pool.map!findSNPs(chunks_with_reference_lengths, 16, 2)) {
            foreach (snp; snps) {
                writeln(snp.position, "\t", snp.reference_base, "\t", snp.sample_base);
            }
        }

    }

    return 0;
 }
	// How to build:
	// rdmd -O -release -inline --build-only -Ipath/to/latest/sambamba/ snpcaller.d
	//
	// Usage: ./snpcaller input.bam > snps.dat
	// (input.bam must contain MD tags)

	import bamfile;
	import reconstruct;
	import splitter;

	import std.array;
	import std.parallelism;
	import std.ascii;
	import std.stdio;
	import std.typecons;
	import std.conv;
	import std.algorithm;
	import std.range;

	immutable char[4] bases = ['A', 'C', 'G', 'T'];

	struct Snp {
	uint position;
	char reference_base;
	char sample_base;

	bool is_transition() @property const {
	return (reference_base == 'A' && sample_base == 'G') \|\|
	(reference_base == 'G' && sample_base == 'A') \|\|
	(reference_base == 'T' && sample_base == 'C') \|\|
	(reference_base == 'C' && sample_base == 'T');
	}

	bool is_transversion() @property const {
	return !is_transition;
	}
	}

	/// Processes all reads given and searched for SNPs on the interval
	/// [start_ref_pos, end_ref_pos) where both start and end are 0-based.
	Snp[] callSNPs(Alignment[] reads, uint start_ref_pos, uint end_ref_pos,
	uint reference_length,
	uint minimum_witnesses=2, uint minimum_coverage=2) {

	auto snps = appender!(Snp[])();

	auto start_pos = reads.front.position;
	auto dna = to!string(dna(reads));

	auto countA = new int[dna.length];
	auto countC = new int[dna.length];
	auto countG = new int[dna.length];
	auto countT = new int[dna.length];

	foreach (read; reads) {
	auto seq = read.sequence;
	auto ref_pos = read.position;
	foreach (cigar_op; read.cigar) {
	if (cigar_op.is_query_consuming && cigar_op.is_reference_consuming) {
	for (auto i = 0; i < cigar_op.length; i++) {
	if (ref_pos + i >= reference_length) {
	break;
	}

	auto index = ref_pos + i - start_pos;

	if (index >= dna.length) {
	stderr.writeln("read name: ", read.read_name);
	stderr.writeln("index: ", index);
	stderr.writeln("start_pos: ", start_pos);
	stderr.writeln("CIGAR: ", read.cigarString());
	stderr.writeln("CIGAR operation type: ", cigar_op.operation);
	stderr.writeln("CIGAR operation length: ", cigar_op.length);
	if (!read["MD"].is_nothing) {
	stderr.writeln("MD: ", to!string(read["MD"]));
	}
	break;
	}

	switch (seq[i]) {
	case 'a', 'A':
	++countA[index];
	break;
	case 'c', 'C':
	++countC[index];
	break;
	case 'g', 'G':
	++countG[index];
	break;
	case 't', 'T':
	++countT[index];
	break;
	default:
	break;
	}
	}
	}

	if (cigar_op.is_query_consuming) {
	seq = seq[cigar_op.length .. seq.length];
	}

	if (cigar_op.is_reference_consuming) {
	ref_pos += cigar_op.length;
	}
	}
	}

	int[4] counts;
	int transitions;
	int transversions;
	int reference_bases_count;

	auto beg = max(start_ref_pos, start_pos);
	auto end = min(end_ref_pos, reads[$-1].position + reads[$-1].basesCovered());

	for (auto k = beg; k < end; ++k) {
	auto i = k - start_pos;
	auto b = dna[i];
	auto base = toUpper(b);
	if (base == 'N')
	continue;

	reference_bases_count++;

	counts[0] = countA[i];
	counts[1] = countC[i];
	counts[2] = countG[i];
	counts[3] = countT[i];
	size_t max_index;
	int total_count;
	for (auto j = 0; j < 4; j++) {
	if (counts[j] > counts[max_index]) {
	max_index = j;
	}
	total_count += counts[j];
	}

	if (counts[max_index] < minimum_witnesses \|\| total_count < minimum_coverage)
	continue;

	if (bases[max_index] != base) {
	auto snp_base = bases[max_index];

	snps.put(Snp(cast(uint)(start_pos + i), cast(char)base, cast(char)snp_base));
	}
	}

	return snps.data();
	}

	int main(string[] args) {

	if (args.length != 2) {
	stderr.writeln("Usage: ", args[0], " <input.bam>");
	stderr.writeln(" BAM must have MD tags");
	stderr.writeln();
	stderr.writeln(" Output format: <0-based position on reference> <ref. base> <sample base>");
	return 1;
	}

	auto task_pool = new TaskPool(totalCPUs);
	scope(exit) task_pool.finish();
	auto bam = BamFile(args[1], task_pool);

	auto reads = filter!"a.is_unmapped == false && a.mapping_quality > 10"(bam.alignments);

	int transitions;
	int transversions;

	static auto findSNPs(Tuple!(Alignment[], int) reads_and_reference_length) {
	auto reads = reads_and_reference_length[0];
	auto reference_length = reads_and_reference_length[1];
	return callSNPs(reads, reads[0].position, reads[$-1].position,
	reference_length,
	5, // minimum number of reads with the same base different from reference
	5);// minimum coverage
	}

	writeln("#filename ", args[1]);

	while (!reads.empty) {

	auto ref_id = reads.front.ref_id;
	if (ref_id == -1) {
	break;
	}

	stderr.writeln("[info] processing reference sequence ", bam.reference(ref_id).name);
	writeln("#chromosome ", bam.reference(ref_id).name);

	auto current_reference_reads = until!"a.ref_id != b"(reads, ref_id);

	auto chunks = current_reference_reads.chunksConsumingLessThan(5_000_000);

	auto chunks_with_reference_lengths = map!((Alignment[] reads) {
	return tuple(reads, bam.reference(reads[0].ref_id).length);
	})(chunks);

	foreach (snps; task_pool.map!findSNPs(chunks_with_reference_lengths, 16, 2)) {
	foreach (snp; snps) {
	writeln(snp.position, "\t", snp.reference_base, "\t", snp.sample_base);
	}
	}

	}

	return 0;
	}