adamkewley · February 24, 2020 11:26
diff --git a/revcomp.cpp b/revcomp.cpp
 #define UNLIKELY(x)    __builtin_expect(!!(x), 0)

 #include <iostream>
 #include <iterator>
 #include <vector>
 #include <string>
 #include <immintrin.h>

 using std::istream;
 using std::ostream;
 using std::runtime_error;
 using std::vector;
 using std::string;

 constexpr size_t basepairs_in_line = 60;
 constexpr size_t line_len = basepairs_in_line + sizeof('\n');

 // Returns the complement of a a single input character
 char complement(char character) {
    
    constexpr char complement_lut[] = {
        '\0', 'T', 'V', 'G', 'H', '\0', '\0',
        'C', 'D', '\0', '\n', 'M', '\0', 'K',
        'N', '\0', '\0', '\0', 'Y', 'S', 'A',
        'A', 'B', 'W', '\0', 'R'
    };
    
    return complement_lut[character & 0x1f];
 }

 __m128i packed(char c) {
    return _mm_set_epi8(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
 }

 __m128i reverse_complement_simd(__m128i v) {
    return v;  // todo
    /*
    __m128i index = packed(0x0);
    index = _mm_sub_epi8(index, _mm_cmpgt_epi8(v,));

    __m128i reversed_indices = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    return _mm_shuffle_epi8(v, reversed_indices);

    
    
    //__m128i and_mask = packed(0x1f);
    //    __m128i reverse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    //    _mm_shuffle_epi8(_mm_and_si128(vs, and_mask), reverse))
    return v;
    */
 }

 // Complement then swap `a` and `b`
 void complement_swap(char* a, char* b) {
    char tmp = complement(*a);
    *a = complement(*b);
    *b = tmp;
 }

 // Reverse-complement a contiguous range, [begin, end), of bps.
 //
 // precondition: [begin, end) can be reverse-complemented without
 // accounting for newlines etc. (the caller should handle this
 // externally).
 void reverse_complement_bps(char* start, char* end, size_t num_bps) {
 #ifdef SIMD
    while (num_bps >= 16) {
        end -= 16;
        
        __m128i tmp = _mm_lddqu_si128((__m128i*)start);
        _mm_storeu_si128((__m128i*)start, reverse_complement_simd(_mm_lddqu_si128((__m128i*)end)));
        _mm_storeu_si128((__m128i*)end, reverse_complement_simd(tmp));
        
        num_bps -= 16;
        start += 16;
    }
 #else
    // even when not using platform-dependent SIMD, it's still
    // advantageous to manually unroll this loop because most
    // compilers won't (because the compiler can't know that the
    // inputs are usually >16 in size at runtime). This gives a ~10 %
    // speedup on my laptop.
    while (num_bps >= 16) {
        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);

        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);

        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);

        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);
        complement_swap(start++, --end);
        
        num_bps -= 16;
    }
 #endif

    for (size_t i = 0; i < num_bps; ++i) {
        complement_swap(start++, --end);
    }
 }

 // Reverse-complements a FASTA sequence. Unformatted basepair (no
 // header) input. All lines apart from the last line contain *exactly*
 // 60 basepairs. The last line can contain <= 60 basepairs, and must
 // have a trailing newline.
 //
 // The reason this alg. is more complicated than necessary for several
 // reasons:
 //
 // - If newlines were stripped from the input while reading the input,
 //   then memory usage would be ~1/60th lower and this step would be
 //   mostly branchless (good). However, writing the output would
 //   require re-adding the newlines into some intermediate output
 //   buffer before the application emits the output (very bad).
 //
 // - If newlines are not stripped from the input, then they need to be
 //   handled by this step. The easiest (<10 LOC) way to handle the
 //   newlines is to have an `if (next_char == '\n') skip;` type check
 //   on the front and back of the input. However, this introduces two
 //   compare + (sometimes) jump operations per basepair swap, plus the
 //   main loop invariant, so the resulting loop can end up with 3/4
 //   branches. It also prevents doing multi-basepair swaps (SIMD, loop
 //   unrolling, etc.). Even without vectorization, that ends up being
 //   a 20-35 % perf hit overall.
 //
 // - So we want to optimize this alg. for branchless, preferably
 //   multi-basepair, swaps + complements. However, the presence of
 //   trailing newlines means that the input might be non-symmetric
 //   (i.e. the data cannot be blindly swapped because the newlines
 //   will end up in an incorrect location). "Symmetric", in this case,
 //   means that swapping the newlines can be done safely because they
 //   are at symmetric offsets relative to the beginning and end of the
 //   input.
 void reverse_complement_basepairs(char* begin, char* end) {
    if (UNLIKELY(begin == end)) {
        return;
    }

    size_t len = end - begin;
    size_t trailer_len = len % line_len;

    // skip end-of-data, so that `end` points to the last newline in
    // the input (i.e. "just past the end of the last basepair")
    end--;

    // optimal case: all lines in the input are exactly `line_len` in
    // length, with no trailing bps. The relative offsets (from
    // begin/end) of newlines in the data are symmetrical. Therefore,
    // The algorithm can just reverse + complement the entire input,
    // apart from the last newline.
    if (trailer_len == 0) {

        size_t num_pairs = len/2;
        reverse_complement_bps(begin, end, num_pairs);

        bool has_middle_bp = (len % 2) > 0;
        if (has_middle_bp) {
            begin[num_pairs] = complement(begin[num_pairs]);
        }

        return;
    }

    // suboptimal case: the last line in the sequence is < `line_len`
    // (it is a "trailing" line). This means that newlines in the
    // input appear at non-symmetrical offsets relative to `begin` and
    // `end`. Because of this, the algorithm has to carefully step
    // over the newlines so that they aren't reversed into an
    // incorrect location in the output.
    size_t trailer_bps = trailer_len > 0 ? trailer_len - 1 : 0;

    size_t rem_bps = basepairs_in_line - trailer_bps;
    size_t rem_bytes = rem_bps + 1;

    size_t num_whole_lines = len / line_len;
    size_t num_steps = num_whole_lines / 2;

    // there are at least two whole lines (+ trailer) per iteration of
    // this loop. This means that we can revcomp the trailer, skip the
    // trailer (+ newline, on the trailer's side), then revcomp the
    // remainder, skip the remainder (+newline, on the starting side)
    // to maintain the loop invariant.
    for (size_t i = 0; i < num_steps; ++i) {
        reverse_complement_bps(begin, end, trailer_bps);
        begin += trailer_bps;
        end -= trailer_len;

        reverse_complement_bps(begin, end, rem_bps);
        begin += rem_bytes;
        end -= rem_bps;
    }

    // there may be one whole line (+ trailer) remaining. In this
    // case, we do the first step of the above (revcomp the trailer)
    // but *not* the second (revcomp the remainder) because the
    // remainder will overlap in both directions.
    bool has_unpaired_line = (num_whole_lines % 2) > 0;
    if (has_unpaired_line) {
        reverse_complement_bps(begin, end, trailer_bps);
        begin += trailer_bps;
        end -= trailer_len;
    }

    // no *whole* lines remaining, but there may be not-multiline
    // remaining. revcomp these bytes.    
    size_t bps_in_last_line = end - begin;
    size_t swaps_in_last_line = bps_in_last_line/2;
    reverse_complement_bps(begin, end, swaps_in_last_line);

    // edge case: there is exactly one byte in the middle of the input
    // that needs to be complemented but not swapped with anything.
    bool has_unpaired_byte = (bps_in_last_line % 2) > 0;
    if (has_unpaired_byte) {
        begin[swaps_in_last_line] = complement(begin[swaps_in_last_line]);
    }
 }

 void read_up_to(istream& in, vector<char>& out, char delim) {
    constexpr size_t read_size = 1<<16;

    size_t bytes_read = 0;
    out.resize(read_size);
    while (in) {
        in.getline(out.data() + bytes_read, read_size, delim);
        bytes_read += in.gcount();

        if (in.fail()) {
            // failed because it ran out of buffer space. Expand the
            // buffer and perform another read
            out.resize(bytes_read + read_size);
            in.clear(in.rdstate() & ~std::ios::failbit);
        } else if (in.eof()) {
            // hit EOF, rather than delmiter, but an EOF can be
            // treated almost identially to a delmiter, except that we
            // don't remove the delimiter from the read buffer.
            break;
        } else {
            // succeeded in reading *up to and including* the sequence
            // delimiter. Remove the delmiter.
            --bytes_read;
            break;
        }
    }
    out.resize(bytes_read);
 }

 struct Sequence {
    string header;  // not incl. starting delim (>)
    vector<char> seq;  // basepair lines. all lines terminated by newline
 };

 // Read a sequence, starting *after* the first delimiter (>)
 void read_sequence(istream& in, Sequence& out) {
    out.header.resize(0);
    std::getline(in, out.header);
    read_up_to(in, out.seq, '>');
 }

 void reverse_complement(Sequence& s) {
    reverse_complement_basepairs(s.seq.data(), s.seq.data() + s.seq.size());
 }

 void write_sequence(ostream& out, Sequence& s) {
    out << '>';
    out << s.header;
    out << '\n';
    out.write(s.seq.data(), s.seq.size());
 }

 int main() {
    // required for *large* (e.g. 1 GiB) inputs
    std::cin.sync_with_stdio(false);
    std::cout.sync_with_stdio(false);

    // the read function assumes that '>' has already been read
    // (because istream::getline will read it per loop iteration:
    // prevents needing to 'peek' a bunch).
    if (std::cin.get() != '>') {
        throw runtime_error{"unexpected input: next char should be the start of a seqence header"};
    }

    Sequence s;
    while (not std::cin.eof()) {
        read_sequence(std::cin, s);
        reverse_complement(s);
        write_sequence(std::cout, s);
    }
 }
	#define UNLIKELY(x) __builtin_expect(!!(x), 0)

	#include <iostream>
	#include <iterator>
	#include <vector>
	#include <string>
	#include <immintrin.h>

	using std::istream;
	using std::ostream;
	using std::runtime_error;
	using std::vector;
	using std::string;

	constexpr size_t basepairs_in_line = 60;
	constexpr size_t line_len = basepairs_in_line + sizeof('\n');

	// Returns the complement of a a single input character
	char complement(char character) {

	constexpr char complement_lut[] = {
	'\0', 'T', 'V', 'G', 'H', '\0', '\0',
	'C', 'D', '\0', '\n', 'M', '\0', 'K',
	'N', '\0', '\0', '\0', 'Y', 'S', 'A',
	'A', 'B', 'W', '\0', 'R'
	};

	return complement_lut[character & 0x1f];
	}

	__m128i packed(char c) {
	return _mm_set_epi8(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
	}

	__m128i reverse_complement_simd(__m128i v) {
	return v; // todo
	/*
	__m128i index = packed(0x0);
	index = _mm_sub_epi8(index, _mm_cmpgt_epi8(v,));

	__m128i reversed_indices = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	return _mm_shuffle_epi8(v, reversed_indices);



	//__m128i and_mask = packed(0x1f);
	// __m128i reverse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	// _mm_shuffle_epi8(_mm_and_si128(vs, and_mask), reverse))
	return v;
	*/
	}

	// Complement then swap `a` and `b`
	void complement_swap(char* a, char* b) {
	char tmp = complement(*a);
	a = complement(b);
	*b = tmp;
	}

	// Reverse-complement a contiguous range, [begin, end), of bps.
	//
	// precondition: [begin, end) can be reverse-complemented without
	// accounting for newlines etc. (the caller should handle this
	// externally).
	void reverse_complement_bps(char* start, char* end, size_t num_bps) {
	#ifdef SIMD
	while (num_bps >= 16) {
	end -= 16;

	__m128i tmp = _mm_lddqu_si128((__m128i*)start);
	_mm_storeu_si128((__m128i)start, reverse_complement_simd(_mm_lddqu_si128((__m128i)end)));
	_mm_storeu_si128((__m128i*)end, reverse_complement_simd(tmp));

	num_bps -= 16;
	start += 16;
	}
	#else
	// even when not using platform-dependent SIMD, it's still
	// advantageous to manually unroll this loop because most
	// compilers won't (because the compiler can't know that the
	// inputs are usually >16 in size at runtime). This gives a ~10 %
	// speedup on my laptop.
	while (num_bps >= 16) {
	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);

	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);

	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);

	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);
	complement_swap(start++, --end);

	num_bps -= 16;
	}
	#endif

	for (size_t i = 0; i < num_bps; ++i) {
	complement_swap(start++, --end);
	}
	}

	// Reverse-complements a FASTA sequence. Unformatted basepair (no
	// header) input. All lines apart from the last line contain exactly
	// 60 basepairs. The last line can contain <= 60 basepairs, and must
	// have a trailing newline.
	//
	// The reason this alg. is more complicated than necessary for several
	// reasons:
	//
	// - If newlines were stripped from the input while reading the input,
	// then memory usage would be ~1/60th lower and this step would be
	// mostly branchless (good). However, writing the output would
	// require re-adding the newlines into some intermediate output
	// buffer before the application emits the output (very bad).
	//
	// - If newlines are not stripped from the input, then they need to be
	// handled by this step. The easiest (<10 LOC) way to handle the
	// newlines is to have an `if (next_char == '\n') skip;` type check
	// on the front and back of the input. However, this introduces two
	// compare + (sometimes) jump operations per basepair swap, plus the
	// main loop invariant, so the resulting loop can end up with 3/4
	// branches. It also prevents doing multi-basepair swaps (SIMD, loop
	// unrolling, etc.). Even without vectorization, that ends up being
	// a 20-35 % perf hit overall.
	//
	// - So we want to optimize this alg. for branchless, preferably
	// multi-basepair, swaps + complements. However, the presence of
	// trailing newlines means that the input might be non-symmetric
	// (i.e. the data cannot be blindly swapped because the newlines
	// will end up in an incorrect location). "Symmetric", in this case,
	// means that swapping the newlines can be done safely because they
	// are at symmetric offsets relative to the beginning and end of the
	// input.
	void reverse_complement_basepairs(char* begin, char* end) {
	if (UNLIKELY(begin == end)) {
	return;
	}

	size_t len = end - begin;
	size_t trailer_len = len % line_len;

	// skip end-of-data, so that `end` points to the last newline in
	// the input (i.e. "just past the end of the last basepair")
	end--;

	// optimal case: all lines in the input are exactly `line_len` in
	// length, with no trailing bps. The relative offsets (from
	// begin/end) of newlines in the data are symmetrical. Therefore,
	// The algorithm can just reverse + complement the entire input,
	// apart from the last newline.
	if (trailer_len == 0) {

	size_t num_pairs = len/2;
	reverse_complement_bps(begin, end, num_pairs);

	bool has_middle_bp = (len % 2) > 0;
	if (has_middle_bp) {
	begin[num_pairs] = complement(begin[num_pairs]);
	}

	return;
	}

	// suboptimal case: the last line in the sequence is < `line_len`
	// (it is a "trailing" line). This means that newlines in the
	// input appear at non-symmetrical offsets relative to `begin` and
	// `end`. Because of this, the algorithm has to carefully step
	// over the newlines so that they aren't reversed into an
	// incorrect location in the output.
	size_t trailer_bps = trailer_len > 0 ? trailer_len - 1 : 0;

	size_t rem_bps = basepairs_in_line - trailer_bps;
	size_t rem_bytes = rem_bps + 1;

	size_t num_whole_lines = len / line_len;
	size_t num_steps = num_whole_lines / 2;

	// there are at least two whole lines (+ trailer) per iteration of
	// this loop. This means that we can revcomp the trailer, skip the
	// trailer (+ newline, on the trailer's side), then revcomp the
	// remainder, skip the remainder (+newline, on the starting side)
	// to maintain the loop invariant.
	for (size_t i = 0; i < num_steps; ++i) {
	reverse_complement_bps(begin, end, trailer_bps);
	begin += trailer_bps;
	end -= trailer_len;

	reverse_complement_bps(begin, end, rem_bps);
	begin += rem_bytes;
	end -= rem_bps;
	}

	// there may be one whole line (+ trailer) remaining. In this
	// case, we do the first step of the above (revcomp the trailer)
	// but not the second (revcomp the remainder) because the
	// remainder will overlap in both directions.
	bool has_unpaired_line = (num_whole_lines % 2) > 0;
	if (has_unpaired_line) {
	reverse_complement_bps(begin, end, trailer_bps);
	begin += trailer_bps;
	end -= trailer_len;
	}

	// no whole lines remaining, but there may be not-multiline
	// remaining. revcomp these bytes.
	size_t bps_in_last_line = end - begin;
	size_t swaps_in_last_line = bps_in_last_line/2;
	reverse_complement_bps(begin, end, swaps_in_last_line);

	// edge case: there is exactly one byte in the middle of the input
	// that needs to be complemented but not swapped with anything.
	bool has_unpaired_byte = (bps_in_last_line % 2) > 0;
	if (has_unpaired_byte) {
	begin[swaps_in_last_line] = complement(begin[swaps_in_last_line]);
	}
	}

	void read_up_to(istream& in, vector<char>& out, char delim) {
	constexpr size_t read_size = 1<<16;

	size_t bytes_read = 0;
	out.resize(read_size);
	while (in) {
	in.getline(out.data() + bytes_read, read_size, delim);
	bytes_read += in.gcount();

	if (in.fail()) {
	// failed because it ran out of buffer space. Expand the
	// buffer and perform another read
	out.resize(bytes_read + read_size);
	in.clear(in.rdstate() & ~std::ios::failbit);
	} else if (in.eof()) {
	// hit EOF, rather than delmiter, but an EOF can be
	// treated almost identially to a delmiter, except that we
	// don't remove the delimiter from the read buffer.
	break;
	} else {
	// succeeded in reading up to and including the sequence
	// delimiter. Remove the delmiter.
	--bytes_read;
	break;
	}
	}
	out.resize(bytes_read);
	}

	struct Sequence {
	string header; // not incl. starting delim (>)
	vector<char> seq; // basepair lines. all lines terminated by newline
	};

	// Read a sequence, starting after the first delimiter (>)
	void read_sequence(istream& in, Sequence& out) {
	out.header.resize(0);
	std::getline(in, out.header);
	read_up_to(in, out.seq, '>');
	}

	void reverse_complement(Sequence& s) {
	reverse_complement_basepairs(s.seq.data(), s.seq.data() + s.seq.size());
	}

	void write_sequence(ostream& out, Sequence& s) {
	out << '>';
	out << s.header;
	out << '\n';
	out.write(s.seq.data(), s.seq.size());
	}

	int main() {
	// required for large (e.g. 1 GiB) inputs
	std::cin.sync_with_stdio(false);
	std::cout.sync_with_stdio(false);

	// the read function assumes that '>' has already been read
	// (because istream::getline will read it per loop iteration:
	// prevents needing to 'peek' a bunch).
	if (std::cin.get() != '>') {
	throw runtime_error{"unexpected input: next char should be the start of a seqence header"};
	}

	Sequence s;
	while (not std::cin.eof()) {
	read_sequence(std::cin, s);
	reverse_complement(s);
	write_sequence(std::cout, s);
	}
	}