Created
February 24, 2020 11:26
-
-
Save adamkewley/cac87d62049b2db40d432a4c23e89f41 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define UNLIKELY(x) __builtin_expect(!!(x), 0) | |
#include <iostream> | |
#include <iterator> | |
#include <vector> | |
#include <string> | |
#include <immintrin.h> | |
using std::istream; | |
using std::ostream; | |
using std::runtime_error; | |
using std::vector; | |
using std::string; | |
constexpr size_t basepairs_in_line = 60; | |
constexpr size_t line_len = basepairs_in_line + sizeof('\n'); | |
// Returns the complement of a a single input character | |
char complement(char character) { | |
constexpr char complement_lut[] = { | |
'\0', 'T', 'V', 'G', 'H', '\0', '\0', | |
'C', 'D', '\0', '\n', 'M', '\0', 'K', | |
'N', '\0', '\0', '\0', 'Y', 'S', 'A', | |
'A', 'B', 'W', '\0', 'R' | |
}; | |
return complement_lut[character & 0x1f]; | |
} | |
__m128i packed(char c) { | |
return _mm_set_epi8(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c); | |
} | |
__m128i reverse_complement_simd(__m128i v) { | |
return v; // todo | |
/* | |
__m128i index = packed(0x0); | |
index = _mm_sub_epi8(index, _mm_cmpgt_epi8(v,)); | |
__m128i reversed_indices = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); | |
return _mm_shuffle_epi8(v, reversed_indices); | |
//__m128i and_mask = packed(0x1f); | |
// __m128i reverse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); | |
// _mm_shuffle_epi8(_mm_and_si128(vs, and_mask), reverse)) | |
return v; | |
*/ | |
} | |
// Complement then swap `a` and `b` | |
void complement_swap(char* a, char* b) { | |
char tmp = complement(*a); | |
*a = complement(*b); | |
*b = tmp; | |
} | |
// Reverse-complement a contiguous range, [begin, end), of bps. | |
// | |
// precondition: [begin, end) can be reverse-complemented without | |
// accounting for newlines etc. (the caller should handle this | |
// externally). | |
void reverse_complement_bps(char* start, char* end, size_t num_bps) { | |
#ifdef SIMD | |
while (num_bps >= 16) { | |
end -= 16; | |
__m128i tmp = _mm_lddqu_si128((__m128i*)start); | |
_mm_storeu_si128((__m128i*)start, reverse_complement_simd(_mm_lddqu_si128((__m128i*)end))); | |
_mm_storeu_si128((__m128i*)end, reverse_complement_simd(tmp)); | |
num_bps -= 16; | |
start += 16; | |
} | |
#else | |
// even when not using platform-dependent SIMD, it's still | |
// advantageous to manually unroll this loop because most | |
// compilers won't (because the compiler can't know that the | |
// inputs are usually >16 in size at runtime). This gives a ~10 % | |
// speedup on my laptop. | |
while (num_bps >= 16) { | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
complement_swap(start++, --end); | |
num_bps -= 16; | |
} | |
#endif | |
for (size_t i = 0; i < num_bps; ++i) { | |
complement_swap(start++, --end); | |
} | |
} | |
// Reverse-complements a FASTA sequence. Unformatted basepair (no | |
// header) input. All lines apart from the last line contain *exactly* | |
// 60 basepairs. The last line can contain <= 60 basepairs, and must | |
// have a trailing newline. | |
// | |
// The reason this alg. is more complicated than necessary for several | |
// reasons: | |
// | |
// - If newlines were stripped from the input while reading the input, | |
// then memory usage would be ~1/60th lower and this step would be | |
// mostly branchless (good). However, writing the output would | |
// require re-adding the newlines into some intermediate output | |
// buffer before the application emits the output (very bad). | |
// | |
// - If newlines are not stripped from the input, then they need to be | |
// handled by this step. The easiest (<10 LOC) way to handle the | |
// newlines is to have an `if (next_char == '\n') skip;` type check | |
// on the front and back of the input. However, this introduces two | |
// compare + (sometimes) jump operations per basepair swap, plus the | |
// main loop invariant, so the resulting loop can end up with 3/4 | |
// branches. It also prevents doing multi-basepair swaps (SIMD, loop | |
// unrolling, etc.). Even without vectorization, that ends up being | |
// a 20-35 % perf hit overall. | |
// | |
// - So we want to optimize this alg. for branchless, preferably | |
// multi-basepair, swaps + complements. However, the presence of | |
// trailing newlines means that the input might be non-symmetric | |
// (i.e. the data cannot be blindly swapped because the newlines | |
// will end up in an incorrect location). "Symmetric", in this case, | |
// means that swapping the newlines can be done safely because they | |
// are at symmetric offsets relative to the beginning and end of the | |
// input. | |
void reverse_complement_basepairs(char* begin, char* end) { | |
if (UNLIKELY(begin == end)) { | |
return; | |
} | |
size_t len = end - begin; | |
size_t trailer_len = len % line_len; | |
// skip end-of-data, so that `end` points to the last newline in | |
// the input (i.e. "just past the end of the last basepair") | |
end--; | |
// optimal case: all lines in the input are exactly `line_len` in | |
// length, with no trailing bps. The relative offsets (from | |
// begin/end) of newlines in the data are symmetrical. Therefore, | |
// The algorithm can just reverse + complement the entire input, | |
// apart from the last newline. | |
if (trailer_len == 0) { | |
size_t num_pairs = len/2; | |
reverse_complement_bps(begin, end, num_pairs); | |
bool has_middle_bp = (len % 2) > 0; | |
if (has_middle_bp) { | |
begin[num_pairs] = complement(begin[num_pairs]); | |
} | |
return; | |
} | |
// suboptimal case: the last line in the sequence is < `line_len` | |
// (it is a "trailing" line). This means that newlines in the | |
// input appear at non-symmetrical offsets relative to `begin` and | |
// `end`. Because of this, the algorithm has to carefully step | |
// over the newlines so that they aren't reversed into an | |
// incorrect location in the output. | |
size_t trailer_bps = trailer_len > 0 ? trailer_len - 1 : 0; | |
size_t rem_bps = basepairs_in_line - trailer_bps; | |
size_t rem_bytes = rem_bps + 1; | |
size_t num_whole_lines = len / line_len; | |
size_t num_steps = num_whole_lines / 2; | |
// there are at least two whole lines (+ trailer) per iteration of | |
// this loop. This means that we can revcomp the trailer, skip the | |
// trailer (+ newline, on the trailer's side), then revcomp the | |
// remainder, skip the remainder (+newline, on the starting side) | |
// to maintain the loop invariant. | |
for (size_t i = 0; i < num_steps; ++i) { | |
reverse_complement_bps(begin, end, trailer_bps); | |
begin += trailer_bps; | |
end -= trailer_len; | |
reverse_complement_bps(begin, end, rem_bps); | |
begin += rem_bytes; | |
end -= rem_bps; | |
} | |
// there may be one whole line (+ trailer) remaining. In this | |
// case, we do the first step of the above (revcomp the trailer) | |
// but *not* the second (revcomp the remainder) because the | |
// remainder will overlap in both directions. | |
bool has_unpaired_line = (num_whole_lines % 2) > 0; | |
if (has_unpaired_line) { | |
reverse_complement_bps(begin, end, trailer_bps); | |
begin += trailer_bps; | |
end -= trailer_len; | |
} | |
// no *whole* lines remaining, but there may be not-multiline | |
// remaining. revcomp these bytes. | |
size_t bps_in_last_line = end - begin; | |
size_t swaps_in_last_line = bps_in_last_line/2; | |
reverse_complement_bps(begin, end, swaps_in_last_line); | |
// edge case: there is exactly one byte in the middle of the input | |
// that needs to be complemented but not swapped with anything. | |
bool has_unpaired_byte = (bps_in_last_line % 2) > 0; | |
if (has_unpaired_byte) { | |
begin[swaps_in_last_line] = complement(begin[swaps_in_last_line]); | |
} | |
} | |
void read_up_to(istream& in, vector<char>& out, char delim) { | |
constexpr size_t read_size = 1<<16; | |
size_t bytes_read = 0; | |
out.resize(read_size); | |
while (in) { | |
in.getline(out.data() + bytes_read, read_size, delim); | |
bytes_read += in.gcount(); | |
if (in.fail()) { | |
// failed because it ran out of buffer space. Expand the | |
// buffer and perform another read | |
out.resize(bytes_read + read_size); | |
in.clear(in.rdstate() & ~std::ios::failbit); | |
} else if (in.eof()) { | |
// hit EOF, rather than delmiter, but an EOF can be | |
// treated almost identially to a delmiter, except that we | |
// don't remove the delimiter from the read buffer. | |
break; | |
} else { | |
// succeeded in reading *up to and including* the sequence | |
// delimiter. Remove the delmiter. | |
--bytes_read; | |
break; | |
} | |
} | |
out.resize(bytes_read); | |
} | |
struct Sequence { | |
string header; // not incl. starting delim (>) | |
vector<char> seq; // basepair lines. all lines terminated by newline | |
}; | |
// Read a sequence, starting *after* the first delimiter (>) | |
void read_sequence(istream& in, Sequence& out) { | |
out.header.resize(0); | |
std::getline(in, out.header); | |
read_up_to(in, out.seq, '>'); | |
} | |
void reverse_complement(Sequence& s) { | |
reverse_complement_basepairs(s.seq.data(), s.seq.data() + s.seq.size()); | |
} | |
void write_sequence(ostream& out, Sequence& s) { | |
out << '>'; | |
out << s.header; | |
out << '\n'; | |
out.write(s.seq.data(), s.seq.size()); | |
} | |
int main() { | |
// required for *large* (e.g. 1 GiB) inputs | |
std::cin.sync_with_stdio(false); | |
std::cout.sync_with_stdio(false); | |
// the read function assumes that '>' has already been read | |
// (because istream::getline will read it per loop iteration: | |
// prevents needing to 'peek' a bunch). | |
if (std::cin.get() != '>') { | |
throw runtime_error{"unexpected input: next char should be the start of a seqence header"}; | |
} | |
Sequence s; | |
while (not std::cin.eof()) { | |
read_sequence(std::cin, s); | |
reverse_complement(s); | |
write_sequence(std::cout, s); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment