Skip to content

Instantly share code, notes, and snippets.

@adamkewley
Created February 24, 2020 11:26
Show Gist options
  • Save adamkewley/cac87d62049b2db40d432a4c23e89f41 to your computer and use it in GitHub Desktop.
Save adamkewley/cac87d62049b2db40d432a4c23e89f41 to your computer and use it in GitHub Desktop.
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
#include <iostream>
#include <iterator>
#include <vector>
#include <string>
#include <immintrin.h>
using std::istream;
using std::ostream;
using std::runtime_error;
using std::vector;
using std::string;
constexpr size_t basepairs_in_line = 60;
constexpr size_t line_len = basepairs_in_line + sizeof('\n');
// Returns the complement of a a single input character
char complement(char character) {
constexpr char complement_lut[] = {
'\0', 'T', 'V', 'G', 'H', '\0', '\0',
'C', 'D', '\0', '\n', 'M', '\0', 'K',
'N', '\0', '\0', '\0', 'Y', 'S', 'A',
'A', 'B', 'W', '\0', 'R'
};
return complement_lut[character & 0x1f];
}
__m128i packed(char c) {
return _mm_set_epi8(c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c);
}
__m128i reverse_complement_simd(__m128i v) {
return v; // todo
/*
__m128i index = packed(0x0);
index = _mm_sub_epi8(index, _mm_cmpgt_epi8(v,));
__m128i reversed_indices = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
return _mm_shuffle_epi8(v, reversed_indices);
//__m128i and_mask = packed(0x1f);
// __m128i reverse = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
// _mm_shuffle_epi8(_mm_and_si128(vs, and_mask), reverse))
return v;
*/
}
// Complement then swap `a` and `b`
void complement_swap(char* a, char* b) {
char tmp = complement(*a);
*a = complement(*b);
*b = tmp;
}
// Reverse-complement a contiguous range, [begin, end), of bps.
//
// precondition: [begin, end) can be reverse-complemented without
// accounting for newlines etc. (the caller should handle this
// externally).
void reverse_complement_bps(char* start, char* end, size_t num_bps) {
#ifdef SIMD
while (num_bps >= 16) {
end -= 16;
__m128i tmp = _mm_lddqu_si128((__m128i*)start);
_mm_storeu_si128((__m128i*)start, reverse_complement_simd(_mm_lddqu_si128((__m128i*)end)));
_mm_storeu_si128((__m128i*)end, reverse_complement_simd(tmp));
num_bps -= 16;
start += 16;
}
#else
// even when not using platform-dependent SIMD, it's still
// advantageous to manually unroll this loop because most
// compilers won't (because the compiler can't know that the
// inputs are usually >16 in size at runtime). This gives a ~10 %
// speedup on my laptop.
while (num_bps >= 16) {
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
complement_swap(start++, --end);
num_bps -= 16;
}
#endif
for (size_t i = 0; i < num_bps; ++i) {
complement_swap(start++, --end);
}
}
// Reverse-complements a FASTA sequence. Unformatted basepair (no
// header) input. All lines apart from the last line contain *exactly*
// 60 basepairs. The last line can contain <= 60 basepairs, and must
// have a trailing newline.
//
// The reason this alg. is more complicated than necessary for several
// reasons:
//
// - If newlines were stripped from the input while reading the input,
// then memory usage would be ~1/60th lower and this step would be
// mostly branchless (good). However, writing the output would
// require re-adding the newlines into some intermediate output
// buffer before the application emits the output (very bad).
//
// - If newlines are not stripped from the input, then they need to be
// handled by this step. The easiest (<10 LOC) way to handle the
// newlines is to have an `if (next_char == '\n') skip;` type check
// on the front and back of the input. However, this introduces two
// compare + (sometimes) jump operations per basepair swap, plus the
// main loop invariant, so the resulting loop can end up with 3/4
// branches. It also prevents doing multi-basepair swaps (SIMD, loop
// unrolling, etc.). Even without vectorization, that ends up being
// a 20-35 % perf hit overall.
//
// - So we want to optimize this alg. for branchless, preferably
// multi-basepair, swaps + complements. However, the presence of
// trailing newlines means that the input might be non-symmetric
// (i.e. the data cannot be blindly swapped because the newlines
// will end up in an incorrect location). "Symmetric", in this case,
// means that swapping the newlines can be done safely because they
// are at symmetric offsets relative to the beginning and end of the
// input.
void reverse_complement_basepairs(char* begin, char* end) {
if (UNLIKELY(begin == end)) {
return;
}
size_t len = end - begin;
size_t trailer_len = len % line_len;
// skip end-of-data, so that `end` points to the last newline in
// the input (i.e. "just past the end of the last basepair")
end--;
// optimal case: all lines in the input are exactly `line_len` in
// length, with no trailing bps. The relative offsets (from
// begin/end) of newlines in the data are symmetrical. Therefore,
// The algorithm can just reverse + complement the entire input,
// apart from the last newline.
if (trailer_len == 0) {
size_t num_pairs = len/2;
reverse_complement_bps(begin, end, num_pairs);
bool has_middle_bp = (len % 2) > 0;
if (has_middle_bp) {
begin[num_pairs] = complement(begin[num_pairs]);
}
return;
}
// suboptimal case: the last line in the sequence is < `line_len`
// (it is a "trailing" line). This means that newlines in the
// input appear at non-symmetrical offsets relative to `begin` and
// `end`. Because of this, the algorithm has to carefully step
// over the newlines so that they aren't reversed into an
// incorrect location in the output.
size_t trailer_bps = trailer_len > 0 ? trailer_len - 1 : 0;
size_t rem_bps = basepairs_in_line - trailer_bps;
size_t rem_bytes = rem_bps + 1;
size_t num_whole_lines = len / line_len;
size_t num_steps = num_whole_lines / 2;
// there are at least two whole lines (+ trailer) per iteration of
// this loop. This means that we can revcomp the trailer, skip the
// trailer (+ newline, on the trailer's side), then revcomp the
// remainder, skip the remainder (+newline, on the starting side)
// to maintain the loop invariant.
for (size_t i = 0; i < num_steps; ++i) {
reverse_complement_bps(begin, end, trailer_bps);
begin += trailer_bps;
end -= trailer_len;
reverse_complement_bps(begin, end, rem_bps);
begin += rem_bytes;
end -= rem_bps;
}
// there may be one whole line (+ trailer) remaining. In this
// case, we do the first step of the above (revcomp the trailer)
// but *not* the second (revcomp the remainder) because the
// remainder will overlap in both directions.
bool has_unpaired_line = (num_whole_lines % 2) > 0;
if (has_unpaired_line) {
reverse_complement_bps(begin, end, trailer_bps);
begin += trailer_bps;
end -= trailer_len;
}
// no *whole* lines remaining, but there may be not-multiline
// remaining. revcomp these bytes.
size_t bps_in_last_line = end - begin;
size_t swaps_in_last_line = bps_in_last_line/2;
reverse_complement_bps(begin, end, swaps_in_last_line);
// edge case: there is exactly one byte in the middle of the input
// that needs to be complemented but not swapped with anything.
bool has_unpaired_byte = (bps_in_last_line % 2) > 0;
if (has_unpaired_byte) {
begin[swaps_in_last_line] = complement(begin[swaps_in_last_line]);
}
}
void read_up_to(istream& in, vector<char>& out, char delim) {
constexpr size_t read_size = 1<<16;
size_t bytes_read = 0;
out.resize(read_size);
while (in) {
in.getline(out.data() + bytes_read, read_size, delim);
bytes_read += in.gcount();
if (in.fail()) {
// failed because it ran out of buffer space. Expand the
// buffer and perform another read
out.resize(bytes_read + read_size);
in.clear(in.rdstate() & ~std::ios::failbit);
} else if (in.eof()) {
// hit EOF, rather than delmiter, but an EOF can be
// treated almost identially to a delmiter, except that we
// don't remove the delimiter from the read buffer.
break;
} else {
// succeeded in reading *up to and including* the sequence
// delimiter. Remove the delmiter.
--bytes_read;
break;
}
}
out.resize(bytes_read);
}
struct Sequence {
string header; // not incl. starting delim (>)
vector<char> seq; // basepair lines. all lines terminated by newline
};
// Read a sequence, starting *after* the first delimiter (>)
void read_sequence(istream& in, Sequence& out) {
out.header.resize(0);
std::getline(in, out.header);
read_up_to(in, out.seq, '>');
}
void reverse_complement(Sequence& s) {
reverse_complement_basepairs(s.seq.data(), s.seq.data() + s.seq.size());
}
void write_sequence(ostream& out, Sequence& s) {
out << '>';
out << s.header;
out << '\n';
out.write(s.seq.data(), s.seq.size());
}
int main() {
// required for *large* (e.g. 1 GiB) inputs
std::cin.sync_with_stdio(false);
std::cout.sync_with_stdio(false);
// the read function assumes that '>' has already been read
// (because istream::getline will read it per loop iteration:
// prevents needing to 'peek' a bunch).
if (std::cin.get() != '>') {
throw runtime_error{"unexpected input: next char should be the start of a seqence header"};
}
Sequence s;
while (not std::cin.eof()) {
read_sequence(std::cin, s);
reverse_complement(s);
write_sequence(std::cout, s);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment