Skip to content

Instantly share code, notes, and snippets.

@dnbaker
Last active January 12, 2018 21:49
Show Gist options
  • Save dnbaker/ce3391a9dad1c377ceb04b18104a31fa to your computer and use it in GitHub Desktop.
Save dnbaker/ce3391a9dad1c377ceb04b18104a31fa to your computer and use it in GitHub Desktop.
Convert fasta file to 2-bit representation
#include <cassert>
#include <fstream>
struct cdata {
uint64_t data;
unsigned count;
cdata() {reset();}
void reset() {data = count = 0;}
template<typename T>
void write(T &stream, size_t nbytes=sizeof(data) / sizeof(char)) {
stream.write((char *)&data, nbytes);
reset();
}
};
int main(int argc, char *argv[]) {
if(argc < 2) {
std::fprintf(stderr, "Usage: %s <infile> <outfile>\n", argv[0]);
return EXIT_FAILURE;
}
std::ios_base::sync_with_stdio(false);
std::ifstream ins(argv[1]);
std::ofstream outs(argc > 2 ? argv[2]: "/dev/stdout", std::fstream::binary | std::ios::out);
std::string line;
cdata data;
while(std::getline(ins, line)) {
switch(line[0]) case '@': case '#': case '>': continue;
for(const auto c: line) {
switch(c) {
case 'a': case 'A': break;
case 'c': case 'C': data.data |= 1ull << 62; break;
case 'g': case 'G': data.data |= 2ull << 62; break;
case 't': case 'T': data.data |= 3ull << 62; break;
default: continue;
}
if(++data.count == 32) data.write(outs);
data.data >>= 2;
}
}
data.data >>= (32 - data.count - 1) << 1;
outs.write((char *)&data.data, (data.count >> 2) + ((data.count & 0x3u) != 0));
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment