Last active
January 12, 2018 21:49
-
-
Save dnbaker/ce3391a9dad1c377ceb04b18104a31fa to your computer and use it in GitHub Desktop.
Convert fasta file to 2-bit representation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cassert> | |
#include <fstream> | |
struct cdata { | |
uint64_t data; | |
unsigned count; | |
cdata() {reset();} | |
void reset() {data = count = 0;} | |
template<typename T> | |
void write(T &stream, size_t nbytes=sizeof(data) / sizeof(char)) { | |
stream.write((char *)&data, nbytes); | |
reset(); | |
} | |
}; | |
int main(int argc, char *argv[]) { | |
if(argc < 2) { | |
std::fprintf(stderr, "Usage: %s <infile> <outfile>\n", argv[0]); | |
return EXIT_FAILURE; | |
} | |
std::ios_base::sync_with_stdio(false); | |
std::ifstream ins(argv[1]); | |
std::ofstream outs(argc > 2 ? argv[2]: "/dev/stdout", std::fstream::binary | std::ios::out); | |
std::string line; | |
cdata data; | |
while(std::getline(ins, line)) { | |
switch(line[0]) case '@': case '#': case '>': continue; | |
for(const auto c: line) { | |
switch(c) { | |
case 'a': case 'A': break; | |
case 'c': case 'C': data.data |= 1ull << 62; break; | |
case 'g': case 'G': data.data |= 2ull << 62; break; | |
case 't': case 'T': data.data |= 3ull << 62; break; | |
default: continue; | |
} | |
if(++data.count == 32) data.write(outs); | |
data.data >>= 2; | |
} | |
} | |
data.data >>= (32 - data.count - 1) << 1; | |
outs.write((char *)&data.data, (data.count >> 2) + ((data.count & 0x3u) != 0)); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment