Skip to content

Instantly share code, notes, and snippets.

@moonwatcher
Last active August 29, 2015 14:08
Show Gist options
  • Save moonwatcher/2f4c5e0f66da3edbaaba to your computer and use it in GitHub Desktop.
Save moonwatcher/2f4c5e0f66da3edbaaba to your computer and use it in GitHub Desktop.
C++ fastq demultiplexer
/*
FASTQ Demultiplexer
Author: Lior Galanti [email protected]
Build:
On ubuntu:
dependencies: sudo apt-get install lib32z1-dev libboost-iostreams-dev libboost-program-options-dev
compile: g++ -static phenaqs.cpp -lboost_iostreams -lboost_program_options -lz -std=c++11 -o phenaqs
On OS X:
dependencies: brew install boost --c++11 --with-icu
compile: g++ phenaqs.cpp -lboost_iostreams -lboost_program_options -lz -std=c++11 -o phenaqs
*/
#include <stdio.h>
#include <string.h>
#include <string>
#include <unordered_map>
#include <vector>
#include <exception>
#include <fstream>
#include <iostream>
#include <boost/property_tree/exceptions.hpp>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/json_parser.hpp>
#include <boost/foreach.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/program_options.hpp>
#define MAX_READ_SIZE 2048
#define MAX_READ_ID_SIZE 2048
#define READ_BUFFER_SIZE 2048
#define WRITE_BUFFER_SIZE 2048
using namespace std;
class FastqInputFeed {
private:
string path;
ifstream file;
istream* stream;
boost::iostreams::filtering_streambuf<boost::iostreams::input> buffer;
public:
int key;
FastqInputFeed(){}
~FastqInputFeed(){}
FastqInputFeed( const FastqInputFeed &obj){};
bool done(){
return stream->eof();
}
istream* getStream() {
return stream;
}
void open(string path) {
this->path = path;
file.open(path, ios_base::in | ios_base::binary);
buffer.push(boost::iostreams::gzip_decompressor());
buffer.push(file);
stream = new istream(&buffer);
}
void close() {
buffer.pop();
delete stream;
file.close();
}
};
class FastqOutputFeed {
private:
string path;
ofstream file;
ostream* stream;
boost::iostreams::filtering_streambuf<boost::iostreams::output> buffer;
public:
int key;
FastqOutputFeed(){}
~FastqOutputFeed(){}
FastqOutputFeed( const FastqOutputFeed &obj){};
ostream* getStream() {
return stream;
}
void open(string path) {
this->path = path;
file.open(path, ios_base::out | ios_base::binary);
buffer.push(boost::iostreams::gzip_compressor());
buffer.push(file);
stream = new ostream(&buffer);
}
void close() {
buffer.pop();
delete stream;
file.close();
}
};
class Sequence {
private:
int length;
public:
char* code;
char* quality;
Sequence() {
code = new char[MAX_READ_SIZE]();
quality = new char[MAX_READ_SIZE]();
clear();
}
~Sequence() {
delete code;
delete quality;
}
int getLength(){
return length;
}
void terminate() {
this->code[this->length] = '\0';
this->quality[this->length] = '\0';
}
void clear() {
length = 0;
terminate();
}
void assign(const string& code, const string& quality) {
strcpy(this->code, code.c_str());
strcpy(this->quality, quality.c_str());
this->length = strlen(this->code);
}
void append(const Sequence& other) {
strncpy(this->code + this->length, other.code, other.length);
strncpy(this->quality + this->length, other.quality, other.length);
this->length += other.length;
terminate();
}
void append(const Sequence& other, const int start, const int length) {
// if length is -1, copy up to the end of the read
int actual = length < 0 ? other.length - start : length;
strncpy(this->code + this->length, other.code, actual);
strncpy(this->quality + this->length, other.quality, actual);
this->length += actual;
terminate();
}
int read(FastqInputFeed& feed) {
int result = -1;
if (!feed.done()) {
feed.getStream()->getline(code, MAX_READ_SIZE);
length = strlen(code);
if (!feed.done()) {
feed.getStream()->ignore(2, '\n');
if (!feed.done()) {
feed.getStream()->getline(quality, MAX_READ_SIZE);
result = 1;
}
}
}
return result;
}
void write(FastqOutputFeed& feed) const {
// probably want to be safer here...
*(feed.getStream()) << code << "\n+\n" << quality << endl;
}
};
class Barcode {
private:
public:
Sequence hash;
vector<Sequence> sequences;
Barcode() {
clear();
}
void resize(int n) {
sequences.resize(n);
}
void clear() {
hash.clear();
for(vector<Sequence>::iterator sequence = sequences.begin(); sequence != sequences.end(); sequence++) {
sequence->clear();
}
}
void makeHash(){
hash.clear();
for(vector<Sequence>::iterator sequence = sequences.begin(); sequence != sequences.end(); sequence++) {
hash.append(*sequence);
}
}
void append(const Sequence& sequence, const int key, const int start, const int length) {
this->sequences[key].append(sequence, start, length);
}
void assign(const int key, const string& code, const string& quality) {
sequences[key].assign(code, quality);
}
bool match(const Barcode& other, const vector<int>& missmatches) {
bool result = true;
for(vector<Sequence>::size_type i = 0; i<sequences.size(); i++) {
int missmatch = 0;
for(int j=0; j<sequences[i].getLength(); j++) {
if(sequences[i].code[j] != other.sequences[i].code[j]) {
missmatch++;
if(missmatch > missmatches[i]) {
result = false;
break;
}
}
}
if(!result) { break; }
}
return result;
}
};
class ReadID {
private:
int length;
char* value;
char* prefix;
int nibble;
char filtered;
char* flags;
char* barcode;
char* buffer;
public:
ReadID() {
value = new char[MAX_READ_ID_SIZE]();
prefix = new char[MAX_READ_ID_SIZE]();
flags = new char[MAX_READ_ID_SIZE]();
barcode = new char[MAX_READ_ID_SIZE]();
buffer = new char[MAX_READ_ID_SIZE]();
clear();
}
void copy(const ReadID& other) {
clear();
strncpy(this->value, other.value, other.length);
length = other.length;
}
void parse() {
/*
This will only work with the latest Illumina ID
@HWI-ST911:232:HABDFADXX:1:1101:1224:1932 1:N:0:
0 Instrument ID @HWI-ST911
1 Run count 232
2 Flowcell ID HABDFADXX
3 Lane number 1
4 Tile 1101
5 x coordinate 1224
6 y coordinate 1932
7 Nibble number 2
8 Filtered N
9 Flags 0
10 Barcode CGATGT
Prefix $0:$1:$2:$3:$4:$5:$6
nibble $7
filtered $8
flags $9
barcode $10
@HWI-ST911:232:HABDFADXX:1:1101:1224:1932 1:N:0:
*/
int offset = 0;
int position = 0;
for(int i = 0; i<length; i++) {
if(position == 0 && value[i] == ' '){
position++;
this->prefix[offset] = '\0';
offset = 0;
} else if(position > 0 && value[i] == ':') {
position++;
}
switch (position) {
case 0:
this->prefix[offset] = value[i];
offset++;
break;
case 2:
this->nibble = boost::lexical_cast<int>(value[i-1]);
position++;
break;
case 3:
this->filtered = value[i];
break;
case 4:
position++;
break;
case 5:
this->flags[offset] = value[i];
offset++;
break;
case 6:
position++;
this->flags[offset] = '\0';
offset = 0;
break;
case 7:
this->barcode[offset] = value[i];
offset++;
break;
}
}
this->barcode[offset] = '\0';
}
~ReadID() {
delete value;
delete prefix;
delete flags;
delete barcode;
}
void terminate() {
this->value[this->length] = '\0';
}
void clear() {
length = 0;
this->prefix[0] = '\0';
this->flags[0] = '\0';
this->barcode[0] = '\0';
this->buffer[0] = '\0';
terminate();
}
int read(FastqInputFeed& feed) {
int result = -1;
if (!feed.done()) {
feed.getStream()->getline(value, MAX_READ_ID_SIZE);
length = strlen(value);
result = 1;
}
return result;
}
void write(FastqOutputFeed& feed, const int nibble, const Barcode& barcode) const {
sprintf( buffer, "%s %d:%c:%s:%s",
prefix,
nibble + 1, // internally nibbles are zero based but on the read id are 1 based
filtered,
flags,
barcode.hash.code
);
*(feed.getStream()) << buffer << endl;
}
};
class Nibble {
public:
int key;
ReadID readId;
Sequence sequence;
void clear() {
readId.clear();
sequence.clear();
}
int read(FastqInputFeed& feed) {
int result = readId.read(feed);
if(result > 0) { result = sequence.read(feed); }
return result;
}
void write(FastqOutputFeed& feed, const int key, const Barcode& barcode) const {
readId.write(feed, key, barcode);
sequence.write(feed);
}
};
class Read {
public:
Barcode barcode;
vector<Nibble> input;
vector<Nibble> output;
Read() {
input.resize(2);
output.resize(2);
int i = 0;
for(vector<Nibble>::iterator nibble = input.begin(); nibble != input.end(); nibble++) {
nibble->key = i;
i++;
}
i = 0;
for(vector<Nibble>::iterator nibble = output.begin(); nibble != output.end(); nibble++) {
nibble->key = i;
i++;
}
}
void append(const int source, const int target, const int start, const int length){
this->output[target].sequence.append(this->input[source].sequence, start, length);
}
void clear() {
barcode.clear();
for(vector<Nibble>::iterator nibble = input.begin(); nibble != input.end(); nibble++) {
nibble->clear();
}
for(vector<Nibble>::iterator nibble = output.begin(); nibble != output.end(); nibble++) {
nibble->clear();
}
}
};
class Fragment {
public:
int nibble;
int start;
int length;
};
class Library {
private:
vector<FastqOutputFeed> streams;
public:
int count;
Barcode barcode;
Library() {
count = 0;
streams.resize(1);
}
void openStream(const int key, const string path) {
this->streams[key].key = key;
this->streams[key].open(path);
}
void write(const Read& read) {
for(vector<FastqOutputFeed>::iterator iterator = streams.begin(); iterator != streams.end(); iterator++) {
// Write with the library's barcode, not with the read's
// this will make sure read with missmatched barcodes within range
// get written with the imputed barcode
read.output[iterator->key].write(*iterator, iterator->key, this->barcode);
}
count ++;
}
void close() {
for(vector<FastqOutputFeed>::iterator iterator = streams.begin(); iterator != streams.end(); iterator++) {
iterator->close();
}
}
};
class Feed {
private:
int count;
vector<FastqInputFeed> streams;
public:
Feed() {
count = 0;
}
void resize(int n) {
streams.resize(n);
}
int getCount() {
return count;
}
void openStream(const int key, const string path) {
this->streams[key].key = key;
this->streams[key].open(path);
}
int read(Read& read) {
int result = -1;
for(vector<FastqInputFeed>::iterator iterator = streams.begin(); iterator != streams.end(); iterator++) {
result = read.input[iterator->key].read(*iterator);
if (result < 0) {
read.clear();
break;
}
}
if (result > 0) { count++; }
return result;
}
void close() {
for(vector<FastqInputFeed>::iterator iterator = streams.begin(); iterator != streams.end(); iterator++) {
iterator->close();
}
}
};
class Configuration {
private:
void loadFragmentPattern(const vector<string> pattern) {
fragmentPattern.resize(pattern.size());
for(vector<string>::size_type i = 0; i < pattern.size(); i++) {
fragmentPattern[i].resize(3);
vector<string> parsed(3);
boost::split(parsed, pattern[i], boost::is_any_of(":"));
try {
fragmentPattern[i][0] = boost::lexical_cast<int>(parsed[0]);
} catch(boost::bad_lexical_cast) {
cout << "Bad ouput reference" << endl;
}
fragmentPattern[i][1] = parsed[1].empty() ? 0 : boost::lexical_cast<int>(parsed[1]);
fragmentPattern[i][2] = parsed[2].empty() ? -1 : boost::lexical_cast<int>(parsed[2]);
}
}
void loadLibraryPattern(const vector<string> pattern) {
libraryPattern.resize(pattern.size());
for(vector<string>::size_type i = 0; i < pattern.size(); i++) {
vector<string> parsed;
boost::split(parsed, pattern[i], boost::is_any_of(":"));
try {
for(vector<string>::iterator j = parsed.begin(); j<parsed.end(); j++){
libraryPattern[i].push_back(boost::lexical_cast<int>(*j));
}
} catch(boost::bad_lexical_cast) {
cout << "Bad library reference" << endl;
}
}
}
void loadBarcodePattern(const vector<string> pattern) {
barcodePattern.resize(pattern.size());
for(vector<string>::size_type i = 0; i < pattern.size(); i++) {
vector<string> parsed;
boost::split(parsed, pattern[i], boost::is_any_of(":"));
try {
for(vector<string>::iterator j = parsed.begin(); j<parsed.end(); j++){
barcodePattern[i].push_back(boost::lexical_cast<int>(*j));
}
} catch(boost::bad_lexical_cast) {
cout << "Bad barcode reference" << endl;
}
}
}
void loadConfigurationFile() {
using boost::property_tree::ptree;
ptree tree;
read_json(path, tree);
BOOST_FOREACH(ptree::value_type &i, tree.get_child("--input")) {
this->input.push_back(i.second.get_value<string>());
}
vector<string> pattern;
BOOST_FOREACH(ptree::value_type &i, tree.get_child("--fragment-pattern")) {
pattern.push_back(i.second.get_value<string>());
}
loadFragmentPattern(pattern);
BOOST_FOREACH(ptree::value_type &i, tree.get_child("--output")) {
this->output.push_back(i.second.get_value<string>());
}
pattern.clear();
BOOST_FOREACH(ptree::value_type &i, tree.get_child("--library-pattern")) {
pattern.push_back(i.second.get_value<string>());
}
loadLibraryPattern(pattern);
pattern.clear();
BOOST_FOREACH(ptree::value_type &i, tree.get_child("--barcode-pattern")) {
pattern.push_back(i.second.get_value<string>());
}
loadBarcodePattern(pattern);
BOOST_FOREACH(ptree::value_type &i, tree.get_child("--barcode-missmatch")) {
this->barcodeMissmatch.push_back(i.second.get_value<int>());
}
BOOST_FOREACH(ptree::value_type &i, tree.get_child("--barcode-sequence")) {
this->barcodeSequence.push_back(i.second.get_value<string>());
}
undetermined = tree.get<int>("--undetermined", -1);
}
public:
string path;
vector<string> input;
vector<string> output;
vector<vector<int>> fragmentPattern;
vector<vector<int>> libraryPattern;
vector<vector<int>> barcodePattern;
vector<string> barcodeSequence;
vector<int> barcodeMissmatch;
int undetermined;
int id;
int librarySize;
int libraryCount;
int barcodeSize;
bool parse(int argc, char** argv) {
vector<string> fragmentPattern;
vector<string> libraryPattern;
vector<string> barcodePattern;
namespace po = boost::program_options;
po::options_description description("Allowed options");
description.add_options()
("help,h", "show this help message")
("conf,c", po::value<string>(&path), "configuration file")
("input,i", po::value< vector<string> >(&input), "input path")
("output,o", po::value< vector<string> >(&output), "output path")
("undetermined,u", po::value<int>(&undetermined), "undetermined output")
("id,d", po::value<int>(&id), "input to provide read id")
("fragment,f", po::value< vector<string> >(&fragmentPattern), "fragment pattern")
("library,l", po::value< vector<string> >(&libraryPattern), "library pattern")
("barcode,b", po::value< vector<string> >(&barcodePattern), "barcode pattern")
("missmatch,m", po::value< vector<int> >(&barcodeMissmatch), "allowed missmatch")
("sequence,s", po::value< vector<string> >(&barcodeSequence), "barcode sequence")
;
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, description), vm);
po::notify(vm);
if (vm.count("help") || argc < 2) {
cout << description << "\n";
return false;
} else {
if (vm.count("conf")){
loadConfigurationFile();
} else {
loadFragmentPattern(fragmentPattern);
loadLibraryPattern(libraryPattern);
loadBarcodePattern(barcodePattern);
}
barcodeSize = this->barcodePattern.size();
librarySize = this->libraryPattern.size();
libraryCount = (librarySize) ? (int)(this->output.size() / librarySize) : 0;
}
return true;
}
void print() {
cout << "Input" << endl;
for(vector<string>::size_type i=0; i<input.size(); i++){
cout << i << " : " << input[i] << endl;
}
cout << "Output" << endl;
for(vector<string>::size_type i=0; i<output.size(); i++){
cout << i << " : " << output[i] << endl;
}
cout << "Fragment Pattern" << endl;
for(vector<vector<int> >::size_type i=0; i<fragmentPattern.size(); i++){
cout << fragmentPattern[i][0] << " : " << fragmentPattern[i][1] << " : " << fragmentPattern[i][2] << endl;
}
cout << "Library Pattern" << endl;
for(vector<vector<int>>::size_type i=0; i<libraryPattern.size(); i++) {
cout << i;
for(vector<int>::size_type j=0; j<libraryPattern[i].size(); j++) {
cout << " : " << libraryPattern[i][j];
}
cout << endl;
}
cout << "Barcode Pattern" << endl;
for(vector<string>::size_type i=0; i<barcodePattern.size(); i++){
cout << i;
for(vector<int>::size_type j=0; j<libraryPattern[i].size(); j++) {
cout << " : " << barcodePattern[i][j];
}
cout << endl;
}
cout << "Barcode Sequence" << endl;
for(vector<string>::size_type i=0; i<barcodeSequence.size(); i++){
cout << i << " : " << barcodeSequence[i] << endl;
}
cout << "Barcode Missmatch" << endl;
for(vector<int>::size_type i=0; i<barcodeMissmatch.size(); i++){
cout << i << " : " << barcodeMissmatch[i] << endl;
}
cout << "Undetermined " << " : " << undetermined << endl;
}
void validate() {
/*
if (vm.count("compression")) {
cout << "Compression level was set to "
<< vm["compression"].as<int>() << ".\n";
} else {
cout << "Compression level was not set.\n";
}
*/
}
};
class Demultiplexer {
public:
Configuration& configuration;
Feed feed;
vector<Fragment> fragments;
vector<Library> libraries;
unordered_map<string, Library*> barcodeLookup;
Library* undetermined = NULL;
Demultiplexer(Configuration& configuration) : configuration(configuration) {}
void open() {
// Populate the feed with the input streams
feed.resize(configuration.input.size());
for(vector<string>::size_type i=0; i<configuration.input.size(); i++) {
feed.openStream(i, configuration.input[i]);
}
// Populate the libraries with the output streams
int k = 0;
libraries.resize(configuration.libraryCount);
for(vector<Library>::iterator i = libraries.begin(); i != libraries.end(); i++) {
for(int j=0; j<configuration.librarySize; j++) {
i->openStream(j, configuration.output[k]);
k++;
}
}
// Populate the fragments
fragments.resize(configuration.fragmentPattern.size());
for(vector<Fragment>::size_type i=0; i<fragments.size(); i++) {
fragments[i].nibble = configuration.fragmentPattern[i][0];
fragments[i].start = configuration.fragmentPattern[i][1];
fragments[i].length = configuration.fragmentPattern[i][2];
}
// Populate the library's barcodes the identify it
k = 0;
for(vector<Library>::iterator i = libraries.begin(); i != libraries.end(); i++) {
i->barcode.resize(configuration.barcodeSize);
for(int j=0; j<configuration.barcodeSize; j++) {
string quality(configuration.barcodeSequence[k].length(), 'N');
i->barcode.assign(j, configuration.barcodeSequence[k], quality);
k++;
}
i->barcode.makeHash();
}
// For perfect matches we can hash each Library by the concatination of all its barcodes
for(vector< Library >::iterator i = libraries.begin(); i != libraries.end(); i++) {
barcodeLookup[string(i->barcode.hash.code)] = &(*i);
}
// Asign the undetermined pointer if one was specified
if (configuration.undetermined > 0) {
undetermined = &libraries[configuration.undetermined];
}
}
void close() {
feed.close();
for(vector< Library >::iterator library = libraries.begin(); library != libraries.end(); library++) {
library->close();
}
}
void choose(const Read& read) {
bool matched = false;
char* hash = read.barcode.hash.code;
if(this->barcodeLookup.count(hash) > 0){
Library* library = this->barcodeLookup[hash];
library->write(read);
matched = true;
} else {
matched = chooseWithMissmatch(read);
if(!matched && undetermined != NULL) {
undetermined->write(read);
}
}
}
bool chooseWithMissmatch(const Read& read) {
bool matched = false;
for(vector<Library>::iterator iterator = libraries.begin(); iterator != libraries.end(); iterator++) {
if(iterator->barcode.match(read.barcode, configuration.barcodeMissmatch)) {
iterator->write(read);
matched = true;
break;
}
}
return matched;
}
void transform(Read& read) {
// Populate the output nibbles with the library fragments
for(vector< vector<int> >::size_type i = 0; i < configuration.libraryPattern.size(); i++) {
for(vector<int>::iterator iterator = configuration.libraryPattern[i].begin(); iterator != configuration.libraryPattern[i].end(); iterator++) {
read.append(fragments[*iterator].nibble, i, fragments[*iterator].start, fragments[*iterator].length);
}
}
// Copy the read id from the first input nibble to the output nibbles
for (vector<Nibble>::iterator iterator = read.output.begin(); iterator != read.output.end(); iterator++) {
iterator->readId.copy(read.input[0].readId);
// don't forget to parse the read id
iterator->readId.parse();
}
// Go over the barcode fragment specifications and populate the barcode with the fragments
for(vector< vector<int> >::size_type i = 0; i < configuration.barcodePattern.size(); i++) {
for(vector<int>::iterator iterator = configuration.barcodePattern[i].begin(); iterator != configuration.barcodePattern[i].end(); iterator++) {
read.barcode.append(read.input[fragments[*iterator].nibble].sequence, i, fragments[*iterator].start, fragments[*iterator].length);
}
}
// create the hash for the library
read.barcode.makeHash();
}
int next(Read& read){
read.clear();
int result = feed.read(read);
if(result > 0) {
this->transform(read);
this->choose(read);
// cout << feed.getCount() << endl;
}
return result;
}
void run(){
Read read;
read.input.resize(configuration.input.size());
read.output.resize(configuration.librarySize);
read.barcode.sequences.resize(configuration.barcodeSize);
int result = 0;
while(this->next(read) > 0) {
result ++;
}
}
void statistics() {
int total = 0;
for(vector< Library >::size_type i = 0; i < libraries.size(); i++) {
total += libraries[i].count;
cout << i << " : " << libraries[i].count << endl;
}
cout << "Total : " << total << endl;
}
};
int main(int argc, char** argv) {
Configuration configuration;
if (configuration.parse(argc,argv)) {
configuration.print();
Demultiplexer* demultiplexer = new Demultiplexer(configuration);
demultiplexer->open();
demultiplexer->run();
demultiplexer->statistics();
demultiplexer->close();
delete demultiplexer;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment