Created
December 10, 2019 20:57
-
-
Save vellamike/84ec96f6bd9b8ef19d9a00b8f69212c1 to your computer and use it in GitHub Desktop.
kseq vs htslib
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include "seqio.h" | |
#include <zlib.h> | |
extern "C" { | |
#include <htslib/faidx.h> | |
} | |
using namespace klibpp; | |
struct free_deleter | |
{ | |
template <typename T> | |
void operator()(T* x) | |
{ | |
std::free(x); | |
} | |
}; | |
typedef struct | |
{ | |
/// Name of sequence. | |
std::string name; | |
/// Base pair representation of sequence. | |
std::string seq; | |
} FastaSequence; | |
class FastaParserHTS | |
{ | |
public: | |
FastaParserHTS(const std::string& fasta_file); | |
~FastaParserHTS(); | |
int32_t get_num_seqences(); | |
FastaSequence get_sequence_by_id(int32_t i); | |
FastaSequence get_sequence_by_name(const std::string&); | |
private: | |
faidx_t* fasta_index_; | |
mutable std::mutex index_mutex_; | |
int32_t num_seqequences_; | |
}; | |
FastaParserHTS::FastaParserHTS(const std::string& fasta_file) | |
{ | |
fasta_index_ = fai_load3(fasta_file.c_str(), NULL, NULL, FAI_CREATE); | |
if (fasta_index_ == NULL) | |
{ | |
throw std::runtime_error("Could not load fasta index!"); | |
} | |
num_seqequences_ = faidx_nseq(fasta_index_); | |
if (num_seqequences_ == 0) | |
{ | |
fai_destroy(fasta_index_); | |
throw std::runtime_error("FASTA file has 0 sequences"); | |
} | |
} | |
FastaParserHTS::~FastaParserHTS() | |
{ | |
fai_destroy(fasta_index_); | |
} | |
int32_t FastaParserHTS::get_num_seqences() | |
{ | |
return num_seqequences_; | |
} | |
FastaSequence FastaParserHTS::get_sequence_by_name(const std::string& name) | |
{ | |
FastaSequence s{}; | |
{ | |
std::lock_guard<std::mutex> lock(index_mutex_); | |
int32_t length; | |
std::unique_ptr<char, free_deleter> seq(fai_fetch(fasta_index_, name.c_str(), &length)); | |
if (length < 0) | |
{ | |
throw std::runtime_error("Error in reading sequence information for seq ID " + name); | |
} | |
s.name = std::string(name); | |
s.seq = std::string(seq.get()); | |
} | |
return s; | |
} | |
FastaSequence FastaParserHTS::get_sequence_by_id(int32_t i) | |
{ | |
std::string str_name = ""; | |
{ | |
std::lock_guard<std::mutex> lock(index_mutex_); | |
const char* name = faidx_iseq(fasta_index_, i); | |
if (name == NULL) | |
{ | |
throw std::runtime_error("No sequence found for ID " + std::to_string(i)); | |
} | |
str_name = std::string(name); | |
} | |
return get_sequence_by_name(str_name); | |
} | |
class FastaParser | |
{ | |
public: | |
/// \brief FastaParser implementations can have custom destructors, so delcare the abstract dtor as default. | |
virtual ~FastaParser() = default; | |
/// \brief Return number of sequences in FASTA file | |
/// | |
/// \return Sequence count in file | |
virtual int32_t get_num_seqences() const = 0; | |
/// \brief Fetch an entry from the FASTA file by index position in file. | |
/// \param id Position of sequence in file. If id is greater than file size, | |
/// an error is thrown. | |
/// | |
/// \return A FastaSequence object describing the entry. | |
virtual FastaSequence get_sequence_by_id(int32_t id) const = 0; | |
/// \brief Fetch an entry from the FASTA file by name. | |
/// \param name Name of the sequence in FASTA file. If there is no entry | |
/// by that name, an error is thrown. | |
/// | |
/// \return A FastaSequence object describing the entry. | |
virtual FastaSequence get_sequence_by_name(const std::string& name) const = 0; | |
}; | |
char* fasta_filepath = "/home/mike/science/cumap_validation/simulated_data/10M_ref_100x_reads.fasta"; | |
int main(int argc, char* argv[]) | |
{ | |
// First test kseq | |
KSeq record; | |
SeqStreamIn iss(fasta_filepath); | |
std::vector<std::string> seqs1; | |
auto start_time = std::chrono::high_resolution_clock::now(); | |
while (iss >> record) { | |
//std::cout << record.name << std::endl; | |
if (!record.comment.empty()) std::cout << record.comment << std::endl; | |
//std::cout << record.seq << std::endl; | |
seqs1.push_back(record.seq); | |
if (!record.qual.empty()) std::cout << record.qual << std::endl; | |
} | |
auto end_time = std::chrono::high_resolution_clock::now(); | |
std::cerr << "KSEQ++ duration (ms)" << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count() << std::endl; | |
// Now test htslib | |
FastaParserHTS f = FastaParserHTS(std::string(fasta_filepath)); | |
int num_sequences = f.get_num_seqences(); | |
std::vector<std::string> seqs2; | |
start_time = std::chrono::high_resolution_clock::now(); | |
for(int i=0; i< f.get_num_seqences(); i++){ | |
seqs2.push_back((f.get_sequence_by_id(i).seq)); | |
} | |
end_time = std::chrono::high_resolution_clock::now(); | |
std::cerr << "HTSLib duration (ms)" << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count() << std::endl; | |
for (auto s: seqs1) { | |
if (s == "blah") { | |
exit(1); | |
} | |
} | |
for (auto s: seqs2) { | |
if (s == "blah") { | |
exit(1); | |
} | |
} | |
assert(seqs1.size() == seqs2.size()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment