Created
January 12, 2025 17:23
-
-
Save rob-p/6c84313809161e85a8db8fe83d7bfc5c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef STREAMING_QUERY_HPP | |
#define STREAMING_QUERY_HPP | |
#include "../external/pthash/external/essentials/include/essentials.hpp" | |
#include "../include/dictionary.hpp" | |
#include "../include/query/streaming_query_canonical_parsing.hpp" | |
#include "../include/util.hpp" | |
#include <sstream> | |
namespace piscem { | |
constexpr int32_t invalid_query_offset = std::numeric_limits<int32_t>::lowest(); | |
constexpr uint64_t invalid_contig_id = std::numeric_limits<uint64_t>::max(); | |
class simplified_streaming_query { | |
public: | |
inline simplified_streaming_query(sshash::dictionary const* d, | |
std::unique_ptr<sshash::bit_vector_iterator>&& contig_it) | |
: m_d(d) | |
, m_q(d) | |
, m_prev_query_offset(invalid_query_offset) | |
, m_prev_contig_id(invalid_contig_id) | |
, m_prev_kmer_id(invalid_query_offset) | |
, m_ref_contig_it(std::move(contig_it)) | |
, m_k(d->k()) {} | |
simplified_streaming_query(const simplified_streaming_query& other) = delete; | |
simplified_streaming_query(simplified_streaming_query&& other) = default; | |
simplified_streaming_query& operator=(const simplified_streaming_query& other) = delete; | |
simplified_streaming_query& operator=(simplified_streaming_query&& other) = default; | |
~simplified_streaming_query() { | |
// print out statistics about the number of | |
// extension lookups compared to the number | |
// of stateless lookups. | |
auto ns = num_searches(); | |
auto ne = num_extensions(); | |
if (ns > 0) { | |
std::stringstream ss; | |
ss << "\nmethod : " << method() << "\n"; | |
ss << "\nnumber of searches = " << ns << ", number of extensions = " << ne | |
<< ", neighbors = " << m_neighbors | |
<< ", extension ratio = " << static_cast<double>(ne) / static_cast<double>(ns) | |
<< "\n"; | |
std::cerr << ss.str(); | |
} | |
} | |
inline void reset_state() { | |
m_prev_query_offset = invalid_query_offset; | |
m_is_present = false; | |
m_start = true; | |
m_remaining_contig_bases = 0; | |
m_q.reset_state(); | |
} | |
inline void start() { | |
m_prev_query_offset = invalid_query_offset; | |
m_prev_kmer_id = invalid_query_offset; | |
m_is_present = false; | |
m_start = true; | |
m_remaining_contig_bases = 0; | |
m_q.start(); | |
} | |
inline void do_stateless_lookup(const char* kmer_s) { | |
// lookup directly in the index without assuming any relation | |
// to the previously queried k-mer | |
m_prev_res = m_d->lookup_advanced(kmer_s); | |
m_direction = m_prev_res.kmer_orientation ? -1 : 1; | |
m_prev_kmer_id = m_prev_res.kmer_id; | |
m_is_present = (m_prev_res.kmer_id != sshash::constants::invalid_uint64); | |
m_start = !m_is_present; | |
m_n_search += m_is_present ? 1 : 0; | |
if (m_is_present) { | |
uint64_t kmer_offset = 2 * (m_prev_res.kmer_id + (m_prev_res.contig_id * (m_k - 1))); | |
kmer_offset += (m_direction > 0) ? 0 : (2 * m_k); | |
m_ref_contig_it->at(kmer_offset); | |
set_remaining_contig_bases(); | |
} | |
} | |
inline void set_remaining_contig_bases() { | |
// if moving forward, we have (contig-length - (pos + k)) positions left | |
// if moving backward, we have (pos) positions left. | |
m_remaining_contig_bases = | |
(m_direction == 1) ? (m_prev_res.contig_size - (m_prev_res.kmer_id_in_contig + m_k)) | |
: (m_prev_res.kmer_id_in_contig); | |
} | |
/* | |
inline sshash::lookup_result query_lookup(const char* read_str, int32_t read_pos) { | |
auto query_offset = read_pos; | |
// if the current query offset position is | |
// the next position after the stored query | |
// offset position, then we can apply the | |
// relevant optimizations. Otherwise, we | |
// should consider this as basically a "new" | |
// query | |
if (m_prev_query_offset != invalid_query_offset) { | |
if ((m_prev_query_offset + 1) != query_offset) { reset_state(); } | |
} | |
m_prev_query_offset = query_offset; | |
const char* kmer = read_str + read_pos; | |
sshash::lookup_result res = m_q.lookup_advanced(kmer); | |
m_is_present = (res.kmer_id != sshash::constants::invalid_uint64); | |
// uint64_t neighbor_inc = | |
// m_is_present && ((res.kmer_id == m_prev_kmer_id + 1) || | |
// (res.kmer_id == m_prev_kmer_id - 1)); | |
// m_neighbors += neighbor_inc; | |
// m_prev_kmer_id = res.kmer_id; | |
// if (m_is_present && (res.contig_id != m_prev_contig_id)) { | |
// auto start_pos = m_ctg_offsets.access(res.contig_id); | |
// auto end_pos = m_ctg_offsets.access(res.contig_id + 1); | |
// size_t len = end_pos - start_pos; | |
// m_ctg_span = {m_ctg_entries.at(start_pos), | |
// m_ctg_entries.at(start_pos + len), len}; | |
// m_prev_contig_id = res.contig_id; | |
// } | |
return res; | |
} | |
size_t num_searches() { return m_q.num_searches(); } | |
size_t num_extensions() { return m_q.num_extensions(); } | |
std::string method() { return "builtin"; } | |
*/ | |
inline sshash::lookup_result query_lookup(const char* read_str, int32_t read_pos) { | |
auto query_offset = read_pos; | |
// if the current query offset position is | |
// the next position after the stored query | |
// offset position, then we can apply the | |
// relevant optimizations. Otherwise, we | |
// should consider this as basically a "new" | |
// query | |
// | |
// if this isn't the first search after a reset | |
if (m_prev_query_offset != invalid_query_offset) { | |
// if there is no contig left to walk, or if | |
// the current k-mer is not the successor of the | |
// previous k-mer on the query, then reset the state. | |
if (((m_prev_query_offset + 1) != query_offset) || (m_remaining_contig_bases <= 0)) { | |
reset_state(); | |
} | |
} | |
// get the k-mer string | |
const char* kmer_s = read_str + read_pos; | |
// if we need to do a stateless lookup | |
if (m_start) { | |
if (!sshash::util::is_valid(kmer_s, m_k)) { return sshash::lookup_result(); } | |
do_stateless_lookup(kmer_s); | |
} else { | |
// try to get the next k-mer | |
auto query_kmer = sshash::util::string_to_uint_kmer(kmer_s, m_k); | |
if (!sshash::util::is_valid(kmer_s[m_k - 1])) { | |
m_prev_res.kmer_id = sshash::constants::invalid_uint64; | |
reset_state(); | |
return m_prev_res; | |
} | |
uint64_t next_kmer_id = m_prev_kmer_id + m_direction; | |
uint64_t ref_kmer = | |
(m_direction > 0) | |
? (m_ref_contig_it->eat(2), m_ref_contig_it->read(2 * m_k)) | |
: (m_ref_contig_it->eat_reverse(2), m_ref_contig_it->read_reverse(2 * m_k)); | |
m_is_present = (query_kmer == ref_kmer) || | |
(query_kmer == sshash::util::compute_reverse_complement(ref_kmer, m_k)); | |
if (!m_is_present) { | |
// the next k-mer was not what was expected | |
reset_state(); | |
// we're doing a fresh lookup | |
do_stateless_lookup(kmer_s); | |
if (!m_is_present) { reset_state(); } | |
} else { | |
// the next k-mer was what was expected | |
m_n_extend++; | |
m_remaining_contig_bases--; | |
m_start = false; | |
m_prev_kmer_id = next_kmer_id; | |
m_prev_res.kmer_id += m_direction; | |
m_prev_res.kmer_id_in_contig += m_direction; | |
} | |
} | |
// the query offset is unconditionally the | |
// offset we just searched | |
m_prev_query_offset = query_offset; | |
return m_prev_res; | |
} | |
size_t num_searches() { return m_n_search; } | |
size_t num_extensions() { return m_n_extend; } | |
std::string method() { return "custom"; } | |
inline bool is_present() { return m_is_present; } | |
private: | |
sshash::dictionary const* m_d; | |
sshash::streaming_query_canonical_parsing m_q; | |
int32_t m_prev_query_offset; | |
uint64_t m_prev_contig_id; | |
uint64_t m_prev_kmer_id; | |
uint64_t m_neighbors{0}; | |
bool m_start = true; | |
bool m_is_present = false; | |
int32_t m_direction{0}; | |
uint64_t m_n_search{0}; | |
uint64_t m_n_extend{0}; | |
sshash::lookup_result m_prev_res; | |
std::unique_ptr<sshash::bit_vector_iterator> m_ref_contig_it{nullptr}; | |
int32_t m_remaining_contig_bases{0}; | |
uint64_t m_k; | |
}; | |
} // namespace piscem | |
#endif // STREAMING_QUERY_HPP |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment