Skip to content

Instantly share code, notes, and snippets.

@rob-p
Created January 12, 2025 17:23
Show Gist options
  • Save rob-p/6c84313809161e85a8db8fe83d7bfc5c to your computer and use it in GitHub Desktop.
Save rob-p/6c84313809161e85a8db8fe83d7bfc5c to your computer and use it in GitHub Desktop.
#ifndef STREAMING_QUERY_HPP
#define STREAMING_QUERY_HPP
#include "../external/pthash/external/essentials/include/essentials.hpp"
#include "../include/dictionary.hpp"
#include "../include/query/streaming_query_canonical_parsing.hpp"
#include "../include/util.hpp"
#include <sstream>
namespace piscem {
constexpr int32_t invalid_query_offset = std::numeric_limits<int32_t>::lowest();
constexpr uint64_t invalid_contig_id = std::numeric_limits<uint64_t>::max();
class simplified_streaming_query {
public:
inline simplified_streaming_query(sshash::dictionary const* d,
std::unique_ptr<sshash::bit_vector_iterator>&& contig_it)
: m_d(d)
, m_q(d)
, m_prev_query_offset(invalid_query_offset)
, m_prev_contig_id(invalid_contig_id)
, m_prev_kmer_id(invalid_query_offset)
, m_ref_contig_it(std::move(contig_it))
, m_k(d->k()) {}
simplified_streaming_query(const simplified_streaming_query& other) = delete;
simplified_streaming_query(simplified_streaming_query&& other) = default;
simplified_streaming_query& operator=(const simplified_streaming_query& other) = delete;
simplified_streaming_query& operator=(simplified_streaming_query&& other) = default;
~simplified_streaming_query() {
// print out statistics about the number of
// extension lookups compared to the number
// of stateless lookups.
auto ns = num_searches();
auto ne = num_extensions();
if (ns > 0) {
std::stringstream ss;
ss << "\nmethod : " << method() << "\n";
ss << "\nnumber of searches = " << ns << ", number of extensions = " << ne
<< ", neighbors = " << m_neighbors
<< ", extension ratio = " << static_cast<double>(ne) / static_cast<double>(ns)
<< "\n";
std::cerr << ss.str();
}
}
inline void reset_state() {
m_prev_query_offset = invalid_query_offset;
m_is_present = false;
m_start = true;
m_remaining_contig_bases = 0;
m_q.reset_state();
}
inline void start() {
m_prev_query_offset = invalid_query_offset;
m_prev_kmer_id = invalid_query_offset;
m_is_present = false;
m_start = true;
m_remaining_contig_bases = 0;
m_q.start();
}
inline void do_stateless_lookup(const char* kmer_s) {
// lookup directly in the index without assuming any relation
// to the previously queried k-mer
m_prev_res = m_d->lookup_advanced(kmer_s);
m_direction = m_prev_res.kmer_orientation ? -1 : 1;
m_prev_kmer_id = m_prev_res.kmer_id;
m_is_present = (m_prev_res.kmer_id != sshash::constants::invalid_uint64);
m_start = !m_is_present;
m_n_search += m_is_present ? 1 : 0;
if (m_is_present) {
uint64_t kmer_offset = 2 * (m_prev_res.kmer_id + (m_prev_res.contig_id * (m_k - 1)));
kmer_offset += (m_direction > 0) ? 0 : (2 * m_k);
m_ref_contig_it->at(kmer_offset);
set_remaining_contig_bases();
}
}
inline void set_remaining_contig_bases() {
// if moving forward, we have (contig-length - (pos + k)) positions left
// if moving backward, we have (pos) positions left.
m_remaining_contig_bases =
(m_direction == 1) ? (m_prev_res.contig_size - (m_prev_res.kmer_id_in_contig + m_k))
: (m_prev_res.kmer_id_in_contig);
}
/*
inline sshash::lookup_result query_lookup(const char* read_str, int32_t read_pos) {
auto query_offset = read_pos;
// if the current query offset position is
// the next position after the stored query
// offset position, then we can apply the
// relevant optimizations. Otherwise, we
// should consider this as basically a "new"
// query
if (m_prev_query_offset != invalid_query_offset) {
if ((m_prev_query_offset + 1) != query_offset) { reset_state(); }
}
m_prev_query_offset = query_offset;
const char* kmer = read_str + read_pos;
sshash::lookup_result res = m_q.lookup_advanced(kmer);
m_is_present = (res.kmer_id != sshash::constants::invalid_uint64);
// uint64_t neighbor_inc =
// m_is_present && ((res.kmer_id == m_prev_kmer_id + 1) ||
// (res.kmer_id == m_prev_kmer_id - 1));
// m_neighbors += neighbor_inc;
// m_prev_kmer_id = res.kmer_id;
// if (m_is_present && (res.contig_id != m_prev_contig_id)) {
// auto start_pos = m_ctg_offsets.access(res.contig_id);
// auto end_pos = m_ctg_offsets.access(res.contig_id + 1);
// size_t len = end_pos - start_pos;
// m_ctg_span = {m_ctg_entries.at(start_pos),
// m_ctg_entries.at(start_pos + len), len};
// m_prev_contig_id = res.contig_id;
// }
return res;
}
size_t num_searches() { return m_q.num_searches(); }
size_t num_extensions() { return m_q.num_extensions(); }
std::string method() { return "builtin"; }
*/
inline sshash::lookup_result query_lookup(const char* read_str, int32_t read_pos) {
auto query_offset = read_pos;
// if the current query offset position is
// the next position after the stored query
// offset position, then we can apply the
// relevant optimizations. Otherwise, we
// should consider this as basically a "new"
// query
//
// if this isn't the first search after a reset
if (m_prev_query_offset != invalid_query_offset) {
// if there is no contig left to walk, or if
// the current k-mer is not the successor of the
// previous k-mer on the query, then reset the state.
if (((m_prev_query_offset + 1) != query_offset) || (m_remaining_contig_bases <= 0)) {
reset_state();
}
}
// get the k-mer string
const char* kmer_s = read_str + read_pos;
// if we need to do a stateless lookup
if (m_start) {
if (!sshash::util::is_valid(kmer_s, m_k)) { return sshash::lookup_result(); }
do_stateless_lookup(kmer_s);
} else {
// try to get the next k-mer
auto query_kmer = sshash::util::string_to_uint_kmer(kmer_s, m_k);
if (!sshash::util::is_valid(kmer_s[m_k - 1])) {
m_prev_res.kmer_id = sshash::constants::invalid_uint64;
reset_state();
return m_prev_res;
}
uint64_t next_kmer_id = m_prev_kmer_id + m_direction;
uint64_t ref_kmer =
(m_direction > 0)
? (m_ref_contig_it->eat(2), m_ref_contig_it->read(2 * m_k))
: (m_ref_contig_it->eat_reverse(2), m_ref_contig_it->read_reverse(2 * m_k));
m_is_present = (query_kmer == ref_kmer) ||
(query_kmer == sshash::util::compute_reverse_complement(ref_kmer, m_k));
if (!m_is_present) {
// the next k-mer was not what was expected
reset_state();
// we're doing a fresh lookup
do_stateless_lookup(kmer_s);
if (!m_is_present) { reset_state(); }
} else {
// the next k-mer was what was expected
m_n_extend++;
m_remaining_contig_bases--;
m_start = false;
m_prev_kmer_id = next_kmer_id;
m_prev_res.kmer_id += m_direction;
m_prev_res.kmer_id_in_contig += m_direction;
}
}
// the query offset is unconditionally the
// offset we just searched
m_prev_query_offset = query_offset;
return m_prev_res;
}
size_t num_searches() { return m_n_search; }
size_t num_extensions() { return m_n_extend; }
std::string method() { return "custom"; }
inline bool is_present() { return m_is_present; }
private:
sshash::dictionary const* m_d;
sshash::streaming_query_canonical_parsing m_q;
int32_t m_prev_query_offset;
uint64_t m_prev_contig_id;
uint64_t m_prev_kmer_id;
uint64_t m_neighbors{0};
bool m_start = true;
bool m_is_present = false;
int32_t m_direction{0};
uint64_t m_n_search{0};
uint64_t m_n_extend{0};
sshash::lookup_result m_prev_res;
std::unique_ptr<sshash::bit_vector_iterator> m_ref_contig_it{nullptr};
int32_t m_remaining_contig_bases{0};
uint64_t m_k;
};
} // namespace piscem
#endif // STREAMING_QUERY_HPP
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment