Skip to content

Instantly share code, notes, and snippets.

@jjfiv
Last active December 31, 2019 16:27
Show Gist options
  • Select an option

  • Save jjfiv/577ff68ebc320d60680d5d6480801e0d to your computer and use it in GitHub Desktop.

Select an option

Save jjfiv/577ff68ebc320d60680d5d6480801e0d to your computer and use it in GitHub Desktop.
Collecting statistics from Tantivy's index structures.
use std::convert::TryInto;
use tantivy::{Searcher, Term};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CountStats {
pub collection_frequency: u64,
pub document_frequency: u64,
pub collection_length: u64,
pub document_count: u64,
}
impl CountStats {
pub fn average_doc_length(&self) -> f32 {
if self.document_count == 0 {
0.0
} else {
let cf = self.collection_frequency as f64;
let dc = self.document_count as f64;
(cf / dc) as f32
}
}
}
pub fn get_count_stats(searcher: &Searcher, term: &Term) -> CountStats {
let mut total_docs: u64 = 0;
let mut term_df: u64 = 0;
let mut total_cf: u64 = 0;
let mut term_cf: u64 = 0;
// Sum up statistics across all open segment readers:
for sr in searcher.segment_readers() {
total_docs += u64::from(sr.num_docs());
let inv_index = sr.inverted_index(term.field());
total_cf += inv_index.total_num_tokens();
let term_dict = inv_index.terms();
// If the term exists in this segment.
if let Some(term_num) = term_dict.term_ord(term.text()) {
let ti = term_dict.term_info_from_ord(term_num);
term_df += u64::from(ti.doc_freq);
if term_num + 1 < term_dict.num_terms().try_into().unwrap() {
// The number of positions (total term tf) can be calculated using the next term stored ordinally.
let ti2 = term_dict.term_info_from_ord(term_num + 1);
term_cf += ti2.positions_idx - ti.positions_idx;
} else {
todo!("Calculate it for the final posting list since we can't do the difference trick!")
}
}
}
CountStats {
collection_frequency: term_cf,
document_frequency: term_df,
collection_length: total_cf,
document_count: total_docs,
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment