Last active
December 31, 2019 16:27
-
-
Save jjfiv/577ff68ebc320d60680d5d6480801e0d to your computer and use it in GitHub Desktop.
Collecting statistics from Tantivy's index structures.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use std::convert::TryInto; | |
| use tantivy::{Searcher, Term}; | |
| #[derive(Debug, Clone, Serialize, Deserialize)] | |
| pub struct CountStats { | |
| pub collection_frequency: u64, | |
| pub document_frequency: u64, | |
| pub collection_length: u64, | |
| pub document_count: u64, | |
| } | |
| impl CountStats { | |
| pub fn average_doc_length(&self) -> f32 { | |
| if self.document_count == 0 { | |
| 0.0 | |
| } else { | |
| let cf = self.collection_frequency as f64; | |
| let dc = self.document_count as f64; | |
| (cf / dc) as f32 | |
| } | |
| } | |
| } | |
| pub fn get_count_stats(searcher: &Searcher, term: &Term) -> CountStats { | |
| let mut total_docs: u64 = 0; | |
| let mut term_df: u64 = 0; | |
| let mut total_cf: u64 = 0; | |
| let mut term_cf: u64 = 0; | |
| // Sum up statistics across all open segment readers: | |
| for sr in searcher.segment_readers() { | |
| total_docs += u64::from(sr.num_docs()); | |
| let inv_index = sr.inverted_index(term.field()); | |
| total_cf += inv_index.total_num_tokens(); | |
| let term_dict = inv_index.terms(); | |
| // If the term exists in this segment. | |
| if let Some(term_num) = term_dict.term_ord(term.text()) { | |
| let ti = term_dict.term_info_from_ord(term_num); | |
| term_df += u64::from(ti.doc_freq); | |
| if term_num + 1 < term_dict.num_terms().try_into().unwrap() { | |
| // The number of positions (total term tf) can be calculated using the next term stored ordinally. | |
| let ti2 = term_dict.term_info_from_ord(term_num + 1); | |
| term_cf += ti2.positions_idx - ti.positions_idx; | |
| } else { | |
| todo!("Calculate it for the final posting list since we can't do the difference trick!") | |
| } | |
| } | |
| } | |
| CountStats { | |
| collection_frequency: term_cf, | |
| document_frequency: term_df, | |
| collection_length: total_cf, | |
| document_count: total_docs, | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment