Skip to content

Instantly share code, notes, and snippets.

@evanxg852000
Last active April 9, 2020 13:14
Show Gist options
  • Save evanxg852000/0ee2972315dde53d14c4f74af8644f9f to your computer and use it in GitHub Desktop.
Save evanxg852000/0ee2972315dde53d14c4f74af8644f9f to your computer and use it in GitHub Desktop.
impl InvertedIndex {
// ... methods
pub fn add_document(&mut self, name: &str, content: &str) {
self.count += 1; // increment doc count and use it as next doc id
self.docs.insert(self.count, name.to_string(), ); // insert in docs map
for word in content.split_whitespace() { // split to word (term)
let word = word.to_lowercase();
match self.index.get_mut(&word) { // find term
// update term frequency, to support search hightlight
// you might need term position in the doc
Some(entry) => {
match entry.get_mut(&self.count) {
Some(n) => { *n = *n + 1 },
None => { entry.insert(self.count, 1); }
};
},
None => { // if term does not exist create it entry
let mut v = HashMap::new();
v.insert(self.count, 1);
self.index.insert(word, v);
}
}
}
}
pub fn search(&self, key: &str) -> Vec<(String, u32)> {
let key = key.to_lowercase();
match self.index.get(&key) { // find term
Some(freq_list) => {
// collect doc ids from frequency list
let mut doc_ids: Vec<u32> = freq_list.keys()
.map(|doc_id| *doc_id).collect();
doc_ids.sort_by(|doc_a_id, doc_b_id| { // sort doc ids by frequency
let a = freq_list.get(doc_a_id).unwrap();
let b = freq_list.get(doc_b_id).unwrap();
a.cmp(b)
});
// collect ordered doc names with frequency
let doc_names: Vec<(String, u32)> = doc_ids.iter().map(|doc_id| {
(self.docs.get(&doc_id).unwrap().clone(), *freq_list.get(&doc_id).unwrap() )
}).collect();
doc_names
},
None => vec![],
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment