Last active
April 9, 2020 13:14
-
-
Save evanxg852000/0ee2972315dde53d14c4f74af8644f9f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
impl InvertedIndex { | |
// ... methods | |
pub fn add_document(&mut self, name: &str, content: &str) { | |
self.count += 1; // increment doc count and use it as next doc id | |
self.docs.insert(self.count, name.to_string(), ); // insert in docs map | |
for word in content.split_whitespace() { // split to word (term) | |
let word = word.to_lowercase(); | |
match self.index.get_mut(&word) { // find term | |
// update term frequency, to support search hightlight | |
// you might need term position in the doc | |
Some(entry) => { | |
match entry.get_mut(&self.count) { | |
Some(n) => { *n = *n + 1 }, | |
None => { entry.insert(self.count, 1); } | |
}; | |
}, | |
None => { // if term does not exist create it entry | |
let mut v = HashMap::new(); | |
v.insert(self.count, 1); | |
self.index.insert(word, v); | |
} | |
} | |
} | |
} | |
pub fn search(&self, key: &str) -> Vec<(String, u32)> { | |
let key = key.to_lowercase(); | |
match self.index.get(&key) { // find term | |
Some(freq_list) => { | |
// collect doc ids from frequency list | |
let mut doc_ids: Vec<u32> = freq_list.keys() | |
.map(|doc_id| *doc_id).collect(); | |
doc_ids.sort_by(|doc_a_id, doc_b_id| { // sort doc ids by frequency | |
let a = freq_list.get(doc_a_id).unwrap(); | |
let b = freq_list.get(doc_b_id).unwrap(); | |
a.cmp(b) | |
}); | |
// collect ordered doc names with frequency | |
let doc_names: Vec<(String, u32)> = doc_ids.iter().map(|doc_id| { | |
(self.docs.get(&doc_id).unwrap().clone(), *freq_list.get(&doc_id).unwrap() ) | |
}).collect(); | |
doc_names | |
}, | |
None => vec![], | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment