Created
February 17, 2022 19:11
-
-
Save worldOneo/2819268cdcc908265bbc64fa23efdbbf to your computer and use it in GitHub Desktop.
Simple, fast ngram index written in rust
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::collections::BinaryHeap; | |
use std::fmt::Debug; | |
use std::vec; | |
use std::{collections::HashMap, io, time::Instant}; | |
#[derive(Debug)] | |
struct NGramMeta<T: Clone + PartialEq> { | |
bind: T, | |
document: Vec<String>, | |
freq: f64, | |
} | |
impl<T: Clone + PartialEq> PartialOrd for NGramMeta<T> { | |
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | |
self.freq.partial_cmp(&other.freq) | |
} | |
fn lt(&self, other: &Self) -> bool { | |
match self.partial_cmp(other) { | |
None | Some(Ordering::Greater) | Some(Ordering::Equal) => false, | |
Some(Ordering::Less) => true, | |
} | |
} | |
fn le(&self, other: &Self) -> bool { | |
match self.partial_cmp(other) { | |
None | Some(Ordering::Greater) => false, | |
Some(Ordering::Less) | Some(Ordering::Equal) => true, | |
} | |
} | |
fn gt(&self, other: &Self) -> bool { | |
match self.partial_cmp(other) { | |
None | Some(Ordering::Equal) | Some(Ordering::Less) => false, | |
Some(Ordering::Greater) => true, | |
} | |
} | |
fn ge(&self, other: &Self) -> bool { | |
match self.partial_cmp(other) { | |
None | Some(Ordering::Less) => false, | |
Some(Ordering::Greater) | Some(Ordering::Equal) => true, | |
} | |
} | |
} | |
impl<T: Clone + PartialEq> PartialEq for NGramMeta<T> { | |
fn eq(&self, other: &Self) -> bool { | |
self.document == other.document && self.freq == other.freq && self.bind == other.bind | |
} | |
fn ne(&self, other: &Self) -> bool { | |
self.document == other.document && self.freq == other.freq && self.bind == other.bind | |
} | |
} | |
impl<T: Clone + PartialEq> Eq for NGramMeta<T> {} | |
impl<T: Clone + PartialEq> Ord for NGramMeta<T> { | |
fn cmp(&self, other: &Self) -> Ordering { | |
if self.freq > other.freq { | |
return Ordering::Greater; | |
} else if self.freq < other.freq { | |
return Ordering::Less; | |
} | |
Ordering::Equal | |
} | |
fn max(self, other: Self) -> Self { | |
std::cmp::max_by(self, other, Ord::cmp) | |
} | |
fn min(self, other: Self) -> Self { | |
std::cmp::min_by(self, other, Ord::cmp) | |
} | |
fn clamp(self, min: Self, max: Self) -> Self { | |
if self.freq < min.freq { | |
min | |
} else if self.freq > max.freq { | |
max | |
} else { | |
self | |
} | |
} | |
} | |
struct NGramIndex<T: Clone + PartialEq> { | |
index: HashMap<String, BinaryHeap<NGramMeta<T>>>, | |
size: usize, | |
} | |
impl<T: Clone + PartialEq> NGramIndex<T> { | |
pub fn new(size: usize) -> NGramIndex<T> { | |
NGramIndex { | |
index: HashMap::new(), | |
size, | |
} | |
} | |
pub fn insert(&mut self, item: T, docs: Vec<(f64, String)>) { | |
for (weight, doc) in docs.iter() { | |
let mut weights = HashMap::new(); | |
let tokens = self.analyze(doc); | |
let len = tokens.len(); | |
for (i, ngram) in tokens.iter().enumerate() { | |
weights.entry(ngram.to_string()).or_insert_with(|| 0.0); | |
weights | |
.entry(ngram.to_string()) | |
.and_modify(|a| *a += *weight * ngram.len() as f64); | |
} | |
for (ngram, freq) in weights.into_iter() { | |
self | |
.index | |
.entry(ngram.to_string()) | |
.or_insert_with(|| BinaryHeap::new()); | |
let document_frequency = freq as f64 / len as f64; | |
self.index.entry(ngram.to_string()).and_modify(|b| { | |
b.push(NGramMeta { | |
bind: item.clone(), | |
document: docs.iter().map(|(_, d)| d.to_string()).collect(), | |
freq: document_frequency, | |
}); | |
}); | |
} | |
} | |
} | |
fn analyze(&self, doc: &String) -> Vec<String> { | |
let mut ret = vec![]; | |
let tokens = doc.split(" ").collect::<Vec<&str>>(); | |
for token in tokens.iter() { | |
let token = token.to_string(); | |
for ngram in self.windows(&token.chars().collect::<Vec<char>>()) { | |
ret.push(ngram.into_iter().collect()); | |
} | |
} | |
ret | |
} | |
fn windows(&self, inp: &Vec<char>) -> Vec<Vec<char>> { | |
let size = inp.len().min(self.size); | |
let mut i = 1; | |
let mut j = 0; | |
let mut ret = vec![]; | |
let len = inp.len(); | |
if inp.len() == 0 { | |
return ret; | |
} | |
loop { | |
let a = Vec::from_iter(inp[j..i].iter().map(|c| *c).collect::<Vec<char>>()); | |
ret.push(a); | |
if i < len { | |
i += 1; | |
} | |
if i > size || i == len { | |
j += 1; | |
} | |
if i - j < size && j != 0 { | |
break; | |
} | |
} | |
ret | |
} | |
pub fn search(&self, query: String, results: usize) -> Vec<T> { | |
let mut ret = Vec::new(); | |
let mut heaps = Vec::new(); | |
println!("{:#?}", self.analyze(&query)); | |
for ngram in self.analyze(&query) { | |
let heap = self.index.get(&ngram); | |
match heap { | |
Some(v) => heaps.push(v.iter().peekable()), | |
None => {} | |
}; | |
} | |
let mut done = false; | |
let len = heaps.len(); | |
while ret.len() < results && !done { | |
done = true; | |
let mut best: Option<(usize, &NGramMeta<T>)> = None; | |
for heap in 1..len { | |
let suggestion = heaps[heap].peek(); | |
match suggestion { | |
Some(v) => match &best { | |
Some(b) => { | |
if v.freq > b.1.freq { | |
best = Some((heap, v)); | |
} | |
} | |
None => best = Some((heap, v)), | |
}, | |
None => {} | |
} | |
} | |
match best { | |
None => {} | |
Some((heap, b)) => { | |
ret.push(b.bind.clone()); | |
heaps[heap].next(); | |
done = false; | |
} | |
} | |
} | |
ret | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment