Skip to content

Instantly share code, notes, and snippets.

@worldOneo
Created February 17, 2022 19:11
Show Gist options
  • Save worldOneo/2819268cdcc908265bbc64fa23efdbbf to your computer and use it in GitHub Desktop.
Save worldOneo/2819268cdcc908265bbc64fa23efdbbf to your computer and use it in GitHub Desktop.
Simple, fast ngram index written in rust
use std::collections::BinaryHeap;
use std::fmt::Debug;
use std::vec;
use std::{collections::HashMap, io, time::Instant};
#[derive(Debug)]
struct NGramMeta<T: Clone + PartialEq> {
bind: T,
document: Vec<String>,
freq: f64,
}
impl<T: Clone + PartialEq> PartialOrd for NGramMeta<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
self.freq.partial_cmp(&other.freq)
}
fn lt(&self, other: &Self) -> bool {
match self.partial_cmp(other) {
None | Some(Ordering::Greater) | Some(Ordering::Equal) => false,
Some(Ordering::Less) => true,
}
}
fn le(&self, other: &Self) -> bool {
match self.partial_cmp(other) {
None | Some(Ordering::Greater) => false,
Some(Ordering::Less) | Some(Ordering::Equal) => true,
}
}
fn gt(&self, other: &Self) -> bool {
match self.partial_cmp(other) {
None | Some(Ordering::Equal) | Some(Ordering::Less) => false,
Some(Ordering::Greater) => true,
}
}
fn ge(&self, other: &Self) -> bool {
match self.partial_cmp(other) {
None | Some(Ordering::Less) => false,
Some(Ordering::Greater) | Some(Ordering::Equal) => true,
}
}
}
impl<T: Clone + PartialEq> PartialEq for NGramMeta<T> {
fn eq(&self, other: &Self) -> bool {
self.document == other.document && self.freq == other.freq && self.bind == other.bind
}
fn ne(&self, other: &Self) -> bool {
self.document == other.document && self.freq == other.freq && self.bind == other.bind
}
}
impl<T: Clone + PartialEq> Eq for NGramMeta<T> {}
impl<T: Clone + PartialEq> Ord for NGramMeta<T> {
fn cmp(&self, other: &Self) -> Ordering {
if self.freq > other.freq {
return Ordering::Greater;
} else if self.freq < other.freq {
return Ordering::Less;
}
Ordering::Equal
}
fn max(self, other: Self) -> Self {
std::cmp::max_by(self, other, Ord::cmp)
}
fn min(self, other: Self) -> Self {
std::cmp::min_by(self, other, Ord::cmp)
}
fn clamp(self, min: Self, max: Self) -> Self {
if self.freq < min.freq {
min
} else if self.freq > max.freq {
max
} else {
self
}
}
}
struct NGramIndex<T: Clone + PartialEq> {
index: HashMap<String, BinaryHeap<NGramMeta<T>>>,
size: usize,
}
impl<T: Clone + PartialEq> NGramIndex<T> {
pub fn new(size: usize) -> NGramIndex<T> {
NGramIndex {
index: HashMap::new(),
size,
}
}
pub fn insert(&mut self, item: T, docs: Vec<(f64, String)>) {
for (weight, doc) in docs.iter() {
let mut weights = HashMap::new();
let tokens = self.analyze(doc);
let len = tokens.len();
for (i, ngram) in tokens.iter().enumerate() {
weights.entry(ngram.to_string()).or_insert_with(|| 0.0);
weights
.entry(ngram.to_string())
.and_modify(|a| *a += *weight * ngram.len() as f64);
}
for (ngram, freq) in weights.into_iter() {
self
.index
.entry(ngram.to_string())
.or_insert_with(|| BinaryHeap::new());
let document_frequency = freq as f64 / len as f64;
self.index.entry(ngram.to_string()).and_modify(|b| {
b.push(NGramMeta {
bind: item.clone(),
document: docs.iter().map(|(_, d)| d.to_string()).collect(),
freq: document_frequency,
});
});
}
}
}
fn analyze(&self, doc: &String) -> Vec<String> {
let mut ret = vec![];
let tokens = doc.split(" ").collect::<Vec<&str>>();
for token in tokens.iter() {
let token = token.to_string();
for ngram in self.windows(&token.chars().collect::<Vec<char>>()) {
ret.push(ngram.into_iter().collect());
}
}
ret
}
fn windows(&self, inp: &Vec<char>) -> Vec<Vec<char>> {
let size = inp.len().min(self.size);
let mut i = 1;
let mut j = 0;
let mut ret = vec![];
let len = inp.len();
if inp.len() == 0 {
return ret;
}
loop {
let a = Vec::from_iter(inp[j..i].iter().map(|c| *c).collect::<Vec<char>>());
ret.push(a);
if i < len {
i += 1;
}
if i > size || i == len {
j += 1;
}
if i - j < size && j != 0 {
break;
}
}
ret
}
pub fn search(&self, query: String, results: usize) -> Vec<T> {
let mut ret = Vec::new();
let mut heaps = Vec::new();
println!("{:#?}", self.analyze(&query));
for ngram in self.analyze(&query) {
let heap = self.index.get(&ngram);
match heap {
Some(v) => heaps.push(v.iter().peekable()),
None => {}
};
}
let mut done = false;
let len = heaps.len();
while ret.len() < results && !done {
done = true;
let mut best: Option<(usize, &NGramMeta<T>)> = None;
for heap in 1..len {
let suggestion = heaps[heap].peek();
match suggestion {
Some(v) => match &best {
Some(b) => {
if v.freq > b.1.freq {
best = Some((heap, v));
}
}
None => best = Some((heap, v)),
},
None => {}
}
}
match best {
None => {}
Some((heap, b)) => {
ret.push(b.bind.clone());
heaps[heap].next();
done = false;
}
}
}
ret
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment