worldOneo · February 17, 2022 19:11
diff --git a/ngram.rs b/ngram.rs
 use std::collections::BinaryHeap;
 use std::fmt::Debug;
 use std::vec;
 use std::{collections::HashMap, io, time::Instant};

 #[derive(Debug)]
 struct NGramMeta<T: Clone + PartialEq> {
  bind: T,
  document: Vec<String>,
  freq: f64,
 }

 impl<T: Clone + PartialEq> PartialOrd for NGramMeta<T> {
  fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
    self.freq.partial_cmp(&other.freq)
  }

  fn lt(&self, other: &Self) -> bool {
    match self.partial_cmp(other) {
      None | Some(Ordering::Greater) | Some(Ordering::Equal) => false,
      Some(Ordering::Less) => true,
    }
  }

  fn le(&self, other: &Self) -> bool {
    match self.partial_cmp(other) {
      None | Some(Ordering::Greater) => false,
      Some(Ordering::Less) | Some(Ordering::Equal) => true,
    }
  }

  fn gt(&self, other: &Self) -> bool {
    match self.partial_cmp(other) {
      None | Some(Ordering::Equal) | Some(Ordering::Less) => false,
      Some(Ordering::Greater) => true,
    }
  }

  fn ge(&self, other: &Self) -> bool {
    match self.partial_cmp(other) {
      None | Some(Ordering::Less) => false,
      Some(Ordering::Greater) | Some(Ordering::Equal) => true,
    }
  }
 }

 impl<T: Clone + PartialEq> PartialEq for NGramMeta<T> {
  fn eq(&self, other: &Self) -> bool {
    self.document == other.document && self.freq == other.freq && self.bind == other.bind
  }

  fn ne(&self, other: &Self) -> bool {
    self.document == other.document && self.freq == other.freq && self.bind == other.bind
  }
 }

 impl<T: Clone + PartialEq> Eq for NGramMeta<T> {}

 impl<T: Clone + PartialEq> Ord for NGramMeta<T> {
  fn cmp(&self, other: &Self) -> Ordering {
    if self.freq > other.freq {
      return Ordering::Greater;
    } else if self.freq < other.freq {
      return Ordering::Less;
    }
    Ordering::Equal
  }

  fn max(self, other: Self) -> Self {
    std::cmp::max_by(self, other, Ord::cmp)
  }

  fn min(self, other: Self) -> Self {
    std::cmp::min_by(self, other, Ord::cmp)
  }

  fn clamp(self, min: Self, max: Self) -> Self {
    if self.freq < min.freq {
      min
    } else if self.freq > max.freq {
      max
    } else {
      self
    }
  }
 }
 struct NGramIndex<T: Clone + PartialEq> {
  index: HashMap<String, BinaryHeap<NGramMeta<T>>>,
  size: usize,
 }

 impl<T: Clone + PartialEq> NGramIndex<T> {
  pub fn new(size: usize) -> NGramIndex<T> {
    NGramIndex {
      index: HashMap::new(),
      size,
    }
  }

  pub fn insert(&mut self, item: T, docs: Vec<(f64, String)>) {
    for (weight, doc) in docs.iter() {
      let mut weights = HashMap::new();
      let tokens = self.analyze(doc);
      let len = tokens.len();
      for (i, ngram) in tokens.iter().enumerate() {
        weights.entry(ngram.to_string()).or_insert_with(|| 0.0);
        weights
          .entry(ngram.to_string())
          .and_modify(|a| *a += *weight * ngram.len() as f64);
      }
      for (ngram, freq) in weights.into_iter() {
        self
          .index
          .entry(ngram.to_string())
          .or_insert_with(|| BinaryHeap::new());

        let document_frequency = freq as f64 / len as f64;

        self.index.entry(ngram.to_string()).and_modify(|b| {
          b.push(NGramMeta {
            bind: item.clone(),
            document: docs.iter().map(|(_, d)| d.to_string()).collect(),
            freq: document_frequency,
          });
        });
      }
    }
  }

  fn analyze(&self, doc: &String) -> Vec<String> {
    let mut ret = vec![];
    let tokens = doc.split(" ").collect::<Vec<&str>>();
    for token in tokens.iter() {
      let token = token.to_string();
      for ngram in self.windows(&token.chars().collect::<Vec<char>>()) {
        ret.push(ngram.into_iter().collect());
      }
    }
    ret
  }

  fn windows(&self, inp: &Vec<char>) -> Vec<Vec<char>> {
    let size = inp.len().min(self.size);
    let mut i = 1;
    let mut j = 0;
    let mut ret = vec![];
    let len = inp.len();
    if inp.len() == 0 {
      return ret;
    }
    loop {
      let a = Vec::from_iter(inp[j..i].iter().map(|c| *c).collect::<Vec<char>>());
      ret.push(a);
      if i < len {
        i += 1;
      }
      if i > size || i == len {
        j += 1;
      }
      if i - j < size && j != 0 {
        break;
      }
    }
    ret
  }

  pub fn search(&self, query: String, results: usize) -> Vec<T> {
    let mut ret = Vec::new();
    let mut heaps = Vec::new();
    println!("{:#?}", self.analyze(&query));
    for ngram in self.analyze(&query) {
      let heap = self.index.get(&ngram);
      match heap {
        Some(v) => heaps.push(v.iter().peekable()),
        None => {}
      };
    }
    let mut done = false;
    let len = heaps.len();
    while ret.len() < results && !done {
      done = true;
      let mut best: Option<(usize, &NGramMeta<T>)> = None;
      for heap in 1..len {
        let suggestion = heaps[heap].peek();
        match suggestion {
          Some(v) => match &best {
            Some(b) => {
              if v.freq > b.1.freq {
                best = Some((heap, v));
              }
            }
            None => best = Some((heap, v)),
          },
          None => {}
        }
      }
      match best {
        None => {}
        Some((heap, b)) => {
          ret.push(b.bind.clone());
          heaps[heap].next();
          done = false;
        }
      }
    }
    ret
  }
 }
	use std::collections::BinaryHeap;
	use std::fmt::Debug;
	use std::vec;
	use std::{collections::HashMap, io, time::Instant};

	#[derive(Debug)]
	struct NGramMeta<T: Clone + PartialEq> {
	bind: T,
	document: Vec<String>,
	freq: f64,
	}

	impl<T: Clone + PartialEq> PartialOrd for NGramMeta<T> {
	fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
	self.freq.partial_cmp(&other.freq)
	}

	fn lt(&self, other: &Self) -> bool {
	match self.partial_cmp(other) {
	None \| Some(Ordering::Greater) \| Some(Ordering::Equal) => false,
	Some(Ordering::Less) => true,
	}
	}

	fn le(&self, other: &Self) -> bool {
	match self.partial_cmp(other) {
	None \| Some(Ordering::Greater) => false,
	Some(Ordering::Less) \| Some(Ordering::Equal) => true,
	}
	}

	fn gt(&self, other: &Self) -> bool {
	match self.partial_cmp(other) {
	None \| Some(Ordering::Equal) \| Some(Ordering::Less) => false,
	Some(Ordering::Greater) => true,
	}
	}

	fn ge(&self, other: &Self) -> bool {
	match self.partial_cmp(other) {
	None \| Some(Ordering::Less) => false,
	Some(Ordering::Greater) \| Some(Ordering::Equal) => true,
	}
	}
	}

	impl<T: Clone + PartialEq> PartialEq for NGramMeta<T> {
	fn eq(&self, other: &Self) -> bool {
	self.document == other.document && self.freq == other.freq && self.bind == other.bind
	}

	fn ne(&self, other: &Self) -> bool {
	self.document == other.document && self.freq == other.freq && self.bind == other.bind
	}
	}

	impl<T: Clone + PartialEq> Eq for NGramMeta<T> {}

	impl<T: Clone + PartialEq> Ord for NGramMeta<T> {
	fn cmp(&self, other: &Self) -> Ordering {
	if self.freq > other.freq {
	return Ordering::Greater;
	} else if self.freq < other.freq {
	return Ordering::Less;
	}
	Ordering::Equal
	}

	fn max(self, other: Self) -> Self {
	std::cmp::max_by(self, other, Ord::cmp)
	}

	fn min(self, other: Self) -> Self {
	std::cmp::min_by(self, other, Ord::cmp)
	}

	fn clamp(self, min: Self, max: Self) -> Self {
	if self.freq < min.freq {
	min
	} else if self.freq > max.freq {
	max
	} else {
	self
	}
	}
	}
	struct NGramIndex<T: Clone + PartialEq> {
	index: HashMap<String, BinaryHeap<NGramMeta<T>>>,
	size: usize,
	}

	impl<T: Clone + PartialEq> NGramIndex<T> {
	pub fn new(size: usize) -> NGramIndex<T> {
	NGramIndex {
	index: HashMap::new(),
	size,
	}
	}

	pub fn insert(&mut self, item: T, docs: Vec<(f64, String)>) {
	for (weight, doc) in docs.iter() {
	let mut weights = HashMap::new();
	let tokens = self.analyze(doc);
	let len = tokens.len();
	for (i, ngram) in tokens.iter().enumerate() {
	weights.entry(ngram.to_string()).or_insert_with(\|\| 0.0);
	weights
	.entry(ngram.to_string())
	.and_modify(\|a\| a += weight * ngram.len() as f64);
	}
	for (ngram, freq) in weights.into_iter() {
	self
	.index
	.entry(ngram.to_string())
	.or_insert_with(\|\| BinaryHeap::new());

	let document_frequency = freq as f64 / len as f64;

	self.index.entry(ngram.to_string()).and_modify(\|b\| {
	b.push(NGramMeta {
	bind: item.clone(),
	document: docs.iter().map(\|(_, d)\| d.to_string()).collect(),
	freq: document_frequency,
	});
	});
	}
	}
	}

	fn analyze(&self, doc: &String) -> Vec<String> {
	let mut ret = vec![];
	let tokens = doc.split(" ").collect::<Vec<&str>>();
	for token in tokens.iter() {
	let token = token.to_string();
	for ngram in self.windows(&token.chars().collect::<Vec<char>>()) {
	ret.push(ngram.into_iter().collect());
	}
	}
	ret
	}

	fn windows(&self, inp: &Vec<char>) -> Vec<Vec<char>> {
	let size = inp.len().min(self.size);
	let mut i = 1;
	let mut j = 0;
	let mut ret = vec![];
	let len = inp.len();
	if inp.len() == 0 {
	return ret;
	}
	loop {
	let a = Vec::from_iter(inp[j..i].iter().map(\|c\| *c).collect::<Vec<char>>());
	ret.push(a);
	if i < len {
	i += 1;
	}
	if i > size \|\| i == len {
	j += 1;
	}
	if i - j < size && j != 0 {
	break;
	}
	}
	ret
	}

	pub fn search(&self, query: String, results: usize) -> Vec<T> {
	let mut ret = Vec::new();
	let mut heaps = Vec::new();
	println!("{:#?}", self.analyze(&query));
	for ngram in self.analyze(&query) {
	let heap = self.index.get(&ngram);
	match heap {
	Some(v) => heaps.push(v.iter().peekable()),
	None => {}
	};
	}
	let mut done = false;
	let len = heaps.len();
	while ret.len() < results && !done {
	done = true;
	let mut best: Option<(usize, &NGramMeta<T>)> = None;
	for heap in 1..len {
	let suggestion = heaps[heap].peek();
	match suggestion {
	Some(v) => match &best {
	Some(b) => {
	if v.freq > b.1.freq {
	best = Some((heap, v));
	}
	}
	None => best = Some((heap, v)),
	},
	None => {}
	}
	}
	match best {
	None => {}
	Some((heap, b)) => {
	ret.push(b.bind.clone());
	heaps[heap].next();
	done = false;
	}
	}
	}
	ret
	}
	}