Skip to content

Instantly share code, notes, and snippets.

@hammett
Created July 30, 2017 19:20
Show Gist options
  • Save hammett/1b1aa1348e70e816d45bca48146ff724 to your computer and use it in GitHub Desktop.
Save hammett/1b1aa1348e70e816d45bca48146ff724 to your computer and use it in GitHub Desktop.
rust experiment: dumb reverse index
fn main() {
use reverse_index::*;
let s1 = "Often, a simple if/else isn’t enough, because you have more than two possible options. Also, conditions can get quite complex. Rust has has a keyword, match, that allows you to replace complicated if/else groupings with something more powerful. It is match. Check it out:";
let s2 = "Strings are ordered lexicographically by their byte values. This orders Unicode code points based more on their positions in the code charts. This is not necessarily the same as alphabetical order, which varies by language and locale. Sorting strings according to culturally-accepted standards requires locale-specific data that is outside the scope of the str type";
let doc1 = parse(&s1);
let doc2 = parse(&s2);
let query = "simple more";
match search(query, &[&doc1, &doc2]) {
Some ((r, doc)) => {
println!("Resulting similarity is {}", r);
},
_ => {
println!("Nothing found");
}
}
}
mod reverse_index {
#![allow(dead_code)]
use std::f32;
use std::collections::HashMap;
// model doc, terms, query
// doc (terms with weights)
// docs
// query
// dot product: query * doc
// sum of query terms with doc (matching) terms
pub struct Document<'a> {
// term frequency (or weigh)
terms: HashMap<&'a str, f32>
}
pub struct Query<'a> {
terms: Vec<&'a str>
}
pub fn parse( text: &str ) -> Document {
// todo: stop words, bi grams, normalization
// let mut terms = Vec::<&str>::new();
let mut term_freq = HashMap::<&str, f32>::new();
for line in text.lines() {
for rawwords in line.split_whitespace() {
let words = rawwords.split(|c| c == '.' || c == ':' || c == ';' || c == ',' || c == '!' || c == '?' || c == '/');
for word in words {
match word {
" " | "" => (),
_ => {
// println!("{}", word)
let freq : f32 = match term_freq.get(word) {
Some(freq) => *freq + 1.0,
_ => { 1.0 }
};
term_freq.insert(word, freq);
}
}
}
}
}
// for (k,v) in &term_freq {
// println!("{}:{}", k, v)
// }
Document{ terms: term_freq }
}
pub type SearchRes<'a> = Option<(f32, &'a Document<'a>)>;
pub fn search<'a>( query: &str, docs: &[&'a Document] ) -> SearchRes<'a> {
let norm_query = 2.0;
// for each doc
// cos theta = dot product of (query and doc)
// ------------------------------
// norm(query) * norm(doc)
// highest val = closest similarity
// lowest / 0 = orthogonality = no similarity
let mut closest = 0.0;
let mut best_doc : Option<&Document> = None;
for doc in docs {
let product = dot_product(query, doc);
let norm_doc = compute_norm(doc);
let similarity : f32 = ( product ) / (norm_query * norm_doc);
if similarity > closest {
closest = similarity;
best_doc = Some(doc);
}
}
match best_doc {
Some(d) => Some((closest, d)),
_ => None
}
}
fn dot_product(query: &str, doc: &Document ) -> f32 {
let mut product : f32 = 0.0;
let right = query.as_bytes();
for query_term in query.split_whitespace() {
match doc.terms.get(query_term) {
Some(freq) => {
product += freq * 2.0
},
_ => ()
}
}
product
}
fn compute_norm( doc: &Document ) -> f32 {
// square root of ( Sum each ( vector_val squared ) )
let mut sum : f32 = 0.0;
// shouldnt .terms.values() work? didn't
for (_, freq) in &doc.terms {
sum += freq.powf(2.0);
}
sum.sqrt()
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment