Created
September 3, 2018 01:37
-
-
Save Freaky/b0970d06f72ed0ba31c2f553f5a92ae4 to your computer and use it in GitHub Desktop.
Quick and dirty email search index test thing that sucks completely
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// Email Sucks Completely / Email Search Command | |
extern crate mailparse; | |
extern crate tantivy; | |
extern crate walkdir; | |
use mailparse::*; | |
use tantivy::collector::TopCollector; | |
use tantivy::query::QueryParser; | |
use tantivy::schema::*; | |
use tantivy::Index; | |
use walkdir::WalkDir; | |
use std::time::Instant; | |
use std::fs; | |
fn open_search_index() -> Index { | |
let index_dir = "/tmp/email_sucks_completely/"; | |
if let Ok(index) = Index::open_in_dir(index_dir) { | |
return index; | |
} else { | |
let mut schema_builder = SchemaBuilder::default(); | |
schema_builder.add_text_field("id", STRING | STORED); | |
schema_builder.add_text_field("path", STRING | STORED); | |
// schema_builder.add_i64_field("date", INT_INDEXED); | |
schema_builder.add_text_field("subject", TEXT | STORED); | |
schema_builder.add_text_field("body", TEXT); | |
let schema = schema_builder.build(); | |
return Index::create_in_dir("/tmp/email_sucks_completely/", schema).expect("create index"); | |
} | |
} | |
fn index_emails(dirs: &[&str]) { | |
let index = open_search_index(); | |
let schema = index.schema(); | |
let mut index_writer = index.writer(250_000_000).expect("index writer"); | |
let id = schema.get_field("id").expect("id"); | |
let path = schema.get_field("path").expect("path"); | |
// let date = schema.get_field("date").unwrap(); | |
let subject = schema.get_field("subject").expect("subject"); | |
let body = schema.get_field("body").expect("body"); | |
let mut indexed = 0; | |
let start = Instant::now(); | |
for dir in dirs.iter() { | |
let walker = WalkDir::new(dir).into_iter(); | |
for entry in walker { | |
if let Ok(entry) = entry { | |
if indexed % 10000 == 0 { | |
println!( | |
"[{}] {:?} {}", | |
indexed, | |
start.elapsed(), | |
entry.path().display() | |
); | |
} | |
if let Ok(message) = fs::read(&entry.path()) { | |
if let Ok(email) = parse_mail(&message) { | |
let m_id = email.headers.get_first_value("Message-Id"); | |
let m_sub = email.headers.get_first_value("Subject"); | |
let m_body = email.get_body(); | |
if let (Ok(Some(m_id)), Ok(Some(m_sub)), Ok(m_body)) = (m_id, m_sub, m_body) | |
{ | |
let mut doc = Document::default(); | |
doc.add_text(path, &entry.path().to_string_lossy()); | |
doc.add_text(id, &m_id); | |
doc.add_text(subject, &m_sub); | |
doc.add_text(body, &m_body); | |
index_writer.add_document(doc); | |
indexed += 1; | |
} | |
} | |
} | |
} | |
} | |
} | |
index_writer.commit().expect("commit"); | |
println!("Indexed {} messages in {:?}", indexed, start.elapsed()); | |
index_writer.wait_merging_threads().unwrap(); | |
println!("Final merge finished after {:?}", start.elapsed()); | |
} | |
fn search(query: &str) { | |
let start = Instant::now(); | |
let index = open_search_index(); | |
let schema = index.schema(); | |
let path = schema.get_field("path").expect("path"); | |
let subject = schema.get_field("subject").expect("subject"); | |
let body = schema.get_field("body").expect("body"); | |
index.load_searchers().expect("load_searchers"); | |
let searcher = index.searcher(); | |
let query_parser = QueryParser::for_index(&index, vec![subject, body]); | |
let query = query_parser.parse_query(query).expect("parse query"); | |
let mut top_collector = TopCollector::with_limit(10); | |
searcher.search(&*query, &mut top_collector).unwrap(); | |
let doc_addresses = top_collector.docs(); | |
for doc_address in doc_addresses { | |
let retrieved_doc = searcher.doc(&doc_address).unwrap(); | |
println!( | |
"{}: {}", | |
retrieved_doc.get_first(path).unwrap().text(), | |
retrieved_doc.get_first(subject).unwrap().text() | |
); | |
} | |
println!("searched in {:?}", start.elapsed()); | |
} | |
fn main() { | |
index_emails(&["/home/freaky/Maildir/"]); | |
search("freshbsd v4 exception"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment