Last active
May 6, 2017 22:32
-
-
Save vbfox/bc4111289c5416631a0caf9c3fe8e75d to your computer and use it in GitHub Desktop.
LINQPad script to index & search a folder full of .eml files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Query Kind="Program"> | |
<NuGetReference>HtmlAgilityPack</NuGetReference> | |
<NuGetReference>Lucene.Net</NuGetReference> | |
<NuGetReference>MimeKit</NuGetReference> | |
<Namespace>HtmlAgilityPack</Namespace> | |
<Namespace>Lucene.Net</Namespace> | |
<Namespace>Lucene.Net.Analysis</Namespace> | |
<Namespace>Lucene.Net.Analysis.Standard</Namespace> | |
<Namespace>Lucene.Net.Analysis.Tokenattributes</Namespace> | |
<Namespace>Lucene.Net.Documents</Namespace> | |
<Namespace>Lucene.Net.Index</Namespace> | |
<Namespace>Lucene.Net.Messages</Namespace> | |
<Namespace>Lucene.Net.QueryParsers</Namespace> | |
<Namespace>Lucene.Net.Search</Namespace> | |
<Namespace>Lucene.Net.Search.Function</Namespace> | |
<Namespace>Lucene.Net.Search.Payloads</Namespace> | |
<Namespace>Lucene.Net.Search.Spans</Namespace> | |
<Namespace>Lucene.Net.Store</Namespace> | |
<Namespace>Lucene.Net.Support</Namespace> | |
<Namespace>Lucene.Net.Support.Compatibility</Namespace> | |
<Namespace>Lucene.Net.Util</Namespace> | |
<Namespace>Lucene.Net.Util.Cache</Namespace> | |
<Namespace>MimeKit</Namespace> | |
<Namespace>MimeKit.Cryptography</Namespace> | |
<Namespace>MimeKit.Encodings</Namespace> | |
<Namespace>MimeKit.IO</Namespace> | |
<Namespace>MimeKit.IO.Filters</Namespace> | |
<Namespace>MimeKit.Text</Namespace> | |
<Namespace>MimeKit.Tnef</Namespace> | |
<Namespace>MimeKit.Utils</Namespace> | |
<Namespace>System.Net</Namespace> | |
<Namespace>System.Threading.Tasks</Namespace> | |
</Query> | |
static string workDir = @"C:\Mails"; | |
static string luceneDir = Path.Combine(workDir, ".lucene"); | |
void Main() | |
{ | |
if (!System.IO.Directory.Exists(luceneDir)) | |
{ | |
Build(); | |
} | |
Search(Util.ReadLine("Search ?")); | |
} | |
string[] excludeFrom = new string[] { | |
}; | |
void Search(string term) | |
{ | |
using (var dir = FSDirectory.Open(luceneDir)) | |
{ | |
using (var indexReader = IndexReader.Open(dir, true)) | |
{ | |
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); | |
using (var searcher = new Lucene.Net.Search.IndexSearcher(dir)) | |
{ | |
//var searchQuery = new QueryParser(Lucene.Net.Util.Version.LUCENE_30,"content",analyzer).Parse(term); | |
var fieldNames = indexReader.GetFieldNames(IndexReader.FieldOption.ALL); | |
var boosts = fieldNames.ToDictionary(x => x, x => 1.0f); | |
boosts["subject"] = 2.0f; | |
boosts["to.name"] = 1.5f; | |
boosts["from.name"] = 1.5f; | |
boosts["content"] = 1.1f; | |
var queryParser = new MultiFieldQueryParser( | |
Lucene.Net.Util.Version.LUCENE_30, | |
fieldNames.ToArray(), | |
new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), | |
boosts); | |
var searchQuery = queryParser.Parse(term); | |
var fullQuery = new BooleanQuery(); | |
fullQuery.Add(searchQuery, Occur.MUST); | |
foreach (var excludedFrom in excludeFrom) | |
{ | |
fullQuery.Add(new TermQuery(new Term("from.address", excludedFrom)), Occur.MUST_NOT); | |
} | |
var hits = searcher.Search(fullQuery, 1000); | |
$"{hits.TotalHits} hits, displaying {hits.ScoreDocs.Length}".Dump(); | |
foreach (var scoreDoc in hits.ScoreDocs) | |
{ | |
var doc = searcher.Doc(scoreDoc.Doc); | |
var fileName = doc.Get("file_name"); | |
var subject = doc.Get("subject"); | |
if (string.IsNullOrWhiteSpace(subject)) | |
{ | |
subject = "<no subject>"; | |
} | |
var from = Best(doc.Get("from.name"), doc.Get("from.address"), "??"); | |
var to = Best(doc.Get("to.name"), doc.Get("to.address"), "??"); | |
var date = DateTools.StringToDate(doc.Get("date")).ToString(); | |
var linkText = $"{subject} ({from} -> {to} on {date})"; | |
Util.OnDemand(linkText, () => | |
{ | |
return Util.VerticalRun( | |
new Hyperlinq(() => Process.Start(fileName), fileName), | |
doc.GetFields().Select(f => new { f.Name, f.StringValue }).ToList()); | |
}).Dump(); | |
} | |
} | |
} | |
} | |
} | |
string Best(params string[] choices) | |
{ | |
return choices.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c)); | |
} | |
void Build(int? limit = null) | |
{ | |
if (System.IO.Directory.Exists(luceneDir)) | |
{ | |
System.IO.Directory.Delete(luceneDir, true); | |
} | |
System.IO.Directory.CreateDirectory(luceneDir); | |
using (var dir = FSDirectory.Open(luceneDir)) | |
{ | |
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); | |
using (var indexWriter = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED)) | |
{ | |
var files = System.IO.Directory.GetFiles(workDir, "*.eml", SearchOption.AllDirectories); | |
if (limit != null) | |
{ | |
files = files.Take(limit.Value).ToArray(); | |
} | |
files.Length.Dump("Loading files:"); | |
Util.Progress = 0; | |
int i = 0; | |
Parallel.ForEach(files, file => | |
{ | |
using (var stream = File.OpenRead(file)) | |
{ | |
var parser = new MimeParser(stream); | |
var msg = parser.ParseMessage(); | |
var text = GetText(msg); | |
AddDoc(file, msg, text, indexWriter); | |
Interlocked.Increment(ref i); | |
Util.Progress = (int)(i * 100.0 / files.Length); | |
} | |
} | |
); | |
Util.Progress = 100; | |
files.Length.Dump("Loading DONE!"); | |
indexWriter.Optimize(); | |
} | |
} | |
} | |
string GetText(MimeMessage msg) | |
{ | |
if (msg.HtmlBody == null) | |
{ | |
return msg.TextBody ?? ""; | |
} | |
var htmlDoc = new HtmlAgilityPack.HtmlDocument(); | |
htmlDoc.LoadHtml(msg.HtmlBody); | |
var textNodes = htmlDoc.DocumentNode.SelectNodes("//text()[normalize-space(.) != '']"); | |
if (textNodes == null) | |
{ | |
return msg.TextBody ?? ""; | |
} | |
var sb = new StringBuilder(); | |
foreach (HtmlNode node in textNodes) | |
{ | |
if (node.Name == "style" || node.ParentNode?.Name == "style") | |
{ | |
continue; | |
} | |
sb.AppendLine(node.InnerText.Trim()); | |
} | |
sb.Replace(" ", " "); | |
return WebUtility.HtmlDecode(sb.ToString()); | |
} | |
void AddNotAnalyzed(string name, string value, Document doc) | |
{ | |
var f = | |
new Field(name, | |
value ?? "", | |
Field.Store.YES, | |
Field.Index.NOT_ANALYZED, | |
Field.TermVector.YES); | |
doc.Add(f); | |
} | |
void AddAnalyzed(string name, string value, Document doc) | |
{ | |
var f = | |
new Field(name, | |
value ?? "", | |
Field.Store.YES, | |
Field.Index.ANALYZED, | |
Field.TermVector.YES); | |
doc.Add(f); | |
} | |
void AddAddresses(string name, InternetAddressList addresses, Document doc) | |
{ | |
foreach (var address in addresses) | |
{ | |
AddNotAnalyzed(name, address.ToString(), doc); | |
AddAnalyzed(name + ".name", address.Name, doc); | |
if (address is MailboxAddress) | |
{ | |
AddNotAnalyzed(name + ".address", (address as MailboxAddress).Address, doc); | |
} | |
} | |
} | |
void AddDoc(string fileName, MimeMessage msg, string text, IndexWriter writer) | |
{ | |
var doc = new Document(); | |
AddAnalyzed("content", text, doc); | |
AddAnalyzed("subject", msg.Subject, doc); | |
AddAddresses("from", msg.From, doc); | |
AddAddresses("to", msg.To, doc); | |
AddAddresses("cc", msg.Cc, doc); | |
AddAddresses("bcc", msg.Bcc, doc); | |
AddNotAnalyzed("file_name", fileName, doc); | |
AddNotAnalyzed("date", DateTools.DateToString(msg.Date.UtcDateTime, DateTools.Resolution.SECOND), doc); | |
foreach (var attachment in msg.Attachments) | |
{ | |
AddAnalyzed("attachment", attachment.ContentDisposition.FileName, doc); | |
} | |
writer.AddDocument(doc); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment