Skip to content

Instantly share code, notes, and snippets.

@vbfox
Last active May 6, 2017 22:32
Show Gist options
  • Save vbfox/bc4111289c5416631a0caf9c3fe8e75d to your computer and use it in GitHub Desktop.
Save vbfox/bc4111289c5416631a0caf9c3fe8e75d to your computer and use it in GitHub Desktop.
LINQPad script to index & search a folder full of .eml files
<Query Kind="Program">
<NuGetReference>HtmlAgilityPack</NuGetReference>
<NuGetReference>Lucene.Net</NuGetReference>
<NuGetReference>MimeKit</NuGetReference>
<Namespace>HtmlAgilityPack</Namespace>
<Namespace>Lucene.Net</Namespace>
<Namespace>Lucene.Net.Analysis</Namespace>
<Namespace>Lucene.Net.Analysis.Standard</Namespace>
<Namespace>Lucene.Net.Analysis.Tokenattributes</Namespace>
<Namespace>Lucene.Net.Documents</Namespace>
<Namespace>Lucene.Net.Index</Namespace>
<Namespace>Lucene.Net.Messages</Namespace>
<Namespace>Lucene.Net.QueryParsers</Namespace>
<Namespace>Lucene.Net.Search</Namespace>
<Namespace>Lucene.Net.Search.Function</Namespace>
<Namespace>Lucene.Net.Search.Payloads</Namespace>
<Namespace>Lucene.Net.Search.Spans</Namespace>
<Namespace>Lucene.Net.Store</Namespace>
<Namespace>Lucene.Net.Support</Namespace>
<Namespace>Lucene.Net.Support.Compatibility</Namespace>
<Namespace>Lucene.Net.Util</Namespace>
<Namespace>Lucene.Net.Util.Cache</Namespace>
<Namespace>MimeKit</Namespace>
<Namespace>MimeKit.Cryptography</Namespace>
<Namespace>MimeKit.Encodings</Namespace>
<Namespace>MimeKit.IO</Namespace>
<Namespace>MimeKit.IO.Filters</Namespace>
<Namespace>MimeKit.Text</Namespace>
<Namespace>MimeKit.Tnef</Namespace>
<Namespace>MimeKit.Utils</Namespace>
<Namespace>System.Net</Namespace>
<Namespace>System.Threading.Tasks</Namespace>
</Query>
static string workDir = @"C:\Mails";
static string luceneDir = Path.Combine(workDir, ".lucene");
void Main()
{
if (!System.IO.Directory.Exists(luceneDir))
{
Build();
}
Search(Util.ReadLine("Search ?"));
}
string[] excludeFrom = new string[] {
};
void Search(string term)
{
using (var dir = FSDirectory.Open(luceneDir))
{
using (var indexReader = IndexReader.Open(dir, true))
{
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
using (var searcher = new Lucene.Net.Search.IndexSearcher(dir))
{
//var searchQuery = new QueryParser(Lucene.Net.Util.Version.LUCENE_30,"content",analyzer).Parse(term);
var fieldNames = indexReader.GetFieldNames(IndexReader.FieldOption.ALL);
var boosts = fieldNames.ToDictionary(x => x, x => 1.0f);
boosts["subject"] = 2.0f;
boosts["to.name"] = 1.5f;
boosts["from.name"] = 1.5f;
boosts["content"] = 1.1f;
var queryParser = new MultiFieldQueryParser(
Lucene.Net.Util.Version.LUCENE_30,
fieldNames.ToArray(),
new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30),
boosts);
var searchQuery = queryParser.Parse(term);
var fullQuery = new BooleanQuery();
fullQuery.Add(searchQuery, Occur.MUST);
foreach (var excludedFrom in excludeFrom)
{
fullQuery.Add(new TermQuery(new Term("from.address", excludedFrom)), Occur.MUST_NOT);
}
var hits = searcher.Search(fullQuery, 1000);
$"{hits.TotalHits} hits, displaying {hits.ScoreDocs.Length}".Dump();
foreach (var scoreDoc in hits.ScoreDocs)
{
var doc = searcher.Doc(scoreDoc.Doc);
var fileName = doc.Get("file_name");
var subject = doc.Get("subject");
if (string.IsNullOrWhiteSpace(subject))
{
subject = "<no subject>";
}
var from = Best(doc.Get("from.name"), doc.Get("from.address"), "??");
var to = Best(doc.Get("to.name"), doc.Get("to.address"), "??");
var date = DateTools.StringToDate(doc.Get("date")).ToString();
var linkText = $"{subject} ({from} -> {to} on {date})";
Util.OnDemand(linkText, () =>
{
return Util.VerticalRun(
new Hyperlinq(() => Process.Start(fileName), fileName),
doc.GetFields().Select(f => new { f.Name, f.StringValue }).ToList());
}).Dump();
}
}
}
}
}
string Best(params string[] choices)
{
return choices.FirstOrDefault(c => !string.IsNullOrWhiteSpace(c));
}
void Build(int? limit = null)
{
if (System.IO.Directory.Exists(luceneDir))
{
System.IO.Directory.Delete(luceneDir, true);
}
System.IO.Directory.CreateDirectory(luceneDir);
using (var dir = FSDirectory.Open(luceneDir))
{
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
using (var indexWriter = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED))
{
var files = System.IO.Directory.GetFiles(workDir, "*.eml", SearchOption.AllDirectories);
if (limit != null)
{
files = files.Take(limit.Value).ToArray();
}
files.Length.Dump("Loading files:");
Util.Progress = 0;
int i = 0;
Parallel.ForEach(files, file =>
{
using (var stream = File.OpenRead(file))
{
var parser = new MimeParser(stream);
var msg = parser.ParseMessage();
var text = GetText(msg);
AddDoc(file, msg, text, indexWriter);
Interlocked.Increment(ref i);
Util.Progress = (int)(i * 100.0 / files.Length);
}
}
);
Util.Progress = 100;
files.Length.Dump("Loading DONE!");
indexWriter.Optimize();
}
}
}
string GetText(MimeMessage msg)
{
if (msg.HtmlBody == null)
{
return msg.TextBody ?? "";
}
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(msg.HtmlBody);
var textNodes = htmlDoc.DocumentNode.SelectNodes("//text()[normalize-space(.) != '']");
if (textNodes == null)
{
return msg.TextBody ?? "";
}
var sb = new StringBuilder();
foreach (HtmlNode node in textNodes)
{
if (node.Name == "style" || node.ParentNode?.Name == "style")
{
continue;
}
sb.AppendLine(node.InnerText.Trim());
}
sb.Replace("&nbsp;", " ");
return WebUtility.HtmlDecode(sb.ToString());
}
void AddNotAnalyzed(string name, string value, Document doc)
{
var f =
new Field(name,
value ?? "",
Field.Store.YES,
Field.Index.NOT_ANALYZED,
Field.TermVector.YES);
doc.Add(f);
}
void AddAnalyzed(string name, string value, Document doc)
{
var f =
new Field(name,
value ?? "",
Field.Store.YES,
Field.Index.ANALYZED,
Field.TermVector.YES);
doc.Add(f);
}
void AddAddresses(string name, InternetAddressList addresses, Document doc)
{
foreach (var address in addresses)
{
AddNotAnalyzed(name, address.ToString(), doc);
AddAnalyzed(name + ".name", address.Name, doc);
if (address is MailboxAddress)
{
AddNotAnalyzed(name + ".address", (address as MailboxAddress).Address, doc);
}
}
}
void AddDoc(string fileName, MimeMessage msg, string text, IndexWriter writer)
{
var doc = new Document();
AddAnalyzed("content", text, doc);
AddAnalyzed("subject", msg.Subject, doc);
AddAddresses("from", msg.From, doc);
AddAddresses("to", msg.To, doc);
AddAddresses("cc", msg.Cc, doc);
AddAddresses("bcc", msg.Bcc, doc);
AddNotAnalyzed("file_name", fileName, doc);
AddNotAnalyzed("date", DateTools.DateToString(msg.Date.UtcDateTime, DateTools.Resolution.SECOND), doc);
foreach (var attachment in msg.Attachments)
{
AddAnalyzed("attachment", attachment.ContentDisposition.FileName, doc);
}
writer.AddDocument(doc);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment