Skip to content

Instantly share code, notes, and snippets.

@sebastienros
Created August 15, 2019 02:41
Show Gist options
  • Save sebastienros/a2480183611861e75896e88cb0adc316 to your computer and use it in GitHub Desktop.
Save sebastienros/a2480183611861e75896e88cb0adc316 to your computer and use it in GitHub Desktop.
Sorting unicode text with lucene
using ICU4N.Text;
using Lucene.Net.Analysis.Fr;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Collation;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.Globalization;
namespace LuceneSort
{
class Program
{
static void Main(string[] args)
{
// Ensures index backwards compatibility
var AppLuceneVersion = LuceneVersion.LUCENE_48;
var indexLocation = @"C:\temp\luceneindex";
if (System.IO.Directory.Exists(indexLocation))
{
System.IO.Directory.Delete(indexLocation, true);
}
var dir = FSDirectory.Open(indexLocation);
//create an analyzer to process the text
//var analyzer = new StandardAnalyzer(AppLuceneVersion);
var collator = Collator.GetInstance(new CultureInfo("fr"));
var analyzer = new ICUCollationKeyAnalyzer(AppLuceneVersion, collator);
var frenchAnalyzer = new FrenchAnalyzer(AppLuceneVersion);
#region Testing the tokenizer
Console.WriteLine("/// Testing the tokenizer");
var tokenStream = analyzer.GetTokenStream("random", "a la maison de mon pere");
var offsetAttribute = tokenStream.AddAttribute<IOffsetAttribute>();
var charTermAttribute = tokenStream.AddAttribute<ICharTermAttribute>();
tokenStream.Reset();
while (tokenStream.IncrementToken())
{
int startOffset = offsetAttribute.StartOffset;
int endOffset = offsetAttribute.EndOffset;
var term = charTermAttribute.ToString();
Console.WriteLine("token: {0}", term);
}
tokenStream.End();
tokenStream.Dispose();
Console.WriteLine("///");
#endregion
//create an index writer
var indexConfig = new IndexWriterConfig(AppLuceneVersion, analyzer);
var writer = new IndexWriter(dir, indexConfig);
var sentences = new[] { "A", "C", "a", "F", "É", "e", "b"};
foreach(var sentence in sentences)
{
var doc = new Document();
// StringField indexes but doesn't tokenise
doc.Add(new TextField("textfield", sentence, Field.Store.YES));
doc.Add(new StringField("stringfield", sentence, Field.Store.YES));
writer.AddDocument(doc);
}
writer.Flush(triggerMerge: false, applyAllDeletes: false);
// search with a phrase
var phrase = new MatchAllDocsQuery();
// re-use the writer to get real-time updates
var searcher = new IndexSearcher(writer.GetReader(applyAllDeletes: true));
var searchField = "textfield";
var hits = searcher.Search(phrase, 20 /* top 20 */, new Sort(new SortField(searchField, SortFieldType.STRING))).ScoreDocs;
foreach (var hit in hits)
{
var foundDoc = searcher.Doc(hit.Doc);
Console.WriteLine(foundDoc.Get("stringfield"));
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment