Created
August 15, 2019 02:41
-
-
Save sebastienros/a2480183611861e75896e88cb0adc316 to your computer and use it in GitHub Desktop.
Sorting unicode text with lucene
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using ICU4N.Text; | |
| using Lucene.Net.Analysis.Fr; | |
| using Lucene.Net.Analysis.Standard; | |
| using Lucene.Net.Analysis.TokenAttributes; | |
| using Lucene.Net.Collation; | |
| using Lucene.Net.Documents; | |
| using Lucene.Net.Index; | |
| using Lucene.Net.Search; | |
| using Lucene.Net.Store; | |
| using Lucene.Net.Util; | |
| using System; | |
| using System.Globalization; | |
| namespace LuceneSort | |
| { | |
| class Program | |
| { | |
| static void Main(string[] args) | |
| { | |
| // Ensures index backwards compatibility | |
| var AppLuceneVersion = LuceneVersion.LUCENE_48; | |
| var indexLocation = @"C:\temp\luceneindex"; | |
| if (System.IO.Directory.Exists(indexLocation)) | |
| { | |
| System.IO.Directory.Delete(indexLocation, true); | |
| } | |
| var dir = FSDirectory.Open(indexLocation); | |
| //create an analyzer to process the text | |
| //var analyzer = new StandardAnalyzer(AppLuceneVersion); | |
| var collator = Collator.GetInstance(new CultureInfo("fr")); | |
| var analyzer = new ICUCollationKeyAnalyzer(AppLuceneVersion, collator); | |
| var frenchAnalyzer = new FrenchAnalyzer(AppLuceneVersion); | |
| #region Testing the tokenizer | |
| Console.WriteLine("/// Testing the tokenizer"); | |
| var tokenStream = analyzer.GetTokenStream("random", "a la maison de mon pere"); | |
| var offsetAttribute = tokenStream.AddAttribute<IOffsetAttribute>(); | |
| var charTermAttribute = tokenStream.AddAttribute<ICharTermAttribute>(); | |
| tokenStream.Reset(); | |
| while (tokenStream.IncrementToken()) | |
| { | |
| int startOffset = offsetAttribute.StartOffset; | |
| int endOffset = offsetAttribute.EndOffset; | |
| var term = charTermAttribute.ToString(); | |
| Console.WriteLine("token: {0}", term); | |
| } | |
| tokenStream.End(); | |
| tokenStream.Dispose(); | |
| Console.WriteLine("///"); | |
| #endregion | |
| //create an index writer | |
| var indexConfig = new IndexWriterConfig(AppLuceneVersion, analyzer); | |
| var writer = new IndexWriter(dir, indexConfig); | |
| var sentences = new[] { "A", "C", "a", "F", "É", "e", "b"}; | |
| foreach(var sentence in sentences) | |
| { | |
| var doc = new Document(); | |
| // StringField indexes but doesn't tokenise | |
| doc.Add(new TextField("textfield", sentence, Field.Store.YES)); | |
| doc.Add(new StringField("stringfield", sentence, Field.Store.YES)); | |
| writer.AddDocument(doc); | |
| } | |
| writer.Flush(triggerMerge: false, applyAllDeletes: false); | |
| // search with a phrase | |
| var phrase = new MatchAllDocsQuery(); | |
| // re-use the writer to get real-time updates | |
| var searcher = new IndexSearcher(writer.GetReader(applyAllDeletes: true)); | |
| var searchField = "textfield"; | |
| var hits = searcher.Search(phrase, 20 /* top 20 */, new Sort(new SortField(searchField, SortFieldType.STRING))).ScoreDocs; | |
| foreach (var hit in hits) | |
| { | |
| var foundDoc = searcher.Doc(hit.Doc); | |
| Console.WriteLine(foundDoc.Get("stringfield")); | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment