Created
March 5, 2022 19:03
-
-
Save jesuslpm/1d61e903b5efc379cbb09461e55d5e65 to your computer and use it in GitHub Desktop.
An accent and case insensitive analyzer for lucene.net 4.8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Lucene.Net.Analysis; | |
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
using System.IO; | |
using Lucene.Net.Analysis.Util; | |
using Lucene.Net.Util; | |
using Lucene.Net.Analysis.Standard; | |
using Lucene.Net.Analysis.Core; | |
namespace Map.Lucene | |
{ | |
public class AccentFoldingAnalyzer : Analyzer | |
{ | |
private readonly LuceneVersion luceneVersion; | |
public AccentFoldingAnalyzer(): this(LuceneVersion.LUCENE_48) | |
{ | |
} | |
public AccentFoldingAnalyzer(LuceneVersion luceneVersion): base() | |
{ | |
this.luceneVersion = luceneVersion; | |
} | |
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) | |
{ | |
Tokenizer source = new StandardTokenizer(luceneVersion, reader); | |
TokenStream result = new StandardFilter(luceneVersion, source); | |
result = new LowerCaseFilter(luceneVersion, result); | |
result = new AccentFoldingFilter(result); | |
return new TokenStreamComponents(source, result); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Lucene.Net.Analysis.TokenAttributes; | |
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
using Lucene.Net.Search; | |
using Lucene.Net.Analysis; | |
namespace Map.Lucene | |
{ | |
public sealed class AccentFoldingFilter : TokenFilter | |
{ | |
private ICharTermAttribute termAttribute; | |
public AccentFoldingFilter(TokenStream input) : base(input) | |
{ | |
termAttribute = this.GetAttribute<ICharTermAttribute>(); | |
} | |
public override bool IncrementToken() | |
{ | |
if (this.m_input.IncrementToken()) | |
{ | |
string buffer = termAttribute.ToString().RemoveDiacritics(); | |
termAttribute.SetEmpty().Append(buffer); | |
return true; | |
} | |
return false; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static string RemoveDiacritics(this string text) | |
{ | |
var normalizedString = text.Normalize(NormalizationForm.FormD); | |
var stringBuilder = new StringBuilder(); | |
foreach (var c in normalizedString) | |
{ | |
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(c); | |
if (unicodeCategory != UnicodeCategory.NonSpacingMark) | |
{ | |
stringBuilder.Append(c); | |
} | |
} | |
return stringBuilder.ToString().Normalize(NormalizationForm.FormC); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment