-
-
Save jalchr/5326545 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Diagnostics; | |
using System.IO; | |
using System.Linq; | |
using Lucene.Net.Analysis; | |
using Lucene.Net.Analysis.Standard; | |
using Lucene.Net.Analysis.Tokenattributes; | |
using Raven.Abstractions.Indexing; | |
using Raven.Client; | |
using Raven.Client.Indexes; | |
using Raven.Tests.Helpers; | |
using Xunit; | |
namespace RavenTests | |
{ | |
public class NGramTest : RavenTestBase | |
{ | |
public class User | |
{ | |
public string Name { get; set; } | |
} | |
public class UsersIndex : AbstractIndexCreationTask<User> | |
{ | |
public UsersIndex() | |
{ | |
Map = users => from user in users | |
select new | |
{ | |
user.Name | |
}; | |
Index(x => x.Name, FieldIndexing.Analyzed); | |
Analyze(x => x.Name, typeof(NGramAnalyzer).AssemblyQualifiedName); | |
} | |
} | |
[Fact] | |
public void Test() | |
{ | |
using (var documentStore = NewDocumentStore()) | |
{ | |
documentStore.ExecuteIndex(new UsersIndex()); | |
using (var session = documentStore.OpenSession()) | |
{ | |
session.Store(new User { Name = "Matt Johnson" }); | |
session.SaveChanges(); | |
} | |
WaitForIndexing(documentStore); | |
using (var session = documentStore.OpenSession()) | |
{ | |
var searchValues = new[] { "ma", "mat", "att", "jo", "joh", "son" }; | |
var allPassed = true; | |
foreach (var value in searchValues) | |
{ | |
var results = session.Query<User, UsersIndex>().Search(x => x.Name, value).ToList(); | |
var pass = results.Count == 1; | |
Debug.WriteLine("\"{0}\" : {1}", value, pass ? "Pass" : "Fail"); | |
if (!pass) | |
allPassed = false; | |
} | |
Assert.True(allPassed); | |
} | |
} | |
} | |
} | |
[NotForQuerying] | |
public class NGramAnalyzer : Analyzer | |
{ | |
public override TokenStream TokenStream(string fieldName, TextReader reader) | |
{ | |
var tokenizer = new StandardTokenizer(Lucene.Net.Util.Version.LUCENE_30, reader) { MaxTokenLength = 255 }; | |
TokenStream filter = new StandardFilter(tokenizer); | |
filter = new LowerCaseFilter(filter); | |
filter = new StopFilter(false, filter, StandardAnalyzer.STOP_WORDS_SET); | |
return new NGramTokenFilter(filter, 2, 6); | |
} | |
} | |
public sealed class NGramTokenFilter : TokenFilter | |
{ | |
public static int DefaultMinNgramSize = 1; | |
public static int DefaultMaxNgramSize = 2; | |
private readonly int _maxGram; | |
private readonly int _minGram; | |
private readonly IOffsetAttribute _offsetAtt; | |
private readonly ITermAttribute _termAtt; | |
private int _curGramSize; | |
private int _curPos; | |
private char[] _curTermBuffer; | |
private int _curTermLength; | |
private int _tokStart; | |
/** | |
* Creates NGramTokenFilter with given min and max n-grams. | |
* <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> | |
* <param name="minGram">the smallest n-gram to generate</param> | |
* <param name="maxGram">the largest n-gram to generate</param> | |
*/ | |
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) | |
: base(input) | |
{ | |
if (minGram < 1) | |
{ | |
throw new ArgumentException("minGram must be greater than zero"); | |
} | |
if (minGram > maxGram) | |
{ | |
throw new ArgumentException("minGram must not be greater than maxGram"); | |
} | |
_minGram = minGram; | |
_maxGram = maxGram; | |
_termAtt = AddAttribute<ITermAttribute>(); | |
_offsetAtt = AddAttribute<IOffsetAttribute>(); | |
} | |
/** | |
* Creates NGramTokenFilter with default min and max n-grams. | |
* <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> | |
*/ | |
public NGramTokenFilter(TokenStream input) | |
: this(input, DefaultMinNgramSize, DefaultMaxNgramSize) | |
{ | |
} | |
/** Returns the next token in the stream, or null at EOS. */ | |
public override bool IncrementToken() | |
{ | |
while (true) | |
{ | |
if (_curTermBuffer == null) | |
{ | |
if (!input.IncrementToken()) | |
{ | |
return false; | |
} | |
_curTermBuffer = (char[])_termAtt.TermBuffer().Clone(); | |
_curTermLength = _termAtt.TermLength(); | |
_curGramSize = _minGram; | |
_curPos = 0; | |
_tokStart = _offsetAtt.StartOffset; | |
} | |
while (_curGramSize <= _maxGram) | |
{ | |
while (_curPos + _curGramSize <= _curTermLength) | |
{ | |
// while there is input | |
ClearAttributes(); | |
_termAtt.SetTermBuffer(_curTermBuffer, _curPos, _curGramSize); | |
_offsetAtt.SetOffset(_tokStart + _curPos, _tokStart + _curPos + _curGramSize); | |
_curPos++; | |
return true; | |
} | |
_curGramSize++; // increase n-gram size | |
_curPos = 0; | |
} | |
_curTermBuffer = null; | |
} | |
} | |
public override void Reset() | |
{ | |
base.Reset(); | |
_curTermBuffer = null; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment