Created
July 17, 2011 01:54
-
-
Save jenwilson/1087018 to your computer and use it in GitHub Desktop.
ShingleFilter class in Lucene.Net - This version is not complete.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* ShingleFilter class for Lucene.Net C# ported from Java version 2.9.2 org.apache.lucene.analysis.shingle.ShingleFilter.java | |
* [Last updated: 07-16-2011] | |
* | |
* This version of the file is not complete. It produces System.SystemException: System.NullReferenceException when run. | |
* | |
* System.NullReferenceException: Object reference not set to an instance of an object. | |
* at LuceneIndexer.ShingleFilter.FillShingleBuffer() in C:\Users\..\ShingleFilter.cs:line 381 * | |
* | |
* | |
* | |
*/ | |
/** | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
* | |
*/ | |
using System; | |
using System.Collections; | |
using System.Text; | |
using Lucene.Net.Analysis; | |
using Lucene.Net.Analysis.Standard; | |
using TermAttribute = Lucene.Net.Analysis.Tokenattributes.TermAttribute; | |
using OffsetAttribute = Lucene.Net.Analysis.Tokenattributes.OffsetAttribute; | |
using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute; | |
using TypeAttribute = Lucene.Net.Analysis.Tokenattributes.TypeAttribute; | |
using AttributeSource = Lucene.Net.Util.AttributeSource; | |
/// <summary>A ShingleFilter constructs shingles (token n-grams) from a token stream. | |
/// In other words, it creates combinations of tokens as a single token. | |
/// | |
/// <p>For example, the sentence "please divide this sentence into shingles" | |
/// might be tokenized into shingles "please divide", "divide this", | |
/// "this sentence", "sentence into", and "into shingles". | |
/// | |
/// <p>This filter handles position increments > 1 by inserting filler tokens | |
/// (tokens with termtext "_"). It does not handle a position increment of 0. | |
/// </summary> | |
/// | |
namespace LuceneIndexer | |
{ | |
public class ShingleFilter : TokenFilter | |
{ | |
/** maximum shingle size (number of tokens) */ | |
internal int maxShingleSize; | |
private TermAttribute termAtt; | |
private OffsetAttribute offsetAtt; | |
private PositionIncrementAttribute posIncrAtt; | |
private TypeAttribute typeAtt; | |
private ArrayList shingleBuf = new ArrayList(); | |
private StringBuilder[] shingles; | |
private string tokenType = "shingle"; | |
/** filler token for when positionIncrement is more than 1 */ | |
public static readonly char[] FILLER_TOKEN = { '_' }; | |
/** default maximum shingle size is 2. */ | |
public static int DEFAULT_MAX_SHINGLE_SIZE = 2; | |
/** The string to use when joining adjacent tokens to form a shingle */ | |
public const string TOKEN_SEPARATOR = " "; | |
/** By default, we output unigrams (individual tokens) as well as shingles (token n-grams). */ | |
private bool outputUnigrams = true; | |
/** | |
* Constructs a ShingleFilter with the specified single size from the | |
* {@link TokenStream} <code>input</code> | |
* | |
* @param input input stream | |
* @param maxShingleSize maximum shingle size produced by the filter. | |
*/ | |
public ShingleFilter(TokenStream input, int maxShingleSize) : base(input) | |
{ | |
this.maxShingleSize = maxShingleSize; | |
termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); | |
offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute)); | |
posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute)); | |
typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute)); | |
} | |
/** | |
* Construct a ShingleFilter with default shingle size. | |
* | |
* @param input input stream | |
*/ | |
public ShingleFilter(TokenStream input) : this(input, DEFAULT_MAX_SHINGLE_SIZE) | |
{ | |
} | |
/** | |
* Construct a ShingleFilter with the specified token type for shingle tokens. | |
* | |
* @param input input stream | |
* @param tokenType token type for shingle tokens | |
*/ | |
public ShingleFilter(TokenStream input, string tokenType) : this(input, DEFAULT_MAX_SHINGLE_SIZE) | |
{ | |
this.tokenType = tokenType; | |
} | |
/** | |
* Set the type of the shingle tokens produced by this filter. | |
* (default: "shingle") | |
* | |
* @param tokenType token tokenType | |
*/ | |
public void SetTokenType(String tokenType) | |
{ | |
this.tokenType = tokenType; | |
} | |
/** | |
* Shall the output stream contain the input tokens (unigrams) as well as | |
* shingles? (default: true.) | |
* | |
* @param outputUnigrams Whether or not the output stream shall contain | |
* the input tokens (unigrams) | |
*/ | |
public void SetOutputUnigrams(bool outputUnigrams) | |
{ | |
this.outputUnigrams = outputUnigrams; | |
} | |
/** | |
* Set the max shingle size (default: 2) | |
* | |
* @param maxShingleSize max size of output shingles | |
*/ | |
public void SetMaxShingleSize(int maxShingleSize) | |
{ | |
if (maxShingleSize < 2) | |
{ | |
throw new System.ArgumentException("Max shingle size must be >= 2"); | |
} | |
this.shingles = new StringBuilder[maxShingleSize]; | |
for (int i = 0; i < shingles.Length; i++) | |
{ | |
shingles[i] = new StringBuilder(); | |
} | |
this.maxShingleSize = maxShingleSize; | |
} | |
/** | |
* Clear the StringBuilder shingles that are used for storing the output shingles. | |
*/ | |
private void ClearShingles() | |
{ | |
if (shingles != null) | |
{ | |
for (int i = 0; i < shingles.Length; i++) | |
{ | |
shingles[i].Length = 0; | |
} | |
} | |
} | |
private AttributeSource.State nextToken; | |
private int shingleBufferPosition; | |
private int[] endOffsets; | |
/* (non-Javadoc) | |
* @see org.apache.lucene.analysis.TokenStream#next() | |
*/ | |
public override bool IncrementToken() | |
{ | |
while (true) | |
{ | |
if (nextToken == null) | |
{ | |
if (!FillShingleBuffer()) | |
{ | |
return false; | |
} | |
} | |
IEnumerator it = shingleBuf.GetEnumerator(); | |
it.MoveNext(); | |
nextToken = (AttributeSource.State)it.Current; | |
if (outputUnigrams) | |
{ | |
if (shingleBufferPosition == 0) | |
{ | |
base.RestoreState(nextToken); | |
posIncrAtt.SetPositionIncrement(1); | |
shingleBufferPosition++; | |
return true; | |
} | |
} | |
else if (shingleBufferPosition % this.maxShingleSize == 0) | |
{ | |
shingleBufferPosition++; | |
} | |
if (shingleBufferPosition < shingleBuf.Count) | |
{ | |
base.RestoreState(nextToken); | |
typeAtt.SetType(tokenType); | |
offsetAtt.SetOffset(offsetAtt.StartOffset(), endOffsets[shingleBufferPosition]); | |
StringBuilder buf = shingles[shingleBufferPosition]; | |
int termLength = buf.Length; | |
char[] termBuffer = termAtt.TermBuffer(); | |
if (termBuffer.Length < termLength) | |
{ | |
termBuffer = termAtt.ResizeTermBuffer(termLength); | |
} | |
//buf.GetChars(0, termLength, termBuffer, 0); | |
termBuffer = buf.ToString().ToCharArray(0, termLength); | |
termAtt.SetTermLength(termLength); | |
if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) | |
{ | |
posIncrAtt.SetPositionIncrement(1); | |
} | |
else | |
{ | |
posIncrAtt.SetPositionIncrement(0); | |
} | |
shingleBufferPosition++; | |
if (shingleBufferPosition == shingleBuf.Count) | |
{ | |
nextToken = null; | |
shingleBufferPosition = 0; | |
} | |
return true; | |
} | |
else | |
{ | |
nextToken = null; | |
shingleBufferPosition = 0; | |
} | |
} | |
} | |
private int numFillerTokensToInsert; | |
private AttributeSource.State currentToken; | |
private bool hasCurrentToken; | |
/** | |
* Get the next token from the input stream and push it on the token buffer. | |
* If we encounter a token with position increment > 1, we put filler tokens | |
* on the token buffer. | |
* <p/> | |
* Returns null when the end of the input stream is reached. | |
* @return the next token, or null if at end of input stream | |
* @throws IOException if the input stream has a problem | |
*/ | |
private bool getNextToken() | |
{ | |
try | |
{ | |
while (true) | |
{ | |
if (numFillerTokensToInsert > 0) | |
{ | |
if (currentToken == null) | |
{ | |
currentToken = CaptureState(); | |
} | |
else | |
{ | |
RestoreState(currentToken); | |
} | |
numFillerTokensToInsert--; | |
// A filler token occupies no space | |
offsetAtt.SetOffset(offsetAtt.StartOffset(), offsetAtt.StartOffset()); | |
termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length); | |
return true; | |
} | |
if (hasCurrentToken) | |
{ | |
if (currentToken != null) | |
{ | |
RestoreState(currentToken); | |
currentToken = null; | |
} | |
hasCurrentToken = false; | |
return true; | |
} | |
if (!input.IncrementToken()) | |
{ | |
return false; | |
} | |
hasCurrentToken = true; | |
if (posIncrAtt.GetPositionIncrement() > 1) | |
{ | |
numFillerTokensToInsert = posIncrAtt.GetPositionIncrement() - 1; | |
} | |
} | |
} | |
catch (System.Exception e) | |
{ | |
throw new System.SystemException(e.ToString()); | |
} | |
} | |
/** | |
* Fill the output buffer with new shingles. | |
* | |
* @throws IOException if there's a problem getting the next token | |
*/ | |
private bool FillShingleBuffer() | |
{ | |
try | |
{ | |
bool addedToken = false; | |
/* Try to fill the shingle buffer. */ | |
do | |
{ | |
if (getNextToken()) | |
{ | |
shingleBuf.Add(CaptureState()); | |
if (shingleBuf.Count > maxShingleSize) | |
{ | |
shingleBuf.RemoveAt(0); | |
} | |
addedToken = true; | |
} | |
else | |
{ | |
break; | |
} | |
} while (shingleBuf.Count < maxShingleSize); | |
if (shingleBuf.Count < 1) | |
{ | |
return false; | |
} | |
/* | |
* If no new token could be added to the shingle buffer, we have reached | |
* the end of the input stream and have to discard the least recent token. | |
*/ | |
if (!addedToken) | |
{ | |
shingleBuf.RemoveAt(0); | |
} | |
if (shingleBuf.Count < 1) | |
{ | |
return false; | |
} | |
ClearShingles(); | |
int i; | |
endOffsets = new int[shingleBuf.Count]; | |
for (i = 0; i < endOffsets.Length; i++) | |
{ | |
endOffsets[i] = 0; | |
} | |
i = 0; | |
for (IEnumerator it = shingleBuf.GetEnumerator(); it.MoveNext(); ) | |
{ | |
RestoreState((AttributeSource.State)it.Current); | |
for (int j = i; j < shingles.Length; j++) | |
{ | |
if (shingles[j].Length != 0) | |
{ | |
shingles[j].Append(TOKEN_SEPARATOR); | |
} | |
shingles[j].Append(termAtt.TermBuffer(), 0, termAtt.TermLength()); | |
} | |
endOffsets[i] = offsetAtt.EndOffset(); | |
i++; | |
} | |
return true; | |
} | |
catch (System.Exception e) | |
{ | |
throw new System.SystemException(e.ToString()); | |
} | |
} | |
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should | |
* not be overridden. Delegates to the backwards compatibility layer. */ | |
public Token Next(Token reusableToken) | |
{ | |
try | |
{ | |
return base.Next(reusableToken); | |
} | |
catch (System.Exception e) | |
{ | |
throw new System.SystemException(e.ToString()); | |
} | |
} | |
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should | |
* not be overridden. Delegates to the backwards compatibility layer. */ | |
public Token Next() | |
{ | |
try | |
{ | |
return base.Next(); | |
} | |
catch (System.Exception e) | |
{ | |
throw new System.SystemException(e.ToString()); | |
} | |
} | |
public override void Reset() | |
{ | |
try | |
{ | |
base.Reset(); | |
nextToken = null; | |
shingleBufferPosition = 0; | |
shingleBuf.Clear(); | |
numFillerTokensToInsert = 0; | |
currentToken = null; | |
hasCurrentToken = false; | |
} | |
catch (System.Exception e) | |
{ | |
throw new System.SystemException(e.ToString()); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment