Skip to content

Instantly share code, notes, and snippets.

@jenwilson
Created July 17, 2011 01:54
Show Gist options
  • Save jenwilson/1087018 to your computer and use it in GitHub Desktop.
Save jenwilson/1087018 to your computer and use it in GitHub Desktop.
ShingleFilter class in Lucene.Net - This version is not complete.
/**
* ShingleFilter class for Lucene.Net C# ported from Java version 2.9.2 org.apache.lucene.analysis.shingle.ShingleFilter.java
* [Last updated: 07-16-2011]
*
* This version of the file is not complete. It produces System.SystemException: System.NullReferenceException when run.
*
* System.NullReferenceException: Object reference not set to an instance of an object.
* at LuceneIndexer.ShingleFilter.FillShingleBuffer() in C:\Users\..\ShingleFilter.cs:line 381 *
*
*
*
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
using System;
using System.Collections;
using System.Text;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using TermAttribute = Lucene.Net.Analysis.Tokenattributes.TermAttribute;
using OffsetAttribute = Lucene.Net.Analysis.Tokenattributes.OffsetAttribute;
using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
using TypeAttribute = Lucene.Net.Analysis.Tokenattributes.TypeAttribute;
using AttributeSource = Lucene.Net.Util.AttributeSource;
/// <summary>A ShingleFilter constructs shingles (token n-grams) from a token stream.
/// In other words, it creates combinations of tokens as a single token.
///
/// <p>For example, the sentence "please divide this sentence into shingles"
/// might be tokenized into shingles "please divide", "divide this",
/// "this sentence", "sentence into", and "into shingles".
///
/// <p>This filter handles position increments > 1 by inserting filler tokens
/// (tokens with termtext "_"). It does not handle a position increment of 0.
/// </summary>
///
namespace LuceneIndexer
{
public class ShingleFilter : TokenFilter
{
/** maximum shingle size (number of tokens) */
internal int maxShingleSize;
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
private ArrayList shingleBuf = new ArrayList();
private StringBuilder[] shingles;
private string tokenType = "shingle";
/** filler token for when positionIncrement is more than 1 */
public static readonly char[] FILLER_TOKEN = { '_' };
/** default maximum shingle size is 2. */
public static int DEFAULT_MAX_SHINGLE_SIZE = 2;
/** The string to use when joining adjacent tokens to form a shingle */
public const string TOKEN_SEPARATOR = " ";
/** By default, we output unigrams (individual tokens) as well as shingles (token n-grams). */
private bool outputUnigrams = true;
/**
* Constructs a ShingleFilter with the specified single size from the
* {@link TokenStream} <code>input</code>
*
* @param input input stream
* @param maxShingleSize maximum shingle size produced by the filter.
*/
public ShingleFilter(TokenStream input, int maxShingleSize) : base(input)
{
this.maxShingleSize = maxShingleSize;
termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
posIncrAtt = (PositionIncrementAttribute)AddAttribute(typeof(PositionIncrementAttribute));
typeAtt = (TypeAttribute)AddAttribute(typeof(TypeAttribute));
}
/**
* Construct a ShingleFilter with default shingle size.
*
* @param input input stream
*/
public ShingleFilter(TokenStream input) : this(input, DEFAULT_MAX_SHINGLE_SIZE)
{
}
/**
* Construct a ShingleFilter with the specified token type for shingle tokens.
*
* @param input input stream
* @param tokenType token type for shingle tokens
*/
public ShingleFilter(TokenStream input, string tokenType) : this(input, DEFAULT_MAX_SHINGLE_SIZE)
{
this.tokenType = tokenType;
}
/**
* Set the type of the shingle tokens produced by this filter.
* (default: "shingle")
*
* @param tokenType token tokenType
*/
public void SetTokenType(String tokenType)
{
this.tokenType = tokenType;
}
/**
* Shall the output stream contain the input tokens (unigrams) as well as
* shingles? (default: true.)
*
* @param outputUnigrams Whether or not the output stream shall contain
* the input tokens (unigrams)
*/
public void SetOutputUnigrams(bool outputUnigrams)
{
this.outputUnigrams = outputUnigrams;
}
/**
* Set the max shingle size (default: 2)
*
* @param maxShingleSize max size of output shingles
*/
public void SetMaxShingleSize(int maxShingleSize)
{
if (maxShingleSize < 2)
{
throw new System.ArgumentException("Max shingle size must be >= 2");
}
this.shingles = new StringBuilder[maxShingleSize];
for (int i = 0; i < shingles.Length; i++)
{
shingles[i] = new StringBuilder();
}
this.maxShingleSize = maxShingleSize;
}
/**
* Clear the StringBuilder shingles that are used for storing the output shingles.
*/
private void ClearShingles()
{
if (shingles != null)
{
for (int i = 0; i < shingles.Length; i++)
{
shingles[i].Length = 0;
}
}
}
private AttributeSource.State nextToken;
private int shingleBufferPosition;
private int[] endOffsets;
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public override bool IncrementToken()
{
while (true)
{
if (nextToken == null)
{
if (!FillShingleBuffer())
{
return false;
}
}
IEnumerator it = shingleBuf.GetEnumerator();
it.MoveNext();
nextToken = (AttributeSource.State)it.Current;
if (outputUnigrams)
{
if (shingleBufferPosition == 0)
{
base.RestoreState(nextToken);
posIncrAtt.SetPositionIncrement(1);
shingleBufferPosition++;
return true;
}
}
else if (shingleBufferPosition % this.maxShingleSize == 0)
{
shingleBufferPosition++;
}
if (shingleBufferPosition < shingleBuf.Count)
{
base.RestoreState(nextToken);
typeAtt.SetType(tokenType);
offsetAtt.SetOffset(offsetAtt.StartOffset(), endOffsets[shingleBufferPosition]);
StringBuilder buf = shingles[shingleBufferPosition];
int termLength = buf.Length;
char[] termBuffer = termAtt.TermBuffer();
if (termBuffer.Length < termLength)
{
termBuffer = termAtt.ResizeTermBuffer(termLength);
}
//buf.GetChars(0, termLength, termBuffer, 0);
termBuffer = buf.ToString().ToCharArray(0, termLength);
termAtt.SetTermLength(termLength);
if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1)
{
posIncrAtt.SetPositionIncrement(1);
}
else
{
posIncrAtt.SetPositionIncrement(0);
}
shingleBufferPosition++;
if (shingleBufferPosition == shingleBuf.Count)
{
nextToken = null;
shingleBufferPosition = 0;
}
return true;
}
else
{
nextToken = null;
shingleBufferPosition = 0;
}
}
}
private int numFillerTokensToInsert;
private AttributeSource.State currentToken;
private bool hasCurrentToken;
/**
* Get the next token from the input stream and push it on the token buffer.
* If we encounter a token with position increment > 1, we put filler tokens
* on the token buffer.
* <p/>
* Returns null when the end of the input stream is reached.
* @return the next token, or null if at end of input stream
* @throws IOException if the input stream has a problem
*/
private bool getNextToken()
{
try
{
while (true)
{
if (numFillerTokensToInsert > 0)
{
if (currentToken == null)
{
currentToken = CaptureState();
}
else
{
RestoreState(currentToken);
}
numFillerTokensToInsert--;
// A filler token occupies no space
offsetAtt.SetOffset(offsetAtt.StartOffset(), offsetAtt.StartOffset());
termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length);
return true;
}
if (hasCurrentToken)
{
if (currentToken != null)
{
RestoreState(currentToken);
currentToken = null;
}
hasCurrentToken = false;
return true;
}
if (!input.IncrementToken())
{
return false;
}
hasCurrentToken = true;
if (posIncrAtt.GetPositionIncrement() > 1)
{
numFillerTokensToInsert = posIncrAtt.GetPositionIncrement() - 1;
}
}
}
catch (System.Exception e)
{
throw new System.SystemException(e.ToString());
}
}
/**
* Fill the output buffer with new shingles.
*
* @throws IOException if there's a problem getting the next token
*/
private bool FillShingleBuffer()
{
try
{
bool addedToken = false;
/* Try to fill the shingle buffer. */
do
{
if (getNextToken())
{
shingleBuf.Add(CaptureState());
if (shingleBuf.Count > maxShingleSize)
{
shingleBuf.RemoveAt(0);
}
addedToken = true;
}
else
{
break;
}
} while (shingleBuf.Count < maxShingleSize);
if (shingleBuf.Count < 1)
{
return false;
}
/*
* If no new token could be added to the shingle buffer, we have reached
* the end of the input stream and have to discard the least recent token.
*/
if (!addedToken)
{
shingleBuf.RemoveAt(0);
}
if (shingleBuf.Count < 1)
{
return false;
}
ClearShingles();
int i;
endOffsets = new int[shingleBuf.Count];
for (i = 0; i < endOffsets.Length; i++)
{
endOffsets[i] = 0;
}
i = 0;
for (IEnumerator it = shingleBuf.GetEnumerator(); it.MoveNext(); )
{
RestoreState((AttributeSource.State)it.Current);
for (int j = i; j < shingles.Length; j++)
{
if (shingles[j].Length != 0)
{
shingles[j].Append(TOKEN_SEPARATOR);
}
shingles[j].Append(termAtt.TermBuffer(), 0, termAtt.TermLength());
}
endOffsets[i] = offsetAtt.EndOffset();
i++;
}
return true;
}
catch (System.Exception e)
{
throw new System.SystemException(e.ToString());
}
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public Token Next(Token reusableToken)
{
try
{
return base.Next(reusableToken);
}
catch (System.Exception e)
{
throw new System.SystemException(e.ToString());
}
}
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
* not be overridden. Delegates to the backwards compatibility layer. */
public Token Next()
{
try
{
return base.Next();
}
catch (System.Exception e)
{
throw new System.SystemException(e.ToString());
}
}
public override void Reset()
{
try
{
base.Reset();
nextToken = null;
shingleBufferPosition = 0;
shingleBuf.Clear();
numFillerTokensToInsert = 0;
currentToken = null;
hasCurrentToken = false;
}
catch (System.Exception e)
{
throw new System.SystemException(e.ToString());
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment