Skip to content

Instantly share code, notes, and snippets.

@Xodarap
Created September 30, 2011 14:38
Show Gist options
  • Save Xodarap/1253934 to your computer and use it in GitHub Desktop.
Save Xodarap/1253934 to your computer and use it in GitHub Desktop.
Multiple words which should be tokenized as one word
/// <summary>
/// Normally, we tokenize based on white space. But there are times we want to keep words across whitespace, e.g.
/// "in basket" should not become "in" "basket". This class allows you to do that.
/// </summary>
class SpecialWordsTokenFilter : TokenFilter
{
readonly TermAttribute termAttribute;
readonly Dictionary<string, string> TwoWords = new Dictionary<string, string>();
private Queue<string> bufferBuffer = new Queue<string>();
/// <summary>
/// Constructor
/// </summary>
/// <param name="in_stream">Input stream</param>
/// <param name="TwoWordPhrases">Key-value pairs of words which should be tokenized as one phrase</param>
public SpecialWordsTokenFilter(TokenStream in_stream, Dictionary<string,string> TwoWordPhrases)
: base(in_stream)
{
TwoWords = TwoWordPhrases;
termAttribute = (TermAttribute) AddAttribute(typeof(TermAttribute));
}
public override bool IncrementToken()
{
//First, check to see if we have anything in our private buffer
if(bufferBuffer.Count > 0)
{
termAttribute.SetTermBuffer(bufferBuffer.Dequeue());
return true;
}
//If not, then see if we can grab anything more from our parent
if (!input.IncrementToken())
return false;
//If the next word is the start of a phrase, push it on to a stack.
//When we find a complete phrase or a word which is not special, we
//stop looping, and put everything we've taken from our parent into our
//private buffer. If we've found a complete phrase, this is entered as
//a single term into our buffer.
Stack<string> runningWordList = new Stack<string>();
while (TwoWords.ContainsKey(termAttribute.Term()))
{
string firstTerm = termAttribute.Term();
runningWordList.Push(firstTerm);
//we ran out of words - so this isn't part of a
//special phrase
if (!input.IncrementToken())
break;
string nextTerm = termAttribute.Term();
if (TwoWords[firstTerm] == nextTerm)
{
//if it's a complete phrase, remove the previous entry
//(which is the first word of the phrase) and put in the
//full phrase
runningWordList.Pop();
runningWordList.Push(firstTerm + " " + nextTerm);
break;
}
//if the next word isn't special, we're not gonna come into this loop again,
//and therefore won't be able to push the term onto the stack. So do it now.
else if(!TwoWords.ContainsKey(nextTerm))
{
runningWordList.Push(nextTerm);
}
}
//put everything into the buffer
foreach (string term in runningWordList.Reverse())
{
bufferBuffer.Enqueue(term);
}
//if we've got stuff in our buffer, use that instead of the
//global buffer
if (bufferBuffer.Count > 0)
termAttribute.SetTermBuffer(bufferBuffer.Dequeue());
return true;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment