Created
January 14, 2011 16:21
-
-
Save Xodarap/779815 to your computer and use it in GitHub Desktop.
Tokenizes some two-word phrases as a single word
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// Normally, we tokenize based on white space. But there are times we want to keep words across whitespace, e.g. | |
/// "in basket" should not become "in" "basket". This class allows you to do that. | |
/// </summary> | |
class SpecialWordsTokenFilter : TokenFilter | |
{ | |
// TODO: Set position and offset attributes as well as term | |
readonly TermAttribute termAttribute; | |
readonly Dictionary<string, string> TwoWords = new Dictionary<string, string>(); | |
private Queue<string> bufferBuffer = new Queue<string>(); | |
/// <summary> | |
/// Constructor | |
/// </summary> | |
/// <param name="in_stream">Input stream</param> | |
/// <param name="TwoWordPhrases">Key-value pairs of words which should be tokenized as one phrase</param> | |
public SpecialWordsTokenFilter(TokenStream in_stream, Dictionary<string,string> TwoWordPhrases) | |
: base(in_stream) | |
{ | |
TwoWords = TwoWordPhrases; | |
termAttribute = (TermAttribute) AddAttribute(typeof(TermAttribute)); | |
} | |
public override bool IncrementToken() | |
{ | |
//First, check to see if we have anything in our private buffer | |
if(bufferBuffer.Count > 0) | |
{ | |
termAttribute.SetTermBuffer(bufferBuffer.Dequeue()); | |
return true; | |
} | |
//If not, then see if we can grab anything more from our parent | |
if (!input.IncrementToken()) | |
return false; | |
//If the next word is the start of a phrase, push it on to a stack. | |
//When we find a complete phrase or a word which is not special, we | |
//stop looping, and put everything we've taken from our parent into our | |
//private buffer. If we've found a complete phrase, this is entered as | |
//a single term into our buffer. | |
Stack<string> runningWordList = new Stack<string>(); | |
while (TwoWords.ContainsKey(termAttribute.Term())) | |
{ | |
string firstTerm = termAttribute.Term(); | |
runningWordList.Push(firstTerm); | |
//we ran out of words - so this isn't part of a | |
//special phrase | |
if (!input.IncrementToken()) | |
break; | |
string nextTerm = termAttribute.Term(); | |
if (TwoWords[firstTerm] == nextTerm) | |
{ | |
//if it's a complete phrase, remove the previous entry | |
//(which is the first word of the phrase) and put in the | |
//full phrase | |
runningWordList.Pop(); | |
runningWordList.Push(firstTerm + " " + nextTerm); | |
break; | |
} | |
//if the next word isn't special, we're not gonna come into this loop again, | |
//and therefore won't be able to push the term onto the stack. So do it now. | |
else if(!TwoWords.ContainsKey(nextTerm)) | |
{ | |
runningWordList.Push(nextTerm); | |
} | |
} | |
//put everything into the buffer | |
foreach (string term in runningWordList.Reverse()) | |
{ | |
bufferBuffer.Enqueue(term); | |
} | |
//if we've got stuff in our buffer, use that instead of the | |
//global buffer | |
if (bufferBuffer.Count > 0) | |
termAttribute.SetTermBuffer(bufferBuffer.Dequeue()); | |
return true; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment