Xodarap · January 14, 2011 16:21
diff --git a/SpecialWordsTokenFilter.cs b/SpecialWordsTokenFilter.cs
 /// <summary>
    /// Normally, we tokenize based on white space. But there are times we want to keep words across whitespace, e.g.
    /// "in basket" should not become "in" "basket". This class allows you to do that.
    /// </summary>
    class SpecialWordsTokenFilter : TokenFilter
    {
        // TODO: Set position and offset attributes as well as term
        readonly TermAttribute termAttribute;
        readonly Dictionary<string, string> TwoWords = new Dictionary<string, string>();
        private Queue<string> bufferBuffer = new Queue<string>();
        
        /// <summary>
        /// Constructor
        /// </summary>
        /// <param name="in_stream">Input stream</param>
        /// <param name="TwoWordPhrases">Key-value pairs of words which should be tokenized as one phrase</param>
        public SpecialWordsTokenFilter(TokenStream in_stream, Dictionary<string,string> TwoWordPhrases)
            : base(in_stream)
        {
            TwoWords = TwoWordPhrases;
            termAttribute = (TermAttribute) AddAttribute(typeof(TermAttribute));
        }

        public override bool IncrementToken()
        {
            //First, check to see if we have anything in our private buffer
            if(bufferBuffer.Count > 0)
            {
                termAttribute.SetTermBuffer(bufferBuffer.Dequeue());
                return true;
            }

            //If not, then see if we can grab anything more from our parent
            if (!input.IncrementToken())
                return false;

            //If the next word is the start of a phrase, push it on to a stack.
            //When we find a complete phrase or a word which is not special, we 
            //stop looping, and put everything we've taken from our parent into our 
            //private buffer. If we've found a complete phrase, this is entered as
            //a single term into our buffer.
            Stack<string> runningWordList = new Stack<string>();
            while (TwoWords.ContainsKey(termAttribute.Term()))
            {
                string firstTerm = termAttribute.Term();
                runningWordList.Push(firstTerm);

                //we ran out of words - so this isn't part of a
                //special phrase
                if (!input.IncrementToken())
                    break;

                string nextTerm = termAttribute.Term();
                if (TwoWords[firstTerm] == nextTerm)
                {
                    //if it's a complete phrase, remove the previous entry
                    //(which is the first word of the phrase) and put in the
                    //full phrase
                    runningWordList.Pop();
                    runningWordList.Push(firstTerm + " " + nextTerm);
                    break;
                }
                //if the next word isn't special, we're not gonna come into this loop again,
                //and therefore won't be able to push the term onto the stack. So do it now.
                else if(!TwoWords.ContainsKey(nextTerm))
                {
                    runningWordList.Push(nextTerm);
                }
            }

            //put everything into the buffer
            foreach (string term in runningWordList.Reverse())
            {
                bufferBuffer.Enqueue(term);
            }

            //if we've got stuff in our buffer, use that instead of the
            //global buffer
            if (bufferBuffer.Count > 0)
                termAttribute.SetTermBuffer(bufferBuffer.Dequeue());

            return true;
        }
    }
	/// <summary>
	/// Normally, we tokenize based on white space. But there are times we want to keep words across whitespace, e.g.
	/// "in basket" should not become "in" "basket". This class allows you to do that.
	/// </summary>
	class SpecialWordsTokenFilter : TokenFilter
	{
	// TODO: Set position and offset attributes as well as term
	readonly TermAttribute termAttribute;
	readonly Dictionary<string, string> TwoWords = new Dictionary<string, string>();
	private Queue<string> bufferBuffer = new Queue<string>();

	/// <summary>
	/// Constructor
	/// </summary>
	/// <param name="in_stream">Input stream</param>
	/// <param name="TwoWordPhrases">Key-value pairs of words which should be tokenized as one phrase</param>
	public SpecialWordsTokenFilter(TokenStream in_stream, Dictionary<string,string> TwoWordPhrases)
	: base(in_stream)
	{
	TwoWords = TwoWordPhrases;
	termAttribute = (TermAttribute) AddAttribute(typeof(TermAttribute));
	}

	public override bool IncrementToken()
	{
	//First, check to see if we have anything in our private buffer
	if(bufferBuffer.Count > 0)
	{
	termAttribute.SetTermBuffer(bufferBuffer.Dequeue());
	return true;
	}

	//If not, then see if we can grab anything more from our parent
	if (!input.IncrementToken())
	return false;

	//If the next word is the start of a phrase, push it on to a stack.
	//When we find a complete phrase or a word which is not special, we
	//stop looping, and put everything we've taken from our parent into our
	//private buffer. If we've found a complete phrase, this is entered as
	//a single term into our buffer.
	Stack<string> runningWordList = new Stack<string>();
	while (TwoWords.ContainsKey(termAttribute.Term()))
	{
	string firstTerm = termAttribute.Term();
	runningWordList.Push(firstTerm);

	//we ran out of words - so this isn't part of a
	//special phrase
	if (!input.IncrementToken())
	break;

	string nextTerm = termAttribute.Term();
	if (TwoWords[firstTerm] == nextTerm)
	{
	//if it's a complete phrase, remove the previous entry
	//(which is the first word of the phrase) and put in the
	//full phrase
	runningWordList.Pop();
	runningWordList.Push(firstTerm + " " + nextTerm);
	break;
	}
	//if the next word isn't special, we're not gonna come into this loop again,
	//and therefore won't be able to push the term onto the stack. So do it now.
	else if(!TwoWords.ContainsKey(nextTerm))
	{
	runningWordList.Push(nextTerm);
	}
	}

	//put everything into the buffer
	foreach (string term in runningWordList.Reverse())
	{
	bufferBuffer.Enqueue(term);
	}

	//if we've got stuff in our buffer, use that instead of the
	//global buffer
	if (bufferBuffer.Count > 0)
	termAttribute.SetTermBuffer(bufferBuffer.Dequeue());

	return true;
	}
	}