Last active
June 7, 2018 10:40
-
-
Save ncruces/fed041b5cac611f413267cf674ce2e4f to your computer and use it in GitHub Desktop.
Plain Java/Android StringSearch (ported from ICU; unfortunately the original had a few bugs at this point)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
******************************************************************************* | |
* Copyright (C) 1996-2000, International Business Machines Corporation and * | |
* others. All Rights Reserved. * | |
******************************************************************************* | |
* | |
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SearchIterator.java,v $ | |
* $Date: 2002/04/03 19:13:56 $ | |
* $Revision: 1.6 $ | |
* | |
***************************************************************************************** | |
*/ | |
package com.ibm.icu.text; | |
import java.text.BreakIterator; | |
import java.text.CharacterIterator; | |
/** | |
* <code>SearchIterator</code> is an abstract base class that provides methods | |
* to search for a pattern within a text string. Instances of | |
* <code>SearchIterator</code> maintain a current position and scan over | |
* the target text, returning the indices the pattern is matched | |
* and the length of each match. | |
* <p> | |
* <code>SearchIterator</code> is an abstract base class that defines a | |
* protocol for text searching. Subclasses provide concrete implementations of | |
* various search algorithms. For example, {@link StringSearch} | |
* implements language-sensitive pattern matching based on the comparison rules | |
* defined in a {@link java.text.RuleBasedCollator RuleBasedCollator} object. | |
* <p> | |
* Internally, <code>SearchIterator</code> scans text using a | |
* {@link CharacterIterator}, and is thus able to scan text held | |
* by any object implementing that protocol. A <code>StringCharacterIterator</code> | |
* is used to scan <code>String</code> objects passed to <code>setText</code>. | |
* <p> | |
* <code>SearchIterator</code> provides an API that is similar to that of | |
* other text iteration classes such as <code>BreakIterator</code>. Using this | |
* class, it is easy to scan through text looking for all occurances of a | |
* given pattern. The following example uses a <code>StringSearch</code> object to | |
* find all instances of "fox" in the target string. Any other subclass of | |
* <code>SearchIterator</code> can be used in an identical manner. | |
* <pre><code> | |
* String target = "The quick brown fox jumped over the lazy fox"; | |
* String pattern = "fox"; | |
* | |
* SearchIterator iter = new StringSearch(pattern, target); | |
* | |
* for (int pos = iter.first(); pos != SearchIterator.DONE; pos = iter.next()) { | |
* System.out.println("Found match at " + pos + | |
* ", length is " + iter.getMatchLength()); | |
* } | |
* </code></pre> | |
* | |
* @see StringSearch | |
*/ | |
public abstract class SearchIterator { | |
/** | |
* DONE is returned by previous() and next() after all valid | |
* matches have been returned, and by first() and last() if | |
* there are no matches at all. | |
*/ | |
public static final int DONE = -1; | |
/** | |
* Private value indicating that the iterator is pointing | |
* before the beginning of the target text. | |
*/ | |
private static final int BEFORE = -2; | |
/** | |
* Return the first index at which the target text matches the search | |
* pattern. The iterator is adjusted so that its current index | |
* (as returned by {@link #getIndex}) is the match posisition if one was found | |
* and <code>DONE</code> if one was not. | |
* | |
* @return The character index of the first match, or <code>DONE</code> if there | |
* are no matches. | |
*/ | |
final public int first() { | |
setIndex(BEFORE); | |
return next(); | |
} | |
/** | |
* Return the first index greater than <tt>pos</tt> at which the target | |
* text matches the search pattern. The iterator is adjusted so that its current index | |
* (as returned by {@link #getIndex}) is the match posisition if one was found | |
* and <code>DONE</code> if one was not. | |
* | |
* @return The character index of the first match following <code>pos</code>, | |
* or <tt>DONE</tt> if there are no matches. | |
*/ | |
final public int following(int pos) { | |
setIndex(pos); | |
return next(); | |
} | |
/** | |
* Return the last index in the target text at which it matches | |
* the search pattern and adjusts the iteration to point to that position. | |
* | |
* @return The index of the first match, or <tt>DONE</tt> if there | |
* are no matches. | |
*/ | |
final public int last() { | |
setIndex(DONE); | |
return previous(); | |
} | |
/** | |
* Return the first index less than <code>pos</code> at which the target | |
* text matches the search pattern. The iterator is adjusted so that its current index | |
* (as returned by {@link #getIndex}) is the match posisition if one was found | |
* and <tt>DONE</tt> if one was not. | |
* | |
* @return The character index of the first match preceding <code>pos</code>, | |
* or <code>DONE</code> if there are no matches. | |
*/ | |
final public int preceding(int pos) { | |
setIndex(pos); | |
return previous(); | |
} | |
/** | |
* Return the index of the next point at which the text matches the | |
* search pattern, starting from the current position. | |
* @return The index of the next match after the current position, | |
* or <code>DONE</code> if there are no more matches. | |
* | |
* @see #first | |
*/ | |
public int next() { | |
if (index == BEFORE){ | |
// Starting at the beginning of the text | |
index = target.getBeginIndex(); | |
} else if (length > 0) { | |
// Finding the next match after a previous one | |
index += overlap ? 1 : length; | |
} | |
index -= 1; | |
do { | |
length = 0; | |
index = handleNext(index + 1); | |
} while (index != DONE && !isBreakUnit(index, index+length)); | |
return index; | |
} | |
/** | |
* Return the index of the previous point at which the text matches | |
* the search pattern, starting at the current position | |
* | |
* @return The index of the previous match before the current position, | |
* or <code>DONE</code> if there are no more matches. | |
*/ | |
public int previous() { | |
if (index == DONE) { | |
index = target.getEndIndex(); | |
} else if (length > 0) { | |
// Finding the previous match before a following one | |
index = overlap ? index + length - 1 : index; | |
} | |
index += 1; | |
do { | |
length = 0; | |
index = handlePrev(index - 1); | |
} while (index != DONE && !isBreakUnit(index, index+length)); | |
if (index == DONE) { | |
index = BEFORE; | |
} | |
return getIndex(); | |
} | |
/** | |
* Return the current index in the text being searched. | |
* If the iteration has gone past the end of the text | |
* (or past the beginning for a backwards search), | |
* {@link #DONE} is returned. | |
*/ | |
public int getIndex() { | |
return index == BEFORE ? DONE : index; | |
} | |
/** | |
* Determines whether overlapping matches are returned. If this | |
* property is <code>true</code>, matches that begin within the | |
* boundry of the previous match are considered valid and will | |
* be returned. For example, when searching for "abab" in the | |
* target text "ababab", both offsets 0 and 2 will be returned | |
* as valid matches if this property is <code>true</code>. | |
* <p> | |
* The default setting of this property is <tt>true</tt> | |
*/ | |
public void setOverlapping(boolean allowOverlap) { | |
overlap = allowOverlap; | |
} | |
/** | |
* Determines whether overlapping matches are returned. | |
* | |
* @see #setOverlapping | |
*/ | |
public boolean isOverlapping() { | |
return overlap; | |
} | |
/** | |
* Returns the length of text in the target which matches the search | |
* pattern. This call returns a valid result only after a successful | |
* call to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. | |
* Just after construction, or after a searching method returns | |
* <tt>DONE</tt>, this method will return 0. | |
* | |
* @return The length of the match in the target text, or 0 if there | |
* is no match currently. | |
*/ | |
public int getMatchLength() { | |
return length; | |
} | |
/** | |
* Set the BreakIterator that will be used to restrict the points | |
* at which matches are detected. | |
* | |
* @param iterator A {@link java.text.BreakIterator BreakIterator} | |
* that will be used to restrict the points | |
* at which matches are detected. If a match is found, but the match's start | |
* or end index is not a boundary as determined by | |
* the <tt>BreakIterator</tt>, the match will be rejected and | |
* another will be searched for. | |
* | |
* If this parameter is <tt>null</tt>, no break | |
* detection is attempted. | |
* | |
* @see #getBreakIterator | |
*/ | |
public void setBreakIterator(BreakIterator iterator) { | |
breaker = iterator; | |
if (breaker != null) { | |
breaker.setText(target); | |
} | |
} | |
/** | |
* Returns the BreakIterator that is used to restrict the points | |
* at which matches are detected. This will be the same object | |
* that was passed to the constructor or to <code>setBreakIterator</code>. | |
* Note that <tt>null</tt> is a legal value; it means that break | |
* detection should not be attempted. | |
* | |
* @see #setBreakIterator | |
*/ | |
public BreakIterator getBreakIterator() { | |
return breaker; | |
} | |
/** | |
* Set the target text which should be searched and resets the | |
* iterator's position to point before the start of the target text. | |
* This method is useful if you want to re-use an iterator to | |
* search for the same pattern within a different body of text. | |
* | |
* @see #getTarget | |
*/ | |
public void setTarget(CharacterIterator iterator) { | |
target = iterator; | |
if (breaker != null) { | |
breaker.setText(target); | |
} | |
setIndex(BEFORE); | |
} | |
/** | |
* Return the target text which is being searched | |
* | |
* @see #setTarget | |
*/ | |
public CharacterIterator getTarget() { | |
return target; | |
} | |
/** | |
* Returns the text that was matched by the most recent call to | |
* {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. | |
* If the iterator is not pointing at a valid match (e.g. just after | |
* construction or after <tt>DONE</tt> has been returned, returns | |
* an empty string. | |
*/ | |
public String getMatchedText() { | |
StringBuffer buffer = new StringBuffer(); | |
if (length > 0) { | |
int i = 0; | |
for (char c = target.setIndex(index); i < length; c = target.next(), i++) | |
{ | |
buffer.append(c); | |
} | |
} | |
return buffer.toString(); | |
} | |
//------------------------------------------------------------------- | |
// Protected interface for subclasses | |
//------------------------------------------------------------------- | |
/** | |
* Constructor for use by subclasses. | |
* <p> | |
* @param target The target text to be searched. This is for internal | |
* use by this class. Subclasses need to maintain their | |
* own reference to or iterator over the target text | |
* for use by their {@link #handleNext handleNext} and | |
* {@link #handlePrev handlePrev} methods. | |
* | |
* @param breaker A {@link BreakIterator} that is used to restrict the points | |
* at which matches are detected. If <tt>handleNext</tt> or | |
* <tt>handlePrev</tt> finds a match, but the match's start | |
* or end index is not a boundary as determined by | |
* the <tt>BreakIterator</tt>, the match is rejected and | |
* <tt>handleNext</tt> or <tt>handlePrev</tt> is called again. | |
* If this parameter is <tt>null</tt>, no break | |
* detection is attempted. | |
* | |
*/ | |
protected SearchIterator(CharacterIterator target, BreakIterator breaker) | |
{ | |
this.target = target; | |
if (breaker != null) { | |
this.breaker = (BreakIterator)breaker.clone(); | |
this.breaker.setText(target); | |
} | |
index = target.getBeginIndex(); | |
length = 0; | |
} | |
/** | |
* Abstract method which subclasses override to provide the mechanism | |
* for finding the next match in the target text. This allows different | |
* subclasses to provide different search algorithms. | |
* <p> | |
* If a match is found, the implementation should return the index at | |
* which the match starts and should call {@link #setMatchLength setMatchLength} | |
* with the number of characters in the target | |
* text that make up the match. If no match is found, the method | |
* should return DONE and should not call <tt>setMatchLength</tt>. | |
* <p> | |
* @param startAt The index in the target text at which the search | |
* should start. | |
* | |
* @see #setMatchLength | |
*/ | |
protected abstract int handleNext(int startAt); | |
/** | |
* Abstract method which subclasses override to provide the mechanism | |
* for finding the previous match in the target text. This allows different | |
* subclasses to provide different search algorithms. | |
* <p> | |
* If a match is found, the implementation should return the index at | |
* which the match starts and should call {@link #setMatchLength setMatchLength} | |
* with the number of characters in the target | |
* text that make up the match. If no match is found, the method | |
* should return DONE and should not call <tt>setMatchLength</tt>. | |
* <p> | |
* @param startAt The index in the target text at which the search | |
* should start. | |
* | |
* @see #setMatchLength | |
*/ | |
protected abstract int handlePrev(int startAt); | |
/** | |
* Sets the length of the currently matched string in the target text. | |
* Subclasses' <code>handleNext</code> and <code>handlePrev</code> | |
* methods should call this when they find a match in the target text. | |
*/ | |
protected void setMatchLength(int length) { | |
this.length = length; | |
} | |
//------------------------------------------------------------------- | |
// Privates | |
// | |
/** | |
* Internal method used by preceding and following. Sets the index | |
* to point to the given position, and clears any state that's | |
* affected. | |
*/ | |
private void setIndex(int pos) { | |
index = pos; | |
length = 0; | |
} | |
/** | |
* Determine whether the target text bounded by <code>start</code> and | |
* <code>end</code> is one or more whole units of text as determined by | |
* the current <code>BreakIterator</code>. | |
*/ | |
private boolean isBreakUnit(int start, int end) | |
{ | |
if (breaker == null) { | |
return true; | |
} | |
boolean startBound = breaker.isBoundary(start); | |
boolean endBound = (end == target.getEndIndex()) || breaker.isBoundary(end); | |
return startBound && endBound; | |
} | |
//------------------------------------------------------------------------- | |
// Private data... | |
//------------------------------------------------------------------------- | |
private int index; // Current position in the target text | |
private int length; // Length of matched text, or 0 | |
private boolean overlap = true; // Return overlapping matches? | |
private CharacterIterator target; // Target text to be searched | |
private BreakIterator breaker; // Break iterator to constrain matches | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
******************************************************************************* | |
* Copyright (C) 1996-2000, International Business Machines Corporation and * | |
* others. All Rights Reserved. * | |
******************************************************************************* | |
* | |
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringSearch.java,v $ | |
* $Date: 2002/03/20 05:11:16 $ | |
* $Revision: 1.6 $ | |
* | |
***************************************************************************************** | |
*/ | |
package com.ibm.icu.text; | |
import java.text.BreakIterator; | |
import java.text.CharacterIterator; | |
import java.text.CollationElementIterator; | |
import java.text.Collator; | |
import java.text.RuleBasedCollator; | |
import java.text.StringCharacterIterator; | |
import java.util.Locale; | |
/** | |
* <code>StringSearch</code> is a <code>SearchIterator</code> that provides | |
* language-sensitive text searching based on the comparison rules defined | |
* in a {@link RuleBasedCollator} object. | |
* Instances of <code>StringSearch</code> function as iterators | |
* maintain a current position and scan over text returning the index of | |
* characters where the pattern occurs and the length of each match. | |
* <p> | |
* <code>StringSearch</code> uses a version of the fast Boyer-Moore search | |
* algorithm that has been adapted to work with the large character set of | |
* Unicode. See "Efficient Text Searching in Java", to be published in | |
* <i>Java Report</i> in February, 1999, for further information on the algorithm. | |
* <p> | |
* Consult the <code>SearchIterator</code> documentation for information on | |
* and examples of how to use instances of this class to implement text | |
* searching. <code>SearchIterator</code> provides all of the necessary | |
* API; this class only provides constructors and internal implementation | |
* methods. | |
* | |
* @see SearchIterator | |
* @see java.text.RuleBasedCollator | |
* | |
* @author Laura Werner | |
* @version 1.0 | |
*/ | |
public final class StringSearch extends SearchIterator | |
{ | |
/** | |
* Construct a <code>StringSearch</code> object using a specific collator and set | |
* of boundary-detection rules. | |
* <p> | |
* @param pat The text for which this object will search. | |
* | |
* @param target The text in which to search for the pattern. | |
* | |
* @param coll A <code>RuleBasedCollator</code> object which defines the | |
* language-sensitive comparison rules used to determine | |
* whether text in the pattern and target matches. | |
* | |
* @param breaker A <code>BreakIterator</code> object used to constrain the matches | |
* that are found. Matches whose start and end indices | |
* in the target text are not boundaries as determined | |
* by the <code>BreakIterator</code> are ignored. If this behavior | |
* is not desired, <code>null</code> can be passed in instead. | |
*/ | |
public StringSearch(String pat, CharacterIterator target, | |
RuleBasedCollator coll, BreakIterator breaker) { | |
super(target, breaker); | |
pattern = pat; | |
collator = coll; | |
strength = coll.getStrength(); | |
target.setIndex(target.getBeginIndex()); | |
iter = collator.getCollationElementIterator(target); | |
initialize(); // Initialize the Boyer-Moore tables | |
} | |
/** | |
* Construct a <code>StringSearch</code> object using a specific collator. | |
* <p> | |
* @param pattern The text for which this object will search. | |
* | |
* @param target The text in which to search for the pattern. | |
* | |
* @param collator A <code>RuleBasedCollator</code> object which defines the | |
* language-sensitive comparison rules used to determine | |
* whether text in the pattern and target matches. | |
*/ | |
public StringSearch(String pattern, | |
CharacterIterator target, | |
RuleBasedCollator collator) { | |
this(pattern, target, collator, BreakIterator.getCharacterInstance()); | |
} | |
/** | |
* Construct a <code>StringSearch</code> object using the collator and | |
* character boundary detection rules for a given locale. | |
* @param pattern The text for which this object will search. | |
* | |
* @param target The text in which to search for the pattern. | |
* | |
* @param loc The locale whose collation and break-detection rules | |
* should be used. | |
* | |
* @exception ClassCastException thrown if the collator for the specified | |
* locale is not a RuleBasedCollator. | |
*/ | |
public StringSearch(String pattern, CharacterIterator target, Locale loc) { | |
this(pattern, target, | |
(RuleBasedCollator) Collator.getInstance(loc), | |
BreakIterator.getCharacterInstance(loc)); | |
} | |
/** | |
* Construct a <code>StringSearch</code> object using the collator for the default | |
* locale. | |
* @param pattern The text for which this object will search. | |
* | |
* @param target The text in which to search for the pattern. | |
*/ | |
public StringSearch(String pattern, String target) { | |
this(pattern, | |
new StringCharacterIterator(target), | |
(RuleBasedCollator)Collator.getInstance(), | |
BreakIterator.getCharacterInstance()); | |
} | |
//------------------------------------------------------------------- | |
// Getters and Setters | |
//------------------------------------------------------------------- | |
/** | |
* Sets this object's strength property. The strength determines the | |
* minimum level of difference considered significant during a | |
* search. Generally, {@link Collator#TERTIARY} and | |
* {@link Collator#IDENTICAL} indicate that all differences are | |
* considered significant, {@link Collator#SECONDARY} indicates | |
* that upper/lower case distinctions should be ignored, and | |
* {@link Collator#PRIMARY} indicates that both case and accents | |
* should be ignored. However, the exact meanings of these constants | |
* are determined by individual Collator objects. | |
* <p> | |
* @see java.text.Collator#PRIMARY | |
* @see java.text.Collator#SECONDARY | |
* @see java.text.Collator#TERTIARY | |
* @see java.text.Collator#IDENTICAL | |
*/ | |
public void setStrength(int newStrength) { | |
strength = newStrength; | |
// Due to a bug (?) in CollationElementIterator, we must set the | |
// collator's strength as well, since the iterator is going to | |
// mask out the portions of the collation element that are not | |
// relevant for the collator's current strength setting | |
// Note that this makes it impossible to share a Collator among | |
// multiple StringSearch objects if you adjust Strength settings. | |
collator.setStrength(strength); | |
initialize(); | |
} | |
/** | |
* Returns this object's strength property, which indicates what level | |
* of differences are considered significant during a search. | |
* <p> | |
* @see #setStrength | |
*/ | |
public int getStrength() { | |
return strength; | |
} | |
/** | |
* Set the collator to be used for this string search. Also changes | |
* the search strength to match that of the new collator. | |
* <p> | |
* This method causes internal data such as Boyer-Moore shift tables | |
* to be recalculated, but the iterator's position is unchanged. | |
* <p> | |
* @see #getCollator | |
*/ | |
public void setCollator(RuleBasedCollator coll) { | |
collator = coll; | |
strength = collator.getStrength(); | |
// Also need to recompute the pattern and get a new target iterator | |
CharacterIterator target = getTarget(); | |
target.setIndex(target.getBeginIndex()); | |
iter = collator.getCollationElementIterator(target); | |
initialize(); | |
} | |
/** | |
* Return the RuleBasedCollator being used for this string search. | |
*/ | |
public RuleBasedCollator getCollator() { | |
return collator; | |
} | |
/** | |
* Set the pattern for which to search. | |
* This method causes internal data such as Boyer-Moore shift tables | |
* to be recalculated, but the iterator's position is unchanged. | |
*/ | |
public void setPattern(String pat) { | |
pattern = pat; | |
initialize(); | |
} | |
/** | |
* Returns the pattern for which this object is searching. | |
*/ | |
public String getPattern() { | |
return pattern; | |
} | |
/** | |
* Set the target text which should be searched and resets the | |
* iterator's position to point before the start of the new text. | |
* This method is useful if you want to re-use an iterator to | |
* search for the same pattern within a different body of text. | |
*/ | |
public void setTarget(CharacterIterator target) { | |
super.setTarget(target); | |
// Since we're caching a CollationElementIterator, recreate it | |
target.setIndex(target.getBeginIndex()); | |
iter = collator.getCollationElementIterator(target); | |
} | |
//------------------------------------------------------------------- | |
// Privates | |
//------------------------------------------------------------------- | |
/** | |
* Search forward for matching text, starting at a given location. | |
* Clients should not call this method directly; instead they should call | |
* {@link SearchIterator#next}. | |
* <p> | |
* If a match is found, this method returns the index at which the match | |
* starts and calls {@link SearchIterator#setMatchLength} | |
* with the number of characters in the target | |
* text that make up the match. If no match is found, the method returns | |
* <code>DONE</code> and does not call <tt>setMatchLength</tt>. | |
* <p> | |
* @param start The index in the target text at which the search starts. | |
* | |
* @return The index at which the matched text in the target starts, or DONE | |
* if no match was found. | |
* <p> | |
* @see SearchIterator#next | |
* @see SearchIterator#DONE | |
*/ | |
protected int handleNext(int start) | |
{ | |
CharacterIterator target = getTarget(); | |
int mask = getMask(strength); | |
int done = CollationElementIterator.NULLORDER & mask; | |
if (DEBUG) { | |
debug("-------------------------handleNext-----------------------------------"); | |
debug(""); | |
debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) | |
+ ", done=" + Integer.toString(done,16)); | |
debug("decomp=" + collator.getDecomposition()); | |
debug("target.begin=" + getTarget().getBeginIndex()); | |
debug("target.end=" + getTarget().getEndIndex()); | |
debug("start = " + start); | |
} | |
int index = start + minLen; | |
int matchEnd = 0; | |
while (index <= target.getEndIndex()) | |
{ | |
int patIndex = normLen; | |
int tval = 0, pval = 0; | |
boolean getP = true; | |
iter.setOffset(index); | |
matchEnd = index; | |
if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index + ", iter offset= " + iter.getOffset()); | |
while ((patIndex > 0 || getP == false) && iter.getOffset() > start) | |
{ | |
if (DEBUG) { | |
debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); | |
debug(" getP=" + getP); | |
} | |
// Get the previous character in both the pattern and the target | |
tval = iter.previous() & mask; | |
if (getP) pval = valueList[--patIndex]; | |
getP = true; | |
if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); | |
if (tval == 0) { // skip tval, use same pval | |
if (DEBUG) debug(" tval is ignorable"); | |
getP = false; | |
} | |
else if (pval != tval) { // Mismatch, skip ahead | |
if (DEBUG) debug(" mismatch: skippping " + getShift(tval, patIndex)); | |
index += getShift(tval, patIndex); | |
break; | |
} | |
else if (patIndex == 0) { | |
// The values matched, and we're at the beginning of the pattern, | |
// which means we matched the whole thing. | |
start = iter.getOffset(); | |
setMatchLength(matchEnd - start); | |
if (DEBUG) debug("Found match at index "+ start ); | |
return start; | |
} | |
} | |
if (DEBUG) { | |
debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); | |
debug(" getP=" + getP); | |
} | |
if (index == matchEnd) { | |
// We hit the beginning of the text being searched, which is | |
// possible if it contains lots of ignorable characters. | |
// Advance one character and try again. | |
if (DEBUG) debug("hit beginning of target; advance by one"); | |
index++; | |
} | |
} | |
if (DEBUG) debug("Fell off end of outer loop; returning DONE"); | |
return DONE; | |
} | |
/** | |
* Search backward for matching text ,starting at a given location. | |
* Clients should not call this method directly; instead they should call | |
* <code>SearchIterator.previous()</code>, which this method overrides. | |
* <p> | |
* If a match is found, this method returns the index at which the match | |
* starts and calls {@link SearchIterator#setMatchLength} | |
* with the number of characters in the target | |
* text that make up the match. If no match is found, the method returns | |
* <code>DONE</code> and does not call <tt>setMatchLength</tt>. | |
* <p> | |
* @param start The index in the target text at which the search starts. | |
* | |
* @return The index at which the matched text in the target starts, or DONE | |
* if no match was found. | |
* <p> | |
* @see SearchIterator#previous | |
* @see SearchIterator#DONE | |
*/ | |
protected int handlePrev(int start) | |
{ | |
int patLen = normLen; | |
int index = start - minLen; | |
int mask = getMask(strength); | |
int done = CollationElementIterator.NULLORDER & mask; | |
if (DEBUG) { | |
debug("-------------------------handlePrev-----------------------------------"); | |
debug(""); | |
debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) | |
+ ", done=" + Integer.toString(done,16)); | |
debug("decomp=" + collator.getDecomposition()); | |
debug("target.begin=" + getTarget().getBeginIndex()); | |
debug("target.end=" + getTarget().getEndIndex()); | |
debug("start = " + start); | |
} | |
while (index >= 0) { | |
int patIndex = 0; | |
int tval = 0, pval = 0; | |
boolean getP = true; | |
iter.setOffset(index); | |
if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index); | |
while ((patIndex < patLen || !getP) && iter.getOffset() < start) | |
{ | |
if (DEBUG) { | |
debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); | |
debug(" getP=" + getP); | |
} | |
tval = iter.next() & mask; | |
if (getP) pval = valueList[patIndex++]; | |
getP = true; | |
if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); | |
if (tval == done) { | |
if (DEBUG) debug(" end of target; no match"); | |
return DONE; | |
} | |
else if (tval == 0) { | |
if (DEBUG) debug(" tval is ignorable"); | |
getP = false; | |
} | |
else if (pval != tval) { | |
// We didn't match this pattern. Skip ahead | |
if (DEBUG) debug(" mismatch: skippping " + getBackShift(tval, patIndex)); | |
int shift = getBackShift(tval, patIndex); | |
index -= shift; | |
break; | |
} | |
else if (patIndex == patLen) { | |
// The elements matched and we're at the end of the pattern, | |
// which means we matched the whole thing. | |
setMatchLength(iter.getOffset() - index); | |
if (DEBUG) debug("Found match at index "+ start ); | |
return index; | |
} | |
} | |
if (DEBUG) { | |
debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); | |
debug(" getP=" + getP); | |
} | |
if (iter.getOffset() >= start) { | |
// We hit the end of the text being searched, which is | |
// possible if it contains lots of ignorable characters. | |
// Back up one character and try again. | |
if (DEBUG) debug("hit end of target; back by one"); | |
index--; | |
} | |
} | |
if (DEBUG) debug("Fell off end of outer loop; returning DONE"); | |
return DONE; | |
} | |
/** | |
* Return a bitmask that will select only the portions of a collation | |
* element that are significant at the given strength level. | |
*/ | |
private static final int getMask(int strength) { | |
switch (strength) { | |
case Collator.PRIMARY: | |
return 0xFFFF0000; | |
case Collator.SECONDARY: | |
return 0xFFFFFF00; | |
default: | |
return 0xFFFFFFFF; | |
} | |
} | |
//------------------------------------------------------------------------ | |
// Private Data | |
// | |
private CollationElementIterator iter; | |
private RuleBasedCollator collator; | |
private int strength; | |
//------------------------------------------------------------------------ | |
// Everything from here on down is the data used to represent the | |
// Boyer-Moore shift tables and the code that generates and manipulates | |
// them. | |
// | |
private static final int MAX_TABLE = 256; // Size of the shift tables | |
private int valueList[] = null; | |
private int shiftTable[] = new int[MAX_TABLE]; | |
private int backShiftTable[] = new int[MAX_TABLE]; | |
private String pattern; // The pattern string | |
private int normLen = 0; // num. of collation elements in pattern. | |
private int minLen = 0; // Min of composed, decomposed versions | |
private int maxLen = 0; // Max | |
private void initialize() { | |
if (DEBUG) { | |
debug("-------------------------initialize-----------------------------------"); | |
debug("pattern=" + pattern); | |
} | |
CollationElementIterator iter = collator.getCollationElementIterator(pattern); | |
int mask = getMask(strength); | |
// See how many non-ignorable collation keys are in the text | |
normLen = 0; | |
int elem; | |
while ((elem = iter.next()) != CollationElementIterator.NULLORDER) | |
{ | |
if ((elem & mask) != 0) { | |
normLen++; | |
} | |
} | |
// Save them all | |
valueList = new int[normLen]; | |
int expandLen = 0; | |
iter.reset(); | |
for (int i = 0; i < normLen; i++) | |
{ | |
elem = iter.next(); | |
if ((elem & mask) != 0) { | |
valueList[i] = elem & mask; | |
} | |
// Keep track of whether there are any expanding-character | |
// sequences that can result in one of the characters that's in | |
// the pattern. If there are, we have to reduce the shift | |
// distances calculated below to account for it. | |
expandLen += iter.getMaxExpansion(elem) - 1; | |
} | |
// | |
// We need to remember the size of the composed and decomposed | |
// versions of the string. Standard Boyer-Moore shift calculations | |
// can be wrong by an amount up to that difference, since a small | |
// small number of characters in the pattern can map to a larger | |
// number in the text being searched, or vice-versa. | |
// | |
int uniLen = pattern.length(); | |
maxLen = Math.max(normLen, uniLen); | |
minLen = Math.min(normLen, uniLen) - expandLen; | |
if (DEBUG) debug("normLen=" + normLen + ", expandLen=" + expandLen | |
+ ", maxLen=" + maxLen + ", minLen=" + minLen); | |
// Now initialize the shift tables | |
// | |
// NOTE: This is the most conservative way to build them. If we had a way | |
// of knowing that there were no expanding/contracting chars in the rules, | |
// we could get rid of the "- 1" in the shiftTable calculations. | |
// But all of the default collators have at least one expansion or | |
// contraction, so it probably doesn't matter anyway. | |
// | |
for (int i = 0; i < MAX_TABLE; i++) { | |
shiftTable[i] = backShiftTable[i] = minLen; | |
} | |
for (int i = 0; i < normLen-1; i++) { | |
shiftTable[hash(valueList[i])] = Math.max(minLen - i - 1, 1); | |
} | |
shiftTable[hash(valueList[normLen-1])] = 1; | |
for (int i = normLen - 1; i > 0; i--) { | |
backShiftTable[hash(valueList[i])] = i; | |
} | |
backShiftTable[hash(valueList[0])] = 1; | |
if (DEBUG) dumpTables(); | |
} | |
/** | |
* Method used by StringSearch to determine how far to the right to | |
* shift the pattern during a Boyer-Moore search. | |
* | |
* @param curValue The current value in the target text | |
* @param curIndex The index in the pattern at which we failed to match | |
* curValue in the target text. | |
*/ | |
private int getShift( int curValue, int curIndex ) { | |
int shiftAmt = shiftTable[hash(curValue)]; | |
// if (minLen != maxLen) { | |
int adjust = normLen - curIndex; | |
// if (shiftAmt > adjust + 1) { | |
if (adjust > 1 && shiftAmt >= adjust) { | |
if (DEBUG) debug("getShift: adjusting by " + adjust); | |
// shiftAmt -= adjust; | |
shiftAmt -= adjust - 1; | |
} | |
// } | |
return shiftAmt; | |
} | |
/** | |
* Method used by StringSearch to determine how far to the left to | |
* shift the pattern during a reverse Boyer-Moore search. | |
* | |
* @param curValue The current value in the target text | |
* @param curIndex The index in the pattern at which we failed to match | |
* curValue in the target text. | |
*/ | |
private int getBackShift( int curValue, int curIndex ) { | |
int shiftAmt = backShiftTable[hash(curValue)]; | |
// if (minLen != maxLen) { | |
int adjust = curIndex; | |
// int adjust = normLen - (minLen - curIndex); | |
if (adjust > 1 && shiftAmt > adjust) { | |
shiftAmt -= adjust - 1; | |
} | |
/* | |
if (shiftAmt > adjust + 1) { | |
if (DEBUG) debug("getBackShift: adjusting by " + adjust); | |
shiftAmt -= adjust; | |
} | |
*/ | |
// } | |
return shiftAmt; | |
} | |
/** | |
* Hash a collation element from its full size (32 bits) down into a | |
* value that can be used as an index into the shift tables. Right | |
* now we do a modulus by the size of the hash table. | |
* | |
* TODO: At some point I should experiment to see whether a slightly | |
* more complicated hash function gives us a better distribution | |
* on multilingual text. I doubt it will have much effect on | |
* performance, though. | |
*/ | |
private static final int hash(int order) { | |
return CollationElementIterator.primaryOrder(order) % MAX_TABLE; | |
} | |
//------------------------------------------------------------------------- | |
// Debugging support... | |
//------------------------------------------------------------------------- | |
static private final boolean DEBUG = false; | |
static void debug(String str) { | |
System.out.println(str); | |
} | |
void dumpTables() { | |
for (int i = 0; i < MAX_TABLE; i++) { | |
if (shiftTable[i] != minLen) { | |
debug("shift[" + Integer.toString(i,16) + "] = " + shiftTable[i]); | |
} | |
} | |
for (int i = 0; i < MAX_TABLE; i++) { | |
if (backShiftTable[i] != minLen) { | |
debug("backShift[" + Integer.toString(i,16) + "] = " + backShiftTable[i]); | |
} | |
} | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment