Skip to content

Instantly share code, notes, and snippets.

@ncruces
Last active June 7, 2018 10:40
Show Gist options
  • Save ncruces/fed041b5cac611f413267cf674ce2e4f to your computer and use it in GitHub Desktop.
Save ncruces/fed041b5cac611f413267cf674ce2e4f to your computer and use it in GitHub Desktop.
Plain Java/Android StringSearch (ported from ICU; unfortunately the original had a few bugs at this point)
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SearchIterator.java,v $
* $Date: 2002/04/03 19:13:56 $
* $Revision: 1.6 $
*
*****************************************************************************************
*/
package com.ibm.icu.text;
import java.text.BreakIterator;
import java.text.CharacterIterator;
/**
* <code>SearchIterator</code> is an abstract base class that provides methods
* to search for a pattern within a text string. Instances of
* <code>SearchIterator</code> maintain a current position and scan over
* the target text, returning the indices the pattern is matched
* and the length of each match.
* <p>
* <code>SearchIterator</code> is an abstract base class that defines a
* protocol for text searching. Subclasses provide concrete implementations of
* various search algorithms. For example, {@link StringSearch}
* implements language-sensitive pattern matching based on the comparison rules
* defined in a {@link java.text.RuleBasedCollator RuleBasedCollator} object.
* <p>
* Internally, <code>SearchIterator</code> scans text using a
* {@link CharacterIterator}, and is thus able to scan text held
* by any object implementing that protocol. A <code>StringCharacterIterator</code>
* is used to scan <code>String</code> objects passed to <code>setText</code>.
* <p>
* <code>SearchIterator</code> provides an API that is similar to that of
* other text iteration classes such as <code>BreakIterator</code>. Using this
* class, it is easy to scan through text looking for all occurances of a
* given pattern. The following example uses a <code>StringSearch</code> object to
* find all instances of "fox" in the target string. Any other subclass of
* <code>SearchIterator</code> can be used in an identical manner.
* <pre><code>
* String target = "The quick brown fox jumped over the lazy fox";
* String pattern = "fox";
*
* SearchIterator iter = new StringSearch(pattern, target);
*
* for (int pos = iter.first(); pos != SearchIterator.DONE; pos = iter.next()) {
* System.out.println("Found match at " + pos +
* ", length is " + iter.getMatchLength());
* }
* </code></pre>
*
* @see StringSearch
*/
public abstract class SearchIterator {
/**
* DONE is returned by previous() and next() after all valid
* matches have been returned, and by first() and last() if
* there are no matches at all.
*/
public static final int DONE = -1;
/**
* Private value indicating that the iterator is pointing
* before the beginning of the target text.
*/
private static final int BEFORE = -2;
/**
* Return the first index at which the target text matches the search
* pattern. The iterator is adjusted so that its current index
* (as returned by {@link #getIndex}) is the match posisition if one was found
* and <code>DONE</code> if one was not.
*
* @return The character index of the first match, or <code>DONE</code> if there
* are no matches.
*/
final public int first() {
setIndex(BEFORE);
return next();
}
/**
* Return the first index greater than <tt>pos</tt> at which the target
* text matches the search pattern. The iterator is adjusted so that its current index
* (as returned by {@link #getIndex}) is the match posisition if one was found
* and <code>DONE</code> if one was not.
*
* @return The character index of the first match following <code>pos</code>,
* or <tt>DONE</tt> if there are no matches.
*/
final public int following(int pos) {
setIndex(pos);
return next();
}
/**
* Return the last index in the target text at which it matches
* the search pattern and adjusts the iteration to point to that position.
*
* @return The index of the first match, or <tt>DONE</tt> if there
* are no matches.
*/
final public int last() {
setIndex(DONE);
return previous();
}
/**
* Return the first index less than <code>pos</code> at which the target
* text matches the search pattern. The iterator is adjusted so that its current index
* (as returned by {@link #getIndex}) is the match posisition if one was found
* and <tt>DONE</tt> if one was not.
*
* @return The character index of the first match preceding <code>pos</code>,
* or <code>DONE</code> if there are no matches.
*/
final public int preceding(int pos) {
setIndex(pos);
return previous();
}
/**
* Return the index of the next point at which the text matches the
* search pattern, starting from the current position.
* @return The index of the next match after the current position,
* or <code>DONE</code> if there are no more matches.
*
* @see #first
*/
public int next() {
if (index == BEFORE){
// Starting at the beginning of the text
index = target.getBeginIndex();
} else if (length > 0) {
// Finding the next match after a previous one
index += overlap ? 1 : length;
}
index -= 1;
do {
length = 0;
index = handleNext(index + 1);
} while (index != DONE && !isBreakUnit(index, index+length));
return index;
}
/**
* Return the index of the previous point at which the text matches
* the search pattern, starting at the current position
*
* @return The index of the previous match before the current position,
* or <code>DONE</code> if there are no more matches.
*/
public int previous() {
if (index == DONE) {
index = target.getEndIndex();
} else if (length > 0) {
// Finding the previous match before a following one
index = overlap ? index + length - 1 : index;
}
index += 1;
do {
length = 0;
index = handlePrev(index - 1);
} while (index != DONE && !isBreakUnit(index, index+length));
if (index == DONE) {
index = BEFORE;
}
return getIndex();
}
/**
* Return the current index in the text being searched.
* If the iteration has gone past the end of the text
* (or past the beginning for a backwards search),
* {@link #DONE} is returned.
*/
public int getIndex() {
return index == BEFORE ? DONE : index;
}
/**
* Determines whether overlapping matches are returned. If this
* property is <code>true</code>, matches that begin within the
* boundry of the previous match are considered valid and will
* be returned. For example, when searching for "abab" in the
* target text "ababab", both offsets 0 and 2 will be returned
* as valid matches if this property is <code>true</code>.
* <p>
* The default setting of this property is <tt>true</tt>
*/
public void setOverlapping(boolean allowOverlap) {
overlap = allowOverlap;
}
/**
* Determines whether overlapping matches are returned.
*
* @see #setOverlapping
*/
public boolean isOverlapping() {
return overlap;
}
/**
* Returns the length of text in the target which matches the search
* pattern. This call returns a valid result only after a successful
* call to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
* Just after construction, or after a searching method returns
* <tt>DONE</tt>, this method will return 0.
*
* @return The length of the match in the target text, or 0 if there
* is no match currently.
*/
public int getMatchLength() {
return length;
}
/**
* Set the BreakIterator that will be used to restrict the points
* at which matches are detected.
*
* @param iterator A {@link java.text.BreakIterator BreakIterator}
* that will be used to restrict the points
* at which matches are detected. If a match is found, but the match's start
* or end index is not a boundary as determined by
* the <tt>BreakIterator</tt>, the match will be rejected and
* another will be searched for.
*
* If this parameter is <tt>null</tt>, no break
* detection is attempted.
*
* @see #getBreakIterator
*/
public void setBreakIterator(BreakIterator iterator) {
breaker = iterator;
if (breaker != null) {
breaker.setText(target);
}
}
/**
* Returns the BreakIterator that is used to restrict the points
* at which matches are detected. This will be the same object
* that was passed to the constructor or to <code>setBreakIterator</code>.
* Note that <tt>null</tt> is a legal value; it means that break
* detection should not be attempted.
*
* @see #setBreakIterator
*/
public BreakIterator getBreakIterator() {
return breaker;
}
/**
* Set the target text which should be searched and resets the
* iterator's position to point before the start of the target text.
* This method is useful if you want to re-use an iterator to
* search for the same pattern within a different body of text.
*
* @see #getTarget
*/
public void setTarget(CharacterIterator iterator) {
target = iterator;
if (breaker != null) {
breaker.setText(target);
}
setIndex(BEFORE);
}
/**
* Return the target text which is being searched
*
* @see #setTarget
*/
public CharacterIterator getTarget() {
return target;
}
/**
* Returns the text that was matched by the most recent call to
* {@link #first}, {@link #next}, {@link #previous}, or {@link #last}.
* If the iterator is not pointing at a valid match (e.g. just after
* construction or after <tt>DONE</tt> has been returned, returns
* an empty string.
*/
public String getMatchedText() {
StringBuffer buffer = new StringBuffer();
if (length > 0) {
int i = 0;
for (char c = target.setIndex(index); i < length; c = target.next(), i++)
{
buffer.append(c);
}
}
return buffer.toString();
}
//-------------------------------------------------------------------
// Protected interface for subclasses
//-------------------------------------------------------------------
/**
* Constructor for use by subclasses.
* <p>
* @param target The target text to be searched. This is for internal
* use by this class. Subclasses need to maintain their
* own reference to or iterator over the target text
* for use by their {@link #handleNext handleNext} and
* {@link #handlePrev handlePrev} methods.
*
* @param breaker A {@link BreakIterator} that is used to restrict the points
* at which matches are detected. If <tt>handleNext</tt> or
* <tt>handlePrev</tt> finds a match, but the match's start
* or end index is not a boundary as determined by
* the <tt>BreakIterator</tt>, the match is rejected and
* <tt>handleNext</tt> or <tt>handlePrev</tt> is called again.
* If this parameter is <tt>null</tt>, no break
* detection is attempted.
*
*/
protected SearchIterator(CharacterIterator target, BreakIterator breaker)
{
this.target = target;
if (breaker != null) {
this.breaker = (BreakIterator)breaker.clone();
this.breaker.setText(target);
}
index = target.getBeginIndex();
length = 0;
}
/**
* Abstract method which subclasses override to provide the mechanism
* for finding the next match in the target text. This allows different
* subclasses to provide different search algorithms.
* <p>
* If a match is found, the implementation should return the index at
* which the match starts and should call {@link #setMatchLength setMatchLength}
* with the number of characters in the target
* text that make up the match. If no match is found, the method
* should return DONE and should not call <tt>setMatchLength</tt>.
* <p>
* @param startAt The index in the target text at which the search
* should start.
*
* @see #setMatchLength
*/
protected abstract int handleNext(int startAt);
/**
* Abstract method which subclasses override to provide the mechanism
* for finding the previous match in the target text. This allows different
* subclasses to provide different search algorithms.
* <p>
* If a match is found, the implementation should return the index at
* which the match starts and should call {@link #setMatchLength setMatchLength}
* with the number of characters in the target
* text that make up the match. If no match is found, the method
* should return DONE and should not call <tt>setMatchLength</tt>.
* <p>
* @param startAt The index in the target text at which the search
* should start.
*
* @see #setMatchLength
*/
protected abstract int handlePrev(int startAt);
/**
* Sets the length of the currently matched string in the target text.
* Subclasses' <code>handleNext</code> and <code>handlePrev</code>
* methods should call this when they find a match in the target text.
*/
protected void setMatchLength(int length) {
this.length = length;
}
//-------------------------------------------------------------------
// Privates
//
/**
* Internal method used by preceding and following. Sets the index
* to point to the given position, and clears any state that's
* affected.
*/
private void setIndex(int pos) {
index = pos;
length = 0;
}
/**
* Determine whether the target text bounded by <code>start</code> and
* <code>end</code> is one or more whole units of text as determined by
* the current <code>BreakIterator</code>.
*/
private boolean isBreakUnit(int start, int end)
{
if (breaker == null) {
return true;
}
boolean startBound = breaker.isBoundary(start);
boolean endBound = (end == target.getEndIndex()) || breaker.isBoundary(end);
return startBound && endBound;
}
//-------------------------------------------------------------------------
// Private data...
//-------------------------------------------------------------------------
private int index; // Current position in the target text
private int length; // Length of matched text, or 0
private boolean overlap = true; // Return overlapping matches?
private CharacterIterator target; // Target text to be searched
private BreakIterator breaker; // Break iterator to constrain matches
};
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringSearch.java,v $
* $Date: 2002/03/20 05:11:16 $
* $Revision: 1.6 $
*
*****************************************************************************************
*/
package com.ibm.icu.text;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.text.CollationElementIterator;
import java.text.Collator;
import java.text.RuleBasedCollator;
import java.text.StringCharacterIterator;
import java.util.Locale;
/**
* <code>StringSearch</code> is a <code>SearchIterator</code> that provides
* language-sensitive text searching based on the comparison rules defined
* in a {@link RuleBasedCollator} object.
* Instances of <code>StringSearch</code> function as iterators
* maintain a current position and scan over text returning the index of
* characters where the pattern occurs and the length of each match.
* <p>
* <code>StringSearch</code> uses a version of the fast Boyer-Moore search
* algorithm that has been adapted to work with the large character set of
* Unicode. See "Efficient Text Searching in Java", to be published in
* <i>Java Report</i> in February, 1999, for further information on the algorithm.
* <p>
* Consult the <code>SearchIterator</code> documentation for information on
* and examples of how to use instances of this class to implement text
* searching. <code>SearchIterator</code> provides all of the necessary
* API; this class only provides constructors and internal implementation
* methods.
*
* @see SearchIterator
* @see java.text.RuleBasedCollator
*
* @author Laura Werner
* @version 1.0
*/
public final class StringSearch extends SearchIterator
{
/**
* Construct a <code>StringSearch</code> object using a specific collator and set
* of boundary-detection rules.
* <p>
* @param pat The text for which this object will search.
*
* @param target The text in which to search for the pattern.
*
* @param coll A <code>RuleBasedCollator</code> object which defines the
* language-sensitive comparison rules used to determine
* whether text in the pattern and target matches.
*
* @param breaker A <code>BreakIterator</code> object used to constrain the matches
* that are found. Matches whose start and end indices
* in the target text are not boundaries as determined
* by the <code>BreakIterator</code> are ignored. If this behavior
* is not desired, <code>null</code> can be passed in instead.
*/
public StringSearch(String pat, CharacterIterator target,
RuleBasedCollator coll, BreakIterator breaker) {
super(target, breaker);
pattern = pat;
collator = coll;
strength = coll.getStrength();
target.setIndex(target.getBeginIndex());
iter = collator.getCollationElementIterator(target);
initialize(); // Initialize the Boyer-Moore tables
}
/**
* Construct a <code>StringSearch</code> object using a specific collator.
* <p>
* @param pattern The text for which this object will search.
*
* @param target The text in which to search for the pattern.
*
* @param collator A <code>RuleBasedCollator</code> object which defines the
* language-sensitive comparison rules used to determine
* whether text in the pattern and target matches.
*/
public StringSearch(String pattern,
CharacterIterator target,
RuleBasedCollator collator) {
this(pattern, target, collator, BreakIterator.getCharacterInstance());
}
/**
* Construct a <code>StringSearch</code> object using the collator and
* character boundary detection rules for a given locale.
* @param pattern The text for which this object will search.
*
* @param target The text in which to search for the pattern.
*
* @param loc The locale whose collation and break-detection rules
* should be used.
*
* @exception ClassCastException thrown if the collator for the specified
* locale is not a RuleBasedCollator.
*/
public StringSearch(String pattern, CharacterIterator target, Locale loc) {
this(pattern, target,
(RuleBasedCollator) Collator.getInstance(loc),
BreakIterator.getCharacterInstance(loc));
}
/**
* Construct a <code>StringSearch</code> object using the collator for the default
* locale.
* @param pattern The text for which this object will search.
*
* @param target The text in which to search for the pattern.
*/
public StringSearch(String pattern, String target) {
this(pattern,
new StringCharacterIterator(target),
(RuleBasedCollator)Collator.getInstance(),
BreakIterator.getCharacterInstance());
}
//-------------------------------------------------------------------
// Getters and Setters
//-------------------------------------------------------------------
/**
* Sets this object's strength property. The strength determines the
* minimum level of difference considered significant during a
* search. Generally, {@link Collator#TERTIARY} and
* {@link Collator#IDENTICAL} indicate that all differences are
* considered significant, {@link Collator#SECONDARY} indicates
* that upper/lower case distinctions should be ignored, and
* {@link Collator#PRIMARY} indicates that both case and accents
* should be ignored. However, the exact meanings of these constants
* are determined by individual Collator objects.
* <p>
* @see java.text.Collator#PRIMARY
* @see java.text.Collator#SECONDARY
* @see java.text.Collator#TERTIARY
* @see java.text.Collator#IDENTICAL
*/
public void setStrength(int newStrength) {
strength = newStrength;
// Due to a bug (?) in CollationElementIterator, we must set the
// collator's strength as well, since the iterator is going to
// mask out the portions of the collation element that are not
// relevant for the collator's current strength setting
// Note that this makes it impossible to share a Collator among
// multiple StringSearch objects if you adjust Strength settings.
collator.setStrength(strength);
initialize();
}
/**
* Returns this object's strength property, which indicates what level
* of differences are considered significant during a search.
* <p>
* @see #setStrength
*/
public int getStrength() {
return strength;
}
/**
* Set the collator to be used for this string search. Also changes
* the search strength to match that of the new collator.
* <p>
* This method causes internal data such as Boyer-Moore shift tables
* to be recalculated, but the iterator's position is unchanged.
* <p>
* @see #getCollator
*/
public void setCollator(RuleBasedCollator coll) {
collator = coll;
strength = collator.getStrength();
// Also need to recompute the pattern and get a new target iterator
CharacterIterator target = getTarget();
target.setIndex(target.getBeginIndex());
iter = collator.getCollationElementIterator(target);
initialize();
}
/**
* Return the RuleBasedCollator being used for this string search.
*/
public RuleBasedCollator getCollator() {
return collator;
}
/**
* Set the pattern for which to search.
* This method causes internal data such as Boyer-Moore shift tables
* to be recalculated, but the iterator's position is unchanged.
*/
public void setPattern(String pat) {
pattern = pat;
initialize();
}
/**
* Returns the pattern for which this object is searching.
*/
public String getPattern() {
return pattern;
}
/**
* Set the target text which should be searched and resets the
* iterator's position to point before the start of the new text.
* This method is useful if you want to re-use an iterator to
* search for the same pattern within a different body of text.
*/
public void setTarget(CharacterIterator target) {
super.setTarget(target);
// Since we're caching a CollationElementIterator, recreate it
target.setIndex(target.getBeginIndex());
iter = collator.getCollationElementIterator(target);
}
//-------------------------------------------------------------------
// Privates
//-------------------------------------------------------------------
/**
* Search forward for matching text, starting at a given location.
* Clients should not call this method directly; instead they should call
* {@link SearchIterator#next}.
* <p>
* If a match is found, this method returns the index at which the match
* starts and calls {@link SearchIterator#setMatchLength}
* with the number of characters in the target
* text that make up the match. If no match is found, the method returns
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
* <p>
* @param start The index in the target text at which the search starts.
*
* @return The index at which the matched text in the target starts, or DONE
* if no match was found.
* <p>
* @see SearchIterator#next
* @see SearchIterator#DONE
*/
protected int handleNext(int start)
{
CharacterIterator target = getTarget();
int mask = getMask(strength);
int done = CollationElementIterator.NULLORDER & mask;
if (DEBUG) {
debug("-------------------------handleNext-----------------------------------");
debug("");
debug("strength=" + strength + ", mask=" + Integer.toString(mask,16)
+ ", done=" + Integer.toString(done,16));
debug("decomp=" + collator.getDecomposition());
debug("target.begin=" + getTarget().getBeginIndex());
debug("target.end=" + getTarget().getEndIndex());
debug("start = " + start);
}
int index = start + minLen;
int matchEnd = 0;
while (index <= target.getEndIndex())
{
int patIndex = normLen;
int tval = 0, pval = 0;
boolean getP = true;
iter.setOffset(index);
matchEnd = index;
if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index + ", iter offset= " + iter.getOffset());
while ((patIndex > 0 || getP == false) && iter.getOffset() > start)
{
if (DEBUG) {
debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
debug(" getP=" + getP);
}
// Get the previous character in both the pattern and the target
tval = iter.previous() & mask;
if (getP) pval = valueList[--patIndex];
getP = true;
if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16));
if (tval == 0) { // skip tval, use same pval
if (DEBUG) debug(" tval is ignorable");
getP = false;
}
else if (pval != tval) { // Mismatch, skip ahead
if (DEBUG) debug(" mismatch: skippping " + getShift(tval, patIndex));
index += getShift(tval, patIndex);
break;
}
else if (patIndex == 0) {
// The values matched, and we're at the beginning of the pattern,
// which means we matched the whole thing.
start = iter.getOffset();
setMatchLength(matchEnd - start);
if (DEBUG) debug("Found match at index "+ start );
return start;
}
}
if (DEBUG) {
debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
debug(" getP=" + getP);
}
if (index == matchEnd) {
// We hit the beginning of the text being searched, which is
// possible if it contains lots of ignorable characters.
// Advance one character and try again.
if (DEBUG) debug("hit beginning of target; advance by one");
index++;
}
}
if (DEBUG) debug("Fell off end of outer loop; returning DONE");
return DONE;
}
/**
* Search backward for matching text ,starting at a given location.
* Clients should not call this method directly; instead they should call
* <code>SearchIterator.previous()</code>, which this method overrides.
* <p>
* If a match is found, this method returns the index at which the match
* starts and calls {@link SearchIterator#setMatchLength}
* with the number of characters in the target
* text that make up the match. If no match is found, the method returns
* <code>DONE</code> and does not call <tt>setMatchLength</tt>.
* <p>
* @param start The index in the target text at which the search starts.
*
* @return The index at which the matched text in the target starts, or DONE
* if no match was found.
* <p>
* @see SearchIterator#previous
* @see SearchIterator#DONE
*/
protected int handlePrev(int start)
{
int patLen = normLen;
int index = start - minLen;
int mask = getMask(strength);
int done = CollationElementIterator.NULLORDER & mask;
if (DEBUG) {
debug("-------------------------handlePrev-----------------------------------");
debug("");
debug("strength=" + strength + ", mask=" + Integer.toString(mask,16)
+ ", done=" + Integer.toString(done,16));
debug("decomp=" + collator.getDecomposition());
debug("target.begin=" + getTarget().getBeginIndex());
debug("target.end=" + getTarget().getEndIndex());
debug("start = " + start);
}
while (index >= 0) {
int patIndex = 0;
int tval = 0, pval = 0;
boolean getP = true;
iter.setOffset(index);
if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index);
while ((patIndex < patLen || !getP) && iter.getOffset() < start)
{
if (DEBUG) {
debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
debug(" getP=" + getP);
}
tval = iter.next() & mask;
if (getP) pval = valueList[patIndex++];
getP = true;
if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16));
if (tval == done) {
if (DEBUG) debug(" end of target; no match");
return DONE;
}
else if (tval == 0) {
if (DEBUG) debug(" tval is ignorable");
getP = false;
}
else if (pval != tval) {
// We didn't match this pattern. Skip ahead
if (DEBUG) debug(" mismatch: skippping " + getBackShift(tval, patIndex));
int shift = getBackShift(tval, patIndex);
index -= shift;
break;
}
else if (patIndex == patLen) {
// The elements matched and we're at the end of the pattern,
// which means we matched the whole thing.
setMatchLength(iter.getOffset() - index);
if (DEBUG) debug("Found match at index "+ start );
return index;
}
}
if (DEBUG) {
debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset());
debug(" getP=" + getP);
}
if (iter.getOffset() >= start) {
// We hit the end of the text being searched, which is
// possible if it contains lots of ignorable characters.
// Back up one character and try again.
if (DEBUG) debug("hit end of target; back by one");
index--;
}
}
if (DEBUG) debug("Fell off end of outer loop; returning DONE");
return DONE;
}
/**
* Return a bitmask that will select only the portions of a collation
* element that are significant at the given strength level.
*/
private static final int getMask(int strength) {
switch (strength) {
case Collator.PRIMARY:
return 0xFFFF0000;
case Collator.SECONDARY:
return 0xFFFFFF00;
default:
return 0xFFFFFFFF;
}
}
//------------------------------------------------------------------------
// Private Data
//
private CollationElementIterator iter;
private RuleBasedCollator collator;
private int strength;
//------------------------------------------------------------------------
// Everything from here on down is the data used to represent the
// Boyer-Moore shift tables and the code that generates and manipulates
// them.
//
private static final int MAX_TABLE = 256; // Size of the shift tables
private int valueList[] = null;
private int shiftTable[] = new int[MAX_TABLE];
private int backShiftTable[] = new int[MAX_TABLE];
private String pattern; // The pattern string
private int normLen = 0; // num. of collation elements in pattern.
private int minLen = 0; // Min of composed, decomposed versions
private int maxLen = 0; // Max
private void initialize() {
if (DEBUG) {
debug("-------------------------initialize-----------------------------------");
debug("pattern=" + pattern);
}
CollationElementIterator iter = collator.getCollationElementIterator(pattern);
int mask = getMask(strength);
// See how many non-ignorable collation keys are in the text
normLen = 0;
int elem;
while ((elem = iter.next()) != CollationElementIterator.NULLORDER)
{
if ((elem & mask) != 0) {
normLen++;
}
}
// Save them all
valueList = new int[normLen];
int expandLen = 0;
iter.reset();
for (int i = 0; i < normLen; i++)
{
elem = iter.next();
if ((elem & mask) != 0) {
valueList[i] = elem & mask;
}
// Keep track of whether there are any expanding-character
// sequences that can result in one of the characters that's in
// the pattern. If there are, we have to reduce the shift
// distances calculated below to account for it.
expandLen += iter.getMaxExpansion(elem) - 1;
}
//
// We need to remember the size of the composed and decomposed
// versions of the string. Standard Boyer-Moore shift calculations
// can be wrong by an amount up to that difference, since a small
// small number of characters in the pattern can map to a larger
// number in the text being searched, or vice-versa.
//
int uniLen = pattern.length();
maxLen = Math.max(normLen, uniLen);
minLen = Math.min(normLen, uniLen) - expandLen;
if (DEBUG) debug("normLen=" + normLen + ", expandLen=" + expandLen
+ ", maxLen=" + maxLen + ", minLen=" + minLen);
// Now initialize the shift tables
//
// NOTE: This is the most conservative way to build them. If we had a way
// of knowing that there were no expanding/contracting chars in the rules,
// we could get rid of the "- 1" in the shiftTable calculations.
// But all of the default collators have at least one expansion or
// contraction, so it probably doesn't matter anyway.
//
for (int i = 0; i < MAX_TABLE; i++) {
shiftTable[i] = backShiftTable[i] = minLen;
}
for (int i = 0; i < normLen-1; i++) {
shiftTable[hash(valueList[i])] = Math.max(minLen - i - 1, 1);
}
shiftTable[hash(valueList[normLen-1])] = 1;
for (int i = normLen - 1; i > 0; i--) {
backShiftTable[hash(valueList[i])] = i;
}
backShiftTable[hash(valueList[0])] = 1;
if (DEBUG) dumpTables();
}
/**
* Method used by StringSearch to determine how far to the right to
* shift the pattern during a Boyer-Moore search.
*
* @param curValue The current value in the target text
* @param curIndex The index in the pattern at which we failed to match
* curValue in the target text.
*/
private int getShift( int curValue, int curIndex ) {
int shiftAmt = shiftTable[hash(curValue)];
// if (minLen != maxLen) {
int adjust = normLen - curIndex;
// if (shiftAmt > adjust + 1) {
if (adjust > 1 && shiftAmt >= adjust) {
if (DEBUG) debug("getShift: adjusting by " + adjust);
// shiftAmt -= adjust;
shiftAmt -= adjust - 1;
}
// }
return shiftAmt;
}
/**
* Method used by StringSearch to determine how far to the left to
* shift the pattern during a reverse Boyer-Moore search.
*
* @param curValue The current value in the target text
* @param curIndex The index in the pattern at which we failed to match
* curValue in the target text.
*/
private int getBackShift( int curValue, int curIndex ) {
int shiftAmt = backShiftTable[hash(curValue)];
// if (minLen != maxLen) {
int adjust = curIndex;
// int adjust = normLen - (minLen - curIndex);
if (adjust > 1 && shiftAmt > adjust) {
shiftAmt -= adjust - 1;
}
/*
if (shiftAmt > adjust + 1) {
if (DEBUG) debug("getBackShift: adjusting by " + adjust);
shiftAmt -= adjust;
}
*/
// }
return shiftAmt;
}
/**
* Hash a collation element from its full size (32 bits) down into a
* value that can be used as an index into the shift tables. Right
* now we do a modulus by the size of the hash table.
*
* TODO: At some point I should experiment to see whether a slightly
* more complicated hash function gives us a better distribution
* on multilingual text. I doubt it will have much effect on
* performance, though.
*/
private static final int hash(int order) {
return CollationElementIterator.primaryOrder(order) % MAX_TABLE;
}
//-------------------------------------------------------------------------
// Debugging support...
//-------------------------------------------------------------------------
static private final boolean DEBUG = false;
static void debug(String str) {
System.out.println(str);
}
void dumpTables() {
for (int i = 0; i < MAX_TABLE; i++) {
if (shiftTable[i] != minLen) {
debug("shift[" + Integer.toString(i,16) + "] = " + shiftTable[i]);
}
}
for (int i = 0; i < MAX_TABLE; i++) {
if (backShiftTable[i] != minLen) {
debug("backShift[" + Integer.toString(i,16) + "] = " + backShiftTable[i]);
}
}
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment