Created
February 13, 2011 23:15
-
-
Save jexp/825280 to your computer and use it in GitHub Desktop.
Some implementation spikes of the Volnitsky substring search algorithm in java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sun.misc.Unsafe; | |
import java.util.Arrays; | |
import java.util.Random; | |
/** | |
* @author mh | |
* @since 12.02.11 | |
*/ | |
public class StringSearcher { | |
private static final int WORD = 4; | |
private static final int SIZE = 10 * 1024; | |
private static final int SS_SIZE = 10; | |
private static int testOffset; | |
private static final int COUNT = 100000; | |
// word size = 4 | |
private static void testHash() { | |
int[] hashes = new int[256]; | |
String[][] buckets = new String[256][]; | |
for (int i=0;i<256;i++) { | |
buckets[i]=new String[50]; | |
} | |
char start=32; | |
char end = 128; | |
for (char a=start;a<end;a++) { | |
for (char b=start;b<end;b++) { | |
for (char c=start;c<end;c++) { | |
for (char d=start;d<end;d++) { | |
int hash= (a << 1 ^ b << 1 ^ c ^ d << 1 ) & 0xFF; | |
// String[] bucket = buckets[hash]; | |
// if (bucket.length==hashes[hash]) { | |
// buckets[hash]=new String[bucket.length*2]; | |
// System.arraycopy(bucket,0,buckets[hash],0,bucket.length); | |
// } | |
// if (buckets[hash].length > hashes[hash]) { | |
// buckets[hash][hashes[hash]]=String.valueOf(chars); | |
// } | |
hashes[hash]++; | |
} | |
} | |
} | |
} | |
for (int i=0;i<256;i++) { | |
System.out.printf("%3d => %6d %s%n",i,hashes[i],rep(hashes[i]/50000,'#')); | |
} | |
System.out.flush(); | |
} | |
private static String rep(int count, char c) { | |
char[] chars = new char[count]; | |
Arrays.fill(chars,c); | |
return String.valueOf(chars); | |
} | |
// 1297482766284 | |
public static int indexOf(char[] textChars, char[] patternChars) { | |
// System.out.printf("pattern = #%s# %d%n", pattern, pattern.length()); | |
int[] lookup = new int[256]; | |
int patternSize = patternChars.length; | |
int stepSize = patternSize - WORD + 1; | |
for (int i = 0; i < stepSize; i++) { | |
int hash = (patternChars[i] << 1 ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] << 1) & 0xFF; | |
if (lookup[hash] != 0) | |
throw new IllegalStateException(String.format("hash already claimed %d old %d new %d", hash, lookup[hash], i)); | |
lookup[hash] = i + 1; | |
} | |
int textSize = textChars.length - stepSize; | |
for (int i = 0; i < textSize; i += stepSize) { | |
int hash = (textChars[i] << 1^ textChars[i + 1] ^ textChars[i + 2] << 1 ^ textChars[i + 3] << 1) & 0xFF; | |
if (lookup[hash] != 0) { | |
int patternOffset = lookup[hash] - 1; | |
if (textChars[i] == patternChars[patternOffset] && | |
textChars[i + 1] == patternChars[patternOffset + 1] && | |
textChars[i + 2] == patternChars[patternOffset + 2] && | |
textChars[i + 3] == patternChars[patternOffset + 3]) { | |
int offset = i - patternOffset; | |
if (Arrays.equals(Arrays.copyOfRange(textChars, offset, offset + patternSize), patternChars)) { | |
return offset; | |
} | |
} | |
} | |
} | |
return -1; | |
} | |
public static int indexOfTrace(CharSequence text, CharSequence pattern, int count) { | |
char[] patternChars = pattern.toString().toCharArray(); | |
char[] textChars = text.toString().toCharArray(); | |
System.out.printf("pattern = #%s# %d%n", pattern, pattern.length()); | |
long time = System.currentTimeMillis(); | |
for (int x = 0; x < count; x++) { | |
int[] lookup = new int[256]; | |
int patternSize = patternChars.length; | |
int patternMinusWordSize = patternSize - WORD; | |
for (int i = 0; i < patternMinusWordSize; i++) { | |
int hash = (patternChars[i] << 1 ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] <<1) & 0xFF; | |
if (lookup[hash] != 0) | |
throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD))); | |
lookup[hash] = i + 1; | |
} | |
} | |
time = System.currentTimeMillis() - time; | |
System.out.println("time pattern = " + time); | |
int[] lookup = new int[256]; | |
int patternSize = pattern.length(); | |
int patternMinusWordSize = patternSize - WORD; | |
for (int i = 0; i < patternMinusWordSize; i++) { | |
int hash = (pattern.charAt(i) << 1 ^ pattern.charAt(i + 1) ^ pattern.charAt(i + 2) << 1 ^ pattern.charAt(i + 3) << 1) & 0xFF; | |
if (lookup[hash] != 0) | |
throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD))); | |
lookup[hash] = i + 1; | |
} | |
time = System.currentTimeMillis(); | |
int matches = 0; | |
int result = -1; | |
LABEL: | |
for (int x = 0; x < count; x++) { | |
{ matches = 0; | |
int stepSize = patternMinusWordSize + 1; | |
int textSize = textChars.length - stepSize; | |
for (int i = 0; i < textSize; i += stepSize) { | |
int hash = (textChars[i] <<1 ^ textChars[i + 1] ^ textChars[i + 2] <<1^ textChars[i + 3]<<1) & 0xFF; | |
if (lookup[hash] != 0) { | |
matches++; | |
int patternOffset = lookup[hash] - 1; | |
if (textChars[i] == patternChars[patternOffset] && | |
textChars[i + 1] == patternChars[patternOffset + 1] && | |
textChars[i + 2] == patternChars[patternOffset + 2] && | |
textChars[i + 3] == patternChars[patternOffset + 3]) { | |
int offset = i - patternOffset; | |
if (text.subSequence(offset, offset + patternSize).equals(pattern)) { | |
result = offset; | |
continue LABEL; | |
} | |
} | |
} | |
} | |
} | |
} | |
time = System.currentTimeMillis() - time; | |
System.out.println("time search = " + time+" "+matches+" "+result); | |
return -1; | |
} | |
public static void main(String[] args) { | |
//testHash(); | |
//Unsafe.getUnsafe().throwException(new RuntimeException()); | |
long seed = args.length > 0 ? Long.parseLong(args[0]) : System.currentTimeMillis(); | |
Random r = new Random(seed); | |
char[] data = makeRandomChars(r, SIZE + SS_SIZE); | |
StringBuilder text = new StringBuilder(data.length); | |
text.append(data); | |
testOffset = r.nextInt(SIZE); | |
System.out.println("seed = " + seed+" offset "+testOffset); | |
String pattern = String.valueOf(makeRandomChars(r, SS_SIZE)).toUpperCase(); | |
char[] patternChars = pattern.toCharArray(); | |
text.replace(testOffset, testOffset + SS_SIZE, pattern); | |
data = text.toString().toCharArray(); | |
indexOfTrace(text,pattern,COUNT); | |
int offset2 = indexOf(data, patternChars); | |
if (offset2 != testOffset) { | |
System.out.printf("Index error %d %d%n", testOffset, offset2); | |
indexOf(data, patternChars); | |
} | |
long time = System.currentTimeMillis(); | |
int old = -1; | |
for (int i = 0; i < COUNT; i++) { | |
old = text.indexOf(pattern); | |
} | |
long oldTime = System.currentTimeMillis() - time; | |
time = System.currentTimeMillis(); | |
int index = -1; | |
for (int i = 0; i < COUNT; i++) { | |
index = indexOf(data, patternChars); | |
} | |
long newTime = System.currentTimeMillis() - time; | |
System.out.printf("offset %d old %d (%d ms) new %d (%dms) seed = %d%n", testOffset, old, oldTime, index, newTime, seed); | |
if (old != index) { | |
index = indexOf(data, patternChars); | |
} | |
} | |
private static char[] makeRandomChars(Random r, final int size) { | |
char[] data = new char[size]; | |
for (int i = 0; i < data.length; i++) { | |
data[i] = (char) (r.nextInt(255 - 32) + 32); | |
} | |
return data; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment