Skip to content

Instantly share code, notes, and snippets.

@jexp
Created February 13, 2011 23:15
Show Gist options
  • Save jexp/825280 to your computer and use it in GitHub Desktop.
Save jexp/825280 to your computer and use it in GitHub Desktop.
Some implementation spikes of the Volnitsky substring search algorithm in java
import sun.misc.Unsafe;
import java.util.Arrays;
import java.util.Random;
/**
* @author mh
* @since 12.02.11
*/
public class StringSearcher {
private static final int WORD = 4;
private static final int SIZE = 10 * 1024;
private static final int SS_SIZE = 10;
private static int testOffset;
private static final int COUNT = 100000;
// word size = 4
private static void testHash() {
int[] hashes = new int[256];
String[][] buckets = new String[256][];
for (int i=0;i<256;i++) {
buckets[i]=new String[50];
}
char start=32;
char end = 128;
for (char a=start;a<end;a++) {
for (char b=start;b<end;b++) {
for (char c=start;c<end;c++) {
for (char d=start;d<end;d++) {
int hash= (a << 1 ^ b << 1 ^ c ^ d << 1 ) & 0xFF;
// String[] bucket = buckets[hash];
// if (bucket.length==hashes[hash]) {
// buckets[hash]=new String[bucket.length*2];
// System.arraycopy(bucket,0,buckets[hash],0,bucket.length);
// }
// if (buckets[hash].length > hashes[hash]) {
// buckets[hash][hashes[hash]]=String.valueOf(chars);
// }
hashes[hash]++;
}
}
}
}
for (int i=0;i<256;i++) {
System.out.printf("%3d => %6d %s%n",i,hashes[i],rep(hashes[i]/50000,'#'));
}
System.out.flush();
}
private static String rep(int count, char c) {
char[] chars = new char[count];
Arrays.fill(chars,c);
return String.valueOf(chars);
}
// 1297482766284
public static int indexOf(char[] textChars, char[] patternChars) {
// System.out.printf("pattern = #%s# %d%n", pattern, pattern.length());
int[] lookup = new int[256];
int patternSize = patternChars.length;
int stepSize = patternSize - WORD + 1;
for (int i = 0; i < stepSize; i++) {
int hash = (patternChars[i] << 1 ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] << 1) & 0xFF;
if (lookup[hash] != 0)
throw new IllegalStateException(String.format("hash already claimed %d old %d new %d", hash, lookup[hash], i));
lookup[hash] = i + 1;
}
int textSize = textChars.length - stepSize;
for (int i = 0; i < textSize; i += stepSize) {
int hash = (textChars[i] << 1^ textChars[i + 1] ^ textChars[i + 2] << 1 ^ textChars[i + 3] << 1) & 0xFF;
if (lookup[hash] != 0) {
int patternOffset = lookup[hash] - 1;
if (textChars[i] == patternChars[patternOffset] &&
textChars[i + 1] == patternChars[patternOffset + 1] &&
textChars[i + 2] == patternChars[patternOffset + 2] &&
textChars[i + 3] == patternChars[patternOffset + 3]) {
int offset = i - patternOffset;
if (Arrays.equals(Arrays.copyOfRange(textChars, offset, offset + patternSize), patternChars)) {
return offset;
}
}
}
}
return -1;
}
public static int indexOfTrace(CharSequence text, CharSequence pattern, int count) {
char[] patternChars = pattern.toString().toCharArray();
char[] textChars = text.toString().toCharArray();
System.out.printf("pattern = #%s# %d%n", pattern, pattern.length());
long time = System.currentTimeMillis();
for (int x = 0; x < count; x++) {
int[] lookup = new int[256];
int patternSize = patternChars.length;
int patternMinusWordSize = patternSize - WORD;
for (int i = 0; i < patternMinusWordSize; i++) {
int hash = (patternChars[i] << 1 ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] <<1) & 0xFF;
if (lookup[hash] != 0)
throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD)));
lookup[hash] = i + 1;
}
}
time = System.currentTimeMillis() - time;
System.out.println("time pattern = " + time);
int[] lookup = new int[256];
int patternSize = pattern.length();
int patternMinusWordSize = patternSize - WORD;
for (int i = 0; i < patternMinusWordSize; i++) {
int hash = (pattern.charAt(i) << 1 ^ pattern.charAt(i + 1) ^ pattern.charAt(i + 2) << 1 ^ pattern.charAt(i + 3) << 1) & 0xFF;
if (lookup[hash] != 0)
throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD)));
lookup[hash] = i + 1;
}
time = System.currentTimeMillis();
int matches = 0;
int result = -1;
LABEL:
for (int x = 0; x < count; x++) {
{ matches = 0;
int stepSize = patternMinusWordSize + 1;
int textSize = textChars.length - stepSize;
for (int i = 0; i < textSize; i += stepSize) {
int hash = (textChars[i] <<1 ^ textChars[i + 1] ^ textChars[i + 2] <<1^ textChars[i + 3]<<1) & 0xFF;
if (lookup[hash] != 0) {
matches++;
int patternOffset = lookup[hash] - 1;
if (textChars[i] == patternChars[patternOffset] &&
textChars[i + 1] == patternChars[patternOffset + 1] &&
textChars[i + 2] == patternChars[patternOffset + 2] &&
textChars[i + 3] == patternChars[patternOffset + 3]) {
int offset = i - patternOffset;
if (text.subSequence(offset, offset + patternSize).equals(pattern)) {
result = offset;
continue LABEL;
}
}
}
}
}
}
time = System.currentTimeMillis() - time;
System.out.println("time search = " + time+" "+matches+" "+result);
return -1;
}
public static void main(String[] args) {
//testHash();
//Unsafe.getUnsafe().throwException(new RuntimeException());
long seed = args.length > 0 ? Long.parseLong(args[0]) : System.currentTimeMillis();
Random r = new Random(seed);
char[] data = makeRandomChars(r, SIZE + SS_SIZE);
StringBuilder text = new StringBuilder(data.length);
text.append(data);
testOffset = r.nextInt(SIZE);
System.out.println("seed = " + seed+" offset "+testOffset);
String pattern = String.valueOf(makeRandomChars(r, SS_SIZE)).toUpperCase();
char[] patternChars = pattern.toCharArray();
text.replace(testOffset, testOffset + SS_SIZE, pattern);
data = text.toString().toCharArray();
indexOfTrace(text,pattern,COUNT);
int offset2 = indexOf(data, patternChars);
if (offset2 != testOffset) {
System.out.printf("Index error %d %d%n", testOffset, offset2);
indexOf(data, patternChars);
}
long time = System.currentTimeMillis();
int old = -1;
for (int i = 0; i < COUNT; i++) {
old = text.indexOf(pattern);
}
long oldTime = System.currentTimeMillis() - time;
time = System.currentTimeMillis();
int index = -1;
for (int i = 0; i < COUNT; i++) {
index = indexOf(data, patternChars);
}
long newTime = System.currentTimeMillis() - time;
System.out.printf("offset %d old %d (%d ms) new %d (%dms) seed = %d%n", testOffset, old, oldTime, index, newTime, seed);
if (old != index) {
index = indexOf(data, patternChars);
}
}
private static char[] makeRandomChars(Random r, final int size) {
char[] data = new char[size];
for (int i = 0; i < data.length; i++) {
data[i] = (char) (r.nextInt(255 - 32) + 32);
}
return data;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment