jexp · February 13, 2011 23:15
diff --git a/Volnitsky.java b/Volnitsky.java
 import sun.misc.Unsafe;

 import java.util.Arrays;
 import java.util.Random;

 /**
 * @author mh
 * @since 12.02.11
 */
 public class StringSearcher {
    private static final int WORD = 4;
    private static final int SIZE = 10 * 1024;
    private static final int SS_SIZE = 10;
    private static int testOffset;
    private static final int COUNT = 100000;
    // word size = 4


    private static void testHash() {
        int[] hashes = new int[256];
        String[][] buckets = new String[256][];
        for (int i=0;i<256;i++) {
            buckets[i]=new String[50];
        }
        char start=32;
        char end = 128;
        for (char a=start;a<end;a++) {
            for (char b=start;b<end;b++) {
                for (char c=start;c<end;c++) {
                    for (char d=start;d<end;d++) {
                        int hash= (a << 1 ^ b  << 1 ^ c  ^ d << 1 ) & 0xFF;
 //                        String[] bucket = buckets[hash];
 //                        if (bucket.length==hashes[hash]) {
 //                            buckets[hash]=new String[bucket.length*2];
 //                            System.arraycopy(bucket,0,buckets[hash],0,bucket.length);
 //                        }
 //                        if (buckets[hash].length > hashes[hash]) {
 //                            buckets[hash][hashes[hash]]=String.valueOf(chars);
 //                        }
                        hashes[hash]++;
                    }
                }
            }
        }
        for (int i=0;i<256;i++) {
            System.out.printf("%3d => %6d %s%n",i,hashes[i],rep(hashes[i]/50000,'#'));
        }
        System.out.flush();
    }

    private static String rep(int count, char c) {
        char[] chars = new char[count];
        Arrays.fill(chars,c);
        return String.valueOf(chars);
    }

    // 1297482766284
    public static int indexOf(char[] textChars, char[] patternChars) {
 //        System.out.printf("pattern = #%s# %d%n", pattern, pattern.length());
        int[] lookup = new int[256];
        int patternSize = patternChars.length;
        int stepSize = patternSize - WORD + 1;
        for (int i = 0; i < stepSize; i++) {
            int hash = (patternChars[i] << 1  ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] << 1) & 0xFF;
            if (lookup[hash] != 0)
                throw new IllegalStateException(String.format("hash already claimed %d old %d new %d", hash, lookup[hash], i));
            lookup[hash] = i + 1;

        }

        int textSize = textChars.length - stepSize;
        for (int i = 0; i < textSize; i += stepSize) {
            int hash = (textChars[i] << 1^ textChars[i + 1] ^ textChars[i + 2] << 1 ^ textChars[i + 3] << 1) & 0xFF;
            if (lookup[hash] != 0) {
                int patternOffset = lookup[hash] - 1;
                if (textChars[i] == patternChars[patternOffset] &&
                        textChars[i + 1] == patternChars[patternOffset + 1] &&
                        textChars[i + 2] == patternChars[patternOffset + 2] &&
                        textChars[i + 3] == patternChars[patternOffset + 3]) {
                    int offset = i - patternOffset;
                    if (Arrays.equals(Arrays.copyOfRange(textChars, offset, offset + patternSize), patternChars)) {
                        return offset;
                    }
                }
            }
        }
        return -1;
    }

    public static int indexOfTrace(CharSequence text, CharSequence pattern, int count) {
        char[] patternChars = pattern.toString().toCharArray();
        char[] textChars = text.toString().toCharArray();
        System.out.printf("pattern = #%s# %d%n", pattern, pattern.length());
        long time = System.currentTimeMillis();
        for (int x = 0; x < count; x++) {
            int[] lookup = new int[256];
            int patternSize = patternChars.length;
            int patternMinusWordSize = patternSize - WORD;
            for (int i = 0; i < patternMinusWordSize; i++) {
                int hash = (patternChars[i] << 1 ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] <<1) & 0xFF;
                if (lookup[hash] != 0)
                    throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD)));
                lookup[hash] = i + 1;
            }
        }
        time = System.currentTimeMillis() - time;
        System.out.println("time pattern = " + time);
        int[] lookup = new int[256];
        int patternSize = pattern.length();
        int patternMinusWordSize = patternSize - WORD;
        for (int i = 0; i < patternMinusWordSize; i++) {
            int hash = (pattern.charAt(i) << 1 ^ pattern.charAt(i + 1)  ^ pattern.charAt(i + 2) << 1 ^ pattern.charAt(i + 3) << 1) & 0xFF;
            if (lookup[hash] != 0)
                throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD)));
            lookup[hash] = i + 1;
        }

        time = System.currentTimeMillis();
        int matches = 0;
        int result = -1;
        LABEL:
        for (int x = 0; x < count; x++) {
            {   matches = 0;
                int stepSize = patternMinusWordSize + 1;
                int textSize = textChars.length - stepSize;
                for (int i = 0; i < textSize; i += stepSize) {
                    int hash = (textChars[i] <<1 ^ textChars[i + 1] ^ textChars[i + 2] <<1^ textChars[i + 3]<<1) & 0xFF;
                    if (lookup[hash] != 0) {
                        matches++;
                        int patternOffset = lookup[hash] - 1;
                        if (textChars[i] == patternChars[patternOffset] &&
                                textChars[i + 1] == patternChars[patternOffset + 1] &&
                                textChars[i + 2] == patternChars[patternOffset + 2] &&
                                textChars[i + 3] == patternChars[patternOffset + 3]) {
                            int offset = i - patternOffset;
                            if (text.subSequence(offset, offset + patternSize).equals(pattern)) {
                                result = offset;
                                continue LABEL;
                            }
                        }
                    }
                }
            }
        }
        time = System.currentTimeMillis() - time;
        System.out.println("time search = " + time+" "+matches+" "+result);
        return -1;
    }

    public static void main(String[] args) {
        //testHash();
        //Unsafe.getUnsafe().throwException(new RuntimeException());
        long seed = args.length > 0 ? Long.parseLong(args[0]) : System.currentTimeMillis();
        Random r = new Random(seed);
        char[] data = makeRandomChars(r, SIZE + SS_SIZE);
        StringBuilder text = new StringBuilder(data.length);

        text.append(data);
        testOffset = r.nextInt(SIZE);
        System.out.println("seed = " + seed+" offset "+testOffset);
        String pattern = String.valueOf(makeRandomChars(r, SS_SIZE)).toUpperCase();
        char[] patternChars = pattern.toCharArray();
        text.replace(testOffset, testOffset + SS_SIZE, pattern);
        data = text.toString().toCharArray();
        indexOfTrace(text,pattern,COUNT);

        int offset2 = indexOf(data, patternChars);
        if (offset2 != testOffset) {
            System.out.printf("Index error %d %d%n", testOffset, offset2);
            indexOf(data, patternChars);
        }

        long time = System.currentTimeMillis();
        int old = -1;
        for (int i = 0; i < COUNT; i++) {
            old = text.indexOf(pattern);
        }
        long oldTime = System.currentTimeMillis() - time;
        time = System.currentTimeMillis();
        int index = -1;
        for (int i = 0; i < COUNT; i++) {
            index = indexOf(data, patternChars);
        }
        long newTime = System.currentTimeMillis() - time;

        System.out.printf("offset %d old %d (%d ms) new %d (%dms) seed = %d%n", testOffset, old, oldTime, index, newTime, seed);
        if (old != index) {
            index = indexOf(data, patternChars);
        }
    }

    private static char[] makeRandomChars(Random r, final int size) {
        char[] data = new char[size];
        for (int i = 0; i < data.length; i++) {
            data[i] = (char) (r.nextInt(255 - 32) + 32);
        }
        return data;
    }
 }
	import sun.misc.Unsafe;

	import java.util.Arrays;
	import java.util.Random;

	/**
	* @author mh
	* @since 12.02.11
	*/
	public class StringSearcher {
	private static final int WORD = 4;
	private static final int SIZE = 10 * 1024;
	private static final int SS_SIZE = 10;
	private static int testOffset;
	private static final int COUNT = 100000;
	// word size = 4


	private static void testHash() {
	int[] hashes = new int[256];
	String[][] buckets = new String[256][];
	for (int i=0;i<256;i++) {
	buckets[i]=new String[50];
	}
	char start=32;
	char end = 128;
	for (char a=start;a<end;a++) {
	for (char b=start;b<end;b++) {
	for (char c=start;c<end;c++) {
	for (char d=start;d<end;d++) {
	int hash= (a << 1 ^ b << 1 ^ c ^ d << 1 ) & 0xFF;
	// String[] bucket = buckets[hash];
	// if (bucket.length==hashes[hash]) {
	// buckets[hash]=new String[bucket.length*2];
	// System.arraycopy(bucket,0,buckets[hash],0,bucket.length);
	// }
	// if (buckets[hash].length > hashes[hash]) {
	// buckets[hash][hashes[hash]]=String.valueOf(chars);
	// }
	hashes[hash]++;
	}
	}
	}
	}
	for (int i=0;i<256;i++) {
	System.out.printf("%3d => %6d %s%n",i,hashes[i],rep(hashes[i]/50000,'#'));
	}
	System.out.flush();
	}

	private static String rep(int count, char c) {
	char[] chars = new char[count];
	Arrays.fill(chars,c);
	return String.valueOf(chars);
	}

	// 1297482766284
	public static int indexOf(char[] textChars, char[] patternChars) {
	// System.out.printf("pattern = #%s# %d%n", pattern, pattern.length());
	int[] lookup = new int[256];
	int patternSize = patternChars.length;
	int stepSize = patternSize - WORD + 1;
	for (int i = 0; i < stepSize; i++) {
	int hash = (patternChars[i] << 1 ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] << 1) & 0xFF;
	if (lookup[hash] != 0)
	throw new IllegalStateException(String.format("hash already claimed %d old %d new %d", hash, lookup[hash], i));
	lookup[hash] = i + 1;

	}

	int textSize = textChars.length - stepSize;
	for (int i = 0; i < textSize; i += stepSize) {
	int hash = (textChars[i] << 1^ textChars[i + 1] ^ textChars[i + 2] << 1 ^ textChars[i + 3] << 1) & 0xFF;
	if (lookup[hash] != 0) {
	int patternOffset = lookup[hash] - 1;
	if (textChars[i] == patternChars[patternOffset] &&
	textChars[i + 1] == patternChars[patternOffset + 1] &&
	textChars[i + 2] == patternChars[patternOffset + 2] &&
	textChars[i + 3] == patternChars[patternOffset + 3]) {
	int offset = i - patternOffset;
	if (Arrays.equals(Arrays.copyOfRange(textChars, offset, offset + patternSize), patternChars)) {
	return offset;
	}
	}
	}
	}
	return -1;
	}

	public static int indexOfTrace(CharSequence text, CharSequence pattern, int count) {
	char[] patternChars = pattern.toString().toCharArray();
	char[] textChars = text.toString().toCharArray();
	System.out.printf("pattern = #%s# %d%n", pattern, pattern.length());
	long time = System.currentTimeMillis();
	for (int x = 0; x < count; x++) {
	int[] lookup = new int[256];
	int patternSize = patternChars.length;
	int patternMinusWordSize = patternSize - WORD;
	for (int i = 0; i < patternMinusWordSize; i++) {
	int hash = (patternChars[i] << 1 ^ patternChars[i + 1] ^ patternChars[i + 2] << 1 ^ patternChars[i + 3] <<1) & 0xFF;
	if (lookup[hash] != 0)
	throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD)));
	lookup[hash] = i + 1;
	}
	}
	time = System.currentTimeMillis() - time;
	System.out.println("time pattern = " + time);
	int[] lookup = new int[256];
	int patternSize = pattern.length();
	int patternMinusWordSize = patternSize - WORD;
	for (int i = 0; i < patternMinusWordSize; i++) {
	int hash = (pattern.charAt(i) << 1 ^ pattern.charAt(i + 1) ^ pattern.charAt(i + 2) << 1 ^ pattern.charAt(i + 3) << 1) & 0xFF;
	if (lookup[hash] != 0)
	throw new IllegalStateException(String.format("hash already claimed %d old %s new %s", hash, pattern.subSequence(lookup[hash] - 1, lookup[hash] - 1 + WORD), pattern.subSequence(i, i + WORD)));
	lookup[hash] = i + 1;
	}

	time = System.currentTimeMillis();
	int matches = 0;
	int result = -1;
	LABEL:
	for (int x = 0; x < count; x++) {
	{ matches = 0;
	int stepSize = patternMinusWordSize + 1;
	int textSize = textChars.length - stepSize;
	for (int i = 0; i < textSize; i += stepSize) {
	int hash = (textChars[i] <<1 ^ textChars[i + 1] ^ textChars[i + 2] <<1^ textChars[i + 3]<<1) & 0xFF;
	if (lookup[hash] != 0) {
	matches++;
	int patternOffset = lookup[hash] - 1;
	if (textChars[i] == patternChars[patternOffset] &&
	textChars[i + 1] == patternChars[patternOffset + 1] &&
	textChars[i + 2] == patternChars[patternOffset + 2] &&
	textChars[i + 3] == patternChars[patternOffset + 3]) {
	int offset = i - patternOffset;
	if (text.subSequence(offset, offset + patternSize).equals(pattern)) {
	result = offset;
	continue LABEL;
	}
	}
	}
	}
	}
	}
	time = System.currentTimeMillis() - time;
	System.out.println("time search = " + time+" "+matches+" "+result);
	return -1;
	}

	public static void main(String[] args) {
	//testHash();
	//Unsafe.getUnsafe().throwException(new RuntimeException());
	long seed = args.length > 0 ? Long.parseLong(args[0]) : System.currentTimeMillis();
	Random r = new Random(seed);
	char[] data = makeRandomChars(r, SIZE + SS_SIZE);
	StringBuilder text = new StringBuilder(data.length);

	text.append(data);
	testOffset = r.nextInt(SIZE);
	System.out.println("seed = " + seed+" offset "+testOffset);
	String pattern = String.valueOf(makeRandomChars(r, SS_SIZE)).toUpperCase();
	char[] patternChars = pattern.toCharArray();
	text.replace(testOffset, testOffset + SS_SIZE, pattern);
	data = text.toString().toCharArray();
	indexOfTrace(text,pattern,COUNT);

	int offset2 = indexOf(data, patternChars);
	if (offset2 != testOffset) {
	System.out.printf("Index error %d %d%n", testOffset, offset2);
	indexOf(data, patternChars);
	}

	long time = System.currentTimeMillis();
	int old = -1;
	for (int i = 0; i < COUNT; i++) {
	old = text.indexOf(pattern);
	}
	long oldTime = System.currentTimeMillis() - time;
	time = System.currentTimeMillis();
	int index = -1;
	for (int i = 0; i < COUNT; i++) {
	index = indexOf(data, patternChars);
	}
	long newTime = System.currentTimeMillis() - time;

	System.out.printf("offset %d old %d (%d ms) new %d (%dms) seed = %d%n", testOffset, old, oldTime, index, newTime, seed);
	if (old != index) {
	index = indexOf(data, patternChars);
	}
	}

	private static char[] makeRandomChars(Random r, final int size) {
	char[] data = new char[size];
	for (int i = 0; i < data.length; i++) {
	data[i] = (char) (r.nextInt(255 - 32) + 32);
	}
	return data;
	}
	}