Created
October 24, 2021 03:50
-
-
Save pawitp/9881e6951eac40babd07a7d05ac74f56 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.IOException; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.Random; | |
public class Buzhash32Chunking { | |
// If we use a multiple of 32, we get a lot of cancelling out | |
private static final int BLOCK_SIZE = 63; | |
private static final int BH_ROTATE = BLOCK_SIZE % 32; | |
private static final int BH_ROTATE_COMP = 32 - BH_ROTATE; | |
private static final int[] HASHES = new int[256]; | |
private static int currentHash; | |
public static void main(String[] args) throws Exception { | |
init(); | |
// Read the entire file to memory for simplicity | |
byte[] input = readInput("ubuntu-20.04.3-live-server-amd64.iso"); | |
long startTime = System.nanoTime(); | |
// Start at the position where we have enough bytes for BLOCK_SIZE | |
int currentPos = BLOCK_SIZE; | |
int currentStartPos = 0; | |
resetHash(input, currentPos); | |
while (currentPos < input.length) { | |
roll(input[currentPos - BLOCK_SIZE], input[currentPos]); | |
if (shouldSplit()) { | |
System.out.println(currentPos + "," + (currentPos - currentStartPos + 1)); | |
currentStartPos = currentPos + 1; | |
currentPos += BLOCK_SIZE + 1; | |
resetHash(input, currentPos); | |
} else { | |
currentPos++; | |
} | |
} | |
// Last chunk | |
currentPos = input.length - 1; | |
System.out.println(currentPos + "," + (currentPos - currentStartPos + 1)); | |
long endTime = System.nanoTime(); | |
System.out.println("Finished in " + (endTime - startTime) / 1000000 + " ms"); | |
} | |
private static byte[] readInput(String filename) throws IOException { | |
File file = new File(filename); | |
try (FileInputStream fis = new FileInputStream(file)) { | |
long len = file.length(); | |
byte[] content = new byte[(int) len]; | |
if (len != fis.read(content)) { | |
throw new IOException("Unexpected bytes read"); | |
} | |
return content; | |
} | |
} | |
private static void init() { | |
Random r = new Random(42); | |
for (int i = 0; i < 256; i++) { | |
HASHES[i] = r.nextInt(); | |
} | |
} | |
private static void resetHash(byte[] b, int pos) { | |
currentHash = 0; | |
for (int i = BLOCK_SIZE; i > 0; i--) { | |
currentHash = (currentHash << 1 | currentHash >>> 31) ^ HASHES[b[pos - i] + 128]; | |
} | |
} | |
private static void roll(byte oldByte, byte newByte) { | |
int oldHash = HASHES[oldByte + 128]; | |
currentHash = (currentHash << 1 | currentHash >>> 31) ^ | |
(oldHash << BH_ROTATE | oldHash >>> BH_ROTATE_COMP) ^ | |
HASHES[newByte + 128]; | |
} | |
private static boolean shouldSplit() { | |
// We want 23 bits to be 0 | |
return (currentHash & 0b11111111111111111111111) == 0; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment