Created
April 22, 2014 14:08
-
-
Save sophistifunk/11180650 to your computer and use it in GitHub Desktop.
Simple, poorly factored proof of concept for de-duping data store algorithm.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.expantra; | |
import com.expantra.buzhash.BuzHash; | |
import javax.xml.bind.DatatypeConverter; | |
import java.io.*; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import static java.lang.System.out; | |
/** | |
* Created by josh on 16/04/2014. | |
* <p/> | |
* Crap, poorly factored code. PoC === PoS. Seriously. Throws IOExceptions it's so weak. Will RTE if you look at | |
* it funny. Also slow. And did I mention crap? Do not taunt Happy Fun Ball. | |
*/ | |
public class SaveFile { | |
public static void main(String[] args) throws IOException, NoSuchAlgorithmException { | |
if (args.length != 2) { | |
out.println("Usage: SaveFile inputFile outputDir"); | |
return; | |
} | |
final int hashWindow = 67; | |
final int maxChunkSize = 0x8000; // 32k | |
final int readBufferLen = 2048; | |
byte[] readBuffer = new byte[readBufferLen]; | |
byte[] chunkBuffer = new byte[maxChunkSize]; | |
// Input | |
File inputFile = new File(args[0]); | |
BufferedInputStream inputStream = new BufferedInputStream(new FileInputStream(inputFile)); | |
// Output | |
File outputDir = new File(args[1]); | |
if (!outputDir.isDirectory()) | |
throw new RuntimeException("Output dir \"" + outputDir.getCanonicalPath() + "\" not a directory"); | |
File indexOutFile = new File(outputDir.getCanonicalPath() + File.separator + inputFile.getName()); | |
if (indexOutFile.exists()) | |
throw new RuntimeException("Temp index filename \"" + indexOutFile.getCanonicalPath() + " exists :("); | |
FileOutputStream indexOut = new FileOutputStream(indexOutFile); | |
File chunkDir = new File(outputDir.getCanonicalPath() + File.separator + "chunks"); | |
if (!chunkDir.exists()) | |
chunkDir.mkdir(); | |
if (!chunkDir.isDirectory()) | |
throw new RuntimeException("Chunk dir \"" + chunkDir.getCanonicalPath() + "\" not a directory"); | |
// Setup | |
BuzHash inputHasher = new BuzHash(hashWindow); | |
MessageDigest fileDigest = MessageDigest.getInstance("SHA-1"); | |
MessageDigest chunkDigest = MessageDigest.getInstance("SHA-1"); | |
int lastHash = -1; | |
int chunkLength = 0; | |
int chunkCount = 0; | |
long chunkBytesWritten = 0; | |
long totalBytes = 0; | |
while (inputStream.available() > 0) { | |
int len = inputStream.read(readBuffer); | |
totalBytes += len; | |
fileDigest.update(readBuffer, 0, len); | |
for (int i = 0; i < len; i++) { | |
byte b = readBuffer[i]; | |
int rollingHash = inputHasher.addByte(b); | |
chunkDigest.update(b); | |
chunkBuffer[chunkLength] = b; | |
chunkLength++; | |
if (chunkLength == maxChunkSize || (lastHash != rollingHash && (rollingHash & 0xffff) == 0)) { | |
out.print("."); | |
// Write chunk | |
byte[] chunkHash = chunkDigest.digest(); | |
chunkBytesWritten += writeChunk(chunkBuffer, chunkLength, chunkHash, chunkDir); | |
// Write chunk id to index | |
indexOut.write(chunkHash); | |
// Reset chunk, update count | |
chunkDigest.reset(); | |
chunkLength = 0; | |
chunkCount++; | |
} | |
lastHash = rollingHash; | |
} | |
} | |
// Final chunk | |
if (chunkLength > 0) { | |
out.print("."); | |
chunkDigest.update(chunkBuffer, 0, chunkLength); | |
// Write chunk | |
byte[] chunkHash = chunkDigest.digest(); | |
chunkBytesWritten += writeChunk(chunkBuffer, chunkLength, chunkHash, chunkDir); | |
// Write chunk id to index | |
indexOut.write(chunkHash); | |
chunkCount++; | |
} | |
indexOut.close(); | |
byte[] fileHash = fileDigest.digest(); | |
// Rename indexOutFile based on whole-file hash | |
String finalIndexFileName = indexOutFile.getCanonicalPath() + "-" + DatatypeConverter.printHexBinary(fileHash); | |
indexOutFile.renameTo(new File(finalIndexFileName)); | |
out.println(); | |
out.println(); | |
out.println("original file : " + inputFile.getCanonicalPath()); | |
out.println(" index file : " + finalIndexFileName); | |
out.println(); | |
long avgSize = totalBytes / chunkCount; | |
long chunkIndexSize = chunkCount * 16; // Assuming 128bit hash | |
long totalStored = chunkBytesWritten + chunkIndexSize; | |
double storedPercent = (((double) totalStored / (double) totalBytes) * 100); | |
out.println(); | |
out.println(" number of chunks : " + String.format("%,d",chunkCount)); | |
out.println(" file size : " + String.format("%,d",totalBytes) + " bytes"); | |
out.println(" unique chunk data : " + String.format("%,d",chunkBytesWritten) + " bytes"); | |
out.println(" avg chunk size : " + String.format("%,d",avgSize) + " bytes"); | |
out.println(" index overhead : " + String.format("%,d",chunkIndexSize) + " bytes"); | |
out.println(" unique + index : " + String.format("%,d",totalStored) + " bytes, " + String.format("%,.3g", storedPercent) + "%"); | |
} | |
private static int writeChunk(byte[] chunkBuffer, int chunkLength, byte[] chunkHash, File chunkDir) throws IOException { | |
String hashString = DatatypeConverter.printHexBinary(chunkHash); | |
File chunkFile = new File(chunkDir.getCanonicalPath() + File.separator + hashString); | |
if (chunkFile.exists()) | |
return 0; | |
FileOutputStream chunkOut = new FileOutputStream(chunkFile); | |
chunkOut.write(chunkBuffer, 0, chunkLength); | |
chunkOut.close(); | |
return chunkLength; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment