-
-
Save klynch/1108981 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
/** | |
* Calculates the amount of byte entropy across a window | |
* size, N. The maximum entropy across any window size | |
* is N unless N > 255 in which case the maximum number of | |
* of byte patterns which may exist is 255. | |
* | |
* This program is based on Charlie Daly's Entropy.java | |
* program which does more or less the same thing, but | |
* this program has lower memory requirements allowing | |
* much larger files to be analysed. It also offers a | |
* number of additional options which may, or may not, be | |
* useful. | |
* | |
* Have fun! ;-P, See http://bit.ly/entropy | |
* | |
* @author Tariq Adel Ebrahim | |
* @date 20081215 | |
*/ | |
class ByteEntropy { | |
private final static double MB = 1048576.0; | |
/* Buffer size for input */ | |
private final static int BUFIN = 5242880; | |
/* Buffer size for output */ | |
private final static int BUFOUT = 1048576; | |
private int [] counters = null; | |
private int [] window = null; | |
private InputStream src = null; | |
private OutputStream dest = null; | |
boolean complete = false; | |
boolean efficient = false; | |
long byteCount; | |
int windowSize = 0; | |
int entropy = 0; | |
int blockSize = 0; | |
long resolution = 0; | |
long resolutionOutputCounter = 0; | |
long resolutionEntropySum = 0; | |
String sep = null; | |
public ByteEntropy(InputStream in, OutputStream out, int ws, boolean ef, String strsep) { | |
src = in; | |
dest = out; | |
byteCount = 0; | |
// Default behaviour output | |
resolution = 1; | |
resolutionOutputCounter = resolution; | |
counters = new int[256]; | |
windowSize = ws; | |
window = new int[ws]; | |
efficient = ef; | |
sep = strsep; | |
} | |
protected void readInitWindow() throws IOException { | |
int b,i; | |
for(i = 0; i<windowSize; i++) { | |
if((b=src.read()) != -1) { | |
window[i] = b; | |
if(counters[b]==0) entropy++; | |
counters[b]++; | |
byteCount++; | |
} | |
} | |
} | |
/** | |
* Reads from input stream. Calculates entropy in each | |
* window and write the result to the output stream. | |
*/ | |
public void analyse() throws IOException { | |
int b; | |
int i = 0; | |
// Read all the bytes we can into our array | |
// If available bytes is less than our window size then an | |
// exception will be thrown. | |
readInitWindow(); | |
// Read until end of stream | |
// Note: Wasteful I/O; correct for perf++. | |
while((b=src.read()) != -1) { | |
// Just read a byte, increment the counter. | |
byteCount++; | |
// Output the current entropy value -- ugly. | |
writeEntropy(entropy); | |
// Remove first byte, decrementing counter. | |
counters[window[i]]--; | |
// Do we still see this byte? If not entropy is lower. | |
if(counters[window[i]] == 0) entropy--; | |
// Read new byte in place | |
window[i] = b; | |
// If not encountered before then entropy increases! | |
if(counters[window[i]] == 0) entropy++; | |
// Increment window counter | |
counters[window[i]]++; | |
// Some modulo magick to keep our memory requirements low. | |
i = (i + 1) % windowSize; | |
// Lets reset our counters if looking at new blocks. | |
if(byteCount >= blockSize && blockSize > 0) { | |
resetCounters(); | |
// Fill windows with bytes | |
// If this isn't possible lets bail! | |
try { | |
readInitWindow(); | |
} catch (IOException e) { | |
// there isn't enough of the file left to analyse another block | |
// lets bail. | |
break; | |
} | |
} | |
} | |
// Squeeze out last entropy drops... | |
// Should we force resolution output? | |
writeEntropy(entropy); | |
// Now we are done and no errors in sight! | |
complete = true; | |
} | |
protected void write(int val) throws IOException { | |
if(!efficient) { | |
dest.write(new String(val+sep).getBytes()); | |
} | |
else { | |
dest.write((byte) val); | |
} | |
} | |
protected void writeEntropy(int entropy) throws IOException { | |
// Let's not be silly | |
if(resolution == 1) { | |
write(entropy); | |
} | |
// Sum new entropy value; | |
resolutionEntropySum += entropy; | |
// Decrement our output counter | |
resolutionOutputCounter--; | |
// If we hit 0 lets output the average entropy value | |
if(resolutionOutputCounter <= 0) { | |
// Average value. Do we care about our decimal places? | |
int resolutionAverage = (int) (resolutionEntropySum / resolution); | |
// Print the average entropy over resolution. | |
write(resolutionAverage); | |
// Reset resolution counter. | |
resetResolutionCounters(); | |
} | |
} | |
protected void resetResolutionCounters() { | |
resolutionOutputCounter = resolution; | |
resolutionEntropySum = 0; | |
} | |
protected void resetCounters() { | |
byteCount = 0; | |
entropy = 0; | |
for(int i=0;i<counters.length;i++) counters[i] = 0; | |
for(int i=0;i<window.length;i++) window[i] = 0; | |
} | |
/** | |
* Returns a boolean flag indicating analsis status. | |
* @return complete Current status. | |
*/ | |
public boolean isComplete() { return complete; } | |
/** | |
* Number of bytes read from input stream. | |
* @return byteCount Number of bytes read from input | |
* stream. | |
*/ | |
public long getByteCount() { return byteCount; } | |
public void setBlockSize(int bs) { blockSize = bs; } | |
public void setResolution(long res) { | |
// Should throw exception for values less than 1! | |
resolution = res; | |
resolutionOutputCounter = res; | |
} | |
// Arguement hashes | |
public final static int BLOCKMODE = 1493; | |
public final static int BLOCKMODE2 = 1361591824; | |
public final static int CREATEGRAPH = 1494; | |
public final static int CREATEGRAPH2 = 709268530; | |
public final static int DELETEENTROPY = -676014097; | |
public final static int DELETEENTROPY2 = 1495; | |
public final static int EFFICIENT = 1496; | |
public final static int EFFICIENT2 = -866071935; | |
public final static int FILE = 1497; | |
public final static int FILE2 = 1333013276; | |
public final static int GNUPLOTPATH = 1498; | |
public final static int GNUPLOTPATH2 = 1908478292; | |
public final static int HELP = 1499; | |
public final static int HELP2 = 1333069025; | |
public final static int OUTPUT = 1506; | |
public final static int OUTPUT2 = 1394501281; | |
public final static int RESOLUTION = 1509; | |
public final static int RESOLUTION2 = 655291980; | |
public final static int SEPERATOR = 1510; | |
public final static int SEPERATOR2 = -471659002; | |
public final static int SUMMARY = 1478; | |
public final static int SUMMARY2 = -1959226431; | |
public final static int WINDOW = 1514; | |
public final static int WINDOW2 = 1612261776; | |
/** | |
* Prints help and exits. | |
*/ | |
private static void printHelp() { | |
printHelp(1, System.err); | |
} | |
private static void printHelpOption() { | |
printHelp(2, System.out); | |
} | |
/** | |
* Prints help and exits if required. | |
* @param exit If greater than 0 this function will | |
call System.exit(exit). | |
*/ | |
private static void printHelp(int exit, PrintStream err) { | |
err.println("Usage: ByteFrequency <OPTIONS>\n"); | |
err.println("Where possible options are:"); | |
err.println("\t-b <bs>| --blockmode <bs>"); | |
err.println("\t\tEntropy values are reset to zero every <bs> number of bytes."); | |
err.println("\t\tThis allows to measure the entropy in sections of much "); | |
err.println("\t\tlarger files; e.g. measuring entropy in each cluster of FAT."); | |
err.println(); | |
err.println("\t-c | --creategraph "); | |
err.println("\t\tCreate a graph using the entropy file. You must specify an "); | |
err.println("\t\toutput file to use this option as graphs will be saved at"); | |
err.println("\t\t<outputfile>.png. You cannot create graphs in efficient mode."); | |
err.println(); | |
err.println("\t-d | --delete-entropy"); | |
err.println("\t\tOnly useful when used with graph mode. Nice to use this to"); | |
err.println("\t\texpress large entropy data in a much smaller file."); | |
err.println(); | |
err.println("\t-e | --efficient"); | |
err.println("\t\tEntropy values are output as bytes or ints instead of strings."); | |
err.println("\t\tThis saves time and space. Note that seperator options are "); | |
err.println("\t\tignored."); | |
err.println(); | |
err.println("\t-f if| --file if"); | |
err.println("\t\tFile <if> to analyse."); | |
err.println(); | |
err.println("\t-g <gp>| --gnuplotpath <gp>"); | |
err.println("\t\tPath to gnuplot on your system. Default is \"gnuplot\" so "); | |
err.println("\t\tneeds to be in your current path. "); | |
err.println(); | |
err.println("\t-h | --help"); | |
err.println("\t\tThe help you're looking at now ;-D."); | |
err.println(); | |
err.println("\t-o of| --output of"); | |
err.println("\t\tFile <of> to send output to."); | |
err.println(); | |
err.println("\t-r <re> | --resolution <re>"); | |
err.println("\t\tIf the -f or --file option is used then you can set the "); | |
err.println("\t\tresultion <re> of the results. E.g. file to be analysed is "); | |
err.println("\t\t1000 bytes long; if we use -a 100 then every 10 entropy"); | |
err.println("\t\tvalues are averaged (nearest smaller int) and output. If used in"); | |
err.println("\t\tconjunction with -b or --blockmode then this resolution applies"); | |
err.println("\t\tacross the block size -- it follows that the resolution should "); | |
err.println("\t\tbe smaller than the block size in this case. "); | |
err.println("\t\tSetting reasonable values can help speed things up a lot and"); | |
err.println("\t\tmake your entropy files a reasonable size; that's win-win!"); | |
err.println(); | |
err.println("\t-s | --seperator"); | |
err.println("\t\tString to appear between entropy values."); | |
err.println(); | |
err.println("\t-S | --summary"); | |
err.println("\t\tOutput summary to STDERR."); | |
err.println(); | |
err.println("\t-w ws | --window ws"); | |
err.println("\t\tWindow to consider entropy over. Should be followed by a "); | |
err.println("\t\tpositive integer <ws>. WARNING: can't be lower than number"); | |
err.println("\t\tof input bytes."); | |
err.println(); | |
if(exit > 0) System.exit(exit); | |
} | |
/** | |
* The main program, run at your own risk! | |
*/ | |
public static void main(String [] args) { | |
InputStream in = null; | |
OutputStream out = null; | |
ByteEntropy be = null; | |
PrintStream sum = null; | |
File inFile = null, outFile = null; | |
int windowSize = 200; | |
int blockSize = 0; | |
int resol = 0; | |
String seperator = ",\n"; | |
boolean efficient = false; | |
boolean createGraph = false; | |
boolean deleteEntropy = false; | |
String gnuplotPath = "gnuplot"; | |
// Handle arguements | |
if(args.length>0) { | |
for(int i=0,n=args.length;i<n;i++) { | |
switch(args[i].hashCode()) { | |
case BLOCKMODE: | |
case BLOCKMODE2: | |
blockSize = Integer.parseInt(args[++i]); break; | |
case CREATEGRAPH: | |
case CREATEGRAPH2: | |
createGraph = true; break; | |
case DELETEENTROPY: | |
case DELETEENTROPY2: | |
deleteEntropy = true; break; | |
case GNUPLOTPATH: | |
case GNUPLOTPATH2: | |
gnuplotPath = args[++i]; break; | |
case HELP: | |
case HELP2: | |
printHelpOption(); break; | |
case FILE: | |
case FILE2: | |
inFile = new File(args[++i]); break; | |
case OUTPUT: | |
case OUTPUT2: | |
outFile = new File(args[++i]); break; | |
case RESOLUTION: | |
case RESOLUTION2: | |
resol = Integer.parseInt(args[++i]); break; | |
case SEPERATOR: | |
case SEPERATOR2: | |
// Allow line returns | |
seperator = args[++i].replace("\\n", "\n"); | |
break; | |
case WINDOW: | |
case WINDOW2: | |
// Read next int. | |
windowSize = Integer.parseInt(args[++i]); break; | |
case SUMMARY: | |
case SUMMARY2: | |
sum = System.err; break; | |
case EFFICIENT: | |
case EFFICIENT2: | |
efficient = true; break; | |
default: | |
// Hmmm, incorrect input! | |
printHelp(); break; // Help auto exits | |
} | |
} | |
} // else all defaults | |
// Create graph needs an output file set as graph is saved as OURPUTFILE.png | |
if(createGraph && (outFile == null || efficient)) { | |
System.err.println("Create graph needs an output file set as graph is saved as OUTPUT.png." | |
+" Creategraph may not be used with efficient modes."); | |
printHelp(); | |
} | |
long timer = System.currentTimeMillis(); | |
try { | |
// No input set, so default to STDIN | |
if(inFile == null) { | |
in = new BufferedInputStream(System.in, BUFIN); | |
} else { | |
in = new BufferedInputStream(new FileInputStream(inFile), BUFIN); | |
} | |
// Was an output file set? | |
if(outFile == null) { | |
out = new BufferedOutputStream(System.out, BUFOUT); | |
} else { | |
out = new BufferedOutputStream(new FileOutputStream(outFile), BUFOUT); | |
} | |
be = new ByteEntropy(in, out, windowSize, efficient, seperator); | |
if(blockSize > 0) { | |
be.setBlockSize(blockSize); | |
} | |
if(resol > 0) { | |
be.setResolution(resol); | |
} | |
be.analyse(); | |
out.flush(); | |
in.close(); | |
out.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} finally { | |
// Holy crap batman! | |
if(!be.isComplete()) { | |
System.err.println("WARNING: Analysis was not complete!"); | |
} else { | |
// Do we need graphs? | |
if(createGraph) { | |
OutputStream plotOut = null; | |
File plotFile = null; | |
try { | |
String resolutionInfo = ""; | |
if(resol > 1) { | |
resolutionInfo = "Entropy is averaged over "+resol+"bytes"; | |
} | |
String plotString = "set terminal png;\n" | |
+"set output '"+outFile.getPath()+".png';\n" | |
+"set title 'Entropy of "+outFile.getName()+"; "+resolutionInfo+"';\n" | |
+"set grid;\n" | |
+"set ylabel 'Entropy in "+windowSize+" previous bytes';\n" | |
+"plot('"+outFile.getPath()+"') with lines;"; | |
plotFile = new File(outFile.getParentFile(), outFile.getName()+".plot"); | |
plotOut = new BufferedOutputStream(new FileOutputStream(plotFile)); | |
plotOut.write(plotString.getBytes()); | |
plotOut.flush(); | |
plotOut.close(); | |
//System.out.println("Created gnuplot settings file "+plotFile.getPath()); | |
Runtime rt=Runtime.getRuntime(); | |
Process p = rt.exec( | |
gnuplotPath+" "+plotFile.getPath()+" " | |
); | |
p.waitFor(); | |
// Don't need it anymore! | |
plotFile.delete(); | |
//System.out.println("Created "+outFile.getPath()+".png ("+p.exitValue()+")"); | |
} catch(Exception e) { | |
System.out.println(e.getMessage()); | |
} | |
} | |
} | |
if(deleteEntropy) { | |
outFile.delete(); | |
} | |
// Check if summary stream init'd | |
if(sum!=null) { | |
long took = System.currentTimeMillis() - timer; | |
long read = be.getByteCount(); | |
// Handle short input. | |
if(took==0) took++; | |
if(read==0) read++; | |
double rate = (read/MB)/(took/1000.0); | |
// Precision to 3 decimal places. | |
rate = Math.round(rate*10000) / 10000; | |
// Write summary. | |
sum.println("Read,"+read+",bytes"); | |
sum.println("Took,"+took+",ms"); | |
sum.println("Rate,"+rate+",MB/s"); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment