Skip to content

Instantly share code, notes, and snippets.

@klynch
Forked from tariqadel/gist:104935
Created July 27, 2011 09:11
Show Gist options
  • Save klynch/1108981 to your computer and use it in GitHub Desktop.
Save klynch/1108981 to your computer and use it in GitHub Desktop.
import java.io.*;
/**
* Calculates the amount of byte entropy across a window
* size, N. The maximum entropy across any window size
* is N unless N > 255 in which case the maximum number of
* of byte patterns which may exist is 255.
*
* This program is based on Charlie Daly's Entropy.java
* program which does more or less the same thing, but
* this program has lower memory requirements allowing
* much larger files to be analysed. It also offers a
* number of additional options which may, or may not, be
* useful.
*
* Have fun! ;-P, See http://bit.ly/entropy
*
* @author Tariq Adel Ebrahim
* @date 20081215
*/
class ByteEntropy {
private final static double MB = 1048576.0;
/* Buffer size for input */
private final static int BUFIN = 5242880;
/* Buffer size for output */
private final static int BUFOUT = 1048576;
private int [] counters = null;
private int [] window = null;
private InputStream src = null;
private OutputStream dest = null;
boolean complete = false;
boolean efficient = false;
long byteCount;
int windowSize = 0;
int entropy = 0;
int blockSize = 0;
long resolution = 0;
long resolutionOutputCounter = 0;
long resolutionEntropySum = 0;
String sep = null;
public ByteEntropy(InputStream in, OutputStream out, int ws, boolean ef, String strsep) {
src = in;
dest = out;
byteCount = 0;
// Default behaviour output
resolution = 1;
resolutionOutputCounter = resolution;
counters = new int[256];
windowSize = ws;
window = new int[ws];
efficient = ef;
sep = strsep;
}
protected void readInitWindow() throws IOException {
int b,i;
for(i = 0; i<windowSize; i++) {
if((b=src.read()) != -1) {
window[i] = b;
if(counters[b]==0) entropy++;
counters[b]++;
byteCount++;
}
}
}
/**
* Reads from input stream. Calculates entropy in each
* window and write the result to the output stream.
*/
public void analyse() throws IOException {
int b;
int i = 0;
// Read all the bytes we can into our array
// If available bytes is less than our window size then an
// exception will be thrown.
readInitWindow();
// Read until end of stream
// Note: Wasteful I/O; correct for perf++.
while((b=src.read()) != -1) {
// Just read a byte, increment the counter.
byteCount++;
// Output the current entropy value -- ugly.
writeEntropy(entropy);
// Remove first byte, decrementing counter.
counters[window[i]]--;
// Do we still see this byte? If not entropy is lower.
if(counters[window[i]] == 0) entropy--;
// Read new byte in place
window[i] = b;
// If not encountered before then entropy increases!
if(counters[window[i]] == 0) entropy++;
// Increment window counter
counters[window[i]]++;
// Some modulo magick to keep our memory requirements low.
i = (i + 1) % windowSize;
// Lets reset our counters if looking at new blocks.
if(byteCount >= blockSize && blockSize > 0) {
resetCounters();
// Fill windows with bytes
// If this isn't possible lets bail!
try {
readInitWindow();
} catch (IOException e) {
// there isn't enough of the file left to analyse another block
// lets bail.
break;
}
}
}
// Squeeze out last entropy drops...
// Should we force resolution output?
writeEntropy(entropy);
// Now we are done and no errors in sight!
complete = true;
}
protected void write(int val) throws IOException {
if(!efficient) {
dest.write(new String(val+sep).getBytes());
}
else {
dest.write((byte) val);
}
}
protected void writeEntropy(int entropy) throws IOException {
// Let's not be silly
if(resolution == 1) {
write(entropy);
}
// Sum new entropy value;
resolutionEntropySum += entropy;
// Decrement our output counter
resolutionOutputCounter--;
// If we hit 0 lets output the average entropy value
if(resolutionOutputCounter <= 0) {
// Average value. Do we care about our decimal places?
int resolutionAverage = (int) (resolutionEntropySum / resolution);
// Print the average entropy over resolution.
write(resolutionAverage);
// Reset resolution counter.
resetResolutionCounters();
}
}
protected void resetResolutionCounters() {
resolutionOutputCounter = resolution;
resolutionEntropySum = 0;
}
protected void resetCounters() {
byteCount = 0;
entropy = 0;
for(int i=0;i<counters.length;i++) counters[i] = 0;
for(int i=0;i<window.length;i++) window[i] = 0;
}
/**
* Returns a boolean flag indicating analsis status.
* @return complete Current status.
*/
public boolean isComplete() { return complete; }
/**
* Number of bytes read from input stream.
* @return byteCount Number of bytes read from input
* stream.
*/
public long getByteCount() { return byteCount; }
public void setBlockSize(int bs) { blockSize = bs; }
public void setResolution(long res) {
// Should throw exception for values less than 1!
resolution = res;
resolutionOutputCounter = res;
}
// Arguement hashes
public final static int BLOCKMODE = 1493;
public final static int BLOCKMODE2 = 1361591824;
public final static int CREATEGRAPH = 1494;
public final static int CREATEGRAPH2 = 709268530;
public final static int DELETEENTROPY = -676014097;
public final static int DELETEENTROPY2 = 1495;
public final static int EFFICIENT = 1496;
public final static int EFFICIENT2 = -866071935;
public final static int FILE = 1497;
public final static int FILE2 = 1333013276;
public final static int GNUPLOTPATH = 1498;
public final static int GNUPLOTPATH2 = 1908478292;
public final static int HELP = 1499;
public final static int HELP2 = 1333069025;
public final static int OUTPUT = 1506;
public final static int OUTPUT2 = 1394501281;
public final static int RESOLUTION = 1509;
public final static int RESOLUTION2 = 655291980;
public final static int SEPERATOR = 1510;
public final static int SEPERATOR2 = -471659002;
public final static int SUMMARY = 1478;
public final static int SUMMARY2 = -1959226431;
public final static int WINDOW = 1514;
public final static int WINDOW2 = 1612261776;
/**
* Prints help and exits.
*/
private static void printHelp() {
printHelp(1, System.err);
}
private static void printHelpOption() {
printHelp(2, System.out);
}
/**
* Prints help and exits if required.
* @param exit If greater than 0 this function will
call System.exit(exit).
*/
private static void printHelp(int exit, PrintStream err) {
err.println("Usage: ByteFrequency <OPTIONS>\n");
err.println("Where possible options are:");
err.println("\t-b <bs>| --blockmode <bs>");
err.println("\t\tEntropy values are reset to zero every <bs> number of bytes.");
err.println("\t\tThis allows to measure the entropy in sections of much ");
err.println("\t\tlarger files; e.g. measuring entropy in each cluster of FAT.");
err.println();
err.println("\t-c | --creategraph ");
err.println("\t\tCreate a graph using the entropy file. You must specify an ");
err.println("\t\toutput file to use this option as graphs will be saved at");
err.println("\t\t<outputfile>.png. You cannot create graphs in efficient mode.");
err.println();
err.println("\t-d | --delete-entropy");
err.println("\t\tOnly useful when used with graph mode. Nice to use this to");
err.println("\t\texpress large entropy data in a much smaller file.");
err.println();
err.println("\t-e | --efficient");
err.println("\t\tEntropy values are output as bytes or ints instead of strings.");
err.println("\t\tThis saves time and space. Note that seperator options are ");
err.println("\t\tignored.");
err.println();
err.println("\t-f if| --file if");
err.println("\t\tFile <if> to analyse.");
err.println();
err.println("\t-g <gp>| --gnuplotpath <gp>");
err.println("\t\tPath to gnuplot on your system. Default is \"gnuplot\" so ");
err.println("\t\tneeds to be in your current path. ");
err.println();
err.println("\t-h | --help");
err.println("\t\tThe help you're looking at now ;-D.");
err.println();
err.println("\t-o of| --output of");
err.println("\t\tFile <of> to send output to.");
err.println();
err.println("\t-r <re> | --resolution <re>");
err.println("\t\tIf the -f or --file option is used then you can set the ");
err.println("\t\tresultion <re> of the results. E.g. file to be analysed is ");
err.println("\t\t1000 bytes long; if we use -a 100 then every 10 entropy");
err.println("\t\tvalues are averaged (nearest smaller int) and output. If used in");
err.println("\t\tconjunction with -b or --blockmode then this resolution applies");
err.println("\t\tacross the block size -- it follows that the resolution should ");
err.println("\t\tbe smaller than the block size in this case. ");
err.println("\t\tSetting reasonable values can help speed things up a lot and");
err.println("\t\tmake your entropy files a reasonable size; that's win-win!");
err.println();
err.println("\t-s | --seperator");
err.println("\t\tString to appear between entropy values.");
err.println();
err.println("\t-S | --summary");
err.println("\t\tOutput summary to STDERR.");
err.println();
err.println("\t-w ws | --window ws");
err.println("\t\tWindow to consider entropy over. Should be followed by a ");
err.println("\t\tpositive integer <ws>. WARNING: can't be lower than number");
err.println("\t\tof input bytes.");
err.println();
if(exit > 0) System.exit(exit);
}
/**
* The main program, run at your own risk!
*/
public static void main(String [] args) {
InputStream in = null;
OutputStream out = null;
ByteEntropy be = null;
PrintStream sum = null;
File inFile = null, outFile = null;
int windowSize = 200;
int blockSize = 0;
int resol = 0;
String seperator = ",\n";
boolean efficient = false;
boolean createGraph = false;
boolean deleteEntropy = false;
String gnuplotPath = "gnuplot";
// Handle arguements
if(args.length>0) {
for(int i=0,n=args.length;i<n;i++) {
switch(args[i].hashCode()) {
case BLOCKMODE:
case BLOCKMODE2:
blockSize = Integer.parseInt(args[++i]); break;
case CREATEGRAPH:
case CREATEGRAPH2:
createGraph = true; break;
case DELETEENTROPY:
case DELETEENTROPY2:
deleteEntropy = true; break;
case GNUPLOTPATH:
case GNUPLOTPATH2:
gnuplotPath = args[++i]; break;
case HELP:
case HELP2:
printHelpOption(); break;
case FILE:
case FILE2:
inFile = new File(args[++i]); break;
case OUTPUT:
case OUTPUT2:
outFile = new File(args[++i]); break;
case RESOLUTION:
case RESOLUTION2:
resol = Integer.parseInt(args[++i]); break;
case SEPERATOR:
case SEPERATOR2:
// Allow line returns
seperator = args[++i].replace("\\n", "\n");
break;
case WINDOW:
case WINDOW2:
// Read next int.
windowSize = Integer.parseInt(args[++i]); break;
case SUMMARY:
case SUMMARY2:
sum = System.err; break;
case EFFICIENT:
case EFFICIENT2:
efficient = true; break;
default:
// Hmmm, incorrect input!
printHelp(); break; // Help auto exits
}
}
} // else all defaults
// Create graph needs an output file set as graph is saved as OURPUTFILE.png
if(createGraph && (outFile == null || efficient)) {
System.err.println("Create graph needs an output file set as graph is saved as OUTPUT.png."
+" Creategraph may not be used with efficient modes.");
printHelp();
}
long timer = System.currentTimeMillis();
try {
// No input set, so default to STDIN
if(inFile == null) {
in = new BufferedInputStream(System.in, BUFIN);
} else {
in = new BufferedInputStream(new FileInputStream(inFile), BUFIN);
}
// Was an output file set?
if(outFile == null) {
out = new BufferedOutputStream(System.out, BUFOUT);
} else {
out = new BufferedOutputStream(new FileOutputStream(outFile), BUFOUT);
}
be = new ByteEntropy(in, out, windowSize, efficient, seperator);
if(blockSize > 0) {
be.setBlockSize(blockSize);
}
if(resol > 0) {
be.setResolution(resol);
}
be.analyse();
out.flush();
in.close();
out.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
// Holy crap batman!
if(!be.isComplete()) {
System.err.println("WARNING: Analysis was not complete!");
} else {
// Do we need graphs?
if(createGraph) {
OutputStream plotOut = null;
File plotFile = null;
try {
String resolutionInfo = "";
if(resol > 1) {
resolutionInfo = "Entropy is averaged over "+resol+"bytes";
}
String plotString = "set terminal png;\n"
+"set output '"+outFile.getPath()+".png';\n"
+"set title 'Entropy of "+outFile.getName()+"; "+resolutionInfo+"';\n"
+"set grid;\n"
+"set ylabel 'Entropy in "+windowSize+" previous bytes';\n"
+"plot('"+outFile.getPath()+"') with lines;";
plotFile = new File(outFile.getParentFile(), outFile.getName()+".plot");
plotOut = new BufferedOutputStream(new FileOutputStream(plotFile));
plotOut.write(plotString.getBytes());
plotOut.flush();
plotOut.close();
//System.out.println("Created gnuplot settings file "+plotFile.getPath());
Runtime rt=Runtime.getRuntime();
Process p = rt.exec(
gnuplotPath+" "+plotFile.getPath()+" "
);
p.waitFor();
// Don't need it anymore!
plotFile.delete();
//System.out.println("Created "+outFile.getPath()+".png ("+p.exitValue()+")");
} catch(Exception e) {
System.out.println(e.getMessage());
}
}
}
if(deleteEntropy) {
outFile.delete();
}
// Check if summary stream init'd
if(sum!=null) {
long took = System.currentTimeMillis() - timer;
long read = be.getByteCount();
// Handle short input.
if(took==0) took++;
if(read==0) read++;
double rate = (read/MB)/(took/1000.0);
// Precision to 3 decimal places.
rate = Math.round(rate*10000) / 10000;
// Write summary.
sum.println("Read,"+read+",bytes");
sum.println("Took,"+took+",ms");
sum.println("Rate,"+rate+",MB/s");
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment