-
-
Save TheRockStarDBA/dc0423fbf49322927ed05bf6b32e2d37 to your computer and use it in GitHub Desktop.
Frequency analysis tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import sys | |
import mmap | |
import logging | |
from collections import defaultdict | |
logging.basicConfig( level=logging.ERROR, | |
format='%(asctime)s %(levelname)-8s %(message)s', | |
datefmt='%Y-%m-%dT%H:%M:%S', | |
handlers={logging.StreamHandler(sys.stderr)}) | |
def convertToNumber(values, endianness): | |
result = 0 | |
if endianness == 'big': values = values[::-1] | |
for v in values: result = (result<<8)+v | |
return result | |
def produceFrequencies(filename, args): | |
frequencies = defaultdict(int) | |
# probably need to support giant files | |
__fileHandle = open(filename, 'rb') | |
fileHandle = mmap.mmap(__fileHandle.fileno(), 0, access=mmap.PROT_READ) | |
fileSize = fileHandle.size() | |
ngramSize = args.size | |
numValues = int(fileSize/ngramSize) | |
remainder = fileSize % ngramSize | |
if remainder != 0: logging.warning('Ignoring {:d} bytes at the end'.format(remainder)) | |
for x in range(numValues): | |
nextBytes = fileHandle[(x*ngramSize):((x+1)*ngramSize)] | |
nextVal = convertToNumber(nextBytes, args.endianness) | |
frequencies[nextVal] += 1 | |
values = frequencies.items() | |
if args.sort_values: | |
values = sorted(values, key=lambda x: x[1], reverse=True) | |
outputPadder = ngramSize*2 | |
outputType = None | |
if args.display_type == 'hex': outputType = 'x' | |
elif args.display_type == 'decimal': outputType = 'd' | |
else: logging.error("Invalid output type, defaulting to hex") | |
outputFormatter = '{:0'+str(len(str(fileSize)))+'d} {:0'+'{:d}'.format(outputPadder)+outputType+'}' | |
for value, freq in values: | |
print(outputFormatter.format(freq, value)) | |
def parseArguments(): | |
parser = argparse.ArgumentParser(description="Arguments for script") | |
parser.add_argument('files', nargs='+') | |
parser.add_argument('-S', '--sort_values', action='store_true', default=False, help='If specified, sort the output by frequency') | |
parser.add_argument('-d', '--display_type', action='store', default='hex', choices=['hex', 'decimal'], help='Specify the output format (hex or decimal, NO OCTAL FOR YOU') | |
parser.add_argument('-s', '--size', action='store', type=int, default=1, help='Specifies the number of bytes to compute frequencies for') | |
parser.add_argument('-e', '--endianness', action='store', default='little', choices=['little', 'big'], help='Specify the endianness to compute multi-byte values (default is little endian)') | |
parser.add_argument('-v', '--verbose', action='store', default=None, help='If specified, output verbose input') | |
args = parser.parse_args() | |
if args.verbose != None: | |
newLevel = getattr(logging, args.verbose.upper(), None) | |
if isinstance(newLevel, int): | |
logging.getLogger().setLevel(newLevel) | |
return args | |
def main(): | |
args = parseArguments() | |
for f in args.files: | |
produceFrequencies(f, args) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment