Skip to content

Instantly share code, notes, and snippets.

@IlluminatiFish
Last active June 4, 2021 00:48
Show Gist options
  • Save IlluminatiFish/29c30883d03c42b52b54d9e006ed8439 to your computer and use it in GitHub Desktop.
Save IlluminatiFish/29c30883d03c42b52b54d9e006ed8439 to your computer and use it in GitHub Desktop.
A short python script that uses Shannon Entropy to analyse a file to find any readable text and/or code and extracts it out
#
# This program is a utility used by myself that I have released
# to the public under the GPLv3 license
#
# Copyright (c) 2021 IlluminatiFish.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.
#
import math, sys
from collections import Counter
# Utils class aggregated into one gist for readability
def get_entropy(data, unit='natural'):
base = {
'shannon' : 2.,
'natural' : math.exp(1),
'hartley' : 10
}
if len(data) <= 1:
return 0
counts = Counter()
for char in data:
counts[char] += 1
entropy = 0
probabilities = [float(c) / len(data) for c in counts.values()]
for probability in probabilities:
if probability > 0:
entropy -= probability * math.log(probability, base[unit])
return entropy
def get_data_size(data):
return sys.getsizeof(data)
def get_lowest_entropy_blocks(entropy_data_table):
return sorted(entropy_data_table.keys(), key=lambda k: entropy_data_table[k], reverse=False)
entropy_data_table = {}
split_data = []
data = open('susfile.txt', 'rb').read()
print()
# data.decode() goes to a lower entropy
print('Total Entropy:', get_entropy(data, 'shannon'))
data_size = get_data_size(data)
print('Size:', data_size)
block_size = 183 # Size of each block in bytes
blocks = int(data_size // block_size)
print('Blocks:', blocks)
for block_index in range(0, blocks):
start = block_index * block_size
end = block_size * (block_index + 1)
block_data = data[start:end]
entropy = get_entropy(block_data, 'shannon')
entropy_data_table[block_index + 1] = entropy
split_data.append(block_data)
lowest_entropy_blocks = get_lowest_entropy_blocks(entropy_data_table)
BLOCK_ENTROPY_THRESHOLD = 5.3
readable_blocks = []
for lowest_entropy_block in lowest_entropy_blocks:
block_entropy = entropy_data_table.get(lowest_entropy_block)
if block_entropy < BLOCK_ENTROPY_THRESHOLD: # Catch any blocks that are below our tested threshold
block_index = lowest_entropy_block - 1
print(block_entropy, split_data[block_index])
readable_blocks.append(block_index)
extracted_content = ""
for readable_block in sorted(readable_blocks):
block_data = split_data[readable_block]
extracted_content += block_data.decode()
print()
print("Extracted Readable Content:\n")
print(extracted_content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment