Last active
June 4, 2021 00:48
-
-
Save IlluminatiFish/29c30883d03c42b52b54d9e006ed8439 to your computer and use it in GitHub Desktop.
A short python script that uses Shannon Entropy to analyse a file to find any readable text and/or code and extracts it out
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# This program is a utility used by myself that I have released | |
# to the public under the GPLv3 license | |
# | |
# Copyright (c) 2021 IlluminatiFish. | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, version 3. | |
# | |
# This program is distributed in the hope that it will be useful, but | |
# WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see http://www.gnu.org/licenses/. | |
# | |
import math, sys | |
from collections import Counter | |
# Utils class aggregated into one gist for readability | |
def get_entropy(data, unit='natural'): | |
base = { | |
'shannon' : 2., | |
'natural' : math.exp(1), | |
'hartley' : 10 | |
} | |
if len(data) <= 1: | |
return 0 | |
counts = Counter() | |
for char in data: | |
counts[char] += 1 | |
entropy = 0 | |
probabilities = [float(c) / len(data) for c in counts.values()] | |
for probability in probabilities: | |
if probability > 0: | |
entropy -= probability * math.log(probability, base[unit]) | |
return entropy | |
def get_data_size(data): | |
return sys.getsizeof(data) | |
def get_lowest_entropy_blocks(entropy_data_table): | |
return sorted(entropy_data_table.keys(), key=lambda k: entropy_data_table[k], reverse=False) | |
entropy_data_table = {} | |
split_data = [] | |
data = open('susfile.txt', 'rb').read() | |
print() | |
# data.decode() goes to a lower entropy | |
print('Total Entropy:', get_entropy(data, 'shannon')) | |
data_size = get_data_size(data) | |
print('Size:', data_size) | |
block_size = 183 # Size of each block in bytes | |
blocks = int(data_size // block_size) | |
print('Blocks:', blocks) | |
for block_index in range(0, blocks): | |
start = block_index * block_size | |
end = block_size * (block_index + 1) | |
block_data = data[start:end] | |
entropy = get_entropy(block_data, 'shannon') | |
entropy_data_table[block_index + 1] = entropy | |
split_data.append(block_data) | |
lowest_entropy_blocks = get_lowest_entropy_blocks(entropy_data_table) | |
BLOCK_ENTROPY_THRESHOLD = 5.3 | |
readable_blocks = [] | |
for lowest_entropy_block in lowest_entropy_blocks: | |
block_entropy = entropy_data_table.get(lowest_entropy_block) | |
if block_entropy < BLOCK_ENTROPY_THRESHOLD: # Catch any blocks that are below our tested threshold | |
block_index = lowest_entropy_block - 1 | |
print(block_entropy, split_data[block_index]) | |
readable_blocks.append(block_index) | |
extracted_content = "" | |
for readable_block in sorted(readable_blocks): | |
block_data = split_data[readable_block] | |
extracted_content += block_data.decode() | |
print() | |
print("Extracted Readable Content:\n") | |
print(extracted_content) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment