Created
November 16, 2023 00:02
-
-
Save chrisdlangton/55c6792a1173b912bcb01d0c0ce2d9c2 to your computer and use it in GitHub Desktop.
Use NLP to filter results of bruteforce key length for XOR encoded words, alphanumeric, or phrase
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string, itertools, time, re | |
from nltk.corpus import words | |
from nltk.tokenize import word_tokenize | |
import nltk | |
flag = "The message is too secret" | |
secret_key = 'CTFKey' | |
def is_mostly_english(text, threshold: float = 0.5): | |
# Tokenize the text into individual words | |
words_in_text = word_tokenize(text) | |
# Count of English and non-English words | |
english_word_count = 0 | |
total_word_count = 0 | |
# Set of English words for faster lookup | |
english_words = set(words.words()) | |
# Iterate over each word in the text | |
for word in words_in_text: | |
# Check if the word is alphanumeric to filter out punctuation and special characters | |
if word.isalpha() and word.lower() in english_words: | |
english_word_count += 1 | |
# Calculate the percentage of English words | |
if not total_word_count: | |
return False | |
english_percentage = english_word_count / total_word_count | |
# Return True if more than 50% of the words are English, otherwise False | |
return english_percentage > threshold | |
def _ascii_permutations(key_length: int): | |
# Generate all possible keys of a given length using printable ASCII characters | |
return itertools.product(string.printable, repeat=key_length) | |
def is_meaningful(text: str) -> bool: | |
# Check for excessive non-standard characters | |
if not all(char in string.printable for char in text): | |
return False | |
# Check all are alphabetic or all characters are a number, or both | |
if text.isalnum(): | |
print('isalnum') | |
return True | |
# Optional: Third party NLP tool | |
if is_mostly_english(text): | |
print('NLP') | |
return True | |
return False | |
# Python generator to decrypt and derive the original text from the ciphertext | |
# It includes a step to guess the key length, defaulting to 6 | |
def xor_decrypt_guess_key(ciphertext, key_length_guess=6) -> list(tuple[str, str]): | |
# Generate all possible keys of a given length using printable ASCII characters | |
possible_keys = _ascii_permutations(key_length_guess) | |
# start_time = time.time() | |
# print(f"possible_keys {len(list(possible_keys))}") | |
# print(f"derived key length in {time.time() - start_time} seconds.") | |
for key in possible_keys: | |
key_str = ''.join(key) | |
decrypted_text = xor(ciphertext, key_str) | |
# Heuristic to check if the decrypted text is readable (contains only printable characters) | |
if is_meaningful(decrypted_text): | |
yield (decrypted_text, key_str) | |
# MB XOR encoding | |
def xor(data, key): | |
key_length = len(key) | |
return ''.join(chr(ord(data[i]) ^ ord(key[i % key_length])) for i in range(len(data))) | |
def derive_key(ciphertext, with_length: int = None): | |
start_time = time.time() | |
if with_length: | |
print(f"Trying length {with_length}") | |
for guess, used_key in xor_decrypt_guess_key(ciphertext, with_length): | |
print(f"completed in {time.time() - start_time} seconds.") | |
if guess: | |
print(f"decrypted '{guess} 'with key '{used_key}'") | |
else: | |
for try_length in range(1, 7): | |
print(f"Trying length {try_length}") | |
for guess, used_key in xor_decrypt_guess_key(ciphertext, try_length): | |
print(f"completed in {time.time() - start_time} seconds.") | |
if guess: | |
print(f"decrypted '{guess}' with key '{used_key}'") | |
# --------------------------- | |
# Start | |
# --------------------------- | |
if __name__ == "__main__": | |
from pprint import pprint | |
def _compare(check, matches) -> bool: | |
pprint(check) | |
pprint(f'Matched {check == matches}') | |
# Read the ciphertext from the provided file | |
# file_path = 'encrypted_string.txt' | |
# from pathlib import Path | |
# ciphertext = Path(file_path).read_text() | |
# Or define for self-contained testing | |
# test_ciphertext: str = '' | |
# _compare(test_ciphertext, generated_ciphertext) | |
# Download the words dataset from NLTK | |
nltk.download('words') | |
nltk.download('punkt') | |
# Generate from vars for testing | |
generated_ciphertext: str = xor(flag, secret_key) | |
pprint(("generated ciphertext", generated_ciphertext, secret_key, flag)) | |
derive_key(generated_ciphertext, 4) | |
# derive_key(test_ciphertext) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment