Use NLP to filter results of bruteforce key length for XOR encoded words, alphanumeric, or phrase
import string, itertools, time, re
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import nltk
flag = "The message is too secret"
secret_key = 'CTFKey'
def is_mostly_english(text, threshold: float = 0.5):
# Tokenize the text into individual words
words_in_text = word_tokenize(text)
# Count of English and non-English words
english_word_count = 0
total_word_count = 0
# Set of English words for faster lookup
english_words = set(words.words())
# Iterate over each word in the text
for word in words_in_text:
# Check if the word is alphanumeric to filter out punctuation and special characters
if word.isalpha() and word.lower() in english_words:
english_word_count += 1
# Calculate the percentage of English words
if not total_word_count:
return False
english_percentage = english_word_count / total_word_count
# Return True if more than 50% of the words are English, otherwise False
return english_percentage > threshold
def _ascii_permutations(key_length: int):
# Generate all possible keys of a given length using printable ASCII characters
return itertools.product(string.printable, repeat=key_length)
def is_meaningful(text: str) -> bool:
# Check for excessive non-standard characters
if not all(char in string.printable for char in text):
return False
# Check all are alphabetic or all characters are a number, or both
if text.isalnum():
return True
# Optional: Third party NLP tool
if is_mostly_english(text):
return True
return False
# Python generator to decrypt and derive the original text from the ciphertext
# It includes a step to guess the key length, defaulting to 6
def xor_decrypt_guess_key(ciphertext, key_length_guess=6) -> list(tuple[str, str]):
# Generate all possible keys of a given length using printable ASCII characters
possible_keys = _ascii_permutations(key_length_guess)
# start_time = time.time()
# print(f"possible_keys {len(list(possible_keys))}")
# print(f"derived key length in {time.time() - start_time} seconds.")
for key in possible_keys:
key_str = ''.join(key)
decrypted_text = xor(ciphertext, key_str)
# Heuristic to check if the decrypted text is readable (contains only printable characters)
if is_meaningful(decrypted_text):
yield (decrypted_text, key_str)
# MB XOR encoding
def xor(data, key):
key_length = len(key)
return ''.join(chr(ord(data[i]) ^ ord(key[i % key_length])) for i in range(len(data)))
def derive_key(ciphertext, with_length: int = None):
start_time = time.time()
if with_length:
print(f"Trying length {with_length}")
for guess, used_key in xor_decrypt_guess_key(ciphertext, with_length):
print(f"completed in {time.time() - start_time} seconds.")
if guess:
print(f"decrypted '{guess} 'with key '{used_key}'")
for try_length in range(1, 7):
print(f"Trying length {try_length}")
for guess, used_key in xor_decrypt_guess_key(ciphertext, try_length):
print(f"completed in {time.time() - start_time} seconds.")
if guess:
print(f"decrypted '{guess}' with key '{used_key}'")
# ---------------------------
# Start
# ---------------------------
if __name__ == "__main__":
from pprint import pprint
def _compare(check, matches) -> bool:
pprint(f'Matched {check == matches}')
# Read the ciphertext from the provided file
# file_path = 'encrypted_string.txt'
# from pathlib import Path
# ciphertext = Path(file_path).read_text()
# Or define for self-contained testing
# test_ciphertext: str = ''
# _compare(test_ciphertext, generated_ciphertext)
# Download the words dataset from NLTK'words')'punkt')
# Generate from vars for testing
generated_ciphertext: str = xor(flag, secret_key)
pprint(("generated ciphertext", generated_ciphertext, secret_key, flag))
derive_key(generated_ciphertext, 4)
# derive_key(test_ciphertext)
