Skip to content

Instantly share code, notes, and snippets.

@chrisdlangton
Created November 16, 2023 00:02
Show Gist options
  • Save chrisdlangton/55c6792a1173b912bcb01d0c0ce2d9c2 to your computer and use it in GitHub Desktop.
Save chrisdlangton/55c6792a1173b912bcb01d0c0ce2d9c2 to your computer and use it in GitHub Desktop.
Use NLP to filter results of bruteforce key length for XOR encoded words, alphanumeric, or phrase
import string, itertools, time, re
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import nltk
flag = "The message is too secret"
secret_key = 'CTFKey'
def is_mostly_english(text, threshold: float = 0.5):
# Tokenize the text into individual words
words_in_text = word_tokenize(text)
# Count of English and non-English words
english_word_count = 0
total_word_count = 0
# Set of English words for faster lookup
english_words = set(words.words())
# Iterate over each word in the text
for word in words_in_text:
# Check if the word is alphanumeric to filter out punctuation and special characters
if word.isalpha() and word.lower() in english_words:
english_word_count += 1
# Calculate the percentage of English words
if not total_word_count:
return False
english_percentage = english_word_count / total_word_count
# Return True if more than 50% of the words are English, otherwise False
return english_percentage > threshold
def _ascii_permutations(key_length: int):
# Generate all possible keys of a given length using printable ASCII characters
return itertools.product(string.printable, repeat=key_length)
def is_meaningful(text: str) -> bool:
# Check for excessive non-standard characters
if not all(char in string.printable for char in text):
return False
# Check all are alphabetic or all characters are a number, or both
if text.isalnum():
print('isalnum')
return True
# Optional: Third party NLP tool
if is_mostly_english(text):
print('NLP')
return True
return False
# Python generator to decrypt and derive the original text from the ciphertext
# It includes a step to guess the key length, defaulting to 6
def xor_decrypt_guess_key(ciphertext, key_length_guess=6) -> list(tuple[str, str]):
# Generate all possible keys of a given length using printable ASCII characters
possible_keys = _ascii_permutations(key_length_guess)
# start_time = time.time()
# print(f"possible_keys {len(list(possible_keys))}")
# print(f"derived key length in {time.time() - start_time} seconds.")
for key in possible_keys:
key_str = ''.join(key)
decrypted_text = xor(ciphertext, key_str)
# Heuristic to check if the decrypted text is readable (contains only printable characters)
if is_meaningful(decrypted_text):
yield (decrypted_text, key_str)
# MB XOR encoding
def xor(data, key):
key_length = len(key)
return ''.join(chr(ord(data[i]) ^ ord(key[i % key_length])) for i in range(len(data)))
def derive_key(ciphertext, with_length: int = None):
start_time = time.time()
if with_length:
print(f"Trying length {with_length}")
for guess, used_key in xor_decrypt_guess_key(ciphertext, with_length):
print(f"completed in {time.time() - start_time} seconds.")
if guess:
print(f"decrypted '{guess} 'with key '{used_key}'")
else:
for try_length in range(1, 7):
print(f"Trying length {try_length}")
for guess, used_key in xor_decrypt_guess_key(ciphertext, try_length):
print(f"completed in {time.time() - start_time} seconds.")
if guess:
print(f"decrypted '{guess}' with key '{used_key}'")
# ---------------------------
# Start
# ---------------------------
if __name__ == "__main__":
from pprint import pprint
def _compare(check, matches) -> bool:
pprint(check)
pprint(f'Matched {check == matches}')
# Read the ciphertext from the provided file
# file_path = 'encrypted_string.txt'
# from pathlib import Path
# ciphertext = Path(file_path).read_text()
# Or define for self-contained testing
# test_ciphertext: str = ''
# _compare(test_ciphertext, generated_ciphertext)
# Download the words dataset from NLTK
nltk.download('words')
nltk.download('punkt')
# Generate from vars for testing
generated_ciphertext: str = xor(flag, secret_key)
pprint(("generated ciphertext", generated_ciphertext, secret_key, flag))
derive_key(generated_ciphertext, 4)
# derive_key(test_ciphertext)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment