chrisdlangton · November 16, 2023 00:02
diff --git a/xor.py b/xor.py
 import string, itertools, time, re

 from nltk.corpus import words
 from nltk.tokenize import word_tokenize
 import nltk

 flag = "The message is too secret"
 secret_key = 'CTFKey'

 def is_mostly_english(text, threshold: float = 0.5):
    # Tokenize the text into individual words
    words_in_text = word_tokenize(text)

    # Count of English and non-English words
    english_word_count = 0
    total_word_count = 0

    # Set of English words for faster lookup
    english_words = set(words.words())

    # Iterate over each word in the text
    for word in words_in_text:
        # Check if the word is alphanumeric to filter out punctuation and special characters
        if word.isalpha() and word.lower() in english_words:
            english_word_count += 1

    # Calculate the percentage of English words
    if not total_word_count:
        return False
    english_percentage = english_word_count / total_word_count

    # Return True if more than 50% of the words are English, otherwise False
    return english_percentage > threshold

 def _ascii_permutations(key_length: int):
    # Generate all possible keys of a given length using printable ASCII characters
    return itertools.product(string.printable, repeat=key_length)

 def is_meaningful(text: str) -> bool:
    # Check for excessive non-standard characters
    if not all(char in string.printable for char in text):
        return False

    # Check all are alphabetic or all characters are a number, or both
    if text.isalnum():
        print('isalnum')
        return True

    # Optional: Third party NLP tool
    if is_mostly_english(text):
        print('NLP')
        return True

    return False

 # Python generator to decrypt and derive the original text from the ciphertext
 # It includes a step to guess the key length, defaulting to 6
 def xor_decrypt_guess_key(ciphertext, key_length_guess=6) -> list(tuple[str, str]):
    # Generate all possible keys of a given length using printable ASCII characters
    possible_keys = _ascii_permutations(key_length_guess)

    # start_time = time.time()
    # print(f"possible_keys {len(list(possible_keys))}")
    # print(f"derived key length in {time.time() - start_time} seconds.")
    for key in possible_keys:
        key_str = ''.join(key)
        decrypted_text = xor(ciphertext, key_str)
        # Heuristic to check if the decrypted text is readable (contains only printable characters)
        if is_meaningful(decrypted_text):
            yield (decrypted_text, key_str)

 # MB XOR encoding
 def xor(data, key):
    key_length = len(key)
    return ''.join(chr(ord(data[i]) ^ ord(key[i % key_length])) for i in range(len(data)))

 def derive_key(ciphertext, with_length: int = None):
    start_time = time.time()
    if with_length:
        print(f"Trying length {with_length}")
        for guess, used_key in xor_decrypt_guess_key(ciphertext, with_length):
            print(f"completed in {time.time() - start_time} seconds.")
            if guess:
                print(f"decrypted '{guess} 'with key '{used_key}'")

    else:
        for try_length in range(1, 7):
            print(f"Trying length {try_length}")
            for guess, used_key in xor_decrypt_guess_key(ciphertext, try_length):
                print(f"completed in {time.time() - start_time} seconds.")
                if guess:
                    print(f"decrypted '{guess}' with key '{used_key}'")

 # ---------------------------
 # Start
 # ---------------------------
 if __name__ == "__main__":
    from pprint import pprint
    def _compare(check, matches) -> bool:
        pprint(check)
        pprint(f'Matched {check == matches}')

    # Read the ciphertext from the provided file
    # file_path = 'encrypted_string.txt'
    # from pathlib import Path
    # ciphertext = Path(file_path).read_text()

    # Or define for self-contained testing
    # test_ciphertext: str = ''
    # _compare(test_ciphertext, generated_ciphertext)

    # Download the words dataset from NLTK
    nltk.download('words')
    nltk.download('punkt')

    # Generate from vars for testing
    generated_ciphertext: str = xor(flag, secret_key)
    pprint(("generated ciphertext", generated_ciphertext, secret_key, flag))

    derive_key(generated_ciphertext, 4)
    # derive_key(test_ciphertext)
	import string, itertools, time, re

	from nltk.corpus import words
	from nltk.tokenize import word_tokenize
	import nltk

	flag = "The message is too secret"
	secret_key = 'CTFKey'

	def is_mostly_english(text, threshold: float = 0.5):
	# Tokenize the text into individual words
	words_in_text = word_tokenize(text)

	# Count of English and non-English words
	english_word_count = 0
	total_word_count = 0

	# Set of English words for faster lookup
	english_words = set(words.words())

	# Iterate over each word in the text
	for word in words_in_text:
	# Check if the word is alphanumeric to filter out punctuation and special characters
	if word.isalpha() and word.lower() in english_words:
	english_word_count += 1

	# Calculate the percentage of English words
	if not total_word_count:
	return False
	english_percentage = english_word_count / total_word_count

	# Return True if more than 50% of the words are English, otherwise False
	return english_percentage > threshold

	def _ascii_permutations(key_length: int):
	# Generate all possible keys of a given length using printable ASCII characters
	return itertools.product(string.printable, repeat=key_length)

	def is_meaningful(text: str) -> bool:
	# Check for excessive non-standard characters
	if not all(char in string.printable for char in text):
	return False

	# Check all are alphabetic or all characters are a number, or both
	if text.isalnum():
	print('isalnum')
	return True

	# Optional: Third party NLP tool
	if is_mostly_english(text):
	print('NLP')
	return True

	return False

	# Python generator to decrypt and derive the original text from the ciphertext
	# It includes a step to guess the key length, defaulting to 6
	def xor_decrypt_guess_key(ciphertext, key_length_guess=6) -> list(tuple[str, str]):
	# Generate all possible keys of a given length using printable ASCII characters
	possible_keys = _ascii_permutations(key_length_guess)

	# start_time = time.time()
	# print(f"possible_keys {len(list(possible_keys))}")
	# print(f"derived key length in {time.time() - start_time} seconds.")
	for key in possible_keys:
	key_str = ''.join(key)
	decrypted_text = xor(ciphertext, key_str)
	# Heuristic to check if the decrypted text is readable (contains only printable characters)
	if is_meaningful(decrypted_text):
	yield (decrypted_text, key_str)

	# MB XOR encoding
	def xor(data, key):
	key_length = len(key)
	return ''.join(chr(ord(data[i]) ^ ord(key[i % key_length])) for i in range(len(data)))

	def derive_key(ciphertext, with_length: int = None):
	start_time = time.time()
	if with_length:
	print(f"Trying length {with_length}")
	for guess, used_key in xor_decrypt_guess_key(ciphertext, with_length):
	print(f"completed in {time.time() - start_time} seconds.")
	if guess:
	print(f"decrypted '{guess} 'with key '{used_key}'")

	else:
	for try_length in range(1, 7):
	print(f"Trying length {try_length}")
	for guess, used_key in xor_decrypt_guess_key(ciphertext, try_length):
	print(f"completed in {time.time() - start_time} seconds.")
	if guess:
	print(f"decrypted '{guess}' with key '{used_key}'")

	# ---------------------------
	# Start
	# ---------------------------
	if __name__ == "__main__":
	from pprint import pprint
	def _compare(check, matches) -> bool:
	pprint(check)
	pprint(f'Matched {check == matches}')

	# Read the ciphertext from the provided file
	# file_path = 'encrypted_string.txt'
	# from pathlib import Path
	# ciphertext = Path(file_path).read_text()

	# Or define for self-contained testing
	# test_ciphertext: str = ''
	# _compare(test_ciphertext, generated_ciphertext)

	# Download the words dataset from NLTK
	nltk.download('words')
	nltk.download('punkt')

	# Generate from vars for testing
	generated_ciphertext: str = xor(flag, secret_key)
	pprint(("generated ciphertext", generated_ciphertext, secret_key, flag))

	derive_key(generated_ciphertext, 4)
	# derive_key(test_ciphertext)