CameronLonsdale · July 11, 2017 05:36 · DayDun · Feb 18, 2021 · sumukus · Feb 19, 2021
diff --git a/MD2.py b/MD2.py
 #!/usr/bin/python3

 """
 MD2 Message Digest Algorithm.
 Implemented by Cameron Lonsdale to the spec of RFC 1319.
 """

 import binascii

 # Permutation of 0..255 constructed from the digits of pi.
 # It gives a "random" nonlinear byte substitution operation.
 #
 # Cameron's Notes:
 # ~~~~~~~~~~~~~~~~
 # S-box (substitution-box). Attempting to obscure the relationship between the input and the output.
 # This aids in providing the diffusion property. Also known as the Avalance effect.
 #
 # The table was generated using the digits from the fractional part of Pi.
 # Numbers used as cryptographic constants are called nothing up my sleeve numbers.
 # If the numbers can be generated from a well known method or mathematical property,
 # its likely that the designer has nothing up their sleeve and the constants were not nefariously chosen.
 # If the constants seem arbitrary and with no explanation of how they came about, its possible that they were
 # chosen to create a backdoor to the algorithm.
 Sbox = [
    41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6,
    19, 98, 167, 5, 243, 192, 199, 115, 140, 152, 147, 43, 217, 188,
    76, 130, 202, 30, 155, 87, 60, 253, 212, 224, 22, 103, 66, 111, 24,
    138, 23, 229, 18, 190, 78, 196, 214, 218, 158, 222, 73, 160, 251,
    245, 142, 187, 47, 238, 122, 169, 104, 121, 145, 21, 178, 7, 63,
    148, 194, 16, 137, 11, 34, 95, 33, 128, 127, 93, 154, 90, 144, 50,
    39, 53, 62, 204, 231, 191, 247, 151, 3, 255, 25, 48, 179, 72, 165,
    181, 209, 215, 94, 146, 42, 172, 86, 170, 198, 79, 184, 56, 210,
    150, 164, 125, 182, 118, 252, 107, 226, 156, 116, 4, 241, 69, 157,
    112, 89, 100, 113, 135, 32, 134, 91, 207, 101, 230, 45, 168, 2, 27,
    96, 37, 173, 174, 176, 185, 246, 28, 70, 97, 105, 52, 64, 126, 15,
    85, 71, 163, 35, 221, 81, 175, 58, 195, 92, 249, 206, 186, 197,
    234, 38, 44, 83, 13, 110, 133, 40, 132, 9, 211, 223, 205, 244, 65,
    129, 77, 82, 106, 220, 55, 200, 108, 193, 171, 250, 36, 225, 123,
    8, 12, 189, 177, 74, 120, 136, 149, 139, 227, 99, 232, 109, 233,
    203, 213, 254, 59, 0, 29, 57, 242, 239, 183, 14, 102, 88, 208, 228,
    166, 119, 114, 248, 235, 117, 75, 10, 49, 68, 80, 180, 143, 237,
    31, 26, 219, 153, 141, 51, 159, 17, 131, 20
 ]

 block_size = 16  # 16 bytes or 128 bits

 message = input()
 message_bytes = bytearray(message, 'utf-8')

 # Step 1: Append Padding Bytes
 # ============================
 # Even if the length is a multiple of 16, 16 bytes are still appended.
 # i bytes of value i are appended.
 padding = block_size - (len(message_bytes) % block_size)
 message_bytes += bytearray(padding for _ in range(padding))

 # Cameron's Note:
 # ~~~~~~~~~~~~~~
 # Padding is understandable as the further functions rely on a certain block size,
 # so its essential that the sizes of blocks match.
 # I'm unsure why the padding values are the same value as the padding length.
 # If the padding was a fixed value would one of the hash properties be weakened?
 #
 # A partial answer to this question is that there are multiple different types of padding (cause why not)
 # The specific padding used in MD2 is called PKCS7. see: https://en.wikipedia.org/wiki/Padding_(cryptography)
 # From the expanation given, it seems necessary for certain encryption schemes, but for hashing its unclear.
 # My intuition is telling me that another padding scheme could be used without damaging any cryptographic properties.
 #
 # Actually it can be. For MD to work effectively, MD-compliant padding MUST be used. Which must have the following properties
 # - M is a prefix of Pad(M)
 # - If |M_1| = |M_2| (length) then |Pad(M_1)| = |Pad(M_2)|
 # - If |M_1| != |M_2| then the last block of Pad(M_1) is different to the last block of Pad(M_2)
 #
 # With these conditions, Zero padding still holds, so unless something else is coming into play, choosing a MD-compliant padding
 # is all that is required.


 # Step 2: Append Checksum
 # =======================
 # 16 byte checksum is appended to message_bytes
 previous_checkbyte = 0  # Used as an Initialisation Vector (IV)
 checksum = bytearray(0 for _ in range(block_size))

 # Process each 16-word block (16 bytes per block)
 for i in range(len(message_bytes) // block_size):
    # Calculate checksum of block using each byte of the block
    for j in range(block_size):
        byte = message_bytes[i * block_size + j]
        previous_checkbyte = checksum[j] = Sbox[byte ^ previous_checkbyte]

 message_bytes += checksum

 # Cameron's Note:
 # ~~~~~~~~~~~~~~
 # I'm thinking about why the Sbox is used here. Does the checksum also need to add to diffusion?
 # If you remove it, how does it affect the overall hash?

 # Interestingly, the checksum does not take into account the entire message other than through the previous_checkbyte.
 # You can have a checksum match between messages if the last block is the same AND the previous_checkbyte of the previous block
 # is the same. Because padding is included, we can get a checksum collision with just a single block.
 # The blocks
 # b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00"
 # and
 # b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa"
 # Both have checksum values of cc3b183d1701a7099359c295711936e0
 # With complete hash values of 9aac6a2bb4bbd9f70d192e01dbdfb0be and 366683424c1157b3e4df675dbc10103d respectively.
 #
 # Here is the snippet to generate collisions:
 # block_size = 16
 # matches = {}

 # for block in range(0, 2**(block_size * 8)):
 #     message = bytearray.fromhex('{:032x}'.format(block))
 #     checksum = md2_checksum(message)

 #     if checksum in matches:
 #         print("Found a collision")
 #         print(message)
 #         print(checksum)
 #         print(matches[checksum])
 #         exit()
 #     else:
 #         matches[checksum] = message


 # Step 3: Initialise message digest buffer
 # ========================================
 buffer_size = 48
 digest = bytearray([0 for _ in range(buffer_size)])


 # Step 4: Process message in 16-byte blocks
 # =========================================
 n_rounds = 18

 for i in range(len(message_bytes) // block_size):
    # Copy block i into the middle section of the array
    # The last section in the array is then filled with front section XOR middle section
    for j in range(block_size):
        digest[block_size + j] = message_bytes[i * block_size + j]
        digest[2 * block_size + j] = digest[block_size + j] ^ digest[j]

    # Rounds of encryption over the entire array. Current byte XOR'd with the previous (substituted) byte.
    previous_hashbyte = 0
    for j in range(n_rounds):
        for k in range(buffer_size):
            digest[k] = previous_hashbyte = digest[k] ^ Sbox[previous_hashbyte]

        previous_hashbyte = (previous_hashbyte + j) % len(Sbox)


 # Cameron's Note
 # ~~~~~~~~~~~~~~
 # This is also known as the compresison function. This type of compression function is known as
 # Merkle-Damgard. The nice property about this is that if the compression function f is collision resistant, then so is
 # the entire hash function!
 #
 # The 48 bytes are split up into the following sections
 # [digest | message_block_i | digest XOR message_block_i]
 # With each section being 16 bytes long.
 #
 # At the beginning the array will look like this, because the initial values in the digest is 0.
 # [0 | message_block_i | message_block_i]
 #
 # Then the entire array is "encrypted" 18 times. The value of the previous byte is (first substituted using Sbox then)
 # XOR'd with the current block. This chaining flows over the entire array.


 # Step 5: Output
 # ==============
 print(binascii.hexlify(digest[:16]).decode('utf-8'))


 # Fun Facts!
 # MD doesnt (just?) stand for Message Digest, it refers to Merkle-Damgard Construction. A technique
 # for building collision resistant one way compression functions.


 # Further Reading
 # ===============
 # https://en.wikipedia.org/wiki/Padding_(cryptography)
 # https://en.wikipedia.org/wiki/Merkle%E2%80%93Damg%C3%A5rd_construction
 # https://en.wikipedia.org/wiki/MD2_(cryptography)
 # https://tools.ietf.org/html/rfc1319
 # https://en.wikipedia.org/wiki/One-way_compression_function
 # https://www.ssi.gouv.fr/archive/fr/sciences/fichiers/lcr/mu04c.pdf
 # https://crypto.stackexchange.com/questions/11935/how-is-the-md2-hash-function-s-table-constructed-from-pi
 # http://networkdls.com/Articles/bulletn4.pdf
	#!/usr/bin/python3

	"""
	MD2 Message Digest Algorithm.
	Implemented by Cameron Lonsdale to the spec of RFC 1319.
	"""

	import binascii

	# Permutation of 0..255 constructed from the digits of pi.
	# It gives a "random" nonlinear byte substitution operation.
	#
	# Cameron's Notes:
	# ~~~~~~~~~~~~~~~~
	# S-box (substitution-box). Attempting to obscure the relationship between the input and the output.
	# This aids in providing the diffusion property. Also known as the Avalance effect.
	#
	# The table was generated using the digits from the fractional part of Pi.
	# Numbers used as cryptographic constants are called nothing up my sleeve numbers.
	# If the numbers can be generated from a well known method or mathematical property,
	# its likely that the designer has nothing up their sleeve and the constants were not nefariously chosen.
	# If the constants seem arbitrary and with no explanation of how they came about, its possible that they were
	# chosen to create a backdoor to the algorithm.
	Sbox = [
	41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6,
	19, 98, 167, 5, 243, 192, 199, 115, 140, 152, 147, 43, 217, 188,
	76, 130, 202, 30, 155, 87, 60, 253, 212, 224, 22, 103, 66, 111, 24,
	138, 23, 229, 18, 190, 78, 196, 214, 218, 158, 222, 73, 160, 251,
	245, 142, 187, 47, 238, 122, 169, 104, 121, 145, 21, 178, 7, 63,
	148, 194, 16, 137, 11, 34, 95, 33, 128, 127, 93, 154, 90, 144, 50,
	39, 53, 62, 204, 231, 191, 247, 151, 3, 255, 25, 48, 179, 72, 165,
	181, 209, 215, 94, 146, 42, 172, 86, 170, 198, 79, 184, 56, 210,
	150, 164, 125, 182, 118, 252, 107, 226, 156, 116, 4, 241, 69, 157,
	112, 89, 100, 113, 135, 32, 134, 91, 207, 101, 230, 45, 168, 2, 27,
	96, 37, 173, 174, 176, 185, 246, 28, 70, 97, 105, 52, 64, 126, 15,
	85, 71, 163, 35, 221, 81, 175, 58, 195, 92, 249, 206, 186, 197,
	234, 38, 44, 83, 13, 110, 133, 40, 132, 9, 211, 223, 205, 244, 65,
	129, 77, 82, 106, 220, 55, 200, 108, 193, 171, 250, 36, 225, 123,
	8, 12, 189, 177, 74, 120, 136, 149, 139, 227, 99, 232, 109, 233,
	203, 213, 254, 59, 0, 29, 57, 242, 239, 183, 14, 102, 88, 208, 228,
	166, 119, 114, 248, 235, 117, 75, 10, 49, 68, 80, 180, 143, 237,
	31, 26, 219, 153, 141, 51, 159, 17, 131, 20
	]

	block_size = 16 # 16 bytes or 128 bits

	message = input()
	message_bytes = bytearray(message, 'utf-8')

	# Step 1: Append Padding Bytes
	# ============================
	# Even if the length is a multiple of 16, 16 bytes are still appended.
	# i bytes of value i are appended.
	padding = block_size - (len(message_bytes) % block_size)
	message_bytes += bytearray(padding for _ in range(padding))

	# Cameron's Note:
	# ~~~~~~~~~~~~~~
	# Padding is understandable as the further functions rely on a certain block size,
	# so its essential that the sizes of blocks match.
	# I'm unsure why the padding values are the same value as the padding length.
	# If the padding was a fixed value would one of the hash properties be weakened?
	#
	# A partial answer to this question is that there are multiple different types of padding (cause why not)
	# The specific padding used in MD2 is called PKCS7. see: https://en.wikipedia.org/wiki/Padding_(cryptography)
	# From the expanation given, it seems necessary for certain encryption schemes, but for hashing its unclear.
	# My intuition is telling me that another padding scheme could be used without damaging any cryptographic properties.
	#
	# Actually it can be. For MD to work effectively, MD-compliant padding MUST be used. Which must have the following properties
	# - M is a prefix of Pad(M)
	# - If \|M_1\| = \|M_2\| (length) then \|Pad(M_1)\| = \|Pad(M_2)\|
	# - If \|M_1\| != \|M_2\| then the last block of Pad(M_1) is different to the last block of Pad(M_2)
	#
	# With these conditions, Zero padding still holds, so unless something else is coming into play, choosing a MD-compliant padding
	# is all that is required.


	# Step 2: Append Checksum
	# =======================
	# 16 byte checksum is appended to message_bytes
	previous_checkbyte = 0 # Used as an Initialisation Vector (IV)
	checksum = bytearray(0 for _ in range(block_size))

	# Process each 16-word block (16 bytes per block)
	for i in range(len(message_bytes) // block_size):
	# Calculate checksum of block using each byte of the block
	for j in range(block_size):
	byte = message_bytes[i * block_size + j]
	previous_checkbyte = checksum[j] = Sbox[byte ^ previous_checkbyte]

	message_bytes += checksum

	# Cameron's Note:
	# ~~~~~~~~~~~~~~
	# I'm thinking about why the Sbox is used here. Does the checksum also need to add to diffusion?
	# If you remove it, how does it affect the overall hash?

	# Interestingly, the checksum does not take into account the entire message other than through the previous_checkbyte.
	# You can have a checksum match between messages if the last block is the same AND the previous_checkbyte of the previous block
	# is the same. Because padding is included, we can get a checksum collision with just a single block.
	# The blocks
	# b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00"
	# and
	# b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa"
	# Both have checksum values of cc3b183d1701a7099359c295711936e0
	# With complete hash values of 9aac6a2bb4bbd9f70d192e01dbdfb0be and 366683424c1157b3e4df675dbc10103d respectively.
	#
	# Here is the snippet to generate collisions:
	# block_size = 16
	# matches = {}

	# for block in range(0, 2*(block_size 8)):
	# message = bytearray.fromhex('{:032x}'.format(block))
	# checksum = md2_checksum(message)

	# if checksum in matches:
	# print("Found a collision")
	# print(message)
	# print(checksum)
	# print(matches[checksum])
	# exit()
	# else:
	# matches[checksum] = message


	# Step 3: Initialise message digest buffer
	# ========================================
	buffer_size = 48
	digest = bytearray([0 for _ in range(buffer_size)])


	# Step 4: Process message in 16-byte blocks
	# =========================================
	n_rounds = 18

	for i in range(len(message_bytes) // block_size):
	# Copy block i into the middle section of the array
	# The last section in the array is then filled with front section XOR middle section
	for j in range(block_size):
	digest[block_size + j] = message_bytes[i * block_size + j]
	digest[2 * block_size + j] = digest[block_size + j] ^ digest[j]

	# Rounds of encryption over the entire array. Current byte XOR'd with the previous (substituted) byte.
	previous_hashbyte = 0
	for j in range(n_rounds):
	for k in range(buffer_size):
	digest[k] = previous_hashbyte = digest[k] ^ Sbox[previous_hashbyte]

	previous_hashbyte = (previous_hashbyte + j) % len(Sbox)


	# Cameron's Note
	# ~~~~~~~~~~~~~~
	# This is also known as the compresison function. This type of compression function is known as
	# Merkle-Damgard. The nice property about this is that if the compression function f is collision resistant, then so is
	# the entire hash function!
	#
	# The 48 bytes are split up into the following sections
	# [digest \| message_block_i \| digest XOR message_block_i]
	# With each section being 16 bytes long.
	#
	# At the beginning the array will look like this, because the initial values in the digest is 0.
	# [0 \| message_block_i \| message_block_i]
	#
	# Then the entire array is "encrypted" 18 times. The value of the previous byte is (first substituted using Sbox then)
	# XOR'd with the current block. This chaining flows over the entire array.


	# Step 5: Output
	# ==============
	print(binascii.hexlify(digest[:16]).decode('utf-8'))


	# Fun Facts!
	# MD doesnt (just?) stand for Message Digest, it refers to Merkle-Damgard Construction. A technique
	# for building collision resistant one way compression functions.


	# Further Reading
	# ===============
	# https://en.wikipedia.org/wiki/Padding_(cryptography)
	# https://en.wikipedia.org/wiki/Merkle%E2%80%93Damg%C3%A5rd_construction
	# https://en.wikipedia.org/wiki/MD2_(cryptography)
	# https://tools.ietf.org/html/rfc1319
	# https://en.wikipedia.org/wiki/One-way_compression_function
	# https://www.ssi.gouv.fr/archive/fr/sciences/fichiers/lcr/mu04c.pdf
	# https://crypto.stackexchange.com/questions/11935/how-is-the-md2-hash-function-s-table-constructed-from-pi
	# http://networkdls.com/Articles/bulletn4.pdf