NeilMadden · February 4, 2025 14:06
diff --git a/crypto.py b/crypto.py
 #!/usr/bin/env python3
 # Copyright 2024 Neil Madden.
 # License: https://creativecommons.org/licenses/by-sa/4.0/deed.en.

 # Like this? I do training courses & consultancy:
 # https://illuminated-security.com/

 import hashlib
 import math
 import os
 from hmac import compare_digest
 from typing import Callable

 from Crypto.Cipher import (AES, ChaCha20,  # pip install pycryptodome
                           ChaCha20_Poly1305)


 print("""
 ################################################################################
 #                                                                              #
 #               A LAZY DEVELOPER'S GUIDE TO MODERN CRYPTOGRAPHY                #
 #                                                                              #
 ################################################################################
 """)

 # DISCLAIMER: the code in this tutorial is intended to illustrate the basics of
 # modern cryptography to developers. It is far from production ready: it is
 # slow, has limited/no tests, has lots of side-channel vulnerabilities etc. Use
 # Tink/libsodium/whatever in real code. (Also, I hardly ever write Python...)


 # A cipher uses a key to encrypt (encipher) and decrypt (decipher) messages/data.
 # The unencrypted data is called plaintext, the encrypted data is ciphertext.
 # According to Kerchoffs' Principle, a cipher should be secure even if every
 # aspect of its design (apart from the key) is known to the adversary.
 class Cipher:
    def encrypt(self, key: bytes, plaintext:  bytes) -> bytes: ...
    def decrypt(self, key: bytes, ciphertext: bytes) -> bytes: ...

    # Keys should be generated from a cryptographically-secure pseudorandom
    # (number) generator (CSPRNG) like os.urandom, /dev/urandom, BCryptGenRandom
    # (Windows), SecureRandom (Java), etc. Keys should typically be at least
    # 128 bits (16 bytes) long to prevent brute-force key guessing attacks.
    def key_gen(self) -> bytes:
        return os.urandom(16)


 print("""
 ################################################################################
 #                                                                              #
 #                                ONE-TIME PADS                                 #
 #                                                                              #
 ################################################################################
 """)

 # XOR is a very useful building block
 def xor(xs: bytes, ys: bytes) -> bytes:
    return bytes(x ^ y for x, y in zip(xs, ys))


 # One-Time Pad (OTP) achieves "perfect secrecy", but the key ("pad") must be as
 # long as the message and only used once.
 def otp(msg: bytes) -> tuple[bytes, bytes]:
    key = os.urandom(len(msg))
    return key, xor(key, msg)


 msg = b'Hello, cruel world'
 key, ct = otp(msg)
 print('OTP Encrypted:', ct.hex())
 print('OTP Decrypted:', xor(key, ct).decode())


 print("""
 ################################################################################
 #                                                                              #
 #                        STREAM CIPHERS & BLOCK CIPHERS                        #
 #                                                                              #
 ################################################################################
 """)

 # We can fix the key size issue by using a pseudorandom generator (PRG) to
 # deterministically expand a small secret seed into an almost-unlimited amount
 # of *key-stream*:
 def prg(seed: bytes, num_bytes: int) -> bytes:
    """Generates $amount bytes of pseudorandom key-stream from $seed."""


 # As a type alias:
 PRG = Callable[[bytes, int], bytes]


 # We can then use the PRG key-stream instead of the random key in OTP to create
 # *stream cipher*:
 class StreamCipher(Cipher):
    def __init__(self, prg: PRG):
        self.prg = prg

    def encrypt(self, key: bytes, plaintext: bytes) -> bytes:
        key_stream = self.prg(key, len(plaintext))
        return xor(key_stream, plaintext)

    def decrypt(self, key: bytes, ciphertext: bytes) -> bytes:
        return self.encrypt(key, ciphertext)  # decrypt == encrypt


 # So, how do we write that secure PRG? There are many ways, but the way we'll
 # look at first uses a different primitive called a *block cipher*. Whereas a
 # stream cipher can encrypt any amount of data, a bit at a time, a block cipher
 # can only encrypt a fixed-length "block" of data:

 class BlockCipher(Cipher):
    def block_size(self) -> int:
        """Returns the block size in bytes."""


 # The most widespread and famous block cipher is AES, the Advanced Encryption
 # Standard. AES was designed by two Belgian cryptographers and selected by
 # the US National Institute of Standards and Technology (NIST) after an
 # international open competition, as all good crypto should be. AES operates
 # on 16 byte (128 bit) blocks and uses keys of either 128, 192, or 256 bits
 # (but nobody uses 192-bit keys).

 class AesCipher(BlockCipher):
    def block_size(self) -> int: return 16

    def encrypt(self, key: bytes, plaintext: bytes) -> bytes:
        assert len(plaintext) == self.block_size()
        aes = AES.new(key, AES.MODE_ECB)  # "ECB" is raw AES
        return aes.encrypt(plaintext)

    def decrypt(self, key: bytes, ciphertext: bytes) -> bytes:
        assert len(ciphertext) == self.block_size()
        aes = AES.new(key, AES.MODE_ECB)
        # Unlike a stream cipher, encryption and decryption are very different
        return aes.decrypt(ciphertext)


 cipher = AesCipher()
 key = cipher.key_gen()
 ct = cipher.encrypt(key, b'YELLOW SUBMARINE')  # Exactly 16 bytes
 print('AES Encrypted:', ct.hex())
 pt = cipher.decrypt(key, ct)
 print('AES Decrypted:', pt.decode())


 # Most messages are more (or less) than exactly 16 bytes in size, though. To
 # encrypt arbitrary messages with a block cipher you need to use a *mode of
 # operation*. The absolute worst thing you can do is to just encrypt each
 # block of 16 bytes one at a time. This is ECB mode and is just awful. Google
 # "ECB Penguin" if you're unconvinced.
 #
 # We can construct a PRG out of a block cipher to turn it into a stream cipher,
 # by simply encrypting an incrementing counter. This is known as Counter Mode,
 # or CTR for short:
 def ctr_prg(seed: bytes, num_bytes: int) -> bytes:
    aes = AesCipher()
    bs = aes.block_size()
    ks = bytearray()
    for i in range(math.ceil(num_bytes / bs)):
        ctr = i.to_bytes(bs)
        ks += aes.encrypt(key=seed, plaintext=ctr)
    return ks


 cipher = StreamCipher(ctr_prg)
 ct = cipher.encrypt(key, b'Now we can encrypt messages of arbitrary length!')
 print('CTR Encrypted:', ct.hex())
 pt = cipher.decrypt(key, ct)
 print('CTR Decrypted:', pt.decode())


 # Now, what I just showed you illustrates the idea, but is really inefficient.
 # Your crypto library probably has built-in support for CTR mode:
 cipher = AES.new(key, AES.MODE_CTR)
 ct = cipher.encrypt(b'Much faster this way')
 print('Faster:', ct.hex())


 print("""
 ################################################################################
 #                                                                              #
 #                       TWO-TIME PADS & IND-CPA SECURITY                       #
 #                                                                              #
 ################################################################################
 """)

 # Look what happens if we use the key for two messages:
 cipher = StreamCipher(ctr_prg)
 ct1 = cipher.encrypt(key, b'a b c d ')
 ct2 = cipher.encrypt(key, b' A B C D')

 diff = xor(ct1, ct2)
 print('XOR of two ciphertexts:', diff.decode())  # => AaBbCcDd


 # Same key => same key stream.
 # If you XOR two ciphertexts together the key-stream cancels out. You're left
 # with the XOR of the two plaintexts, which typically leaks a lot of
 # information about the original messages. This is known as a two-time pad.
 # The more you reuse the key, the more information is leaked: typically the
 # entire messages can be recovered pretty quickly.
 #
 # To prevent this attack, we have to use an encryption scheme that is
 # *randomized*: encrypting the same message twice with the same key results in
 # different ciphertexts each time. A cipher that is secure against two-time
 # pads is said to have IND-CPA security (aka semantic security):
 #
 #   - INDistinguishability (the security goal): an attacker cannot tell which
 #     of two equal-length messages (of their choice) has been encrypted.
 #   - Under a Chosen Plaintext Attack (the attacker model): the attacker is
 #     allowed to encrypt any messages they like under the same key and see the
 #     resulting ciphertext.
 #
 # It's a good idea to use types/interfaces to indicate what security properties
 # different constructions have. See Google's Tink library for a good example.
 class IndCpaSecureCipher(Cipher):
    """A cipher that achieves IND-CPA security."""


 # One way to achieve IND-CPA security is to introduce a *nonce* (number used
 # once) into a stream cipher. You can think of the nonce as like an offset into
 # the key-stream: the cipher "seeks" to that point in the key-stream and starts
 # generating key material from that point:
 def ctr_prg_with_nonce(seed: bytes, nonce: int, amount: int) -> bytes:
    aes = AesCipher()
    ks = bytearray()
    for i in range(math.ceil(amount / 16)):
        ctr = (nonce + i).to_bytes(16)
        ks += aes.encrypt(seed, ctr)
    return ks


 PRGWithNonce = Callable[[bytes, int, int], bytes]


 # We can then encrypt multiple messages by generating a random nonce for each
 # message. We can simply prefix the nonce to the message (it doesn't need to be
 # kept secret).
 class RandomizedStreamCipher(IndCpaSecureCipher):
    def __init__(self, prg: PRGWithNonce, nonce_size_bytes: int = 16):
        self.prg = prg
        self.nonce_size = nonce_size_bytes

    def encrypt(self, key: bytes, plaintext: bytes) -> bytes:
        nonce = os.urandom(self.nonce_size)  # non-blocking!
        ks = self.prg(key, int.from_bytes(nonce), len(plaintext))
        return nonce + xor(ks, plaintext)

    def decrypt(self, key: bytes, ciphertext: bytes) -> bytes:
        assert len(ciphertext) >= self.nonce_size
        nonce, ciphertext = (ciphertext[:self.nonce_size],
                             ciphertext[self.nonce_size:])
        ks = self.prg(key, int.from_bytes(nonce), len(ciphertext))
        return xor(ks, ciphertext)


 class RandomizedCtrMode(RandomizedStreamCipher):
    def __init__(self):
        super().__init__(ctr_prg_with_nonce, 16)


 cipher = RandomizedCtrMode()
 ct1 = cipher.encrypt(key, b'a b c d ')
 ct2 = cipher.encrypt(key, b' A B C D')

 diff = xor(ct1[16:], ct2[16:])  # skip the nonce prefix
 print('XOR of two ciphertexts with nonce:', diff)  # => random garbage


 # There is a small chance that you will generate the same random nonce twice.
 # The more messages you encrypt, the more likely this is to occur (google
 # "birthday paradox": it happens sooner than you might guess). NIST recommend
 # to never exceed a 2^-32 (1 in 4 billion) chance of a nonce "collision". For
 # an n-bit nonce, that occurs after 2^((n-32)/2) messages.

 def msg_limit(nonce_size_bytes: int) -> int:
    return 2 ** ((nonce_size_bytes * 8 - 32) // 2)


 # We can print out the maximum number of messages that you can encrypt with a
 # nonce of a given size for typical nonce sizes:
 print()
 print('Nonce Size (bytes) |                   Max. # Messages')
 print('===================+==================================')
 for i in [8, 12, 16, 24]:
    print(f'{i:18d} | {msg_limit(i):33,}')
 print()

 # Nonce Size (bytes) |                   Max. # Messages
 # ===================+==================================
 #                  8 |                            65,536
 #                 12 |                     4,294,967,296
 #                 16 |               281,474,976,710,656
 #                 24 | 1,208,925,819,614,629,174,706,176
 #
 # For AES-CTR with 16-byte nonces, this is 281 trillion messages, which is good
 # for most things. *However*, because of the way we constructed the nonce, this
 # is actually 281 trillion *blocks*, not messages. That's still a lot. Some
 # ciphers segment the nonce to have a random component and then an incremental
 # block counter (reset to 0 for each message). But this is generally only a good
 # idea if your messages are very long. The state of the art for random nonces
 # is a cipher called XSalsa20 and its replacement XChaCha. Both use 24-byte
 # nonces *and* have a separate 64-bit block counter, allowing an almost
 # unlimited number of messages to be encrypted with each being of almost
 # unlimited size. The downside of course is that all your messages get 24 bytes
 # larger to store the random nonce.


 class XChaCha(IndCpaSecureCipher):
    def encrypt(self, key: bytes, plaintext:  bytes) -> bytes:
        nonce = os.urandom(24)
        return nonce + ChaCha20.new(key=key, nonce=nonce).encrypt(plaintext)

    def decrypt(self, key: bytes, ciphertext: bytes) -> bytes:
        assert len(ciphertext) >= 24
        nonce = ciphertext[:24]
        return ChaCha20.new(key=key, nonce=nonce).decrypt(ciphertext[24:])

    def key_gen(self) -> bytes:
        return os.urandom(32)  # 256-bit keys


 cipher = XChaCha()
 key = cipher.key_gen()
 ct = cipher.encrypt(key, b'Cha-Cha-Cha!')
 print('XChaCha Encrypted:', ct.hex())
 pt = cipher.decrypt(key, ct)
 print('XChaCha Decrypted:', pt.decode())

 #
 # Does the nonce have to be random? Couldn't we use a simple counter or
 # something? Yes, technically you could just use a counter, but in most cases
 # that is harder than it looks: consider sharing a key between multiple servers,
 # dealing with server restarts, VM resumption, etc. Using a counter makes your
 # crypto stateful, with all the usual downsides of that, plus the fact that if
 # your counter ever goes backwards then you end up exposing messages. A random
 # nonce is much safer, at the cost of having to transmit the nonce as part of
 # the message (known as *ciphertext expansion*). Some protocols like TLS use a
 # stateful nonce *per session* calculated by both sides (and so not
 # transmitted). TLS negotiates a fresh key for each session, so the nonce
 # doesn't need to be synchronised between servers. Even then, there is a history
 # of nonce-reuse attacks on those kinds of protocols (see e.g. the KRACK attacks
 # against Wifi encryption - https://krackattacks.com).


 print("""
 ################################################################################
 #                                                                              #
 #                                   PADDING                                    #
 #                                                                              #
 ################################################################################
 """)

 # A nonce-based stream cipher is pretty good as far as secrecy is concerned, but
 # there is one significant leak of information still: the length of the message
 # is not hidden at all. This can sometimes be a big leak, e.g. one study found
 # you could uniquely identify what someone is watching on Netflix based on the
 # lengths of the (encrypted) chunks of streaming video data. This leak is even
 # more exploitable when encryption is combined with compression: compression
 # changes the length of a message based on the content. This was exploited in
 # the CRIME and BREACH attacks to extract secrets from HTTP responses. (BREACH
 # is still largely unmitigated, by the way).
 #
 # The solution is to add padding to a message before encrypting it (after
 # compression, if you must). There are many padding schemes in use. A common
 # (but bad) one is PKCS#7 padding (PKCS = Public Key Cryptography Standard, a
 # series of "standards" published by RSA Corp), where e.g. to add 7 bytes of
 # padding you'd add the byte 0x07 repeated 7 times. This limits you to a max of
 # 255 bytes of padding. A simpler padding scheme is given in ISO/IEC 7816-4:
 # simply append a single 1 bit and then as many 0 bits as you need:
 def pad_iso7816(msg: bytes, pad_bytes: int) -> bytes:
    return msg + b'\x80' + b'\x00' * (pad_bytes - 1)

 def unpad_iso7816(padded: bytes) -> bytes:
    unpadded = padded.rstrip(b'\x00')
    if unpadded[-1] != 0x80:
        raise ValueError
    return unpadded[:-1]

 # How much padding to add? This is always a trade-off between bloating your
 # message size vs hiding the true length of the message. A common strategy is to
 # pad to the next multiple of some block size (which may or may not correspond
 # to the cipher block size), but this only very slightly reduces the information
 # leakage:
 def pad_to_block_size(msg_len: int, block_size: int = 16) -> int:
    """Padding strategy that pads to the next multiple of the block size."""
    return msg_len + (block_size - (msg_len % block_size))


 # A good general-purpose strategy is [Padmé](https://lbarman.ch/blog/padme/),
 # which adds at most 12% overhead. IMO though, Padmé is not very good for very
 # small messages,
 def padme(msg_len: int, min_size: int = 100) -> int:
    """Returns the length to pad a message to."""
    if msg_len <= min_size:
        return min_size
    e = math.floor(math.log2(msg_len))
    s = math.floor(math.log2(e)) + 1
    mask = (1 << (e - s)) - 1
    return (msg_len + mask) & ~mask


 PaddingStrategy = Callable[[int], int]
 def pad(msg: bytes, strategy: PaddingStrategy = padme) -> bytes:
    return pad_iso7816(msg, strategy(len(msg)) - len(msg))

 def unpad(padded: bytes) -> bytes:
    return unpad_iso7816(padded)

 padded = pad(b'x' * 12345)
 unpadded = unpad(padded)
 print(f'Length: padded={len(padded)}, unpadded={len(unpadded)}')

 # Note though that Padmé adds almost no padding to very short strings, so is not
 # suitable for obscuring the length of passwords or other short values. For
 # such cases you may want to include a minimum length, e.g. 100 bytes.

 print("""
 ################################################################################
 #                                                                              #
 #                 CIPHERTEXT MALLEABILITY AND IND-CCA SECURITY                 #
 #                                                                              #
 ################################################################################
 """)

 # At this point we have pretty good *secrecy*, but our ciphers provide no
 # *integrity*: ciphertexts are malleable. For example, let's say Alice is
 # sending a message to her bank to pay Bob a lot of money:
 msg = b'Pay Bob $100,000'
 cipher = RandomizedStreamCipher(ctr_prg_with_nonce)
 ct = cipher.encrypt(key, msg)

 # Eve intercepts the message, guesses (some of) the contents, and so can tamper
 # with the ciphertext to change the payee:
 # First she XORs the appropriate part of the message (taking into account the
 # 16-byte nonce prefix) with "Bob" to cancel out his name:
 without_bob = ct[:20] + xor(ct[20:23], b'Bob') + ct[23:]
 # Then she XORs the same part with "Eve" to add her name instead:
 with_eve = (without_bob[:20] +
            xor(without_bob[20:23], b'Eve') +
            without_bob[23:])

 # Now, when the bank decrypts the message they get "Pay Eve $100,000":
 print('Bank received:', cipher.decrypt(key, with_eve).decode())

 # Sometimes this tampering can not only lead to loss of integrity, but also a
 # loss of secrecy. For example, the tampering may cause parsing of the message
 # to subsequently fail with an error message that reveals details of the
 # surrounding original message. A common version of this is known as a Padding
 # Oracle attack, because it exploits errors when checking any padding after
 # decryption. (Some encryption modes require padding). The attacks can be
 # devastating, allowing entire messages to be decrypted one byte at a time.
 # A cipher that is immune to such attacks is said to have IND-CCA security
 # (or IND-CCA2 security more specifically): indistinguishability under a chosen
 # ciphertext attack. IND-CCA security implies IND-CPA security, but also that
 # ciphertexts are non-malleable.


 class IndCcaSecureCipher(IndCpaSecureCipher):
    """A cipher that provides IND-CCA2 security."""


 print("""
 ################################################################################
 #                                                                              #
 #                            MESSAGE AUTHENTICATION                            #
 #                                                                              #
 ################################################################################
 """)

 # Encryption does not protect integrity. For that we need a different construct:
 # a Message Authentication Code (MAC). A MAC takes a secret key and a message
 # and computes a short fixed-length authentication tag. The tag is attached to
 # the message and the receiver computes the same tag and checks that they match.
 # The usual security goal for MACs is EUF-CMA: Existential UnForgeability under
 # a Chosen Message Attack. That means it's impossible for an attacker to find
 # a valid tag for a message that they haven't already seen a tag for (even if
 # they get to ask for tags for arbitrary messages).
 # NB: CMA == CPA. Different terms, same attacker model.
 class MAC:

    def key_gen(self) -> bytes:
        """Generates a random key for use with this MAC."""
        return os.urandom(16)

    def tag_len_bytes(self) -> int:
        """Returns the size of the MAC tag, in bytes."""

    def compute_tag(self, key: bytes, msg: bytes) -> bytes:
        """Computes a MAC tag and returns it."""

    def verify(self, key: bytes, msg: bytes, tag: bytes) -> bool:
        """Verifies that the given MAC tag matches this message."""
        expected_tag = self.compute_tag(key, msg)
        return compare_digest(expected_tag, tag)


    # Although the above two methods are the usual API of a MAC, the following
    # is IMO a more fail-safe and useful interface:
    def seal(self, key: bytes, msg: bytes) -> bytes:
        """Computes a MAC tag and appends it to the message."""
        return msg + self.compute_tag(key, msg)

    def unseal(self, key: bytes, tagged_msg: bytes) -> bytes:
        """Verifies the MAC tag on the message and, if valid, returns the
        message. Otherwise, raises AuthFailure."""
        tlen = self.tag_len_bytes()
        assert len(tagged_msg) >= tlen
        msg, tag = tagged_msg[:-tlen], tagged_msg[-tlen:]
        if not self.verify(key, msg, tag):
            raise AuthFailure
        return msg

 class AuthFailure(Exception):
    """Indicates a MAC authentication failure."""

 print("""
 ################################################################################
 #                                                                              #
 #                                TIMING ATTACKS                                #
 #                                                                              #
 ################################################################################
 """)

 # The MAC.verify() method above uses the function `compare_digest` from the
 # `hmac` module to compare the tag we've just computed to the one that was
 # supplied. A normal equality function (such as Python's `==`) works like the
 # following:
 def unsafe_equals(xs: bytes, ys: bytes) -> bool:
    if len(xs) != len(ys):
        return False
    for x, y in zip(xs, ys):
        if x != y:
            return False
    return True

 # This method returns `False` as soon as it finds any difference between the
 # two values. This means there is a slight timing difference between if the
 # first byte is different compared to if the second byte is different, and so
 # on. An attacker can then try varying the first byte of a forged MAC tag until
 # the verification takes very slightly longer, which tells them the correct
 # value for the first byte, and then move onto the second.


 from timeit import timeit
 print('Timing unsafe equals:')
 print('Index of first diff | Time (in microseconds)')
 print('====================+=======================')
 xs = bytes([42] * 100)
 for i in range(0, 100, 10):
    # Change byte at index i:
    ys = xs[:i] + b'\43' + xs[i+1:]
    print(f'{i:19d} | {timeit(lambda: unsafe_equals(xs, ys), number=100_000):22.5f}')

 # See https://codahale.com/a-lesson-in-timing-attacks/ for a more detailed
 # explanation. (Just bear in mind that Java's MessageDigest.isEqual is fixed
 # in all recent versions of Java, so you _should_ use it).
 #
 # These differences are tiny, but if there is one thing modern science (and
 # modern statistics) is good at, it is measuring tiny differences. This 2009
 # paper claims to be able to measure timing differences of 15–100 microseconds
 # over the internet, and as little as 100ns from the local network:
 # https://www.cs.rice.edu/~dwallach/pub/crosby-timing2009.pdf
 #
 # You may question how practical such attacks are, especially with modern CPU
 # and memory architectures, with cache lines and other details coming into play.
 # The attacks also generally require many thousands of measurements.
 # But regardless of whether you think such attacks are feasible, the fix is
 # very simple: use an equality method that takes the same amount of time
 # regardless of the specific inputs passed to it:

 def safe_equals(xs: bytes, ys: bytes) -> bool:
    if len(xs) != len(ys):  # Assume length check is constant-time
        return False
    accum = 0
    for x, y in zip(xs, ys):
        accum |= x ^ y  # x^y will be 0 if and only if x==y
    return accum == 0

 print('Timing safe equals:')
 print('Index of first diff | Time (in microseconds)')
 print('====================+=======================')
 for i in range(0, 100, 10):
    ys = xs[:i] + b'\43' + xs[i+1:]
    print(f'{i:19d} | {timeit(lambda: safe_equals(xs, ys), number=100_000):22.5f}')

 # This is essentially what hmac.compare_digest does.
 #
 # Timing attacks are an example of a *side-channel attack*. Such attacks try to
 # extract information by observing some aspect of how cryptographic computations
 # run on a real physical machine: timing, power consumption, variable CPU clock
 # speed, changes to caches, etc. If a computation makes some observable
 # difference then somebody has probably used it to extract a key from a
 # cryptographic implementation. Writing cryptographic code that is immune to
 # such attacks is an active area of research and requires great skill and
 # knowledge.

 print("""
 ################################################################################
 #                                                                              #
 #                            PSEUDORANDOM FUNCTIONS                            #
 #                                                                              #
 ################################################################################
 """)

 # A block cipher is an example of a *pseudorandom function* (PRF). If you don't
 # know the key, a PRF looks like a completely random mapping from inputs to
 # outputs. Any secure PRF is a secure MAC, so a block cipher is a secure MAC
 # too. But only for messages that are exactly one block in size.
 class PRF(MAC):
    """A secure pseudorandom function (PRF)."""


 class BlockCipherPRF(PRF):
    def __init__(self, cipher: BlockCipher):
        self.cipher = cipher

    def key_gen(self) -> bytes:
        return cipher.key_gen()

    def tag_len_bytes(self) -> int:
        return self.cipher.block_size()

    def compute_tag(self, key: bytes, msg: bytes) -> bytes:
        assert len(msg) == self.cipher.block_size(), '1 block exactly, please!'
        return self.cipher.encrypt(key, msg)

 # So, AES is itself a secure MAC, but only for messages of exactly 16 bytes in
 # size. We can extend this to arbitrarily-sized messages by using a mode of
 # operation known as cipher block chaining (CBC), in which we XOR the ciphertext
 # of the last block into plaintext of the next block. The last ciphertext block
 # becomes the tag:
 #
 #       PT Block 1          PT Block 2             PT Block 3
 #           |                   |                      |
 #           |         +-------> + (XOR)      +-------> + (XOR)
 #           v         |         v            |         v
 #      K-> AES        |    K-> AES           |    K-> AES
 #           |         |         |            |         |
 #           +---------+         +------------+         +-----> Tag
 #
 # This is known as CBC-MAC. (Note: in CBC *encryption* you XOR a random block,
 # known as the IV, into the first plaintext block. In CBC-MAC you absolutely
 # shouldn't do that. See https://cryptopals.com/sets/7/challenges/49).
 # CBC-MAC is a variable-length-input PRF (vPRF) and a secure MAC.


 class CbcMac(PRF):
    ZERO_IV = b'\0' * 16

    def tag_len_bytes(self) -> int:
        return 16

    def compute_tag(self, key: bytes, msg: bytes) -> bytes:
        cbc = AES.new(key, AES.MODE_CBC, iv=self.ZERO_IV)
        last_block = cbc.encrypt(pad(msg, strategy=pad_to_block_size))[-16:]
        return last_block


 mac = CbcMac()
 mac_key = mac.key_gen()
 msg = b'Pay Bob $100,000'
 tag = mac.compute_tag(mac_key, msg)
 # Attempt to tamper with the message
 tampered = b'Pay Eve $1 BILLION and ' + msg
 if not mac.verify(mac_key, tampered, tag):
    print('Tampering detected!')


 print("""
 ################################################################################
 #                                                                              #
 #                           LENGTH EXTENSION ATTACKS                           #
 #                                                                              #
 ################################################################################
 """)

 # CBC-MAC is vulnerable to length extension attacks: given the tag on one
 # message, along with an "oracle" that will compute tags on arbitrary messages,
 # you can compute a new tag for a message that is an extension of the first:
 #
 #   Pay Bob $10 --> Pay Bob $100,000
 #
 # This is because the tag on the first message is equivalent to an intermediate
 # state for the second message. (It is complicated by block boundaries and
 # padding).
 #
 # I'm not going to go into the details of how you do this, as it's a little bit
 # complicated (and TBH, a little bit contrived for CBC-MAC). See
 # https://cryptopals.com/sets/7/challenges/49 again for some hints on what an
 # attack looks like in practice.
 #
 # Length extension attacks are easy to prevent. We'll show two approaches here.
 # In the first case, we use some unambiguous encoding such that one message is
 # never a prefix of another ("prefix-free"). The simplest approach is to just
 # prefix the message with its length:

 class PrefixFreeCbcMac(CbcMac):
    def compute_tag(self, key: bytes, msg: bytes) -> bytes:
        prefix = len(msg).to_bytes(8)  # Use fixed-size encoding of length itself
        return super().compute_tag(key, prefix + msg)

 # The second approach is to (re-)encrypt the final tag using AES with a separate
 # key. This is known as ECBC-MAC (Encrypted CBC-MAC):

 class EcbcMac(CbcMac):
    def key_gen(self) -> bytes:
        # Generate two independent keys and concatenate
        return super().key_gen() + super().key_gen()

    def compute_tag(self, key: bytes, msg: bytes) -> bytes:
        k1, k2 = split_in_half(key)
        tag = super().compute_tag(k1, msg)
        return AesCipher().encrypt(k2, tag)

 # It is essential to the security that k1 != k2. Principle: never use a key for
 # more than one purpose, it almost always leads to doom.

 def split_in_half(x: bytes) -> tuple[bytes, bytes]:
    """Utility function that splits some bytes in half."""
    mid = len(x) // 2
    return x[:mid], x[mid:]

 # A more advanced version of ECBC-MAC, known as CMAC, uses 3 keys to avoid
 # having to add any padding if the message is already a multiple of the block
 # size.

 print("""
 ################################################################################
 #                                                                              #
 #                            HASH FUNCTIONS & HMAC                             #
 #                                                                              #
 ################################################################################
 """)

 # In practice, CBC-MAC is rarely used when a standalone MAC is required. The
 # most widely used MAC is probably HMAC, which uses a cryptographic hash
 # function. You can think of a hash function as a bit like a PRF with a fixed
 # key: it looks a lot like a random function. However, unlike a PRF, a hash
 # function doesn't use a key at all (at least, not by default).

 HashFunction = Callable[[bytes], bytes]

 # Secure hash functions include the SHA-2 family: SHA-256, SHA-512 and friends,
 # and the newer SHA-3 (which has variants named SHA3-256, SHA3-512, etc.).
 # SHA-3 has a lovely design, but is slow in software and with only limited
 # hardware support currently, so stick with SHA-2 for now.
 # Avoid older algorithms like MD5 or SHA-1, which are broken.


 # SHA-256 is widely used and secure:
 def sha256(data: bytes) -> bytes:
    return hashlib.sha256(data).digest()


 print('SHA-256:', sha256(b'Some example string').hex())

 # Cryptographic hash functions are designed to have three security properties:
 #   - Collision Resistance: it must be insanely hard to find two distinct inputs
 #       that have the same hash, i.e., H(x) = H(y).
 #   - Preimage Resistance: given a hash output y=H(x) it should be insanely hard
 #       to find x. (Assuming x is not easy to guess in the first place).
 #   - Second Preimage Resistance: given a pair x, y=H(x), it should be
 #       impossible to find another x' (!= x) such that H(x') = y.
 # The fact that secure hash functions exist at all and do all this without a
 # key is kind of amazing.
 #
 # Due to the birthday problem that we talked about earlier, collision resistance
 # requires a hash function output to be twice the size you'd expect. For
 # example, SHA-256 has a 256-bit output to provide 128-bit security against
 # collision attacks (i.e., it should take around 2^128 guesses to find a
 # collision). Preimage and second preimage resistance are easier, so usually
 # only need n-bit output to ensure n-bit security.

 # Many hash functions, like SHA-2, are also vulnerable to length-extension
 # attacks: given H(x) you can compute H(x + some_suffix), without knowing x.
 # In particular, H(secret + data) is *not* a secure MAC in general. (It is for
 # newer hash functions like SHA-3). Likewise, H(data + secret) is also not
 # generally secure, for more subtle reasons.


 # You can build secure a PRF/MAC out of a hash function using HMAC. HMAC looks
 # like this:
 #   HMAC(k, m) = H(k1 + H(k2 + m))
 # where k1 and k2 are derived from k in a particular way:
 class HMAC(PRF):
    """A toy implementation of HMAC, to demonstrate how it works. Use your
    crypto library's real implementation, not this."""
    def __init__(self, hash: HashFunction, block_size: int = 64):
        self.hash = hash
        self.block_size = block_size
        self.tag_len = len(hash(b''))

        # Fixed constants used to calculate two distinct sub-keys. They are
        # chosen carefully to ensure a certain separation between the keys.
        # https://crypto.stackexchange.com/questions/20695/hmac-ipad-and-opad-choice
        self.IPAD = bytes([0x36] * block_size)
        self.OPAD = bytes([0x5C] * block_size)

    def tag_len_bytes(self) -> int:
        return self.tag_len

    def compute_tag(self, key: bytes, msg: bytes) -> bytes:
        if len(key) > self.block_size:
            key = self.hash(key)
        elif len(key) < self.block_size:
            key += bytes([0] * (self.block_size - len(key)))
        inner_key = xor(self.IPAD, key)
        outer_key = xor(self.OPAD, key)
        return self.hash(outer_key + self.hash(inner_key + msg))


 hmac = HMAC(sha256, 64)
 key = os.urandom(64)
 print('HMAC:', hmac.compute_tag(key, b'HMAC, World!').hex())

 # For MAC/PRF usage, we only need preimage resistance (to keep the key
 # secret) and 2nd preimage resistance (to stop an attacker using a tag for
 # another message). It is therefore common to *truncate* the HMAC output to
 # half size to save space. For example, taking the first 16 bytes of the output
 # of HMAC-SHA-256 is still a perfectly secure MAC/PRF. Any PRF is still secure
 # when its output is truncated, so long as you do not make it so small that
 # brute-force attacks become possible. For general usage, 16 bytes is a good
 # tag size. Some constrained environments use smaller tags of 8 bytes, but
 # ensure you have rate limiting to slow down forgery attempts in this case.

 print("""
 ################################################################################
 #                                                                              #
 #                           AUTHENTICATED ENCRYPTION                           #
 #                                                                              #
 ################################################################################
 """)

 # Combining a cipher with a MAC gives us *Authenticated Encryption* (AE). AE is
 # IND-CCA secure:


 class AuthenticatedEncryption(IndCcaSecureCipher):
    """A cipher that provides authenticated encryption."""


 # Strictly speaking, AE is stronger than IND-CCA as it also provides data origin
 # authentication. IND-CCA just says that if someone tampers with the message
 # they are guaranteed not to learn anything about the plaintext. AE (and MAC)
 # says that only holders of the MAC key can produce a valid message in the first
 # place. So if the message authenticates you know it came from a key-bearer
 # (known as *data origin authentication*). Of course, if the key is not kept
 # secure then all bets are off...


 # There are lots of bad ways to combine a MAC and a cipher. Moxie's
 # Cryptographic Doom Principle (google it) says that if you have to do anything
 # with a message before you validate the MAC tag, it will inevitably lead to
 # doom. Doom! So the approach that you should use is called Encrypt-then-MAC
 # (EtM): first encrypt the message, and then MAC it. This ensures that the first
 # thing we do during decryption is to verify the MAC.
 class EncryptThenMac(AuthenticatedEncryption):
    # NB: we only need the cipher to be IND-CPA secure
    def __init__(self, cipher: IndCpaSecureCipher, mac: MAC):
        self.cipher = cipher
        self.mac = mac

    def key_gen(self) -> bytes:
        mac_key = mac.key_gen()
        enc_key = cipher.key_gen()
        return mac_key + enc_key

    def encrypt(self, key: bytes, plaintext:  bytes) -> bytes:
        mac_key, enc_key = split_in_half(key)  # Assumes keys are same length
        ct = cipher.encrypt(enc_key, plaintext)
        return mac.seal(mac_key, ct)

    def decrypt(self, key: bytes, ciphertext: bytes) -> bytes:
        mac_key, enc_key = split_in_half(key)
        ct = mac.unseal(mac_key, ciphertext)
        return cipher.decrypt(enc_key, ct)


 # Note that the MAC tag includes the nonce. This is not always required, but
 # some modes (like CBC encryption) are insecure if this is not included in the
 # MAC calculation. This happens automatically due to the way we designed our
 # ciphers. Secondly, you can see why I like the seal()/unseal() API for MACs.
 cipher = RandomizedStreamCipher(ctr_prg_with_nonce)
 mac = PrefixFreeCbcMac()
 ccm = EncryptThenMac(cipher, mac)
 key = ccm.key_gen()
 ct = ccm.encrypt(key, b'Hello, cruel world')
 print('AE Encrypted:', ct.hex())
 pt = ccm.decrypt(key, ct)
 print('AE Decrypted:', pt.decode())


 print("""
 ################################################################################
 #                                                                              #
 #                            AEAD & DEDICATED MODES                            #
 #                                                                              #
 ################################################################################
 """)

 # Sometimes our messages have some associated data that we want to protect but
 # not encrypt. This could be a header with routing information in it (just don't
 # look at the headers before you validate the MAC tag: Doom!). To support this
 # case, most AE ciphers implement AEAD: Authenticated Encryption with Associated
 # Data. This extends the AE interface to also allow you to specify the AD to be
 # authenticated but not encrypted:

 class AEAD(AuthenticatedEncryption):
    def encrypt_with_ad(self, key: bytes, plaintext: bytes, ad: bytes) -> bytes:
        """Encrypts and MACs the plaintext and MACs the ad."""
    def decrypt_with_ad(self, key: bytes, ciphertext: bytes, ad: bytes) -> bytes:
        """Decrypts and verifies the ciphertext and verifies the ad."""

    def encrypt(self, key: bytes, plaintext: bytes) -> bytes:
        return self.encrypt_with_ad(key, plaintext, b'')

    def decrypt(self, key: bytes, ciphertext: bytes) -> bytes:
        return self.decrypt_with_ad(key, ciphertext, b'')

 # AEAD has become _the_ standard API for symmetric ciphers. There are now
 # various dedicated AEAD modes that provide this API directly. Some are far more
 # efficient than the simple schemes I've shown so far.
 #
 # - CCM mode (CTR with CMAC) uses AES in CTR mode and AES-CMAC.
 # - GCM mode (Galois/Counter Mode) uses AES in CTR mode and a very fast MAC
 # based on Galois Field arithmetic. Usually the fastest mode on most processors
 # due to hardware acceleration.
 # - ChaCha20-Poly1305 ("ChaPoly") uses the ChaCha20 stream cipher and the
 # Poly1305 MAC, which is also a fast MAC based on finite field arithmetic. It is
 # fast and easy to implement securely in pure software, so is good on low-end
 # mobile devices that lack AES hardware acceleration.

 # To give you an idea of how these fast MACs work, here is a toy implementation
 # of Poly1305. It takes a 32-byte key that consists of two 128-bit integers: r
 # and s. Then it splits the message up into 128-bit blocks and treats the whole
 # thing as a big polynomial: t = s + r*b0 + r^2*b1 + r^3*b3 + ... mod p, where
 # p is the prime number 2^130 - 5. It is only secure if the key is used for
 # one message only. (Similar to a one-time pad, polynomial MACs are
 # unconditionally secure under that restriction).
 def toy_poly1305(key: bytes, msg: bytes) -> bytes:
    """A toy implementation of Poly1305 for learning purposes. Vulnerable to
    timing side-channel attacks."""
    r_bytes, s_bytes = split_in_half(key)
    r = clamp(int.from_bytes(r_bytes, 'little'))
    s =       int.from_bytes(s_bytes, 'little')

    a = 0
    p = 2**130 - 5

    for i in range(0, len(msg), 16):
        block = msg[i:i+16] + b'\x01'
        n = int.from_bytes(block, 'little')
        a = (a + n) * r % p

    a += s
    return a.to_bytes(17, 'little')[:16]

 def clamp(r: int) -> int:
    """Clears some bits in the representation of r to limit the need for carries
    when doing arithmetic over 32-bit limbs."""
    return r & 0x0ffffffc0ffffffc0ffffffc0fffffff

 # https://www.rfc-editor.org/rfc/rfc8439#section-2.5.2
 key = bytes.fromhex(
    "85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b")
 msg = b'Cryptographic Forum Research Group'
 expected_tag = bytes.fromhex("a8061dc1305136c6c22b8baf0c0127a9")
 tag = toy_poly1305(key, msg)
 assert tag == expected_tag

 #
 # The fast MACs used in these dedicated modes are based on Universal Hash
 # Functions (UHFs), which are very fast but *much more fragile* than PRF-based
 # MACs or HMAC. They are not suitable for use as standalone MACs outside an
 # AEAD mode.


 class GCM(AEAD):
    """AEAD cipher using AES in Galois/Counter Mode (GCM)."""
    def encrypt_with_ad(self, key: bytes, plaintext: bytes, ad: bytes) -> bytes:
        nonce = os.urandom(12)
        gcm = AES.new(key, AES.MODE_GCM, nonce=nonce)
        gcm.update(ad)
        return nonce + gcm.encrypt(plaintext) + gcm.digest()  # "digest" = MAC

    def decrypt_with_ad(self, key: bytes, ciphertext: bytes, ad: bytes) -> bytes:
        assert len(ciphertext) >= 16 + 12
        nonce, ct, tag = ciphertext[:12], ciphertext[12:-16], ciphertext[-16:]
        gcm = AES.new(key, AES.MODE_GCM, nonce=nonce)
        gcm.update(ad)
        return gcm.decrypt_and_verify(ct, tag)


 # Although associated data is sometimes used to authenticate a header, which is
 # sent along with the encrypted message, a better use is to "bind" a message to
 # some context that both Alice and Bob already know. For example, there are a
 # range of attacks that can occur in a messaging protocol between two parties:
 #
 #   - An attacker can re-arrange messages so that they arrive out of order.
 #   - They can take a message sent from Alice to Bob and *reflect* it back to
 #     Alice, so she thinks it came from Bob.
 #   - They can duplicate messages and *replay* them (perhaps at a later time).
 #   - They can drop some messages so that Bob never receives them, or truncate
 #     the flow of messages altogether (i.e. drop all messages after some point)
 #
 # To prevent these attacks, Alice and Bob can keep track of some context about
 # the state of communication and include it as associated data (without actually
 # transmitting it over the wire):

 # This is message 1 from Alice to Bob, and it is not the last message
 context = b'Alice->Bob msg:1 last:False'

 gcm = GCM()
 ct = gcm.encrypt_with_ad(key, b'Hello Bob', context)
 pt = gcm.decrypt_with_ad(key, ct, context)

 print('GCM Encrypted:', ct.hex())
 print('GCM Decrypted:', pt.decode())

 # Now, if an attacker attempts to interfere with messages, Alice and Bob can
 # (eventually) detect this. You can stop a *lot* of attacks by including
 # appropriate context like this.
 #
 # Another good use-case for AD is encrypting fields in a database. By including
 # the primary key (or rowid) and column name you can prevent "copy & paste"
 # attacks in which a valid encrypted value from one field is copied into
 # another. (Database encryption is a complex topic though and easy to screw up).

 print("""
 ################################################################################
 #                                                                              #
 #                  MISUSE RESISTANT AUTHENTICATED ENCRYPTION                   #
 #                                                                              #
 ################################################################################
 """)

 # Although dedicated modes like GCM are very fast, the fail *catastrophically*
 # if you reuse a nonce. Like all CTR-based modes, reusing a nonce results in a
 # two-time pad and so exposes the plaintext. However, it also allows an attacker
 # to recover the MAC key, so they can then tamper with any subsequent messages.
 # This then further allows the attacker to perform chosen ciphertext attacks and
 # potentially compromise future messages, even after the nonce reuse is fixed.
 # ChaPoly is a bit better, but still fails badly in nonce reuse, as do all
 # the modes we've seen so far. The KRACK attacks against Wifi encryption
 # exploited a weakness in the protocol to _force_ nonce reuse (resetting a
 # counter to 0: why you should use random nonces reason #4792...)
 #
 # There are some AEAD modes that protect against nonce reuse to the maximum
 # amount possible. These are known as Misuse Resistant Authenticated Encryption
 # modes (MRAE):


 class MRAE(AEAD):
    """An AEAD mode that provides maximum possible protection against nonce
    reuse."""


 # When given a unique nonce for every message, an MRAE mode behaves exactly like
 # a normal AEAD mode. However, when a nonce is reused, MRAE modes provide a
 # weaker security standard known as Deterministic Authenticated Encryption
 # (DAE). DAE guarantees that there is no loss of authentication security, and
 # the only loss of secrecy is that an attacker can now tell if the same message
 # has been encrypted twice with the same key, nonce, and associated data. (This
 # can still be a significant loss of security: DAE is not IND-CPA secure. You
 # should not think it is "ok" to reuse nonces with an MRAE mode).
 #
 # The original MRAE mode is known as Synthetic IV (SIV) mode. It is a
 # combination of a PRF (as the MAC) and a stream cipher, but in a different
 # construction than the normal Encrypt-then-MAC. First, the nonce, plaintext,
 # and associated data is run through the PRF to produce a tag, then that tag is
 # used as the nonce (IV) for the stream cipher.
 #
 # The standard SIV mode uses AES-CMAC as the PRF, and AES-CTR as the cipher. It
 # uses a neat construction to allow any number of inputs to the PRF, avoiding
 # the need for the encode_prf_input function we used.

 class AesSiv(MRAE):
    def key_gen(self) -> bytes:
        return os.urandom(64)  # Can be 32, 48, or 64 bytes

    def encrypt_with_ad(self, key: bytes, plaintext: bytes, ad: bytes) -> bytes:
        nonce = os.urandom(16)  # Could be any size
        cipher = AES.new(key, AES.MODE_SIV, nonce=nonce)
        cipher.update(ad)
        ct, tag = cipher.encrypt_and_digest(plaintext)
        return nonce + ct + tag

    def decrypt_with_ad(self, key: bytes, ciphertext: bytes, ad: bytes) -> bytes:
        nonce, ct, tag = ciphertext[:16], ciphertext[16:-16], ciphertext[-16:]
        cipher = AES.new(key, AES.MODE_SIV, nonce=nonce)
        cipher.update(ad)
        return cipher.decrypt_and_verify(ct, tag)


 cipher = AesSiv()
 key = cipher.key_gen()
 ct = cipher.encrypt_with_ad(key, b'Hello Bob', context)
 pt = cipher.decrypt_with_ad(key, ct, context)

 print('SIV Encrypted:', ct.hex())
 print('SIV Decrypted:', pt.decode())

 # The downside of SIV, and all MRAE modes, is that it requires two passes over
 # the plaintext: you cannot produce even a single bit of ciphertext until you
 # have processed the entire input with the PRF. It is therefore not suitable
 # for streaming encryption (which we'll say more about soon). A variant known as
 # AES-GCM-SIV replaces CMAC with GCM's fast MAC, and is much faster, but still
 # requires two passes over the input during encryption.
 #
 # A second drawback of SIV is that it violates Moxie's Cryptographic Doom
 # principle: you must decrypt the message before you can verify the MAC tag. If
 # the MAC verification fails, you should be careful to delete the decrypted
 # plaintext from memory and not process it in any way. The PyCryptodome library
 # used above does not release the plaintext in this case.

 print("""
 ################################################################################
 #                                                                              #
 #                                STREAMING AEAD                                #
 #                                                                              #
 ################################################################################
 """)

 # If you need to encrypt large volumes of data (more than a couple of MB), then
 # you don't want to encrypt it all at once with an AEAD cipher. Although this
 # can be made to work, and most crypto libraries offer a streaming API to
 # accomplish this, it has one huge downside: during decryption you do not know
 # whether the data has been tampered with until you get to the end and verify
 # the authentication tag. For example, if you are streaming gigabytes of data,
 # you either have to buffer it all up in memory until you've verified its
 # authenticity, or else you have to process it on-the-fly not knowing if it is
 # valid or not (and potentially have a big problem to clean up if it turns out
 # to have been manipulated). This latter problem is known as releasing
 # unverified plaintext.
 #
 # The solution is to break the message into chunks and encrypt each with the
 # AEAD. But if you do this naïvely, an attacker can still perform various
 # attacks: re-arranging chunks, deleting some chunks, duplicating others,
 # and maybe truncating the stream after some chunk. This is somewhat similar to
 # the situation with ECB mode for block ciphers: each individual chunk may be
 # securely encrypted, but the whole message is not. (In general, security
 # properties are not compositional: just because a secure construction is used
 # for part of a larger construction doesn't mean that security property will
 # carry over to the whole).
 #
 # We can fix this by linking the chunks together in a way that makes any
 # re-arrangement of them detectable. A standard construction for doing this is
 # called STREAM, and is implemented in Tink. The idea is to reserve parts of the
 # nonce used for encrypting each chunk to include a "chunk counter" that
 # indicates which chunk this is, and a "last chunk" bit that indicates if this
 # is the last chunk in the stream or not (to prevent truncation attacks).
 # Reserving part of the nonce in this way obviously reduces what is left to
 # hold a per-message random nonce, so it is better to use this construction with
 # an extended-nonce construction like XChaCha, as shown below. An alternative
 # would be to encode the same details into the associated data, as we discussed
 # earlier.

 class StreamingAEAD:
    def encrypt_chunk(self, key: bytes, chunk: bytes,
                      last: bool, assoc_data: bytes = b'') -> bytes:
        ...
    def decrypt_chunk(self, key: bytes, chunk: bytes,
                      last: bool, assoc_data: bytes = b'') -> bytes:
        ...


 class XChaChaPoly1305StreamingAEAD(StreamingAEAD):
    def __init__(self):
        self.chunk_counter = 0
        self.nonce_prefix = os.urandom(20)

    def encrypt_chunk(self, key: bytes, chunk: bytes,
                      last: bool, assoc_data: bytes = b'') -> bytes:
        cipher = self.__get_cipher(key, last)
        cipher.update(assoc_data)
        ct, tag = cipher.encrypt_and_digest(chunk)
        if self.chunk_counter == 1:
            ct = self.nonce_prefix + ct  # Prepend the random nonce prefix to
                                         # the first chunk
        return ct + tag

    def decrypt_chunk(self, key: bytes, chunk: bytes,
                      last: bool, assoc_data: bytes = b'') -> bytes:
        if self.chunk_counter == 0:
            self.nonce_prefix = chunk[:20]
            chunk = chunk[20:]
        cipher = self.__get_cipher(key, last)
        cipher.update(assoc_data)
        return cipher.decrypt_and_verify(chunk[:-16], chunk[-16:])

    def __get_cipher(self, key: bytes, last: bool) -> ChaCha20_Poly1305:
        last_chunk = bytes([last])
        nonce = self.nonce_prefix + self.chunk_counter.to_bytes(3) + last_chunk
        self.chunk_counter += 1
        if self.chunk_counter >= 2**24:
            raise OverflowError  # Won't fit in the 3 bytes we've reserved
        if last:
            self.chunk_counter = 2**24  # Prevent further encryption/decryption
        return ChaCha20_Poly1305.new(key=key, nonce=nonce)


 key = os.urandom(32)
 cipher = XChaChaPoly1305StreamingAEAD()

 ct1 = cipher.encrypt_chunk(key, b'This is the first chunk', last=False)
 ct2 = cipher.encrypt_chunk(key, b'This is the second chunk', last=False)
 ct3 = cipher.encrypt_chunk(key, b'This is the third and last chunk', last=True)

 print(ct1.hex(), ct2.hex(), ct3.hex())

 cipher = XChaChaPoly1305StreamingAEAD()
 pt1 = cipher.decrypt_chunk(key, ct1, last=False)
 pt2 = cipher.decrypt_chunk(key, ct2, last=False)
 pt3 = cipher.decrypt_chunk(key, ct3, last=True)

 print(pt1.decode())
 print(pt2.decode())
 print(pt3.decode())

 # The STREAM construction requires that a unique nonce is used for every
 # message. If a nonce repeats then all security is lost. Even if you use a MRAE
 # mode like SIV to encrypt each chunk, the resulting streaming AEAD is not
 # misuse-resistant (no streaming mode can be). An alternative mode, known as
 # CHAIN, provides stronger guarantees called OAE2 security: Online
 # Authenticated Encryption. However, unlike with MRAE, OAE2 schemes still lose
 # quite a lot of security under nonce reuse, and the CHAIN construction is much
 # less efficient than STREAM (in particular, chunks cannot be encrypted or
 # decrypted in parallel). *All* online encryption schemes are vulnerable to
 # certain chosen ciphertext attacks, similar to the BEAST attack against TLS,
 # when a nonce is reused. If you need misuse resistance you cannot have
 # streaming, so pick your poison.

 print("""
 ################################################################################
 #                                                                              #
 #                        RATCHETING AND FORWARD SECRECY                        #
 #                                                                              #
 ################################################################################
 """)

 # So far, we have assumed that a single, static, key is used for all messages
 # sent between Alice and Bob. This has the downside that if that key is ever
 # compromised then all previous messages can be immediately decrypted. Some
 # adversaries may be capable of storing lots of messages in the hope of later
 # obtaining access to the decryption key (known as a "store now, decrypt later"
 # attack). One solution to this is to regularly change the key, providing a
 # property known as *forward secrecy*. After the key is changed, the old key can
 # be deleted, protecting messages encrypted under the old key. (I don't know why
 # it's "forward" secrecy either: NIST call a similar property "backtracking
 # resistance", which is a bit more accurate).
 #
 # An even better idea is to automatically change the key at the start of every
 # communication session, or even after every single message exchanged (which is
 # what the Signal protocol does). I'm going to call this Perfect Forward Secrecy
 # (PFS), although in reality that term is used interchangeably with plain
 # forward secrecy. In symmetric cryptography, the only way to achieve PFS is
 # via a stateful mechanism known as *ratcheting*: after each message you
 # overwrite the key using a deterministic method that Alice and Bob can both
 # compute. For example, the [Noise Protocol Framework](
 # https://noiseprotocol.org/noise.html#cipher-functions) reserves a particular
 # nonce value (the largest possible) to be used for "re-keying" in this way:


 class RatchetingCipher:
    RATCHET_NONCE = bytes([0xFF] * 24)  # Largest possible XChaCha nonce

    def __init__(self, initial_key: bytearray):
        self.key = initial_key.copy()
        self.cipher = XChaChaPoly1305StreamingAEAD()

    def encrypt(self, plaintext: bytes, ad: bytes = b'') -> bytes:
        # Obviously, never print/log keys in real life...
        print('Encrypting with key:', self.key.hex())
        ct = self.cipher.encrypt_chunk(self.key, plaintext, False, ad)
        self.__ratchet()
        return ct

    def decrypt(self, ciphertext: bytes, ad: bytes = b'') -> bytes:
        print('Decrypting with key:', self.key.hex())
        pt = self.cipher.decrypt_chunk(self.key, ciphertext, False, ad)
        self.__ratchet()
        return pt

    def __ratchet(self):
        cipher = ChaCha20.new(key=self.key, nonce=self.RATCHET_NONCE)
        cipher.encrypt(bytes([0x00] * 32), output=self.key)  # Overwrites key

 key = bytearray(os.urandom(32))
 alice = RatchetingCipher(key)
 bob = RatchetingCipher(key)

 key[:] = b'\0' * len(key)  # Wipe original key once ciphers are initialized

 ct1 = alice.encrypt(b'The first message')
 ct2 = alice.encrypt(b'The second message')

 pt1 = bob.decrypt(ct1)
 pt2 = bob.decrypt(ct2)
 print(pt1.decode())
 print(pt2.decode())

 # Note that the chances of accidentally generating our reserved nonce by random
 # are so astronomically small that it is not worth testing for it.

 print("""
 ################################################################################
 #                                                                              #
 #                  DIFFIE-HELLMAN AND PUBLIC KEY CRYPTOGRAPHY                  #
 #                                                                              #
 ################################################################################
 """)

 # We are now finally at a point to move on from symmetric cryptography and start
 # looking at public key crypto (aka asymmetric crypto, because Alice and Bob now
 # have different keys). But don't think we've left symmetric crypto behind!
 # Almost all real-world cryptography is *hybrid*: a mixture of public key and
 # symmetric cryptography. We typically just use public key crypto to derive a
 # symmetric key and then use symmetric crypto from then on. The reasons for this
 # are that (a) public key crypto is sssslllloooowwww, and (b) there are usually
 # severe limits on how much data you can process with public key crypto
 # primitives.

 # Public key crypto starts with the classic 1976 paper by Whitfield Diffie and
 # Martin Hellman. Actually it starts before then with Ralph Merkle, and
 # apparently even earlier at GCHQ (1970). But the Diffie-Hellman paper is where
 # shit gets real. They describe a way for Alice and Bob to agree upon a key
 # without needing to share or exchange any secret key material.
 #
 # The original Diffie-Hellman protocol used arithmetic modulo some large
 # prime number, p. We now call this Finite Field Diffie-Hellman, FFDH. The
 # choice of p is critical for security, you can't just pick any old prime.
 # Typically, you'd use a *safe prime*, such that q=(p-1)/2 is also prime:
 # https://en.wikipedia.org/wiki/Safe_and_Sophie_Germain_primes
 # There will be exactly q valid keys in this system.
 #
 # Because of various attacks on FFDH, we have to use quite a large modulus. It
 # is believed that currently 2048 bits is the minimum, providing somewhere
 # around the same security as 112 bits for a symmetric key. For 128-bit
 # equivalent, you need to use a 3072-bit modulus, and for long-term security
 # (50+ years) you're looking at 7680+ bits! We'll look at elliptic curves later
 # that reduce these key sizes considerably. See https://www.keylength.com/ for
 # a nice summary of key size recommendations.

 def int_from_hex(hex: str) -> int:
    no_ws = ''.join(hex.split())
    return int(no_ws, 16)

 # Example using https://www.rfc-editor.org/rfc/rfc7919#appendix-A.1, which is
 # a 2048-bit modulus using a safe prime.
 P = int_from_hex('''
    FFFFFFFF FFFFFFFF ADF85458 A2BB4A9A AFDC5620 273D3CF1
    D8B9C583 CE2D3695 A9E13641 146433FB CC939DCE 249B3EF9
    7D2FE363 630C75D8 F681B202 AEC4617A D3DF1ED5 D5FD6561
    2433F51F 5F066ED0 85636555 3DED1AF3 B557135E 7F57C935
    984F0C70 E0E68B77 E2A689DA F3EFE872 1DF158A1 36ADE735
    30ACCA4F 483A797A BC0AB182 B324FB61 D108A94B B2C8E3FB
    B96ADAB7 60D7F468 1D4F42A3 DE394DF4 AE56EDE7 6372BB19
    0B07A7C8 EE0A6D70 9E02FCE1 CDF7E2EC C03404CD 28342F61
    9172FE9C E98583FF 8E4F1232 EEF28183 C3FE3B1B 4C6FAD73
    3BB5FCBC 2EC22005 C58EF183 7D1683B2 C6F34A26 C1B2EFFA
    886B4238 61285C97 FFFFFFFF FFFFFFFF
 ''')

 Q = (P - 1) // 2  # Also prime, 2047-bits


 # Alice picks a random number a between 1 and q-1 as her private key, and then
 # calculates her public key as A = g^a mod p, where g is a special number known
 # as a *generator* (typically g=2).

 G = 2

 # Generally, P, Q, and G should be agreed upon and fixed ahead of time. (People
 # used to think that you should generate unique parameters for each server using
 # FFDH, because you can reuse the computations for cracking one key to crack
 # other keys using the same parameters. But given that it is quite easy to
 # generate insecure parameters, it is better to pick standardised values that
 # are known good and make sure they are sufficiently strong in the first place).

 # The set { g^0 mod P, g^1 mod P, ..., g^(Q-1) mod P } form a multiplicative
 # *cyclic group* with (prime) order Q. It is a subgroup of the set of integers
 # from 1 to P-1, which also form a group. Most crypto textbooks will go through
 # this if you want to know the mathematical details.


 def ffdh_secret_key() -> int:
    # NB: never use mod (%) directly to create a random integer in some range,
    # as it introduces significant bias. Do one of the following:
    #  - Keep generating random values until one of them is in the right range
    #    (known as rejection sampling). This is what we do here.
    #  - Generate a random byte value that is much larger than you need (at
    #    least 64 bits more) and then use mod on that, to reduce the bias.
    while True:
        x_bytes = os.urandom(math.ceil(Q.bit_length() / 8))
        x = int.from_bytes(x_bytes)
        if 0 < x < Q:
            return x

 def ffdh_pk(sk: int) -> int:
    return pow(G, sk, P)

 alice_sk = ffdh_secret_key()
 alice_pk = ffdh_pk(alice_sk)

 # Given A (Alice's public key), it is computationally infeasible to recover a
 # (her secret key). That is to compute a = log_g(A) mod p. This is known as the
 # *discrete logarithm* problem and is thought to be hard. (It's not as hard as
 # finding an AES key, which is why FFDH keys tend to be >= 2048 bits rather than
 # 128 or 256 bits).
 #
 # Bob does the same as Alice to get secret key b and public key B = g^b mod p.

 bob_sk = ffdh_secret_key()
 bob_pk = ffdh_pk(bob_sk)

 # They then exchange public keys (over a non-secret *but still authenticated*
 # channel). To compute their shared secret, Alice does s = B^a mod p, and Bob
 # does s = A^b mod p. If you expand out the terms you get:
 #
 #   s = B^a mod p             <-- Alice's computation
 #     = (g^b)^a mod p
 #     = g^ba mod p
 #     = g^ab mod p
 #     = (g^a)^b mod p
 #     = A^b mod p             <-- Bob's computation
 #
 # So Alice and Bob end up with the same shared secret, s, using calculations
 # that only involve their own secret key and the other party's public key.

 def ffdh(sk: int, pk: int) -> int:
    if not 1 < pk < P-1:
        raise ValueError
    return pow(pk, sk, P)

 alice_shared_secret = ffdh(alice_sk, bob_pk)
 bob_shared_secret = ffdh(bob_sk, alice_pk)

 assert alice_shared_secret == bob_shared_secret, 'These should be equal!'

 # The shared secret is *indistinguishable* from a completely random value
 # g^z mod p (if you don't know Alice's or Bob's secret key). This is
 # known as the Decisional Diffie-Hellman assumption (DDH). That is, the
 # following tuples are indistinguishable (for any x, y, and z):
 #   - g^x, g^y, g^z     (mod p)
 #   - g^x, g^y, g^xy    (mod p)
 # With this assumption we can treat the shared secret as a secure pseudorandom
 # value. Effectively, FFDH is a PRF where the key is an integer between 1 and
 # Q-1, and the input and output are elements of g^n mod p.

 print("""
 ################################################################################
 #                                                                              #
 #                           KEY DERIVATION FUNCTIONS                           #
 #                                                                              #
 ################################################################################
 """)

 # The shared secret output by FFDH is indistinguishable from a random value
 # from the set G = { g^0, g^1, ..., g^(q-1) } (all modulo p). But this is very
 # different from being a uniform random byte string that most symmetric crypto
 # algorithms require as a key. In fact, when viewed as a string of bytes, the
 # FFDH shared secret might be quite non-uniform. To turn a pseudorandom element
 # of G into something that looks like a uniformly random byte string, we have to
 # use a *Key Derivation Function* (KDF).
 #
 # Any secure cryptographic hash function can be used as a KDF. For example, we
 # can use SHA-256:

 key = sha256(alice_shared_secret.to_bytes(256))
 print(key.hex())

 # However, there are a few weaknesses in this that can be improved. The most
 # well-studied modern KDF is called HKDF (RFC 5869), which describes a two-phase
 # process:
 # First, an "extract" phase converts the random-but-not-uniform input key
 # material into a uniformly-random-looking byte string. It takes an optional
 # "salt" parameter, which we'll discuss in a minute. HKDF-Extract is just HMAC,
 # where the salt is used as the key and the input key material (i.e., the
 # shared secret from FFDH) is used as the message input:

 HMAC_TAG_LEN = 32  # For HMAC-SHA256
 DEFAULT_SALT = bytes([0] * HMAC_TAG_LEN)
 def hkdf_extract(ikm: bytes, salt=DEFAULT_SALT) -> bytes:
    return hmac.compute_tag(key=salt, msg=ikm)

 # prk = pseudorandom key
 prk = hkdf_extract(alice_shared_secret.to_bytes(256))
 print(prk.hex())

 # After the extract phase, there is an "expand" phase that expands the output of
 # HKDF-Extract into a longer sequence of key material. This again uses HMAC, in
 # a variation of CTR mode. It also has an optional "info" argument, which can be
 # used to "bind" the key to various contextual information in a similar way to
 # the way we used associated data earlier when discussing AEAD schemes.

 def hkdf_expand(prk: bytes, info: bytes, output_bytes: int) -> bytes:
    assert output_bytes < 256 * HMAC_TAG_LEN, 'Too big'
    last = b''
    output = bytearray()
    num_blocks = math.ceil(output_bytes / HMAC_TAG_LEN)
    counter = 1
    for i in range(num_blocks):
        last = hmac.compute_tag(prk, last + info + counter.to_bytes(1))
        output += last
    return output

 key_material = hkdf_expand(prk, b'Alice & Bob', 64)
 print(key_material.hex())

 # Combining them into one function, we get:
 def hkdf(ikm: bytes, info: bytes, output_bytes: int, salt: bytes=DEFAULT_SALT):
    prk = hkdf_extract(ikm, salt)
    return hkdf_expand(prk, info, output_bytes)

 # So, what about those salt and info arguments? How should you use them?
 # HKDF-Extract is intended as a randomness extractor. This is a classic use of
 # a hash function as a theoretical object known as a *random oracle*. Imagine
 # you had unlimited memory, you could implement a random oracle like the
 # following:

 previously_seen = {}
 def random_oracle(input: bytes) -> bytes:
    # Have we previously seen this value?
    value = previously_seen.get(input)
    if not value:
        # If not then generate a completely random 32-byte value and save it
        value = os.urandom(32)
        previously_seen[input] = value
    return value

 # This implementation is impractical because (a) it needs unlimited memory and
 # (b) Alice and Bob would need to have access to the same copy of it. So instead
 # we use hash functions, which approximate a random oracle in a constant amount
 # of memory (usually not very much at all). But all hash functions are bad
 # random oracles for some (pathological) inputs. We already saw an example of
 # this with length extension attacks: sha256(x) and sha256(x + y) are related in
 # a way that is not very random. By adding a random *salt* argument to HKDF, you
 # make it behave much more like a true random oracle. In fact, almost anything
 # you can chuck into the salt argument will make it a better random oracle: even
 # a simple fixed constant value. The Noise Protocol Framework uses a fixed
 # label that names the variant of the Noise protocol being used. Just make sure
 # that the value you use is trusted: either a constant known to Alice and Bob
 # ahead of time or a negotiated value that *has already been authenticated*.

 SALT = b'A Lazy Developer\'s Salt'

 alice_key = hkdf(alice_shared_secret.to_bytes(256), b'Alice & Bob', 64, SALT)
 bob_key = hkdf(bob_shared_secret.to_bytes(256), b'Alice & Bob', 64, SALT)
 print(alice_key.hex())
 print(bob_key.hex())
 assert alice_key == bob_key


 def ffdh_hkdf(sk: int, pk: int, info: bytes = b'', key_len: int = 32) -> bytes:
    shared_secret = ffdh(sk, pk)
    return hkdf(shared_secret.to_bytes(256), info, key_len)

 # HKDF is very flexible and very secure, but it is also somewhat slow, requiring
 # at least 4 calls to the underlying hash function. Modern hash functions like
 # SHA-3 directly support KDF use-cases in a more efficient form via the KMAC
 # function. The XChaCha cipher we discussed earlier internally uses a very
 # fast special-purpose KDF called HChaCha to derive a message-specific key from
 # the first 128-bits of the nonce.

 print("""
 ################################################################################
 #                                                                              #
 #                           SMALL SUB-GROUP ATTACKS                            #
 #                                                                              #
 ################################################################################
 """)

 # Every time you do a Diffie-Hellman key exchange with someone you leak some
 # information about your private key. How much information you leak depends on
 # the *order* of the other party's public key. For public keys that have been
 # honestly generated as g^x for some x, then the order of that element is Q.
 # That is, if you multiply it with itself Q times, the answer will be 1:

 x = ffdh_secret_key()  # Pick any random value
 y = ffdh_pk(x)
 n = pow(y, Q, P)  # n = y^q mod p
 assert n == 1

 # If you multiply it one more time, you'll get back to y again.

 # Performing a DH computation with a given public key leaks the remainder of
 # your secret key modulo the order of that public key, but only if an attacker
 # can enumerate all the possibilities. Q is far too large to enumerate: there
 # are on the order of 2^2047 values for our example parameters. So
 # performing DH with an honest public key reveals nothing about the secret key.
 # But there are values that could be passed to our DH function that were not
 # honestly generated (shocking, I know). In general, any value between 1 and
 # P-1 could be passed to our FFDH function, and we can't easily tell which were
 # honestly generated and which were not. Some of these values will be of *small
 # order*: small enough to be enumerated. Then the attacker can get us to do DH
 # with their small order value and then check which value we computed by simply
 # trying them all (until the MAC tag on the message you sent validates, for
 # example). When they find the matching one, then they learn some bits of your
 # private key. They can keep doing this with other small-order public keys to
 # leak more and more bits.
 #
 # So how do we stop this?
 #
 # Firstly, we can stop it by carefully picking the values of P and Q. The
 # possible orders of public keys are all the divisors of P-1. We picked our P
 # such that P - 1 = 2Q, so the possible orders of our public keys are 2Q, Q
 # (our very large prime-order subgroup), 2, or 1. Learning the value of our
 # private key modulo 2 only leaks a single bit of the 2047-bit private key, so
 # can safely be ignored. This is why we prefer to use safe primes for the
 # modulus.
 #
 # More generally, we will have the situation that P-1 = hQ for some *cofactor*
 # h. (h=2 in our example). Then the small-order public keys will have orders
 # that are divisors of h. If h is large-ish and has many divisors then an
 # attacker can potentially leak a lot of information about our private key by
 # getting us to run FFDH with public keys of their choosing. To validate that
 # a given public key is in the correct prime-order subgroup, we need to check
 # the equation we gave above, that is that y^q mod p == 1:

 def ffdh_validate_pk(pk: int) -> None:
    """Full FFDH public key validation routine as per NIST SP800-56A revision 3
    section 5.6.2.3.1."""
    if not 1 < pk < P-1:
        raise ValueError
    if pow(pk, Q, P) != 1:
        raise ValueError

 try:
    ffdh_validate_pk(9999)  # 9999 has order P-1 in this group
 except ValueError:
    print('Detected invalid public key!')

 # The first check is that pk is in the range [2, p-2]. 0 is not a valid public
 # key, and neither are any negative numbers. 1 is not valid because it has
 # order 1. p-1 is -1 mod p, which has order 2.
 #
 # The second check, which confirms that the public key is in the prime-order
 # subgroup, is much more expensive, involving a modular exponentiation. Using a
 # safe prime group allows us to skip this validation step, but the downside is
 # that private keys are large (and so exponentiation is expensive). Using a
 # smaller prime-order group (of say 256 bits) speeds up FFDH, but at the risk of
 # enabling many more small-order subgroups.

 print("""
 ################################################################################
 #                                                                              #
 #                    ACTIVE ATTACKS AGAINST DIFFIE-HELLMAN                     #
 #                                                                              #
 ################################################################################
 """)

 # Although DH allows a public key to be transmitted over a non-secret channel,
 # you still need that channel to be authenticated. Otherwise, an attacker can
 # perform a Manipulator-in-the-Middle (MitM) attack. (Often referred to as a
 # Man-in-the-Middle attack, but non-men can be threat actors too!) The way this
 # works is that Mallory (the attacker), sits somewhere on the network between
 # Alice and Bob. When Alice sends her public key to Bob, Mallory intercepts it
 # and replaces it with her own public key. Likewise, she does the same with
 # Bob's public key. She then computes two FFDH shared secrets: one between
 # herself and Alice, and another between herself and Bob. She can now decrypt
 # and alter any messages Alice and Bob send to each other.
 #
 # Alice and Bob think they are communicating with each other, but each is
 # actually communicating with Mallory-in-the-Middle. (I think there was a TV
 # show about this).

 mallory_sk = ffdh_secret_key()
 mallory_pk = ffdh_pk(mallory_sk)

 # Alice tries to send her pk to Bob, but Mallory intercepts and sends her own
 # pk instead. Bob now computes:
 bob_shared_secret = ffdh_hkdf(bob_sk, mallory_pk)

 # Bob then sends back his pk to alice, but again Mallory intercepts. Now Mallory
 # can compute:
 mallory_bob_secret = ffdh_hkdf(mallory_sk, bob_pk)

 # She then also sends her pk back to Alice (in place of Bob's) and computes:
 mallory_alice_secret = ffdh_hkdf(mallory_sk, alice_pk)

 # Finally, Alice uses Mallory's pk thinking it is Bob's:
 alice_shared_secret = ffdh_hkdf(alice_sk, mallory_pk)

 # Now Mallory knows shared secrets for both directions and can act as a proxy
 # for all messages between Alice and Bob:
 assert alice_shared_secret == mallory_alice_secret
 print('Mallory <-> Alice:')
 print(alice_shared_secret.hex(), '==', mallory_alice_secret.hex())

 assert bob_shared_secret == mallory_bob_secret
 print('Mallory <-> Bob:')
 print(bob_shared_secret.hex(), '==', mallory_bob_secret.hex())

 # There are various ways to authenticate the public keys. For example, you can
 # exchange keys in person, or over a channel authenticated by some other means.
 # A trusted third party can vouch for the public keys, either interactively
 # (eg you connect to a central service to retrieve the public key for someone),
 # or asynchronously via a certificate (a document that associates a public key
 # with an identity and is digitally signed by the trusted third party).


 print("""
 ################################################################################
 #                                                                              #
 #                               ELLIPTIC CURVES                                #
 #                                                                              #
 ################################################################################
 """)

 # Although FFDH is still a perfectly good public key cryptosystem, it's not what
 # most *modern* crypto is using. Instead, we use a variant of it based on
 # operations over *elliptic curves*, which we call ECDH. There are plenty of
 # good tutorials on elliptic curve crypto out there, so I won't cover them much
 # here. Elliptic curves are defined by equations in a certain form. In
 # cryptography, we interested in variants of these curves where the x and y axes
 # are computed in some finite field (typically modulo some large prime again).
 #
 # We can then define a way to "add" two points together to get a third point on
 # the curve. We do this by drawing a line between the two points, finding a
 # third point where that line intersects the curve again, and finally flipping
 # the sign on the y-coordinate of that point. This addition operation satisfies
 # all the requirements to be a cyclic group, just like the set of exponents of g
 # in FFDH, only now we have an *additive* cyclic group (i.e., the operation is
 # addition not multiplication).
 #
 # Whereas before we used exponentiation (repeated multiplication of a value with
 # itself), now we use *scalar multiplication*: repeated addition of a point with
 # itself. That is, xP = P + P + P ... (x times).
 #
 # We still have a special generator, except that now it is a particular point
 # on the curve, G = (x_1, y_1). This point G has some order, q, which we
 # generally want to be a large prime order. Private keys are simple integer
 # values again, between 0 and q-1, and we typically use the variable d to
 # indicate this private key value. The corresponding public key, P = dG, which
 # is the point you get by "adding" G to itself d times. Reversing this process
 # (which bizarrely is still called "discrete logarithm") is infeasible.
 #
 # The identity element for an elliptic curve (corresponding to the value 1 for
 # FFDH) is a special "point at infinity". This crops up whenever you add a point
 # P to its inverse, -P. Those points are the same but reflected about the x-axis
 # (i.e., if P = (x, y), -P = (x, -y)), so the line between them is vertical and
 # won't intersect the curve again. To visualise this point at infinity, imagine
 # drawing a graph on a piece of paper and then holding it up so the bottom of
 # the y-axis is near your nose and the top is held away from you. Now all
 # vertical lines will appear to converge on a "vanishing point". This is the
 # point at infinity. We often use (x, y, z) projective coordinates in elliptic
 # curves, where this point has a natural expression.
 #
 # That doesn't tell you much about what elliptic curves are or how the
 # mathematics plays out, because you don't really need to know. All that matters
 # is that elliptic curves are another cyclic group that we can use for Diffie-
 # Hellman key agreement. There are multiple advantages to using elliptic curves:
 #
 #   - EC keys can be much smaller: typically 256-512 bits, compared to
 #     2048-4096+ bits for FFDH.
 #   - ECDH is also generally faster than FFDH. (On my machine, `openssl speed`
 #     reports ECDH as 18–26x faster than FFDH for a roughly equivalent 128-bit
 #     security level).
 #   - There has been essentially no improvement in attacks on ECDH since it was
 #     first described, whereas there have been multiple improvements in attacks
 #     on FFDH. (That said, progress on attacking FFDH has slowed in recent years
 #     and there have been attacks on specific classes of elliptic curves). Such
 #     attacks generally result in a push to even larger key sizes.
 #
 # Some of the most widely used curves at the moment are the NIST prime-order
 # curves, which originate in an older standard called SEC-2 (see secg.org), or
 # (in the case of P-256) in the ANSI X9.62 standard:
 #
 #   - P-256 (aka secp256r1): A 256-bit curve providing roughly equivalent
 #     security to a 128-bit symmetric cipher like AES.
 #   - P-384 (aka secp384r1): Provides roughly "192-bit" security.
 #   - P-521 (aka secp521r1): Provides roughly "256-bit" security. NOTE: the 521
 #     here is *not* a typo!
 #
 # Although P-521 sounds like the most secure option it is *insanely* slow, so
 # almost never used. Even the NSA (before they lost their minds over quantum
 # computing) only ever recommended P-384 for TOP SECRET data. Nobody seriously
 # uses P-521.
 #
 # Another common curve is secp256k1, which is used in Bitcoin.
 #
 # All of these curves have cofactor h=1. That is, the number of points generated
 # by the generator point G is equal to the number of points on the curve itself.
 # So *all* valid points are in the prime-order subgroup, and we don't need to
 # worry about small-order points (but we do have to worry about other attacks!).

 # We're definitely not going to roll our own for ECDH, so we'll use PyCryptodome
 # again:
 from Crypto.PublicKey import ECC
 from Crypto.Protocol.DH import key_agreement

 def kdf(ikm: bytes) -> bytes:
    return hkdf(ikm=ikm, info=b'Alice & Bob', output_bytes=64,
                salt=b'A Lazy Developer\'s ECDH Example')

 def ecdh(private: ECC.EccKey, public: ECC.EccKey):
    # The output of ECDH is the x-coordinate of the point resulting from d*P,
    # where d is the private key and P is the public key (point).
    return key_agreement(static_priv=private,
                         static_pub=public,
                         kdf=kdf)

 alice_key = ECC.generate(curve='p256')
 bob_key = ECC.generate(curve='p256')

 alice_shared_secret = ecdh(alice_key, bob_key.public_key())
 bob_shared_secret = ecdh(bob_key, alice_key.public_key())

 assert alice_shared_secret == bob_shared_secret

 print("""
 ################################################################################
 #                                                                              #
 #                            INVALID CURVE ATTACKS                             #
 #                                                                              #
 ################################################################################
 """)

 # Although the NIST P-curves are immune to small-order points, due to having a
 # cofactor of 1 (i.e., all points are on the main prime-order subgroup, apart
 # from the point at infinity, which has order 1), they are potentially
 # vulnerable to a much more serious type of attack: an invalid curve attack.
 # Elliptic curve public keys are (x, y) points that satisfy a particular curve
 # equation. For example, for P-256, the equation is:
 #
 #    y^2 = x^3 - 3x + B  (mod P)
 #
 # Where B = <a big constant>, and P = 2^256 - 2^224 + 2^192 + 2^96 - 1. Unlike
 # for FFDH, where public keys are just integers in a given range, for elliptic
 # curves an attacker can also supply (x, y) coordinates that don't match that
 # curve equation, but are on a completely different curve, with a different
 # constant for B. The attacker is then free to choose small-order points on
 # these other curves, and can engineer points with essentially any order. After
 # getting you to perform ECDH computations with points of their choosing, the
 # attacker can use a bit of maths called the Chinese remainder theorem to
 # recover you entire secret key!
 #
 # Invalid curve attacks are absolutely devastating, so it is essential that you
 # prevent them. Sadly, many crypto libraries don't do a great job of this, so
 # invalid curve attacks crop up regularly. You can do the validation yourself
 # as follows:

 # P is the prime that all arithmetic is done modulo.
 P = 2**256 - 2**224 + 2**192 + 2**96 - 1

 # A and B are the coefficients of the curve equation
 A = -3
 B = 41058363725152142129326129780047268409114441015993725554835256314039467401291

 # N is the order of the base point used by P-256 (ie the number of valid points
 # on the curve subgroup).
 N = 115792089210356248762697446949407573529996955224135760342422259061068512044369

 def p256_validate_pk(pk: ECC.EccKey):
    """Validates a NIST P-256 elliptic curve public key."""
    q = pk.pointQ
    # 1. Check pk != point at infinity
    if q.is_point_at_infinity():
        raise ValueError('PK is point at infinity')
    # 2. Check x, y in range [0, p-1]
    if not (0 <= q.x < P) and (0 <= q.y < P):
        raise ValueError('PK x or y is outside allowed range')
    # 3. Check point matches curve equation
    if pow(q.y, 2, P) != (pow(q.x, 3, P) + A*int(q.x) + B) % P:
        raise ValueError('PK doesn\'t match curve equation')
    # 4. Check nQ = O where n is the order of the curve, Q is the public key
    #    point, and O is the point at infinity. You can safely skip this
    #    (expensive) check if the cofactor=1, as it does for all NIST P-curves.
    #    We keep it in for illustration purposes, and in case anyone copy+pastes
    #    this into code using different curves.
    result = q * N  # PyCryptodome handily overloads * for point scalar mult
    if not result.is_point_at_infinity():
        raise ValueError('PK is not in prime-order subgroup')

 p256_validate_pk(alice_key)
 p256_validate_pk(bob_key)
 print('Bob and Alice\'s keys are valid!')

 bad_key = ECC.generate(curve='secp224r1')  # Different curve
 try:
    p256_validate_pk(bad_key)
    print('Err... a bad key somehow validated')
 except ValueError as e:
    print('Rejected bad key:', e)

 # NB: PyCryptodome validates keys on import when using the ECC.import_key
 # method. If you create keys in some other manner then you do need to perform
 # the above checks:
 def ecdh(private: ECC.EccKey, public: ECC.EccKey):
    p256_validate_pk(public)
    return key_agreement(static_priv=private,
                         static_pub=public,
                         kdf=kdf)

 print("""
 ################################################################################
 #                                                                              #
 #                       COMPRESSED POINT REPRESENTATION                        #
 #                                                                              #
 ################################################################################
 """)

 # Point validation will save you from invalid curve attacks, but it's easy to
 # skip these checks, and then you have no security. A general direction in
 # modern cryptography API design is to make it hard to mess things up. We can do
 # this by changing the representation we use for public keys. An EC public key
 # is a point on the curve, represented as (x, y) coordinates. But for each x
 # coordinate, there is at most two valid y-coordinates for that curve, and they
 # are symmetrical about the x-axis. So we can use a *compressed* format for
 # these points consisting of just the x-coordinate and a single bit to indicate
 # which of the two y-coordinates. We can then reconstruct the full coordinates
 # from the curve equation: y = +/- sqrt(x^3 + ax + b) mod p.

 # The SEC1 format is widely used in other standards. It consists of a single
 # byte indicating the type of point and then the coordinate data:
 # 0x00 - indicates the "point at infinity". No further data follows.
 # 0x02 / 0x03 - indicates a compressed point. The x-coordinate follows. The type
 #   byte indicates which of the two y-coordinates.
 # 0x04 - indicates an uncompressed point, followed by the full x- and
 #   y-coordinates.
 compressed = alice_key.public_key().export_key(format='SEC1', compress=True)
 print('Compressed PK:', compressed.hex(), 'length:', len(compressed))
 recovered = ECC.import_key(compressed, curve_name='p256')

 assert recovered == alice_key.public_key()

 #
 # Not only are compressed points smaller (e.g., 33 bytes for P-256), but they
 # also *eliminate invalid curve attacks*. An attacker can no longer supply a
 # point that is not on the curve. The worst they can do is provide an x value
 # that doesn't correspond to a real point, but in that case the curve equation
 # will have no solutions and naturally be rejected.
 #
 # If compressed public keys are so much better, why do any crypto libraries
 # allow uncompressed points at all? The answer sadly, as with many things, is
 # patents. Compressed point representations were covered by patents for many
 # years and so were not widely used. To this day, most standards still specify
 # uncompressed point representations, despite the clear advantages of compressed
 # points. On a positive note, the patents have now expired, so hopefully future
 # standards will only specify compressed points.

 print("""
 ################################################################################
 #                                                                              #
 #                     MONTGOMERY CURVES AND X-ONLY LADDERS                     #
 #                                                                              #
 ################################################################################
 """)

 # The NIST P-curves that we have looked at so far are a class of curves known as
 # "short Weierstrass" curves, which have the form of curve equation we
 # specified. But these are not the only type of elliptic curve used in
 # cryptography. (By the way, in case you are wondering, there is a "full
 # Weierstrass" format, which encompasses *all* elliptic curves, but the formula
 # is much more complex). One particularly good class of elliptic curves for
 # Diffie-Hellman are the Montgomery curves, which have a formula of the form:
 #
 #   By^2 = x^3 + Ax^2 + x
 #
 # Montgomery curves allow a very efficient way to compute Diffie-Hellman that
 # doesn't use the y-coordinate of points at all (recall that the output of ECDH
 # is the x-coordinate of a computed point). This allows an even more
 # compressed point representation that consists of *only* the x-coordinate. This
 # method is known as the "Montgomery ladder". As well as avoiding invalid curve
 # attacks, the Montgomery ladder is also easier to harden against some timing
 # side-channel attacks.
 #
 # (Technically, an x-coordinate for a Montgomery curve may not represent a point
 # on the main curve, but on an alternative curve known as the "twist". Rather
 # than explicitly rejecting such points, cryptographers instead design
 # Montgomery curves such that this twist curve is also secure).
 #
 # The most well-known Montgomery curve used in cryptography is Curve25519,
 # designed by Daniel Bernstein (who also did ChaCha & Poly1305). It is named for
 # the prime, p = 2^255 - 19, that is used as the modulus for all its finite
 # field arithmetic. It has roughly the same security strength as NIST's P-256,
 # but is often considerably faster.

 print("""
 ################################################################################
 #                                                                              #
 #                                  COFACTORS                                   #
 #                                                                              #
 ################################################################################
 """)

 # Montgomery curves have one big downside compared to prime-order curves like
 # P-256: they always have a cofactor > 1. For example, Curve25519 has a cofactor
 # of 8, so there are small subgroups of 1, 2, 4, and 8. There are several ways
 # to prevent small subgroup attacks against curves with a cofactor. NIST's
 # approach is to use a variant of Diffie-Hellman known as Cofactor
 # Diffie-Hellman, where the private key is multiplied by the cofactor before
 # the normal DH process. In FFDH terms, this becomes: s = y^(x*h) mod p, where
 # y is the other party's public key, x is your private key and h is the
 # cofactor. This is the same as s = (y^h)^x mod p. If y is a small order value,
 # then y^h will always be 1 (mod p), so the output of DH will be 1, regardless
 # of the private key. For elliptic curves, Cofactor Diffie-Hellman ensures that
 # the output is always the point at infinity if a small-order point is provided.
 # Because the representation of the point at infinity is typically (0, 0) in
 # most libraries, some standards recommend to check explicitly for an all-zero
 # output from the DH function and abort if this occurs. But this is not
 # necessary for security (the attacker has learned nothing, but you may not want
 # to use any secure channel based on this predictable shared secret).
 #
 # Curve25519 takes a different approach. It "clamps" the private key prior to
 # use, by zero-ing the least-significant 3 bits. This ensures that the resulting
 # value is a multiple of 8 (the cofactor), without explicitly performing that
 # multiplication. It also sets the most-significant bit (modulo p), which avoids
 # another possible (minor) timing side-channel attack, and also rules out some
 # pathological (but extremely unlikely) small private key values.

 def curve25519_clamp(x: bytes) -> bytes:
    """This is done for you by any implementation, just for illustration."""
    assert len(x) == 32
    result = bytearray(x)
    # NB: Curve25519 always uses little-endian order
    result[0]  &= ~0x07  # Clear bottom 3 bits
    result[31] &= ~0x80  # Clear msb so < p
    result[31] |= 0x40   # Set msb (considering as 255-bit number)
    return bytes(result)

 clamped = curve25519_clamp(os.urandom(32))
 assert int.from_bytes(clamped, 'little') % 8 == 0, \
    'Should be multiple of cofactor (8)'

 # The DH function using this clamping and the Montgomery ladder is known as
 # X25519 to distinguish it from plain ECDH. It is now very widely used and
 # represents the state of the art for DH in production. It is now also approved
 # for US Federal use via NIST SP800-186, so can be used in most applications.
 # If you intend to use Curve25519 for something *other* than X25519, just be
 # aware of that cofactor! Many security issues have occurred from people
 # mistakenly assuming that Curve25519 is a faster drop-in replacement for P-256.
 # It is not: X25519 is a faster replacement for ECDH with P-256, but beyond that
 # you need to think carefully!

 print("""
 ################################################################################
 #                                                                              #
 #                          UNKNOWN KEY SHARE ATTACKS                           #
 #                                                                              #
 ################################################################################
 """)

 # A more subtle attack occurs when Alice thinks she is communicating with
 # Mallory, but is actually communicating with Bob. For example, suppose that
 # Alice, Mallory, and Bob all use a shared service to lookup public keys for
 # other users. Mallory observes Bob registering for his account and records his
 # public key. She then registers for her own account, but registers Bob's
 # public key again under her own name. Now, when Alice tries to send a message
 # to Mallory, telling her that she is fired, it instead is decrypted by Bob
 #
 # There are also changes you can make to the protocol to help detect MitM
 # attacks. Modern protocols include a *transcript hash* in the KDF info. A
 # transcript hash is a cryptographic hash of every message exchanged during key
 # negotiation. On its own, this doesn't prevent MitM attacks, but it makes the
 # protocol more robust against them, so long as at least one participant can be
 # authenticated. For example, in TLS only the server authenticates usually, by
 # presenting a certificate signed by a trusted certificate authority. But
 # because TLS includes a transcript hash in its KDF inputs, this one-way
 # authentication is enough to stop active attacks in *either* direction.