resilar · September 22, 2025 08:56
diff --git a/ed25519.c b/ed25519.c
 #include "sha512.h"

 #include <stdint.h>
 #include <sys/random.h>

 /*
 * RFC7748 X25519 Elliptic Curve Diffie-Hellman (ECDH) with Curve25519.
 * Montgomery curve y² = x³ + 486662x² + x over GF(p) = GF(2^255 - 19).
 *
 * 256-bit arithmetic modulo 2^255 - 19 implemented with 8 x 32-bit limbs :3
 */

 static uint32_t *import(const uint8_t in[32], uint32_t out[8])
 {
    out[0] = in[ 0] + (in[ 1] << 8) + (in[ 2] << 16) + (in[ 3] << 24);
    out[1] = in[ 4] + (in[ 5] << 8) + (in[ 6] << 16) + (in[ 7] << 24);
    out[2] = in[ 8] + (in[ 9] << 8) + (in[10] << 16) + (in[11] << 24);
    out[3] = in[12] + (in[13] << 8) + (in[14] << 16) + (in[15] << 24);
    out[4] = in[16] + (in[17] << 8) + (in[18] << 16) + (in[19] << 24);
    out[5] = in[20] + (in[21] << 8) + (in[22] << 16) + (in[23] << 24);
    out[6] = in[24] + (in[25] << 8) + (in[26] << 16) + (in[27] << 24);
    out[7] = in[28] + (in[29] << 8) + (in[30] << 16) + (in[31] << 24);
    return out;
 }

 static uint8_t *export(const uint32_t i[8], uint8_t o[32])
 {
    o[ 0] = i[0]; o[ 1] = i[0] >> 8; o[ 2] = i[0] >> 16; o[ 3] = i[0] >> 24;
    o[ 4] = i[1]; o[ 5] = i[1] >> 8; o[ 6] = i[1] >> 16; o[ 7] = i[1] >> 24;
    o[ 8] = i[2]; o[ 9] = i[2] >> 8; o[10] = i[2] >> 16; o[11] = i[2] >> 24;
    o[12] = i[3]; o[13] = i[3] >> 8; o[14] = i[3] >> 16; o[15] = i[3] >> 24;
    o[16] = i[4]; o[17] = i[4] >> 8; o[18] = i[4] >> 16; o[19] = i[4] >> 24;
    o[20] = i[5]; o[21] = i[5] >> 8; o[22] = i[5] >> 16; o[23] = i[5] >> 24;
    o[24] = i[6]; o[25] = i[6] >> 8; o[26] = i[6] >> 16; o[27] = i[6] >> 24;
    o[28] = i[7]; o[29] = i[7] >> 8; o[30] = i[7] >> 16; o[31] = i[7] >> 24;
    return o;
 }

 static uint32_t *clamp_scalar(uint32_t a[8])
 {
    a[0] &= 0xfffffff8;
    a[7] &= 0x7fffffff;
    a[7] |= 0x40000000;
    return a;
 }

 static uint32_t *clamp_coordinate(uint32_t u[8])
 {
    u[7] &= 0x7fffffff;
    return u;
 }

 /* x = a (mod p) */
 static uint32_t *modp(const uint32_t a[8], uint32_t x[8])
 {
    int i;
    uint64_t c = (uint64_t)a[0] + 19;
    for (i = 1; i < 8; i++) c = (c >> 32) + a[i];
    c = (c >> 31) * 19;
    for (i = 0; i < 7; i++) {
        c += a[i];
        x[i] = c;
        c >>= 32;
    }
    x[i] = ((uint32_t)c + a[i]) & 0x7fffffff;
    return x;
 }

 /* x = a + b (mod p) */
 static uint32_t *add(const uint32_t a[8], const uint32_t b[8], uint32_t x[8])
 {
    int i;
    uint64_t c = 0;
    for (i = 0; i < 8; i++) {
        c += a[i];
        c += b[i];
        x[i] = c;
        c >>= 32;
    }
    return x;
    /*
     * Note that we can skip modp() here because we do not chain more than one
     * addition or subtraction between multiplications. mul() handles unreduced
     * integers just fine and returns (mod p)-reduced product as the result.
     */
 }

 /* x = a - b (mod p) */
 static uint32_t *sub(const uint32_t a[8], const uint32_t b[8], uint32_t x[8])
 {
    int i;
    int64_t c = -19;
    for (i = 0; i < 8; i++) {
        c += a[i];
        c -= b[i];
        x[i] = c;
        c >>= 32;
    }
    x[7] += 0x80000000;
    return x; /* Again, skip modp() */
 }

 /*
 * The mul() function is the main performance bottleneck of the code.
 * https://locklessinc.com/articles/256bit_arithmetic/
 */
 /* x = a * b (mod p) */
 static uint32_t *mul(const uint32_t a[8], const uint32_t b[8], uint32_t x[8])
 {
    int i, j;
    uint32_t ab[8];
    uint64_t c, d, l, r;
    for (i = c = 0; i < 8; i++) {
        l = c & 0xffffffff;
        r = 0;
        c >>= 32;
        for (j = 0; j <= i; j++) {
            l += (uint64_t)a[j] * b[i - j];
            c += l >> 32;
            l &= 0xffffffff;
        }
        for (d = 0; j < 8; j++) {
            r += (uint64_t)a[j] * b[8 - (j - i)];
            d += r >> 32;
            r &= 0xffffffff;
        }
        l += 38 * r;
        c += 38 * d + (l >> 32);
        ab[i] = l;
    }
    c = 19 * (2*c + (ab[7] >> 31));
    ab[7] &= 0x7fffffff;
    for (i = 0; i < 8; i++) {
        c += ab[i];
        ab[i] = c;
        c >>= 32;
    }
    return modp(ab, x);
 }

 /* x = a² (mod p) */
 static uint32_t *square(const uint32_t a[8], uint32_t x[8])
 {
    return mul(a, a, x);
 }

 /* x = a^(p-2) (mod p) */
 static uint32_t *inverse_modp(const uint32_t a[8], uint32_t x[8])
 {
    int i;
    uint32_t b[8];
    mul(a, square(a, b), x);
    mul(x, square(square(b, b), b), x);
    square(b, b);
    for (i = 5; i < 255; i++) mul(x, square(b, b), x);
    return x;
 }

 /* Swap a and b if c != 0 */
 static void cswap(uint32_t a[8], uint32_t b[8], int c)
 {
    int i;
    uint32_t mask = -(uint32_t)!!c;
    for (i = 0; i < 8; i++) {
        const uint32_t xor = mask & (a[i] ^ b[i]);
        a[i] ^= xor;
        b[i] ^= xor;
    }
 }

 /*
 * Constant-time Montgomery ladder for elliptic curve point multiplication with
 * a scalar k and point coordinate u. We expect k and u to be properly clamped.
 */
 static uint32_t *monty(const uint32_t k[8], const uint32_t u[8], uint32_t x[8])
 {
    int i, swap = 0;
    uint32_t x0[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
    uint32_t z0[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
    uint32_t x1[8];
    uint32_t z1[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
    const uint32_t a24[8] = { (486662 - 2) / 4, 0, 0, 0, 0, 0, 0, 0 };
    for (i = 0; i < 8; i++) x1[i] = u[i];
    for (i = 1; i <= 255; i++) {
        uint32_t A[8], AA[8], B[8], BB[8], C[8], CB[8], D[8], DA[8],E[8];
        const int bit = (k[(255-i) / 32] >> ((255-i) % 32)) & 1;
        swap ^= bit;
        cswap(x0, x1, swap);
        cswap(z0, z1, swap);
        swap = bit;

        /*
         * A = x_0 + z_0, AA = A^2
         * B = x_0 - z_0, BB = B^2
         * C = x_1 + z_1, CB = C * B
         * D = x_1 - z_1, DA = D * A
         * E = AA - BB
         */
        square(add(x0, z0, A), AA);
        square(sub(x0, z0, B), BB);
        mul(add(x1, z1, C), B, CB);
        mul(sub(x1, z1, D), A, DA);
        sub(AA, BB, E);

        /*
         * x_0 = AA * BB
         * x_1 = (DA + CB)^2
         */
        mul(AA, BB, x0);
        square(add(DA, CB, x1), x1);

        /*
         * z_0 = E * (AA + a24 * E) where a24 = (486662 - 2) / 4 = 121665
         * z_1 = (DA - CB)^2 * u
         */
        add(AA, mul(a24, E, z0), z0);
        mul(E, z0, z0);
        square(sub(DA, CB, z1), z1);
        mul(u, z1, z1);
    }

    cswap(x0, x1, swap);
    cswap(z0, z1, swap);
    return mul(x0, inverse_modp(z0, x), x);
 }

 /*
 * Generate a public/private key pair for x25519 key exchange.
 *
 * The secret key is generated with getrandom(2) and clamped/masked according
 * to RFC7748. The public key is derived from the secret key by elliptic curve
 * point multiplication with the generator point G = { 9 } i.e. the Montgomery
 * curve point u=9.
 */
 int x25519_keygen(uint8_t secret[32], uint8_t public[32])
 {
    uint32_t k[8];
    const uint32_t G[8] = { 9, 0, 0, 0, 0, 0, 0, 0 };
    if (getrandom(k, sizeof(k), 0) != sizeof(k))
        return 0;
    clamp_scalar(k);
    export(k, secret);
    monty(k, G, k);
    export(k, public);
    return 1;
 }

 /*
 * Compute a public key from secret.
 */
 void x25519_secret_to_public(const uint8_t secret[32], uint8_t public[32])
 {
    uint32_t k[8];
    const uint32_t G[8] = { 9, 0, 0, 0, 0, 0, 0, 0 };
    import(secret, k);
    clamp_scalar(k);
    monty(k, G, k);
    export(k, public);
 }

 /*
 * X25519 Elliptic-curve Diffie-Hellman (ECDH).
 *
 * x25519(k, u) <=> U(kP) = x25519(k, U(P)) where P = (u, v) and U(P) = u.
 *
 *  K_a = clamp_scalar(urandom(32))             Alice's private key
 *  P_a = x25519(K_a, G) = K_a x G = K_a x 9    Alice's public key
 *  K_b = clamp_scalar(urandom(32))             Bob's private key
 *  P_b = x25519(K_b, G) = K_b x G = K_b x 9    Bob's public key
 *
 * Elliptic-curve Diffie-Hellman produces a shared key for Alice & Bob:
 *
 *  K = x25519(K_a, P_b) = x25519(K_b, P_a)     Alice's & Bob's shared key
 *
 * Proof of correctness:
 * (A) x25519(K_a, P_b) = x25519(K_a, x25519(K_b, 9)) = K_a x K_b x 9 = K
 * (B) x25519(K_b, P_a) = x25519(K_b, x25519(K_a, 9)) = K_b x K_a x 9 = K
 *  => Alice and Bob derive identical shared key K.
 *
 * The return value is non-zero on success. An elliptic curve point with small
 * order causes the function return to zero and out buffer receives only zeros.
 */
 int x25519(const uint8_t secret[32], const uint8_t public[32], uint8_t out[32])
 {
    uint32_t k[8], u[8];
    import(secret, k);
    import(public, u);
    clamp_scalar(k);
    clamp_coordinate(u);
    monty(k, u, k);
    export(k, out);
    return (k[0] | k[1] | k[2] | k[3] | k[4] | k[5] | k[6] | k[7]) != 0;
 }

 /*
 * RFC8032 Edwards-Curve Digital Signature Algorithm (EdDSA).
 * Twisted Edwards curve -x² + y² = (121665/121666)x²y² over GF(p).
 */

 /* Base point */
 static const uint32_t ed25519_B[32] = {
    /* g_x = recover_x(g_y, 0) */
    0x8f25d51a, 0xc9562d60, 0x9525a7b2, 0x692cc760,
    0xfdd6dc5c, 0xc0a4e231, 0xcd6e53fe, 0x216936d3,
    /* g_y = 4 * inverse_modp(5) % p */
    0x66666658, 0x66666666, 0x66666666, 0x66666666,
    0x66666666, 0x66666666, 0x66666666, 0x66666666,
    /* g_z = 1 */
    0x00000001, 0x00000000, 0x00000000, 0x00000000,
    0x00000000, 0x00000000, 0x00000000, 0x00000000,
    /* g_t = g_x * g_y % p */
    0xa5b7dda3, 0x6dde8ab3, 0x775152f5, 0x20f09f80,
    0x64abe37d, 0x66ea4e8e, 0xd78b7665, 0x67875f0f
 };


 /* Group order q = 2^252 + 27742317777372353535851937790883648493 */
 static const uint32_t ed25519_q[8] = {
    0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
    0x00000000, 0x00000000, 0x00000000, 0x10000000,
 };

 /* Reduce 512-bit x modulo q */
 static uint32_t *modq(const uint32_t x[16], uint32_t reduced[8])
 {
    /*
     * Barrett reduction.
     *
     * First, choose k = 256 such that 2^k > q. Second, set r = floor(4^k / q).
     * Now x - q * floor(x*r / 4^k) < 2*q and we obtain x (mod q) after a
     * conditional subtraction.
     */
    static const uint32_t r[9] = {
        0x0a2c131b, 0xed9ce5a3, 0x086329a7, 0x2106215d,
        0xffffffeb, 0xffffffff, 0xffffffff, 0xffffffff,
        0xf
    };
    int i, j;
    uint64_t c;
    uint32_t mask, xr[25] = {0};

    /* xr = x * r */
    for (i = 0; i < 9; i++) {
        c = 0;
        for (j = 0; j < 16; j++) {
            c += (uint64_t)r[i] * x[j] + xr[i+j];
            xr[i+j] = c;
            c >>= 32;
        }
        xr[i+j] = c;
    }

    /* xr = q * floor(xr / 4^k)  = q * floor(xr / 2^512) */
    for (i = 0; i < 8; i++) xr[i] = 0;
    for (i = 0; i < 8; i++) {
        c = 0;
        for (j = 0; i+j < 8; j++) {
            c = (c >> 32) + xr[i+j] + (uint64_t)xr[i+16] * ed25519_q[j];
            xr[i+j] = c;
        }
    }

    /* xr = x - xr */
    c = 1;
    for (i = 0; i < 8; i++) {
        xr[i] = (c = c + x[i] + ~xr[i]);
        c >>= 32;
    }

    /*
     * reduced = xr     if 0 <= xr < q
     *           xr - q if q <= xr < 2*q
     */
    c = 1;
    for (i = 0; i < 8; i++) c = (c + xr[i] + ~ed25519_q[i]) >> 32;
    mask = -c;
    for (i = 0; i < 8; i++) {
        c += (uint64_t)xr[i] + (~ed25519_q[i] & mask);
        reduced[i] = c;
        c >>= 32;
    }

    for (i = 0; i < 25; i++) xr[i] = 0;
    return reduced;
 }

 /* Variable-time */
 static int cmp_vartime(const uint32_t a[8], const uint32_t b[8])
 {
    int i;
    for (i = 0; i < 8; i++) {
        const int j = 8 - (i+1);
        if (a[j] < b[j]) return -1;
        if (a[j] > b[j]) return 1;
    }
    return 0;
 }

 /* Variable-time */
 static uint32_t *recover_x(const uint32_t y[8], int sign, uint32_t x[8])
 {
    int i;
    uint32_t yy[8], u[8], v[8], uv3[8], uv7[8], b[8], bb[8];
    const uint32_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
    const uint32_t one[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
    const uint32_t p[8] = {
        0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff,
        0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff
    };
    const uint32_t d[8] = {
        0x135978a3, 0x75eb4dca, 0x4141d8ab, 0x00700a4d,
        0x7779e898, 0x8cc74079, 0x2b6ffe73, 0x52036cee
    };
    const uint32_t sqrti[8] = { /* sqrt of -1, i.e., pow(2, (p-1) / 4) % p */
        0x4a0ea0b0, 0xc4ee1b27, 0xad2fe478, 0x2f431806,
        0x3dfbd7a7, 0x2b4d0099, 0x4fc1df0b, 0x2b832480
    };

    /* if (y >= p) return 0; */
    if (cmp_vartime(y, p) >= 0)
        return (void *)0;

    /*
     * For an encoded point R = (x,y), recover x = ±sqrt((y² - 1) / (dy² + 1)).
     * Prime p = 2^255 - 19 (modulo 8) ≡ 5 so α² = β⁴ (i.e., ±α = β²) for any
     * square α ∈ F(p) such that β = α^((p+3)/8).
     *
     *  (1) x = ±sqrt(u/v) = ±sqrt((y² - 1) / (dy² + 1))
     *  (2) β = (u/v)^((p+3)/8) = ··· = uv³(uv⁷)^((p-5)/8)
     */
    sub(square(y, yy), one, u);
    add(mul(d, yy, v), one, v);
    
    /* If u is zero return a zero coordinate (or error if a non-zero sign) */
    if (cmp_vartime(modp(u, x), zero) == 0)
        return sign ? (void *)0 : x;

    /* β = (u/v)^(p+3)/8 = ··· = uv³(uv⁷)^((p-5)/8) */
    mul(u, mul(v, square(v, b), b), uv3); /* b = v³, uv3 = uv³ */
    mul(u, mul(v, square(b, b), b), uv7); /* b = v⁷, uv7 = uv⁷ */
    mul(uv3, uv7, b);
    square(uv7, bb);
    for (i = 0; i < 250; i++) mul(b, square(bb, bb), b);

    /*
     * If β² = -α (that is, vβ² = -u) then multiplication with sqrt(-1) yields
     * the recovered x.
     */
    sub(mul(v, square(b, bb), yy), u, yy);
    if (cmp_vartime(modp(yy, yy), zero) != 0) {
        mul(b, sqrti, b);
        sub(mul(v, square(b, bb), yy), u, yy);
        if (cmp_vartime(modp(yy, yy), zero) != 0)
            return (void *)0; /* Input was not a square */
    }

    /* return (p - b) if (b & 1) != sign else b */
    if (((int)b[0] & 1) != sign)
        sub(p, b, b);
    return modp(b, x);
 }

 /* Derive a scalar and a message prefix from a 32-byte secret */
 static void secret_expand(const uint8_t secret[32],
                          uint32_t scalar[8], uint8_t prefix[32])
 {
    int i;
    uint8_t hash[64];
    struct sha512 ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, secret, 32);
    sha512_final(&ctx, hash);
    import(hash, scalar);
    clamp_scalar(scalar);
    for (i = 0; i < 32; i++) prefix[i] = hash[32+i];
 }

 /* Finalize SHA-512 computation and return the hash as an integer modulo q */
 static void sha512_final_modq(struct sha512 *ctx, uint32_t x[8])
 {
    union {
        uint8_t u8[64];
        uint32_t u32[16];
    } hash;
    sha512_final(ctx, hash.u8);
    import(hash.u8, hash.u32);
    import(&hash.u8[32], &hash.u32[8]);
    modq(hash.u32, x);
 }

 /* r = ((x * y) + z) % q */
 static uint32_t *mul_add_modq(const uint32_t x[8], const uint32_t y[8],
                              const uint32_t z[8], uint32_t r[8])
 {
    int i, j;
    uint64_t c;
    uint32_t xy[16] = {0};
    for (i = 0; i < 8; i++) xy[i] = z[i];
    for (i = 0; i < 8; i++) {
        c = 0;
        for (j = 0; j < 8; j++) {
            xy[i+j] = (c += (uint64_t)x[i] * y[j] + xy[i+j]);
            c >>= 32;
        }
        xy[i+j] = c;
    }
    return modq(xy, r);
 }

 /* Encode point (X:Y:Z:T) as a 255-bit y-coordinate with a sign bit */
 static void point_encode(const uint32_t P[32], uint8_t buf[32])
 {
    uint32_t x[8], y[8], zinv[8];
    inverse_modp(P + 16, zinv);
    mul(zinv, P, x);
    mul(zinv, P + 8, y);
    y[7] |= (x[0] & 1) << 31;
    export(y, buf);
 }

 /* Decode point (X:Y:Z:T) from 256 bits encoding the x coordinate and sign */
 static uint32_t *point_decode(const uint8_t buf[32], uint32_t P[32])
 {
    int sign;
    uint32_t *Px = P;
    uint32_t *Py = P + 8;
    uint32_t *Pz = P + 16;
    uint32_t *Pt = P + 24;

    import(buf, Py);
    sign = Py[7] >> 31;
    Py[7] &= 0x7fffffff;

    if (!recover_x(Py, sign, Px))
        return (void *)0;

    Pz[0] = 1;
    Pz[1] = Pz[2] = Pz[3] = Pz[4] = Pz[5] = Pz[6] = Pz[7] = 0;

    mul(Px, Py, Pt);
    return P;
 }

 /* P == Q */
 static int point_equal(const uint32_t P[32], const uint32_t Q[32])
 {
    int i;
    uint32_t A[8], B[8], sum = 0;
    const uint32_t *Px = P;
    const uint32_t *Py = P + 8;
    const uint32_t *Pz = P + 16;
    const uint32_t *Qx = Q;
    const uint32_t *Qy = Q + 8;
    const uint32_t *Qz = Q + 16;

    /* P.x / P.z == Q.x / Q.z  <=>  P.x * Q.z == Q.x * P.z */
    mul(Px, Qz, A);
    mul(Qx, Pz, B);
    for (i = 0; i < 8; i++) sum |= A[i] ^ B[i];

    /* P.y / P.z == Q.y / Q.z  <=>  P.y * Q.z == Q.y * P.z */
    mul(Py, Qz, A);
    mul(Qy, Pz, B);
    for (i = 0; i < 8; i++) sum |= A[i] ^ B[i];

    return sum == 0;
 }

 /* R = P + Q */
 static uint32_t *point_add(const uint32_t P[32], const uint32_t Q[32],
                           uint32_t R[32])
 {
    uint32_t A[8], B[8], C[8], D[8], E[8], F[8], G[8], H[8];
    uint32_t *Rx = R;
    uint32_t *Ry = R + 8;
    uint32_t *Rz = R + 16;
    uint32_t *Rt = R + 24;
    const uint32_t *Px = P;
    const uint32_t *Py = P + 8;
    const uint32_t *Pz = P + 16;
    const uint32_t *Pt = P + 24;
    const uint32_t *Qx = Q;
    const uint32_t *Qy = Q + 8;
    const uint32_t *Qz = Q + 16;
    const uint32_t *Qt = Q + 24;
    const uint32_t d2[8] = {
        0x26b2f159, 0xebd69b94, 0x8283b156, 0x00e0149a,
        0xeef3d130, 0x198e80f2, 0x56dffce7, 0x2406d9dc
    };

    /* A = (P[1]-P[0]) * (Q[1]-Q[0]) % p */
    sub(Py, Px, A);
    sub(Qy, Qx, C);
    mul(A, C, A);

    /* B = (P[1]+P[0]) * (Q[1]+Q[0]) % p */
    add(Py, Px, B);
    add(Qy, Qx, C);
    mul(B, C, B);

    /* C = 2 * P[3] * Q[3] * d % p */
    mul(Pt, Qt, C);
    mul(C, d2, C);

    /* D = 2 * P[2] * Q[2] % p */
    mul(Pz, Qz, D);
    add(D, D, D);
    modp(D, D);

    /* E = B - A */
    sub(B, A, E);

    /* F = D - C */
    sub(D, C, F);

    /* G = D + C */
    add(D, C, G);

    /* H = B + A */
    add(B, A, H);

    /* R = (E*F, G*H, F*G, E*H) */
    mul(E, F, Rx);
    mul(G, H, Ry);
    mul(F, G, Rz);
    mul(E, H, Rt);

    return R;
 }

 /* Q = s * P */
 static uint32_t *point_mul(const uint32_t s[8], const uint32_t P[32],
                           uint32_t Q[32])
 {
    int i;
    uint32_t sP[32];
    for (i = 0; i < 32; i++) {
        sP[i] = P[i];
        Q[i] = 0;
    }
    Q[8] = Q[16] = 1;
    for (i = 0; i < 256; i++) {
        uint32_t R[32];
        const int swap = (s[i / 32] >> (i % 32)) & 1;
        point_add(Q, sP, R);
        cswap(Q, R, swap);
        cswap(Q + 8, R + 8, swap);
        cswap(Q + 16, R + 16, swap);
        cswap(Q + 24, R + 24, swap);
        point_add(sP, sP, sP);
    }
    return Q;
 }

 /* Compute a public key from secret */
 void ed25519_secret_to_public(const uint8_t secret[32], uint8_t public[32])
 {
    uint8_t prefix[32];
    uint32_t a[8], aB[32];
    secret_expand(secret, a, prefix);
    point_mul(a, ed25519_B, aB);
    point_encode(aB, public);
 }

 /* Constant-time EdDSA signature generation algorithm */
 void ed25519_sign(const uint8_t secret[32], const void *msg, size_t len,
                  uint8_t signature[64])
 {
    struct sha512 ctx;
    uint32_t a[8], h[8], r[8], P[32];
    secret_expand(secret, a, signature);
    sha512_init(&ctx);
    sha512_update(&ctx, signature, 32);
    sha512_update(&ctx, msg, len);
    sha512_final_modq(&ctx, r);
    point_encode(point_mul(r, ed25519_B, P), signature);
    point_encode(point_mul(a, ed25519_B, P), signature + 32);
    sha512_init(&ctx);
    sha512_update(&ctx, signature, 64);
    sha512_update(&ctx, msg, len);
    sha512_final_modq(&ctx, h);
    export(mul_add_modq(h, a, r, r), signature + 32);
 }

 /* Variable-time EdDSA signature verification algorithm */
 int ed25519_verify(const uint8_t public[32], const void *msg, size_t len,
                   const uint8_t signature[64])
 {
    struct sha512 ctx;
    uint32_t A[32], R[32], s[8], h[8], sB[32], hA[32];
    if (!point_decode(public, A))
        return 0;
    if (!point_decode(signature, R))
        return 0;
    import(signature + 32, s);
    if (cmp_vartime(s, ed25519_q) >= 0)
        return 0;
    sha512_init(&ctx);
    sha512_update(&ctx, signature, 32);
    sha512_update(&ctx, public, 32);
    sha512_update(&ctx, msg, len);
    sha512_final_modq(&ctx, h);
    point_mul(s, ed25519_B, sB);
    point_mul(h, A, hA);
    return point_equal(sB, point_add(R, hA, hA));
 }
diff --git a/sha512.c b/sha512.c
 #include "sha512.h"

 #define ROR64(x, c) (((x) >> (c)) | ((x) << (64 - (c))))

 #define LOAD64_BE(p)                \
    ( ((uint64_t)((p)[7]) <<  0)    \
    | ((uint64_t)((p)[6]) <<  8)    \
    | ((uint64_t)((p)[5]) << 16)    \
    | ((uint64_t)((p)[4]) << 24)    \
    | ((uint64_t)((p)[3]) << 32)    \
    | ((uint64_t)((p)[2]) << 40)    \
    | ((uint64_t)((p)[1]) << 48)    \
    | ((uint64_t)((p)[0]) << 56)    \
    )
 #define STORE64_BE(p, v)            \
    (p)[7] = ((v) >>  0) & 0xFF;    \
    (p)[6] = ((v) >>  8) & 0xFF;    \
    (p)[5] = ((v) >> 16) & 0xFF;    \
    (p)[4] = ((v) >> 24) & 0xFF;    \
    (p)[3] = ((v) >> 32) & 0xFF;    \
    (p)[2] = ((v) >> 40) & 0xFF;    \
    (p)[1] = ((v) >> 48) & 0xFF;    \
    (p)[0] = ((v) >> 56) & 0xFF;

 void sha512_init(struct sha512 *ctx)
 {
    ctx->state[0] = 0x6a09e667f3bcc908; /* sqrt(2) */
    ctx->state[1] = 0xbb67ae8584caa73b; /* sqrt(3) */
    ctx->state[2] = 0x3c6ef372fe94f82b; /* sqrt(5) */
    ctx->state[3] = 0xa54ff53a5f1d36f1; /* sqrt(7) */
    ctx->state[4] = 0x510e527fade682d1; /* sqrt(11) */
    ctx->state[5] = 0x9b05688c2b3e6c1f; /* sqrt(13) */
    ctx->state[6] = 0x1f83d9abfb41bd6b; /* sqrt(17) */
    ctx->state[7] = 0x5be0cd19137e2179; /* sqrt(19) */
    ctx->bits[0] = ctx->bits[1] = 0;
    ctx->n = 0;
 }

 static void sha512_block(uint64_t state[8], const uint8_t p[128])
 {
    uint64_t a, b, c, d, e, f, g, h, s0, s1, S0, S1, t1, t2, w[16];
    static const uint64_t K512[80] = {
        0x428a2f98d728ae22, 0x7137449123ef65cd,
        0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
        0x3956c25bf348b538, 0x59f111f1b605d019,
        0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
        0xd807aa98a3030242, 0x12835b0145706fbe,
        0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
        0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
        0x9bdc06a725c71235, 0xc19bf174cf692694,
        0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
        0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
        0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
        0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
        0x983e5152ee66dfab, 0xa831c66d2db43210,
        0xb00327c898fb213f, 0xbf597fc7beef0ee4,
        0xc6e00bf33da88fc2, 0xd5a79147930aa725,
        0x06ca6351e003826f, 0x142929670a0e6e70,
        0x27b70a8546d22ffc, 0x2e1b21385c26c926,
        0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
        0x650a73548baf63de, 0x766a0abb3c77b2a8,
        0x81c2c92e47edaee6, 0x92722c851482353b,
        0xa2bfe8a14cf10364, 0xa81a664bbc423001,
        0xc24b8b70d0f89791, 0xc76c51a30654be30,
        0xd192e819d6ef5218, 0xd69906245565a910,
        0xf40e35855771202a, 0x106aa07032bbd1b8,
        0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
        0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
        0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
        0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
        0x748f82ee5defb2fc, 0x78a5636f43172f60,
        0x84c87814a1f0ab72, 0x8cc702081a6439ec,
        0x90befffa23631e28, 0xa4506cebde82bde9,
        0xbef9a3f7b2c67915, 0xc67178f2e372532b,
        0xca273eceea26619c, 0xd186b8c721c0c207,
        0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
        0x06f067aa72176fba, 0x0a637dc5a2c898a6,
        0x113f9804bef90dae, 0x1b710b35131c471b,
        0x28db77f523047d84, 0x32caab7b40c72493,
        0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
        0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
        0x5fcb6fab3ad6faec, 0x6c44198c4a475817
    };

    a = state[0]; b = state[1]; c = state[2]; d = state[3];
    e = state[4]; f = state[5]; g = state[6]; h = state[7];

    #define ROUND_CORE(i)                                           \
    S0 = ROR64(a, 28) ^ ROR64(a, 34) ^ ROR64(a, 39);                \
    S1 = ROR64(e, 14) ^ ROR64(e, 18) ^ ROR64(e, 41);                \
    t1 = h + S1 + ((e & f) ^ (~e & g)) + K512[i] + w[i & 0xf];      \
    t2 = S0 + ((a & b) ^ (a & c) ^ (b & c));                        \
    h = g; g = f; f = e; e = d + t1;                                \
    d = c; c = b; b = a; a = t1 + t2;

    #define ROUND_0_15(i) w[i] = LOAD64_BE(p); p += 8; ROUND_CORE(i)
    ROUND_0_15( 0) ROUND_0_15( 1) ROUND_0_15( 2) ROUND_0_15( 3)
    ROUND_0_15( 4) ROUND_0_15( 5) ROUND_0_15( 6) ROUND_0_15( 7)
    ROUND_0_15( 8) ROUND_0_15( 9) ROUND_0_15(10) ROUND_0_15(11)
    ROUND_0_15(12) ROUND_0_15(13) ROUND_0_15(14) ROUND_0_15(15)
    #undef ROUND_0_15

    #define ROUND_16_79(i)                                          \
    s0 = w[(i + 1) & 0xf]; s1 = w[(i + 14) & 0xf];                  \
    s0 = ROR64(s0, 1) ^ ROR64(s0, 8) ^ (s0 >> 7);                   \
    s1 = ROR64(s1, 19) ^ ROR64(s1,  61) ^ (s1 >> 6);                \
    w[i & 0xf] += s0 + s1 + w[(i + 9) & 0xf]; ROUND_CORE(i)
    ROUND_16_79(16) ROUND_16_79(17) ROUND_16_79(18) ROUND_16_79(19)
    ROUND_16_79(20) ROUND_16_79(21) ROUND_16_79(22) ROUND_16_79(23)
    ROUND_16_79(24) ROUND_16_79(25) ROUND_16_79(26) ROUND_16_79(27)
    ROUND_16_79(28) ROUND_16_79(29) ROUND_16_79(30) ROUND_16_79(31)
    ROUND_16_79(32) ROUND_16_79(33) ROUND_16_79(34) ROUND_16_79(35)
    ROUND_16_79(36) ROUND_16_79(37) ROUND_16_79(38) ROUND_16_79(39)
    ROUND_16_79(40) ROUND_16_79(41) ROUND_16_79(42) ROUND_16_79(43)
    ROUND_16_79(44) ROUND_16_79(45) ROUND_16_79(46) ROUND_16_79(47)
    ROUND_16_79(48) ROUND_16_79(49) ROUND_16_79(50) ROUND_16_79(51)
    ROUND_16_79(52) ROUND_16_79(53) ROUND_16_79(54) ROUND_16_79(55)
    ROUND_16_79(56) ROUND_16_79(57) ROUND_16_79(58) ROUND_16_79(59)
    ROUND_16_79(60) ROUND_16_79(61) ROUND_16_79(62) ROUND_16_79(63)
    ROUND_16_79(64) ROUND_16_79(65) ROUND_16_79(66) ROUND_16_79(67)
    ROUND_16_79(68) ROUND_16_79(69) ROUND_16_79(70) ROUND_16_79(71)
    ROUND_16_79(72) ROUND_16_79(73) ROUND_16_79(74) ROUND_16_79(75)
    ROUND_16_79(76) ROUND_16_79(77) ROUND_16_79(78) ROUND_16_79(79)
    #undef ROUND_16_79
    #undef ROUND_CORE

    state[0] += a; state[1] += b; state[2] += c; state[3] += d;
    state[4] += e; state[5] += f; state[6] += g; state[7] += h;
 }

 void sha512_update(struct sha512 *ctx, const void *data, size_t n)
 {
    const uint8_t *input = (const uint8_t *)data;

    if (ctx->n || n < 128) {
        while (n && ctx->n < 128) {
            ctx->buffer[ctx->n++] = *input++;
            n--;
        }
        if (ctx->n < 128)
            return;
        sha512_block(ctx->state, ctx->buffer);
        ctx->bits[1] += !(ctx->bits[0] += 1024);
        ctx->n = 0;
    }

    while (n >= 128) {
        sha512_block(ctx->state, input);
        ctx->bits[1] += !(ctx->bits[0] += 1024);
        input += 128;
        n -= 128;
    }

    while (n) {
        ctx->buffer[ctx->n++] = *input++;
        n--;
    }
 }

 void sha512_final(struct sha512 *ctx, uint8_t hash[64])
 {
    int i;
    uint8_t buf[256];
    ctx->bits[0] += ctx->n * 8;
    for (i = 1, buf[0] = 0x80; (ctx->n + i + 16) % 128; buf[i++] = 0);
    STORE64_BE(buf + i, ctx->bits[1]);
    STORE64_BE(buf + i + 8, ctx->bits[0]);
    sha512_update(ctx, buf, i + 16);
    STORE64_BE(hash +  0, ctx->state[0]);
    STORE64_BE(hash +  8, ctx->state[1]);
    STORE64_BE(hash + 16, ctx->state[2]);
    STORE64_BE(hash + 24, ctx->state[3]);
    STORE64_BE(hash + 32, ctx->state[4]);
    STORE64_BE(hash + 40, ctx->state[5]);
    STORE64_BE(hash + 48, ctx->state[6]);
    STORE64_BE(hash + 56, ctx->state[7]);
 }

 void sha512(const void *data, size_t n, uint8_t hash[64])
 {
    struct sha512 ctx;
    sha512_init(&ctx);
    sha512_update(&ctx, data, n);
    sha512_final(&ctx, hash);
 }
diff --git a/sha512.h b/sha512.h
 #ifndef SHA512_H
 #define SHA512_H

 #include <stddef.h>
 #include <stdint.h>

 struct sha512 {
    uint64_t state[8];
    uint8_t buffer[128];
    uint64_t bits[2];
    int n;
 };

 void sha512_init(struct sha512 *ctx);
 void sha512_update(struct sha512 *ctx, const void *data, size_t n);
 void sha512_final(struct sha512 *ctx, uint8_t hash[64]);

 void sha512(const void *data, size_t n, uint8_t hash[64]);

 #endif
	#include "sha512.h"

	#include <stdint.h>
	#include <sys/random.h>

	/*
	* RFC7748 X25519 Elliptic Curve Diffie-Hellman (ECDH) with Curve25519.
	* Montgomery curve y² = x³ + 486662x² + x over GF(p) = GF(2^255 - 19).
	*
	* 256-bit arithmetic modulo 2^255 - 19 implemented with 8 x 32-bit limbs :3
	*/

	static uint32_t *import(const uint8_t in[32], uint32_t out[8])
	{
	out[0] = in[ 0] + (in[ 1] << 8) + (in[ 2] << 16) + (in[ 3] << 24);
	out[1] = in[ 4] + (in[ 5] << 8) + (in[ 6] << 16) + (in[ 7] << 24);
	out[2] = in[ 8] + (in[ 9] << 8) + (in[10] << 16) + (in[11] << 24);
	out[3] = in[12] + (in[13] << 8) + (in[14] << 16) + (in[15] << 24);
	out[4] = in[16] + (in[17] << 8) + (in[18] << 16) + (in[19] << 24);
	out[5] = in[20] + (in[21] << 8) + (in[22] << 16) + (in[23] << 24);
	out[6] = in[24] + (in[25] << 8) + (in[26] << 16) + (in[27] << 24);
	out[7] = in[28] + (in[29] << 8) + (in[30] << 16) + (in[31] << 24);
	return out;
	}

	static uint8_t *export(const uint32_t i[8], uint8_t o[32])
	{
	o[ 0] = i[0]; o[ 1] = i[0] >> 8; o[ 2] = i[0] >> 16; o[ 3] = i[0] >> 24;
	o[ 4] = i[1]; o[ 5] = i[1] >> 8; o[ 6] = i[1] >> 16; o[ 7] = i[1] >> 24;
	o[ 8] = i[2]; o[ 9] = i[2] >> 8; o[10] = i[2] >> 16; o[11] = i[2] >> 24;
	o[12] = i[3]; o[13] = i[3] >> 8; o[14] = i[3] >> 16; o[15] = i[3] >> 24;
	o[16] = i[4]; o[17] = i[4] >> 8; o[18] = i[4] >> 16; o[19] = i[4] >> 24;
	o[20] = i[5]; o[21] = i[5] >> 8; o[22] = i[5] >> 16; o[23] = i[5] >> 24;
	o[24] = i[6]; o[25] = i[6] >> 8; o[26] = i[6] >> 16; o[27] = i[6] >> 24;
	o[28] = i[7]; o[29] = i[7] >> 8; o[30] = i[7] >> 16; o[31] = i[7] >> 24;
	return o;
	}

	static uint32_t *clamp_scalar(uint32_t a[8])
	{
	a[0] &= 0xfffffff8;
	a[7] &= 0x7fffffff;
	a[7] \|= 0x40000000;
	return a;
	}

	static uint32_t *clamp_coordinate(uint32_t u[8])
	{
	u[7] &= 0x7fffffff;
	return u;
	}

	/* x = a (mod p) */
	static uint32_t *modp(const uint32_t a[8], uint32_t x[8])
	{
	int i;
	uint64_t c = (uint64_t)a[0] + 19;
	for (i = 1; i < 8; i++) c = (c >> 32) + a[i];
	c = (c >> 31) * 19;
	for (i = 0; i < 7; i++) {
	c += a[i];
	x[i] = c;
	c >>= 32;
	}
	x[i] = ((uint32_t)c + a[i]) & 0x7fffffff;
	return x;
	}

	/* x = a + b (mod p) */
	static uint32_t *add(const uint32_t a[8], const uint32_t b[8], uint32_t x[8])
	{
	int i;
	uint64_t c = 0;
	for (i = 0; i < 8; i++) {
	c += a[i];
	c += b[i];
	x[i] = c;
	c >>= 32;
	}
	return x;
	/*
	* Note that we can skip modp() here because we do not chain more than one
	* addition or subtraction between multiplications. mul() handles unreduced
	* integers just fine and returns (mod p)-reduced product as the result.
	*/
	}

	/* x = a - b (mod p) */
	static uint32_t *sub(const uint32_t a[8], const uint32_t b[8], uint32_t x[8])
	{
	int i;
	int64_t c = -19;
	for (i = 0; i < 8; i++) {
	c += a[i];
	c -= b[i];
	x[i] = c;
	c >>= 32;
	}
	x[7] += 0x80000000;
	return x; /* Again, skip modp() */
	}

	/*
	* The mul() function is the main performance bottleneck of the code.
	* https://locklessinc.com/articles/256bit_arithmetic/
	*/
	/* x = a * b (mod p) */
	static uint32_t *mul(const uint32_t a[8], const uint32_t b[8], uint32_t x[8])
	{
	int i, j;
	uint32_t ab[8];
	uint64_t c, d, l, r;
	for (i = c = 0; i < 8; i++) {
	l = c & 0xffffffff;
	r = 0;
	c >>= 32;
	for (j = 0; j <= i; j++) {
	l += (uint64_t)a[j] * b[i - j];
	c += l >> 32;
	l &= 0xffffffff;
	}
	for (d = 0; j < 8; j++) {
	r += (uint64_t)a[j] * b[8 - (j - i)];
	d += r >> 32;
	r &= 0xffffffff;
	}
	l += 38 * r;
	c += 38 * d + (l >> 32);
	ab[i] = l;
	}
	c = 19 * (2*c + (ab[7] >> 31));
	ab[7] &= 0x7fffffff;
	for (i = 0; i < 8; i++) {
	c += ab[i];
	ab[i] = c;
	c >>= 32;
	}
	return modp(ab, x);
	}

	/* x = a² (mod p) */
	static uint32_t *square(const uint32_t a[8], uint32_t x[8])
	{
	return mul(a, a, x);
	}

	/* x = a^(p-2) (mod p) */
	static uint32_t *inverse_modp(const uint32_t a[8], uint32_t x[8])
	{
	int i;
	uint32_t b[8];
	mul(a, square(a, b), x);
	mul(x, square(square(b, b), b), x);
	square(b, b);
	for (i = 5; i < 255; i++) mul(x, square(b, b), x);
	return x;
	}

	/* Swap a and b if c != 0 */
	static void cswap(uint32_t a[8], uint32_t b[8], int c)
	{
	int i;
	uint32_t mask = -(uint32_t)!!c;
	for (i = 0; i < 8; i++) {
	const uint32_t xor = mask & (a[i] ^ b[i]);
	a[i] ^= xor;
	b[i] ^= xor;
	}
	}

	/*
	* Constant-time Montgomery ladder for elliptic curve point multiplication with
	* a scalar k and point coordinate u. We expect k and u to be properly clamped.
	*/
	static uint32_t *monty(const uint32_t k[8], const uint32_t u[8], uint32_t x[8])
	{
	int i, swap = 0;
	uint32_t x0[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
	uint32_t z0[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
	uint32_t x1[8];
	uint32_t z1[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
	const uint32_t a24[8] = { (486662 - 2) / 4, 0, 0, 0, 0, 0, 0, 0 };
	for (i = 0; i < 8; i++) x1[i] = u[i];
	for (i = 1; i <= 255; i++) {
	uint32_t A[8], AA[8], B[8], BB[8], C[8], CB[8], D[8], DA[8],E[8];
	const int bit = (k[(255-i) / 32] >> ((255-i) % 32)) & 1;
	swap ^= bit;
	cswap(x0, x1, swap);
	cswap(z0, z1, swap);
	swap = bit;

	/*
	* A = x_0 + z_0, AA = A^2
	* B = x_0 - z_0, BB = B^2
	* C = x_1 + z_1, CB = C * B
	* D = x_1 - z_1, DA = D * A
	* E = AA - BB
	*/
	square(add(x0, z0, A), AA);
	square(sub(x0, z0, B), BB);
	mul(add(x1, z1, C), B, CB);
	mul(sub(x1, z1, D), A, DA);
	sub(AA, BB, E);

	/*
	* x_0 = AA * BB
	* x_1 = (DA + CB)^2
	*/
	mul(AA, BB, x0);
	square(add(DA, CB, x1), x1);

	/*
	* z_0 = E * (AA + a24 * E) where a24 = (486662 - 2) / 4 = 121665
	* z_1 = (DA - CB)^2 * u
	*/
	add(AA, mul(a24, E, z0), z0);
	mul(E, z0, z0);
	square(sub(DA, CB, z1), z1);
	mul(u, z1, z1);
	}

	cswap(x0, x1, swap);
	cswap(z0, z1, swap);
	return mul(x0, inverse_modp(z0, x), x);
	}

	/*
	* Generate a public/private key pair for x25519 key exchange.
	*
	* The secret key is generated with getrandom(2) and clamped/masked according
	* to RFC7748. The public key is derived from the secret key by elliptic curve
	* point multiplication with the generator point G = { 9 } i.e. the Montgomery
	* curve point u=9.
	*/
	int x25519_keygen(uint8_t secret[32], uint8_t public[32])
	{
	uint32_t k[8];
	const uint32_t G[8] = { 9, 0, 0, 0, 0, 0, 0, 0 };
	if (getrandom(k, sizeof(k), 0) != sizeof(k))
	return 0;
	clamp_scalar(k);
	export(k, secret);
	monty(k, G, k);
	export(k, public);
	return 1;
	}

	/*
	* Compute a public key from secret.
	*/
	void x25519_secret_to_public(const uint8_t secret[32], uint8_t public[32])
	{
	uint32_t k[8];
	const uint32_t G[8] = { 9, 0, 0, 0, 0, 0, 0, 0 };
	import(secret, k);
	clamp_scalar(k);
	monty(k, G, k);
	export(k, public);
	}

	/*
	* X25519 Elliptic-curve Diffie-Hellman (ECDH).
	*
	* x25519(k, u) <=> U(kP) = x25519(k, U(P)) where P = (u, v) and U(P) = u.
	*
	* K_a = clamp_scalar(urandom(32)) Alice's private key
	* P_a = x25519(K_a, G) = K_a x G = K_a x 9 Alice's public key
	* K_b = clamp_scalar(urandom(32)) Bob's private key
	* P_b = x25519(K_b, G) = K_b x G = K_b x 9 Bob's public key
	*
	* Elliptic-curve Diffie-Hellman produces a shared key for Alice & Bob:
	*
	* K = x25519(K_a, P_b) = x25519(K_b, P_a) Alice's & Bob's shared key
	*
	* Proof of correctness:
	* (A) x25519(K_a, P_b) = x25519(K_a, x25519(K_b, 9)) = K_a x K_b x 9 = K
	* (B) x25519(K_b, P_a) = x25519(K_b, x25519(K_a, 9)) = K_b x K_a x 9 = K
	* => Alice and Bob derive identical shared key K.
	*
	* The return value is non-zero on success. An elliptic curve point with small
	* order causes the function return to zero and out buffer receives only zeros.
	*/
	int x25519(const uint8_t secret[32], const uint8_t public[32], uint8_t out[32])
	{
	uint32_t k[8], u[8];
	import(secret, k);
	import(public, u);
	clamp_scalar(k);
	clamp_coordinate(u);
	monty(k, u, k);
	export(k, out);
	return (k[0] \| k[1] \| k[2] \| k[3] \| k[4] \| k[5] \| k[6] \| k[7]) != 0;
	}

	/*
	* RFC8032 Edwards-Curve Digital Signature Algorithm (EdDSA).
	* Twisted Edwards curve -x² + y² = (121665/121666)x²y² over GF(p).
	*/

	/* Base point */
	static const uint32_t ed25519_B[32] = {
	/* g_x = recover_x(g_y, 0) */
	0x8f25d51a, 0xc9562d60, 0x9525a7b2, 0x692cc760,
	0xfdd6dc5c, 0xc0a4e231, 0xcd6e53fe, 0x216936d3,
	/* g_y = 4 * inverse_modp(5) % p */
	0x66666658, 0x66666666, 0x66666666, 0x66666666,
	0x66666666, 0x66666666, 0x66666666, 0x66666666,
	/* g_z = 1 */
	0x00000001, 0x00000000, 0x00000000, 0x00000000,
	0x00000000, 0x00000000, 0x00000000, 0x00000000,
	/* g_t = g_x * g_y % p */
	0xa5b7dda3, 0x6dde8ab3, 0x775152f5, 0x20f09f80,
	0x64abe37d, 0x66ea4e8e, 0xd78b7665, 0x67875f0f
	};


	/* Group order q = 2^252 + 27742317777372353535851937790883648493 */
	static const uint32_t ed25519_q[8] = {
	0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
	0x00000000, 0x00000000, 0x00000000, 0x10000000,
	};

	/* Reduce 512-bit x modulo q */
	static uint32_t *modq(const uint32_t x[16], uint32_t reduced[8])
	{
	/*
	* Barrett reduction.
	*
	* First, choose k = 256 such that 2^k > q. Second, set r = floor(4^k / q).
	* Now x - q * floor(xr / 4^k) < 2q and we obtain x (mod q) after a
	* conditional subtraction.
	*/
	static const uint32_t r[9] = {
	0x0a2c131b, 0xed9ce5a3, 0x086329a7, 0x2106215d,
	0xffffffeb, 0xffffffff, 0xffffffff, 0xffffffff,
	0xf
	};
	int i, j;
	uint64_t c;
	uint32_t mask, xr[25] = {0};

	/* xr = x * r */
	for (i = 0; i < 9; i++) {
	c = 0;
	for (j = 0; j < 16; j++) {
	c += (uint64_t)r[i] * x[j] + xr[i+j];
	xr[i+j] = c;
	c >>= 32;
	}
	xr[i+j] = c;
	}

	/* xr = q * floor(xr / 4^k) = q * floor(xr / 2^512) */
	for (i = 0; i < 8; i++) xr[i] = 0;
	for (i = 0; i < 8; i++) {
	c = 0;
	for (j = 0; i+j < 8; j++) {
	c = (c >> 32) + xr[i+j] + (uint64_t)xr[i+16] * ed25519_q[j];
	xr[i+j] = c;
	}
	}

	/* xr = x - xr */
	c = 1;
	for (i = 0; i < 8; i++) {
	xr[i] = (c = c + x[i] + ~xr[i]);
	c >>= 32;
	}

	/*
	* reduced = xr if 0 <= xr < q
	* xr - q if q <= xr < 2*q
	*/
	c = 1;
	for (i = 0; i < 8; i++) c = (c + xr[i] + ~ed25519_q[i]) >> 32;
	mask = -c;
	for (i = 0; i < 8; i++) {
	c += (uint64_t)xr[i] + (~ed25519_q[i] & mask);
	reduced[i] = c;
	c >>= 32;
	}

	for (i = 0; i < 25; i++) xr[i] = 0;
	return reduced;
	}

	/* Variable-time */
	static int cmp_vartime(const uint32_t a[8], const uint32_t b[8])
	{
	int i;
	for (i = 0; i < 8; i++) {
	const int j = 8 - (i+1);
	if (a[j] < b[j]) return -1;
	if (a[j] > b[j]) return 1;
	}
	return 0;
	}

	/* Variable-time */
	static uint32_t *recover_x(const uint32_t y[8], int sign, uint32_t x[8])
	{
	int i;
	uint32_t yy[8], u[8], v[8], uv3[8], uv7[8], b[8], bb[8];
	const uint32_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
	const uint32_t one[8] = { 1, 0, 0, 0, 0, 0, 0, 0 };
	const uint32_t p[8] = {
	0xffffffed, 0xffffffff, 0xffffffff, 0xffffffff,
	0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff
	};
	const uint32_t d[8] = {
	0x135978a3, 0x75eb4dca, 0x4141d8ab, 0x00700a4d,
	0x7779e898, 0x8cc74079, 0x2b6ffe73, 0x52036cee
	};
	const uint32_t sqrti[8] = { /* sqrt of -1, i.e., pow(2, (p-1) / 4) % p */
	0x4a0ea0b0, 0xc4ee1b27, 0xad2fe478, 0x2f431806,
	0x3dfbd7a7, 0x2b4d0099, 0x4fc1df0b, 0x2b832480
	};

	/* if (y >= p) return 0; */
	if (cmp_vartime(y, p) >= 0)
	return (void *)0;

	/*
	* For an encoded point R = (x,y), recover x = ±sqrt((y² - 1) / (dy² + 1)).
	* Prime p = 2^255 - 19 (modulo 8) ≡ 5 so α² = β⁴ (i.e., ±α = β²) for any
	* square α ∈ F(p) such that β = α^((p+3)/8).
	*
	* (1) x = ±sqrt(u/v) = ±sqrt((y² - 1) / (dy² + 1))
	* (2) β = (u/v)^((p+3)/8) = ··· = uv³(uv⁷)^((p-5)/8)
	*/
	sub(square(y, yy), one, u);
	add(mul(d, yy, v), one, v);

	/* If u is zero return a zero coordinate (or error if a non-zero sign) */
	if (cmp_vartime(modp(u, x), zero) == 0)
	return sign ? (void *)0 : x;

	/* β = (u/v)^(p+3)/8 = ··· = uv³(uv⁷)^((p-5)/8) */
	mul(u, mul(v, square(v, b), b), uv3); /* b = v³, uv3 = uv³ */
	mul(u, mul(v, square(b, b), b), uv7); /* b = v⁷, uv7 = uv⁷ */
	mul(uv3, uv7, b);
	square(uv7, bb);
	for (i = 0; i < 250; i++) mul(b, square(bb, bb), b);

	/*
	* If β² = -α (that is, vβ² = -u) then multiplication with sqrt(-1) yields
	* the recovered x.
	*/
	sub(mul(v, square(b, bb), yy), u, yy);
	if (cmp_vartime(modp(yy, yy), zero) != 0) {
	mul(b, sqrti, b);
	sub(mul(v, square(b, bb), yy), u, yy);
	if (cmp_vartime(modp(yy, yy), zero) != 0)
	return (void )0; / Input was not a square */
	}

	/* return (p - b) if (b & 1) != sign else b */
	if (((int)b[0] & 1) != sign)
	sub(p, b, b);
	return modp(b, x);
	}

	/* Derive a scalar and a message prefix from a 32-byte secret */
	static void secret_expand(const uint8_t secret[32],
	uint32_t scalar[8], uint8_t prefix[32])
	{
	int i;
	uint8_t hash[64];
	struct sha512 ctx;
	sha512_init(&ctx);
	sha512_update(&ctx, secret, 32);
	sha512_final(&ctx, hash);
	import(hash, scalar);
	clamp_scalar(scalar);
	for (i = 0; i < 32; i++) prefix[i] = hash[32+i];
	}

	/* Finalize SHA-512 computation and return the hash as an integer modulo q */
	static void sha512_final_modq(struct sha512 *ctx, uint32_t x[8])
	{
	union {
	uint8_t u8[64];
	uint32_t u32[16];
	} hash;
	sha512_final(ctx, hash.u8);
	import(hash.u8, hash.u32);
	import(&hash.u8[32], &hash.u32[8]);
	modq(hash.u32, x);
	}

	/* r = ((x * y) + z) % q */
	static uint32_t *mul_add_modq(const uint32_t x[8], const uint32_t y[8],
	const uint32_t z[8], uint32_t r[8])
	{
	int i, j;
	uint64_t c;
	uint32_t xy[16] = {0};
	for (i = 0; i < 8; i++) xy[i] = z[i];
	for (i = 0; i < 8; i++) {
	c = 0;
	for (j = 0; j < 8; j++) {
	xy[i+j] = (c += (uint64_t)x[i] * y[j] + xy[i+j]);
	c >>= 32;
	}
	xy[i+j] = c;
	}
	return modq(xy, r);
	}

	/* Encode point (X:Y:Z:T) as a 255-bit y-coordinate with a sign bit */
	static void point_encode(const uint32_t P[32], uint8_t buf[32])
	{
	uint32_t x[8], y[8], zinv[8];
	inverse_modp(P + 16, zinv);
	mul(zinv, P, x);
	mul(zinv, P + 8, y);
	y[7] \|= (x[0] & 1) << 31;
	export(y, buf);
	}

	/* Decode point (X:Y:Z:T) from 256 bits encoding the x coordinate and sign */
	static uint32_t *point_decode(const uint8_t buf[32], uint32_t P[32])
	{
	int sign;
	uint32_t *Px = P;
	uint32_t *Py = P + 8;
	uint32_t *Pz = P + 16;
	uint32_t *Pt = P + 24;

	import(buf, Py);
	sign = Py[7] >> 31;
	Py[7] &= 0x7fffffff;

	if (!recover_x(Py, sign, Px))
	return (void *)0;

	Pz[0] = 1;
	Pz[1] = Pz[2] = Pz[3] = Pz[4] = Pz[5] = Pz[6] = Pz[7] = 0;

	mul(Px, Py, Pt);
	return P;
	}

	/* P == Q */
	static int point_equal(const uint32_t P[32], const uint32_t Q[32])
	{
	int i;
	uint32_t A[8], B[8], sum = 0;
	const uint32_t *Px = P;
	const uint32_t *Py = P + 8;
	const uint32_t *Pz = P + 16;
	const uint32_t *Qx = Q;
	const uint32_t *Qy = Q + 8;
	const uint32_t *Qz = Q + 16;

	/* P.x / P.z == Q.x / Q.z <=> P.x * Q.z == Q.x * P.z */
	mul(Px, Qz, A);
	mul(Qx, Pz, B);
	for (i = 0; i < 8; i++) sum \|= A[i] ^ B[i];

	/* P.y / P.z == Q.y / Q.z <=> P.y * Q.z == Q.y * P.z */
	mul(Py, Qz, A);
	mul(Qy, Pz, B);
	for (i = 0; i < 8; i++) sum \|= A[i] ^ B[i];

	return sum == 0;
	}

	/* R = P + Q */
	static uint32_t *point_add(const uint32_t P[32], const uint32_t Q[32],
	uint32_t R[32])
	{
	uint32_t A[8], B[8], C[8], D[8], E[8], F[8], G[8], H[8];
	uint32_t *Rx = R;
	uint32_t *Ry = R + 8;
	uint32_t *Rz = R + 16;
	uint32_t *Rt = R + 24;
	const uint32_t *Px = P;
	const uint32_t *Py = P + 8;
	const uint32_t *Pz = P + 16;
	const uint32_t *Pt = P + 24;
	const uint32_t *Qx = Q;
	const uint32_t *Qy = Q + 8;
	const uint32_t *Qz = Q + 16;
	const uint32_t *Qt = Q + 24;
	const uint32_t d2[8] = {
	0x26b2f159, 0xebd69b94, 0x8283b156, 0x00e0149a,
	0xeef3d130, 0x198e80f2, 0x56dffce7, 0x2406d9dc
	};

	/* A = (P[1]-P[0]) * (Q[1]-Q[0]) % p */
	sub(Py, Px, A);
	sub(Qy, Qx, C);
	mul(A, C, A);

	/* B = (P[1]+P[0]) * (Q[1]+Q[0]) % p */
	add(Py, Px, B);
	add(Qy, Qx, C);
	mul(B, C, B);

	/* C = 2 * P[3] * Q[3] * d % p */
	mul(Pt, Qt, C);
	mul(C, d2, C);

	/* D = 2 * P[2] * Q[2] % p */
	mul(Pz, Qz, D);
	add(D, D, D);
	modp(D, D);

	/* E = B - A */
	sub(B, A, E);

	/* F = D - C */
	sub(D, C, F);

	/* G = D + C */
	add(D, C, G);

	/* H = B + A */
	add(B, A, H);

	/* R = (EF, GH, FG, EH) */
	mul(E, F, Rx);
	mul(G, H, Ry);
	mul(F, G, Rz);
	mul(E, H, Rt);

	return R;
	}

	/* Q = s * P */
	static uint32_t *point_mul(const uint32_t s[8], const uint32_t P[32],
	uint32_t Q[32])
	{
	int i;
	uint32_t sP[32];
	for (i = 0; i < 32; i++) {
	sP[i] = P[i];
	Q[i] = 0;
	}
	Q[8] = Q[16] = 1;
	for (i = 0; i < 256; i++) {
	uint32_t R[32];
	const int swap = (s[i / 32] >> (i % 32)) & 1;
	point_add(Q, sP, R);
	cswap(Q, R, swap);
	cswap(Q + 8, R + 8, swap);
	cswap(Q + 16, R + 16, swap);
	cswap(Q + 24, R + 24, swap);
	point_add(sP, sP, sP);
	}
	return Q;
	}

	/* Compute a public key from secret */
	void ed25519_secret_to_public(const uint8_t secret[32], uint8_t public[32])
	{
	uint8_t prefix[32];
	uint32_t a[8], aB[32];
	secret_expand(secret, a, prefix);
	point_mul(a, ed25519_B, aB);
	point_encode(aB, public);
	}

	/* Constant-time EdDSA signature generation algorithm */
	void ed25519_sign(const uint8_t secret[32], const void *msg, size_t len,
	uint8_t signature[64])
	{
	struct sha512 ctx;
	uint32_t a[8], h[8], r[8], P[32];
	secret_expand(secret, a, signature);
	sha512_init(&ctx);
	sha512_update(&ctx, signature, 32);
	sha512_update(&ctx, msg, len);
	sha512_final_modq(&ctx, r);
	point_encode(point_mul(r, ed25519_B, P), signature);
	point_encode(point_mul(a, ed25519_B, P), signature + 32);
	sha512_init(&ctx);
	sha512_update(&ctx, signature, 64);
	sha512_update(&ctx, msg, len);
	sha512_final_modq(&ctx, h);
	export(mul_add_modq(h, a, r, r), signature + 32);
	}

	/* Variable-time EdDSA signature verification algorithm */
	int ed25519_verify(const uint8_t public[32], const void *msg, size_t len,
	const uint8_t signature[64])
	{
	struct sha512 ctx;
	uint32_t A[32], R[32], s[8], h[8], sB[32], hA[32];
	if (!point_decode(public, A))
	return 0;
	if (!point_decode(signature, R))
	return 0;
	import(signature + 32, s);
	if (cmp_vartime(s, ed25519_q) >= 0)
	return 0;
	sha512_init(&ctx);
	sha512_update(&ctx, signature, 32);
	sha512_update(&ctx, public, 32);
	sha512_update(&ctx, msg, len);
	sha512_final_modq(&ctx, h);
	point_mul(s, ed25519_B, sB);
	point_mul(h, A, hA);
	return point_equal(sB, point_add(R, hA, hA));
	}
	#include "sha512.h"

	#define ROR64(x, c) (((x) >> (c)) \| ((x) << (64 - (c))))

	#define LOAD64_BE(p) \
	( ((uint64_t)((p)[7]) << 0) \
	\| ((uint64_t)((p)[6]) << 8) \
	\| ((uint64_t)((p)[5]) << 16) \
	\| ((uint64_t)((p)[4]) << 24) \
	\| ((uint64_t)((p)[3]) << 32) \
	\| ((uint64_t)((p)[2]) << 40) \
	\| ((uint64_t)((p)[1]) << 48) \
	\| ((uint64_t)((p)[0]) << 56) \
	)
	#define STORE64_BE(p, v) \
	(p)[7] = ((v) >> 0) & 0xFF; \
	(p)[6] = ((v) >> 8) & 0xFF; \
	(p)[5] = ((v) >> 16) & 0xFF; \
	(p)[4] = ((v) >> 24) & 0xFF; \
	(p)[3] = ((v) >> 32) & 0xFF; \
	(p)[2] = ((v) >> 40) & 0xFF; \
	(p)[1] = ((v) >> 48) & 0xFF; \
	(p)[0] = ((v) >> 56) & 0xFF;

	void sha512_init(struct sha512 *ctx)
	{
	ctx->state[0] = 0x6a09e667f3bcc908; /* sqrt(2) */
	ctx->state[1] = 0xbb67ae8584caa73b; /* sqrt(3) */
	ctx->state[2] = 0x3c6ef372fe94f82b; /* sqrt(5) */
	ctx->state[3] = 0xa54ff53a5f1d36f1; /* sqrt(7) */
	ctx->state[4] = 0x510e527fade682d1; /* sqrt(11) */
	ctx->state[5] = 0x9b05688c2b3e6c1f; /* sqrt(13) */
	ctx->state[6] = 0x1f83d9abfb41bd6b; /* sqrt(17) */
	ctx->state[7] = 0x5be0cd19137e2179; /* sqrt(19) */
	ctx->bits[0] = ctx->bits[1] = 0;
	ctx->n = 0;
	}

	static void sha512_block(uint64_t state[8], const uint8_t p[128])
	{
	uint64_t a, b, c, d, e, f, g, h, s0, s1, S0, S1, t1, t2, w[16];
	static const uint64_t K512[80] = {
	0x428a2f98d728ae22, 0x7137449123ef65cd,
	0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,
	0x3956c25bf348b538, 0x59f111f1b605d019,
	0x923f82a4af194f9b, 0xab1c5ed5da6d8118,
	0xd807aa98a3030242, 0x12835b0145706fbe,
	0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
	0x72be5d74f27b896f, 0x80deb1fe3b1696b1,
	0x9bdc06a725c71235, 0xc19bf174cf692694,
	0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
	0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,
	0x2de92c6f592b0275, 0x4a7484aa6ea6e483,
	0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
	0x983e5152ee66dfab, 0xa831c66d2db43210,
	0xb00327c898fb213f, 0xbf597fc7beef0ee4,
	0xc6e00bf33da88fc2, 0xd5a79147930aa725,
	0x06ca6351e003826f, 0x142929670a0e6e70,
	0x27b70a8546d22ffc, 0x2e1b21385c26c926,
	0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
	0x650a73548baf63de, 0x766a0abb3c77b2a8,
	0x81c2c92e47edaee6, 0x92722c851482353b,
	0xa2bfe8a14cf10364, 0xa81a664bbc423001,
	0xc24b8b70d0f89791, 0xc76c51a30654be30,
	0xd192e819d6ef5218, 0xd69906245565a910,
	0xf40e35855771202a, 0x106aa07032bbd1b8,
	0x19a4c116b8d2d0c8, 0x1e376c085141ab53,
	0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,
	0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
	0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,
	0x748f82ee5defb2fc, 0x78a5636f43172f60,
	0x84c87814a1f0ab72, 0x8cc702081a6439ec,
	0x90befffa23631e28, 0xa4506cebde82bde9,
	0xbef9a3f7b2c67915, 0xc67178f2e372532b,
	0xca273eceea26619c, 0xd186b8c721c0c207,
	0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,
	0x06f067aa72176fba, 0x0a637dc5a2c898a6,
	0x113f9804bef90dae, 0x1b710b35131c471b,
	0x28db77f523047d84, 0x32caab7b40c72493,
	0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
	0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
	};

	a = state[0]; b = state[1]; c = state[2]; d = state[3];
	e = state[4]; f = state[5]; g = state[6]; h = state[7];

	#define ROUND_CORE(i) \
	S0 = ROR64(a, 28) ^ ROR64(a, 34) ^ ROR64(a, 39); \
	S1 = ROR64(e, 14) ^ ROR64(e, 18) ^ ROR64(e, 41); \
	t1 = h + S1 + ((e & f) ^ (~e & g)) + K512[i] + w[i & 0xf]; \
	t2 = S0 + ((a & b) ^ (a & c) ^ (b & c)); \
	h = g; g = f; f = e; e = d + t1; \
	d = c; c = b; b = a; a = t1 + t2;

	#define ROUND_0_15(i) w[i] = LOAD64_BE(p); p += 8; ROUND_CORE(i)
	ROUND_0_15( 0) ROUND_0_15( 1) ROUND_0_15( 2) ROUND_0_15( 3)
	ROUND_0_15( 4) ROUND_0_15( 5) ROUND_0_15( 6) ROUND_0_15( 7)
	ROUND_0_15( 8) ROUND_0_15( 9) ROUND_0_15(10) ROUND_0_15(11)
	ROUND_0_15(12) ROUND_0_15(13) ROUND_0_15(14) ROUND_0_15(15)
	#undef ROUND_0_15

	#define ROUND_16_79(i) \
	s0 = w[(i + 1) & 0xf]; s1 = w[(i + 14) & 0xf]; \
	s0 = ROR64(s0, 1) ^ ROR64(s0, 8) ^ (s0 >> 7); \
	s1 = ROR64(s1, 19) ^ ROR64(s1, 61) ^ (s1 >> 6); \
	w[i & 0xf] += s0 + s1 + w[(i + 9) & 0xf]; ROUND_CORE(i)
	ROUND_16_79(16) ROUND_16_79(17) ROUND_16_79(18) ROUND_16_79(19)
	ROUND_16_79(20) ROUND_16_79(21) ROUND_16_79(22) ROUND_16_79(23)
	ROUND_16_79(24) ROUND_16_79(25) ROUND_16_79(26) ROUND_16_79(27)
	ROUND_16_79(28) ROUND_16_79(29) ROUND_16_79(30) ROUND_16_79(31)
	ROUND_16_79(32) ROUND_16_79(33) ROUND_16_79(34) ROUND_16_79(35)
	ROUND_16_79(36) ROUND_16_79(37) ROUND_16_79(38) ROUND_16_79(39)
	ROUND_16_79(40) ROUND_16_79(41) ROUND_16_79(42) ROUND_16_79(43)
	ROUND_16_79(44) ROUND_16_79(45) ROUND_16_79(46) ROUND_16_79(47)
	ROUND_16_79(48) ROUND_16_79(49) ROUND_16_79(50) ROUND_16_79(51)
	ROUND_16_79(52) ROUND_16_79(53) ROUND_16_79(54) ROUND_16_79(55)
	ROUND_16_79(56) ROUND_16_79(57) ROUND_16_79(58) ROUND_16_79(59)
	ROUND_16_79(60) ROUND_16_79(61) ROUND_16_79(62) ROUND_16_79(63)
	ROUND_16_79(64) ROUND_16_79(65) ROUND_16_79(66) ROUND_16_79(67)
	ROUND_16_79(68) ROUND_16_79(69) ROUND_16_79(70) ROUND_16_79(71)
	ROUND_16_79(72) ROUND_16_79(73) ROUND_16_79(74) ROUND_16_79(75)
	ROUND_16_79(76) ROUND_16_79(77) ROUND_16_79(78) ROUND_16_79(79)
	#undef ROUND_16_79
	#undef ROUND_CORE

	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
	}

	void sha512_update(struct sha512 ctx, const void data, size_t n)
	{
	const uint8_t input = (const uint8_t )data;

	if (ctx->n \|\| n < 128) {
	while (n && ctx->n < 128) {
	ctx->buffer[ctx->n++] = *input++;
	n--;
	}
	if (ctx->n < 128)
	return;
	sha512_block(ctx->state, ctx->buffer);
	ctx->bits[1] += !(ctx->bits[0] += 1024);
	ctx->n = 0;
	}

	while (n >= 128) {
	sha512_block(ctx->state, input);
	ctx->bits[1] += !(ctx->bits[0] += 1024);
	input += 128;
	n -= 128;
	}

	while (n) {
	ctx->buffer[ctx->n++] = *input++;
	n--;
	}
	}

	void sha512_final(struct sha512 *ctx, uint8_t hash[64])
	{
	int i;
	uint8_t buf[256];
	ctx->bits[0] += ctx->n * 8;
	for (i = 1, buf[0] = 0x80; (ctx->n + i + 16) % 128; buf[i++] = 0);
	STORE64_BE(buf + i, ctx->bits[1]);
	STORE64_BE(buf + i + 8, ctx->bits[0]);
	sha512_update(ctx, buf, i + 16);
	STORE64_BE(hash + 0, ctx->state[0]);
	STORE64_BE(hash + 8, ctx->state[1]);
	STORE64_BE(hash + 16, ctx->state[2]);
	STORE64_BE(hash + 24, ctx->state[3]);
	STORE64_BE(hash + 32, ctx->state[4]);
	STORE64_BE(hash + 40, ctx->state[5]);
	STORE64_BE(hash + 48, ctx->state[6]);
	STORE64_BE(hash + 56, ctx->state[7]);
	}

	void sha512(const void *data, size_t n, uint8_t hash[64])
	{
	struct sha512 ctx;
	sha512_init(&ctx);
	sha512_update(&ctx, data, n);
	sha512_final(&ctx, hash);
	}
	#ifndef SHA512_H
	#define SHA512_H

	#include <stddef.h>
	#include <stdint.h>

	struct sha512 {
	uint64_t state[8];
	uint8_t buffer[128];
	uint64_t bits[2];
	int n;
	};

	void sha512_init(struct sha512 *ctx);
	void sha512_update(struct sha512 ctx, const void data, size_t n);
	void sha512_final(struct sha512 *ctx, uint8_t hash[64]);

	void sha512(const void *data, size_t n, uint8_t hash[64]);

	#endif