Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save userid/b3baba535298fffcc34392df938d8542 to your computer and use it in GitHub Desktop.

Select an option

Save userid/b3baba535298fffcc34392df938d8542 to your computer and use it in GitHub Desktop.
From 5f22ddce5c2f301e9e6b2d38315e44388cefce5d Mon Sep 17 00:00:00 2001
From: Thomas Deutschmann <whissi@whissi.de>
Date: Thu, 22 Sep 2016 19:01:48 +0200
Subject: [PATCH] Implementation of draft and RFC versions of CHACHA20-POLY1305
ciphers
Backport of https://github.com/cloudflare/sslconfig/commit/3afa6467ad3b15db2f678f8b5a1f817e56874602
---
Configure | 56 +-
Makefile.org | 4 +-
apps/speed.c | 30 +-
crypto/chacha20poly1305/Makefile | 97 +++
crypto/chacha20poly1305/asm/chacha20_avx.pl | 408 +++++++++++
crypto/chacha20poly1305/asm/chacha20_avx2.pl | 443 ++++++++++++
crypto/chacha20poly1305/asm/poly1305_avx.pl | 732 ++++++++++++++++++++
crypto/chacha20poly1305/asm/poly1305_avx2.pl | 984 +++++++++++++++++++++++++++
crypto/chacha20poly1305/asm/poly1305_x64.pl | 281 ++++++++
crypto/chacha20poly1305/chacha20.c | 162 +++++
crypto/chacha20poly1305/chacha20poly1305.h | 79 +++
crypto/chacha20poly1305/chapolytest.c | 470 +++++++++++++
crypto/chacha20poly1305/poly1305.c | 287 ++++++++
crypto/cryptlib.c | 10 -
crypto/evp/Makefile | 7 +-
crypto/evp/e_chacha20poly1305.c | 435 ++++++++++++
crypto/evp/evp.h | 4 +
ssl/s3_lib.c | 119 ++++
ssl/ssl.h | 2 +
ssl/ssl_ciph.c | 60 +-
ssl/ssl_locl.h | 2 +
ssl/tls1.h | 28 +
22 files changed, 4646 insertions(+), 54 deletions(-)
create mode 100644 crypto/chacha20poly1305/Makefile
create mode 100644 crypto/chacha20poly1305/asm/chacha20_avx.pl
create mode 100644 crypto/chacha20poly1305/asm/chacha20_avx2.pl
create mode 100644 crypto/chacha20poly1305/asm/poly1305_avx.pl
create mode 100644 crypto/chacha20poly1305/asm/poly1305_avx2.pl
create mode 100644 crypto/chacha20poly1305/asm/poly1305_x64.pl
create mode 100644 crypto/chacha20poly1305/chacha20.c
create mode 100644 crypto/chacha20poly1305/chacha20poly1305.h
create mode 100644 crypto/chacha20poly1305/chapolytest.c
create mode 100644 crypto/chacha20poly1305/poly1305.c
create mode 100644 crypto/evp/e_chacha20poly1305.c
diff --git a/Configure b/Configure
index c39f71a..b139783 100755
--- a/Configure
+++ b/Configure
@@ -150,25 +150,25 @@ my $tlib="-lnsl -lsocket";
my $bits1="THIRTY_TWO_BIT ";
my $bits2="SIXTY_FOUR_BIT ";
-my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o:";
+my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o::des-586.o crypt586.o:aes-586.o vpaes-x86.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o:cmll-x86.o:ghash-x86.o::";
my $x86_elf_asm="$x86_asm:elf";
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:";
-my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
-my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o::void";
-my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o:::::::::::::void";
-my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o::void";
-my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o rsaz_exp.o rsaz-x86_64.o rsaz-avx2.o:ecp_nistz256.o ecp_nistz256-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o aesni-mb-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o sha1-mb-x86_64.o sha256-mb-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o::chacha20_avx.o chacha20_avx2.o poly1305_x64.o poly1305_avx2.o";
+my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:::void";
+my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o vis3-mont.o sparct4-mont.o sparcv9-gf2m.o::des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o aest4-sparcv9.o::md5-sparcv9.o:sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o::::::camellia.o cmll_misc.o cmll_cbc.o cmllt4-sparcv9.o:ghash-sparcv9.o:::void";
+my $sparcv8_asm=":sparcv8.o::des_enc-sparc.o fcrypt_b.o::::::::::::::void";
+my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::sha1-alpha.o:::::::ghash-alpha.o:::void";
+my $mips64_asm=":bn-mips.o mips-mont.o:::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::::";
my $mips32_asm=$mips64_asm; $mips32_asm =~ s/\s*sha512\-mips\.o//;
-my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
-my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o::void";
-my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
-my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
-my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
-my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
+my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o:::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o::";
+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o:::aes_cbc.o aes-armv4.o bsaes-armv7.o aesv8-armx.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o ghashv8-armx.o:::void";
+my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o::::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o::";
+my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::32";
+my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o:::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:::64";
+my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o:::aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o::";
my $ppc32_asm=$ppc64_asm;
-my $no_asm="::::::::::::::::void";
+my $no_asm=":::::::::::::::::void";
# As for $BSDthreads. Idea is to maintain "collective" set of flags,
# which would cover all BSD flavors. -pthread applies to them all,
@@ -220,7 +220,7 @@ my %table=(
"debug-linux-ppro","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -mcpu=pentiumpro -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn",
"debug-linux-elf","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -march=i486 -Wall::-D_REENTRANT::-lefence -ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debug-linux-elf-noefence","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DL_ENDIAN -g -march=i486 -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"debug-linux-ia32-aes", "gcc:-DAES_EXPERIMENTAL -DL_ENDIAN -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:x86cpuid.o:bn-586.o co-586.o x86-mont.o::des-586.o crypt586.o:aes_x86core.o aes_cbc.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o::ghash-x86.o::elf:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"debug-linux-ia32-aes", "gcc:-DAES_EXPERIMENTAL -DL_ENDIAN -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:x86cpuid.o:bn-586.o co-586.o x86-mont.o::des-586.o crypt586.o:aes_x86core.o aes_cbc.o aesni-x86.o:bf-586.o:md5-586.o:sha1-586.o sha256-586.o sha512-586.o:cast-586.o:rc4-586.o:rmd-586.o:rc5-586.o:wp_block.o wp-mmx.o::ghash-x86.o:::elf:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debug-linux-generic32","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -g -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debug-linux-generic64","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DTERMIO -g -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"debug-linux-x86_64","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -m64 -DL_ENDIAN -g -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
@@ -327,7 +327,7 @@ my %table=(
"hpux-parisc-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${no_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"hpux-parisc1_1-gcc","gcc:-O3 -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-Wl,+s -ldld:BN_LLONG DES_PTR DES_UNROLL DES_RISC1:${parisc11_asm}:dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa1.1",
"hpux-parisc2-gcc","gcc:-march=2.0 -O3 -DB_ENDIAN -D_REENTRANT::::-Wl,+s -ldld:SIXTY_FOUR_BIT RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL DES_RISC1:".eval{my $asm=$parisc20_asm;$asm=~s/2W\./2\./;$asm=~s/:64/:32/;$asm}.":dl:hpux-shared:-fPIC:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_32",
-"hpux64-parisc2-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::pa-risc2W.o:::::::::::::::void:dlfcn:hpux-shared:-fpic:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_64",
+"hpux64-parisc2-gcc","gcc:-O3 -DB_ENDIAN -D_REENTRANT::::-ldl:SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT::pa-risc2W.o::::::::::::::::void:dlfcn:hpux-shared:-fpic:-shared:.sl.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/pa20_64",
# More attempts at unified 10.X and 11.X targets for HP C compiler.
#
@@ -584,9 +584,9 @@ my %table=(
# Visual C targets
#
# Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64
-"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32",
+"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::::::::ghash-ia64.o::ias:win32",
"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
-"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o::ias:win32",
+"debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o:::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::::::::ghash-ia64.o::ias:win32",
"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
# x86 Win32 target defaults to ANSI API, if you want UNICODE, complement
# 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE'
@@ -714,6 +714,7 @@ my $idx_wp_obj = $idx++;
my $idx_cmll_obj = $idx++;
my $idx_modes_obj = $idx++;
my $idx_engines_obj = $idx++;
+my $idx_chapoly_obj = $idx++;
my $idx_perlasm_scheme = $idx++;
my $idx_dso_scheme = $idx++;
my $idx_shared_target = $idx++;
@@ -756,6 +757,7 @@ my $bf ="crypto/bf/bf_locl.h";
my $bn_asm ="bn_asm.o";
my $des_enc="des_enc.o fcrypt_b.o";
my $aes_enc="aes_core.o aes_cbc.o";
+my $chapoly_enc="";
my $bf_enc ="bf_enc.o";
my $cast_enc="c_enc.o";
my $rc4_enc="rc4_enc.o rc4_skey.o";
@@ -1211,7 +1213,7 @@ $openssldir=$prefix . "/" . $openssldir if $openssldir !~ /(^\/|^[a-zA-Z]:[\\\/]
print "IsMK1MF=$IsMK1MF\n";
-my @fields = split(/\s*:\s*/,$table{$target} . ":" x 30 , -1);
+my @fields = split(/\s*:\s*/,$table{$target} . ":" x 31 , -1);
my $cc = $fields[$idx_cc];
# Allow environment CC to override compiler...
if($ENV{CC}) {
@@ -1240,6 +1242,7 @@ my $wp_obj = $fields[$idx_wp_obj];
my $cmll_obj = $fields[$idx_cmll_obj];
my $modes_obj = $fields[$idx_modes_obj];
my $engines_obj = $fields[$idx_engines_obj];
+my $chapoly_obj = $fields[$idx_chapoly_obj];
my $perlasm_scheme = $fields[$idx_perlasm_scheme];
my $dso_scheme = $fields[$idx_dso_scheme];
my $shared_target = $fields[$idx_shared_target];
@@ -1407,7 +1410,7 @@ if ($no_asm)
{
$cpuid_obj=$bn_obj=$ec_obj=
$des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj=$cmll_obj=
- $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj="";
+ $modes_obj=$sha1_obj=$md5_obj=$rmd160_obj=$wp_obj=$engines_obj=$chapoly_obj="";
}
if (!$no_shared)
@@ -1567,6 +1570,14 @@ else
{
$rc4_obj=$rc4_enc;
}
+if ($chapoly_obj =~ /\.o$/)
+ {
+ $cflags.=" -DCHAPOLY_x86_64_ASM";
+ }
+else
+ {
+ $chapoly_obj=$chapoly_enc;
+ }
if ($sha1_obj =~ /\.o$/)
{
# $sha1_obj=$sha1_enc;
@@ -1751,6 +1762,7 @@ while (<IN>)
s/^WP_ASM_OBJ=.*$/WP_ASM_OBJ= $wp_obj/;
s/^CMLL_ENC=.*$/CMLL_ENC= $cmll_obj/;
s/^MODES_ASM_OBJ.=*$/MODES_ASM_OBJ= $modes_obj/;
+ s/^CHAPOLY_ENC=.*$/CHAPOLY_ENC= $chapoly_obj/;
s/^ENGINES_ASM_OBJ.=*$/ENGINES_ASM_OBJ= $engines_obj/;
s/^PERLASM_SCHEME=.*$/PERLASM_SCHEME= $perlasm_scheme/;
s/^PROCESSOR=.*/PROCESSOR= $processor/;
@@ -1813,6 +1825,7 @@ print "RMD160_OBJ_ASM=$rmd160_obj\n";
print "CMLL_ENC =$cmll_obj\n";
print "MODES_OBJ =$modes_obj\n";
print "ENGINES_OBJ =$engines_obj\n";
+print "CHAPOLY_ENC =$chapoly_obj\n";
print "PROCESSOR =$processor\n";
print "RANLIB =$ranlib\n";
print "ARFLAGS =$arflags\n";
@@ -2211,7 +2224,7 @@ sub print_table_entry
my ($cc, $cflags, $unistd, $thread_cflag, $sys_id, $lflags,
$bn_ops, $cpuid_obj, $bn_obj, $ec_obj, $des_obj, $aes_obj, $bf_obj,
$md5_obj, $sha1_obj, $cast_obj, $rc4_obj, $rmd160_obj,
- $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj,
+ $rc5_obj, $wp_obj, $cmll_obj, $modes_obj, $engines_obj, $chapoly_obj,
$perlasm_scheme, $dso_scheme, $shared_target, $shared_cflag,
$shared_ldflag, $shared_extension, $ranlib, $arflags, $multilib)=
split(/\s*:\s*/,$table{$target} . ":" x 30 , -1);
@@ -2242,6 +2255,7 @@ sub print_table_entry
\$cmll_obj = $cmll_obj
\$modes_obj = $modes_obj
\$engines_obj = $engines_obj
+\$chapoly_obj = $chapoly_obj
\$perlasm_scheme = $perlasm_scheme
\$dso_scheme = $dso_scheme
\$shared_target= $shared_target
diff --git a/Makefile.org b/Makefile.org
index 2377f50..33f4cd1 100644
--- a/Makefile.org
+++ b/Makefile.org
@@ -92,6 +92,7 @@ BN_ASM= bn_asm.o
EC_ASM=
DES_ENC= des_enc.o fcrypt_b.o
AES_ENC= aes_core.o aes_cbc.o
+CHAPOLY_ENC=
BF_ENC= bf_enc.o
CAST_ENC= c_enc.o
RC4_ENC= rc4_enc.o
@@ -149,7 +150,7 @@ SDIRS= \
bn ec rsa dsa ecdsa dh ecdh dso engine \
buffer bio stack lhash rand err \
evp asn1 pem x509 x509v3 conf txt_db pkcs7 pkcs12 comp ocsp ui krb5 \
- cms pqueue ts jpake srp store cmac
+ cms pqueue ts jpake srp store cmac chacha20poly1305
# keep in mind that the above list is adjusted by ./Configure
# according to no-xxx arguments...
@@ -236,6 +237,7 @@ BUILDENV= LC_ALL=C PLATFORM='$(PLATFORM)' PROCESSOR='$(PROCESSOR)'\
WP_ASM_OBJ='$(WP_ASM_OBJ)' \
MODES_ASM_OBJ='$(MODES_ASM_OBJ)' \
ENGINES_ASM_OBJ='$(ENGINES_ASM_OBJ)' \
+ CHAPOLY_ENC='$(CHAPOLY_ENC)' \
PERLASM_SCHEME='$(PERLASM_SCHEME)' \
FIPSLIBDIR='${FIPSLIBDIR}' \
FIPSDIR='${FIPSDIR}' \
diff --git a/apps/speed.c b/apps/speed.c
index b862868..28a5589 100644
--- a/apps/speed.c
+++ b/apps/speed.c
@@ -1,4 +1,4 @@
-/* apps/speed.c */
+/* apps/speed.c -*- mode:C; c-file-style: "eay" -*- */
/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
* All rights reserved.
*
@@ -226,7 +226,7 @@
# endif
# undef BUFSIZE
-# define BUFSIZE ((long)1024*8+1)
+# define BUFSIZE ((long)1024*8+16)
static volatile int run = 0;
static int mr = 0;
@@ -241,7 +241,7 @@ static void print_result(int alg, int run_no, int count, double time_used);
static int do_multi(int multi);
# endif
-# define ALGOR_NUM 30
+# define ALGOR_NUM 31
# define SIZE_NUM 5
# define RSA_NUM 4
# define DSA_NUM 3
@@ -256,7 +256,7 @@ static const char *names[ALGOR_NUM] = {
"aes-128 cbc", "aes-192 cbc", "aes-256 cbc",
"camellia-128 cbc", "camellia-192 cbc", "camellia-256 cbc",
"evp", "sha256", "sha512", "whirlpool",
- "aes-128 ige", "aes-192 ige", "aes-256 ige", "ghash"
+ "aes-128 ige", "aes-192 ige", "aes-256 ige", "ghash", "chacha20-poly1305"
};
static double results[ALGOR_NUM][SIZE_NUM];
@@ -516,6 +516,7 @@ int MAIN(int argc, char **argv)
# define D_IGE_192_AES 27
# define D_IGE_256_AES 28
# define D_GHASH 29
+# define D_CHAPOLY 30
double d = 0.0;
long c[ALGOR_NUM][SIZE_NUM];
# define R_DSA_512 0
@@ -972,6 +973,9 @@ int MAIN(int argc, char **argv)
doit[D_CBC_256_CML] = 1;
} else
# endif
+ if (strcmp(*argv, "chacha20-poly1305") == 0) {
+ doit[D_CHAPOLY] = 1;
+ } else
# ifndef OPENSSL_NO_RSA
if (strcmp(*argv, "rsa") == 0) {
rsa_doit[R_RSA_512] = 1;
@@ -1139,6 +1143,7 @@ int MAIN(int argc, char **argv)
BIO_printf(bio_err, "rc4");
# endif
BIO_printf(bio_err, "\n");
+ BIO_printf(bio_err, "chacha20-poly1305\n");
# ifndef OPENSSL_NO_RSA
BIO_printf(bio_err, "rsa512 rsa1024 rsa2048 rsa4096\n");
@@ -1370,6 +1375,7 @@ int MAIN(int argc, char **argv)
c[D_IGE_192_AES][0] = count;
c[D_IGE_256_AES][0] = count;
c[D_GHASH][0] = count;
+ c[D_CHAPOLY][0] = count;
for (i = 1; i < SIZE_NUM; i++) {
c[D_MD2][i] = c[D_MD2][0] * 4 * lengths[0] / lengths[i];
@@ -1821,6 +1827,22 @@ int MAIN(int argc, char **argv)
CRYPTO_gcm128_release(ctx);
}
# endif
+ if (doit[D_CHAPOLY]) {
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+ EVP_CipherInit_ex(&ctx, EVP_chacha20_poly1305(), NULL, key32, iv, 1);
+
+ for (j = 0; j < SIZE_NUM; j++) {
+ print_message(names[D_CHAPOLY], c[D_CHAPOLY][j], lengths[j]);
+ Time_F(START);
+ for (count = 0, run = 1; COND(c[D_CHAPOLY][j]); count++) {
+ EVP_CIPHER_CTX_ctrl(&ctx, EVP_CTRL_AEAD_TLS1_AAD, 13, buf);
+ EVP_Cipher(&ctx, buf, buf, (unsigned long)lengths[j] + 16);
+ }
+ d = Time_F(STOP);
+ print_result(D_CHAPOLY, j, count, d);
+ }
+ }
# ifndef OPENSSL_NO_CAMELLIA
if (doit[D_CBC_128_CML]) {
for (j = 0; j < SIZE_NUM; j++) {
diff --git a/crypto/chacha20poly1305/Makefile b/crypto/chacha20poly1305/Makefile
new file mode 100644
index 0000000..446eb27
--- /dev/null
+++ b/crypto/chacha20poly1305/Makefile
@@ -0,0 +1,97 @@
+#
+# crypto/chacha20poly1305/Makefile
+#
+
+DIR= chacha20poly1305
+TOP= ../..
+CC= cc
+CPP= $(CC) -E
+INCLUDES=
+CFLAG=-g
+MAKEFILE= Makefile
+AR= ar r
+
+CHAPOLY_ENC=
+
+CFLAGS= $(INCLUDES) $(CFLAG)
+ASFLAGS= $(INCLUDES) $(ASFLAG)
+AFLAGS= $(ASFLAGS)
+
+GENERAL=Makefile
+TEST=chapolytest.c
+APPS=
+
+LIB=$(TOP)/libcrypto.a
+LIBSRC=chacha20.c poly1305.c
+LIBOBJ=chacha20.o poly1305.o $(CHAPOLY_ENC)
+
+SRC= $(LIBSRC)
+
+EXHEADER=chacha20poly1305.h
+HEADER= $(EXHEADER)
+
+ALL= $(GENERAL) $(SRC) $(HEADER)
+
+top:
+ (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
+
+all: lib
+
+lib: $(LIBOBJ)
+ $(AR) $(LIB) $(LIBOBJ)
+ $(RANLIB) $(LIB) || echo Never mind.
+ @touch lib
+
+poly1305_x64.s:asm/poly1305_x64.pl
+ $(PERL) asm/poly1305_x64.pl $(PERLASM_SCHEME) > $@
+chacha20_avx.s:asm/chacha20_avx.pl
+ $(PERL) asm/chacha20_avx.pl $(PERLASM_SCHEME) > $@
+poly1305_avx.s:asm/poly1305_avx.pl
+ $(PERL) asm/poly1305_avx.pl $(PERLASM_SCHEME) > $@
+chacha20_avx2.s:asm/chacha20_avx2.pl
+ $(PERL) asm/chacha20_avx2.pl $(PERLASM_SCHEME) > $@
+poly1305_avx2.s:asm/poly1305_avx2.pl
+ $(PERL) asm/poly1305_avx2.pl $(PERLASM_SCHEME) > $@
+
+files:
+ $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
+
+links:
+ @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
+ @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
+ @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
+
+install:
+ @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
+ @headerlist="$(EXHEADER)"; for i in $$headerlist ; \
+ do \
+ (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
+ chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
+ done;
+
+tags:
+ ctags $(SRC)
+
+tests:
+
+chapolytest: top chapolytest.c $(LIB)
+ $(CC) $(CFLAGS) -Wall -Werror -g -o chapolytest cahpolytest.c $(LIB)
+
+lint:
+ lint -DLINT $(INCLUDES) $(SRC)>fluff
+
+depend:
+ @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
+ $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
+
+dclean:
+ $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
+ mv -f Makefile.new $(MAKEFILE)
+
+clean:
+ rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
+
+# DO NOT DELETE THIS LINE -- make depend depends on it.
+
+chacha20.o: ../../include/openssl/chacha20poly1305.h chacha20.c
+poly1305.o: ../../include/openssl/chacha20poly1305.h poly1305.c
diff --git a/crypto/chacha20poly1305/asm/chacha20_avx.pl b/crypto/chacha20poly1305/asm/chacha20_avx.pl
new file mode 100644
index 0000000..bf3e3f0
--- /dev/null
+++ b/crypto/chacha20poly1305/asm/chacha20_avx.pl
@@ -0,0 +1,408 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright 2014 Intel Corporation #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); #
+# you may not use this file except in compliance with the License. #
+# You may obtain a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+# #
+##############################################################################
+# #
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center #
+# (2) University of Haifa #
+# #
+# Related work: #
+# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE #
+# Proceedings of 11th International Conference on Information #
+# Technology: New Generations (ITNG 2014), 612-615 (2014). #
+# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"#
+# to be published. #
+# A. Langley, chacha20poly1305 for the AEAD head #
+# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 #
+##############################################################################
+
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+}
+
+if ($avx>=1) {{
+
+my ($rol8, $rol16, $state_cdef, $tmp,
+ $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7,
+ $v8, $v9, $v10, $v11)=map("%xmm$_",(0..15));
+
+sub chacha_qr {
+
+my ($a,$b,$c,$d)=@_;
+$code.=<<___;
+
+ vpaddd $b, $a, $a # a += b
+ vpxor $a, $d, $d # d ^= a
+ vpshufb $rol16, $d, $d # d <<<= 16
+
+ vpaddd $d, $c, $c # c += d
+ vpxor $c, $b, $b # b ^= c
+ vpslld \$12, $b, $tmp
+ vpsrld \$20, $b, $b
+ vpxor $tmp, $b, $b # b <<<= 12
+
+ vpaddd $b, $a, $a # a += b
+ vpxor $a, $d, $d # d ^= a
+ vpshufb $rol8, $d, $d # d <<<= 8
+
+ vpaddd $d, $c, $c # c += d
+ vpxor $c, $b, $b # b ^= c
+
+ vpslld \$7, $b, $tmp
+ vpsrld \$25, $b, $b
+ vpxor $tmp, $b, $b # b <<<= 7
+___
+
+}
+
+$code.=<<___;
+
+.text
+.align 16
+chacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.rol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.rol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.avxInc:
+.quad 1,0
+
+___
+
+{
+
+
+my ($out, $in, $in_len, $key_ptr, $nr)
+ =("%rdi", "%rsi", "%rdx", "%rcx", "%r8");
+
+$code.=<<___;
+.globl chacha_20_core_avx
+.type chacha_20_core_avx ,\@function,2
+.align 64
+chacha_20_core_avx:
+ vzeroupper
+
+ # Init state
+ vmovdqa .rol8(%rip), $rol8
+ vmovdqa .rol16(%rip), $rol16
+ vmovdqu 16*2($key_ptr), $state_cdef
+
+2:
+ cmp \$64*3, $in_len
+ jb 2f
+
+ vmovdqa chacha20_consts(%rip), $v0
+ vmovdqu 16*0($key_ptr), $v1
+ vmovdqu 16*1($key_ptr), $v2
+ vmovdqa $state_cdef, $v3
+
+ vmovdqa $v0, $v4
+ vmovdqa $v0, $v8
+
+ vmovdqa $v1, $v5
+ vmovdqa $v1, $v9
+
+ vmovdqa $v2, $v6
+ vmovdqa $v2, $v10
+
+ vpaddq .avxInc(%rip), $v3, $v7
+ vpaddq .avxInc(%rip), $v7, $v11
+
+ mov \$10, $nr
+
+ 1:
+___
+
+ &chacha_qr( $v0, $v1, $v2, $v3);
+ &chacha_qr( $v4, $v5, $v6, $v7);
+ &chacha_qr( $v8, $v9,$v10,$v11);
+
+$code.=<<___;
+ vpalignr \$4, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$12, $v3, $v3, $v3
+ vpalignr \$4, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$12, $v7, $v7, $v7
+ vpalignr \$4, $v9, $v9, $v9
+ vpalignr \$8, $v10, $v10, $v10
+ vpalignr \$12, $v11, $v11, $v11
+___
+
+ &chacha_qr( $v0, $v1, $v2, $v3);
+ &chacha_qr( $v4, $v5, $v6, $v7);
+ &chacha_qr( $v8, $v9,$v10,$v11);
+
+$code.=<<___;
+ vpalignr \$12, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$4, $v3, $v3, $v3
+ vpalignr \$12, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$4, $v7, $v7, $v7
+ vpalignr \$12, $v9, $v9, $v9
+ vpalignr \$8, $v10, $v10, $v10
+ vpalignr \$4, $v11, $v11, $v11
+
+ dec $nr
+
+ jnz 1b
+
+ vpaddd chacha20_consts(%rip), $v0, $v0
+ vpaddd chacha20_consts(%rip), $v4, $v4
+ vpaddd chacha20_consts(%rip), $v8, $v8
+
+ vpaddd 16*0($key_ptr), $v1, $v1
+ vpaddd 16*0($key_ptr), $v5, $v5
+ vpaddd 16*0($key_ptr), $v9, $v9
+
+ vpaddd 16*1($key_ptr), $v2, $v2
+ vpaddd 16*1($key_ptr), $v6, $v6
+ vpaddd 16*1($key_ptr), $v10, $v10
+
+ vpaddd $state_cdef, $v3, $v3
+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef
+ vpaddd $state_cdef, $v7, $v7
+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef
+ vpaddd $state_cdef, $v11, $v11
+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef
+
+ vpxor 16*0($in), $v0, $v0
+ vpxor 16*1($in), $v1, $v1
+ vpxor 16*2($in), $v2, $v2
+ vpxor 16*3($in), $v3, $v3
+
+ vmovdqu $v0, 16*0($out)
+ vmovdqu $v1, 16*1($out)
+ vmovdqu $v2, 16*2($out)
+ vmovdqu $v3, 16*3($out)
+
+ vpxor 16*4($in), $v4, $v4
+ vpxor 16*5($in), $v5, $v5
+ vpxor 16*6($in), $v6, $v6
+ vpxor 16*7($in), $v7, $v7
+
+ vmovdqu $v4, 16*4($out)
+ vmovdqu $v5, 16*5($out)
+ vmovdqu $v6, 16*6($out)
+ vmovdqu $v7, 16*7($out)
+
+ vpxor 16*8($in), $v8, $v8
+ vpxor 16*9($in), $v9, $v9
+ vpxor 16*10($in), $v10, $v10
+ vpxor 16*11($in), $v11, $v11
+
+ vmovdqu $v8, 16*8($out)
+ vmovdqu $v9, 16*9($out)
+ vmovdqu $v10, 16*10($out)
+ vmovdqu $v11, 16*11($out)
+
+ lea 16*12($in), $in
+ lea 16*12($out), $out
+ sub \$16*12, $in_len
+
+ jmp 2b
+
+2:
+ cmp \$64*2, $in_len
+ jb 2f
+
+ vmovdqa chacha20_consts(%rip), $v0
+ vmovdqa chacha20_consts(%rip), $v4
+ vmovdqu 16*0($key_ptr), $v1
+ vmovdqu 16*0($key_ptr), $v5
+ vmovdqu 16*1($key_ptr), $v2
+ vmovdqu 16*1($key_ptr), $v6
+ vmovdqu 16*1($key_ptr), $v10
+ vmovdqa $state_cdef, $v3
+ vpaddq .avxInc(%rip), $v3, $v7
+
+ mov \$10, $nr
+
+ 1:
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+ &chacha_qr($v4,$v5,$v6,$v7);
+
+$code.=<<___;
+ vpalignr \$4, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$12, $v3, $v3, $v3
+ vpalignr \$4, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$12, $v7, $v7, $v7
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+ &chacha_qr($v4,$v5,$v6,$v7);
+
+$code.=<<___;
+ vpalignr \$12, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$4, $v3, $v3, $v3
+ vpalignr \$12, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$4, $v7, $v7, $v7
+
+ dec $nr
+
+ jnz 1b
+
+ vpaddd chacha20_consts(%rip), $v0, $v0
+ vpaddd chacha20_consts(%rip), $v4, $v4
+
+ vpaddd 16*0($key_ptr), $v1, $v1
+ vpaddd 16*0($key_ptr), $v5, $v5
+
+ vpaddd 16*1($key_ptr), $v2, $v2
+ vpaddd 16*1($key_ptr), $v6, $v6
+
+ vpaddd $state_cdef, $v3, $v3
+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef
+ vpaddd $state_cdef, $v7, $v7
+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef
+
+ vpxor 16*0($in), $v0, $v0
+ vpxor 16*1($in), $v1, $v1
+ vpxor 16*2($in), $v2, $v2
+ vpxor 16*3($in), $v3, $v3
+
+ vmovdqu $v0, 16*0($out)
+ vmovdqu $v1, 16*1($out)
+ vmovdqu $v2, 16*2($out)
+ vmovdqu $v3, 16*3($out)
+
+ vpxor 16*4($in), $v4, $v4
+ vpxor 16*5($in), $v5, $v5
+ vpxor 16*6($in), $v6, $v6
+ vpxor 16*7($in), $v7, $v7
+
+ vmovdqu $v4, 16*4($out)
+ vmovdqu $v5, 16*5($out)
+ vmovdqu $v6, 16*6($out)
+ vmovdqu $v7, 16*7($out)
+
+ lea 16*8($in), $in
+ lea 16*8($out), $out
+ sub \$16*8, $in_len
+
+ jmp 2b
+2:
+ cmp \$64, $in_len
+ jb 2f
+
+ vmovdqa chacha20_consts(%rip), $v0
+ vmovdqu 16*0($key_ptr), $v1
+ vmovdqu 16*1($key_ptr), $v2
+ vmovdqa $state_cdef, $v3
+
+ mov \$10, $nr
+
+ 1:
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+
+$code.=<<___;
+ vpalignr \$4, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$12, $v3, $v3, $v3
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+$code.=<<___;
+ vpalignr \$12, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$4, $v3, $v3, $v3
+
+ dec $nr
+ jnz 1b
+
+ vpaddd chacha20_consts(%rip), $v0, $v0
+ vpaddd 16*0($key_ptr), $v1, $v1
+ vpaddd 16*1($key_ptr), $v2, $v2
+ vpaddd $state_cdef, $v3, $v3
+ vpaddq .avxInc(%rip), $state_cdef, $state_cdef
+
+ vpxor 16*0($in), $v0, $v0
+ vpxor 16*1($in), $v1, $v1
+ vpxor 16*2($in), $v2, $v2
+ vpxor 16*3($in), $v3, $v3
+
+ vmovdqu $v0, 16*0($out)
+ vmovdqu $v1, 16*1($out)
+ vmovdqu $v2, 16*2($out)
+ vmovdqu $v3, 16*3($out)
+
+ lea 16*4($in), $in
+ lea 16*4($out), $out
+ sub \$16*4, $in_len
+ jmp 2b
+
+2:
+ vmovdqu $state_cdef, 16*2($key_ptr)
+
+ vzeroupper
+ ret
+.size chacha_20_core_avx,.-chacha_20_core_avx
+___
+}
+}}
+
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
+
diff --git a/crypto/chacha20poly1305/asm/chacha20_avx2.pl b/crypto/chacha20poly1305/asm/chacha20_avx2.pl
new file mode 100644
index 0000000..9f31f86
--- /dev/null
+++ b/crypto/chacha20poly1305/asm/chacha20_avx2.pl
@@ -0,0 +1,443 @@
+#!/usr/bin/env perl
+
+##############################################################################
+# #
+# Copyright 2014 Intel Corporation #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); #
+# you may not use this file except in compliance with the License. #
+# You may obtain a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+# #
+##############################################################################
+# #
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center #
+# (2) University of Haifa #
+# #
+# Related work: #
+# M. Goll, S. Gueron, "Vectorization on ChaCha Stream Cipher", IEEE #
+# Proceedings of 11th International Conference on Information #
+# Technology: New Generations (ITNG 2014), 612-615 (2014). #
+# M. Goll, S. Gueron, "Vectorization on Poly1305 Message Authentication Code"#
+# to be published. #
+# A. Langley, chacha20poly1305 for the AEAD head #
+# https://git.openssl.org/gitweb/?p=openssl.git;a=commit;h=9a8646510b3d0a48e950748f7a2aaa12ed40d5e0 #
+##############################################################################
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+}
+
+if ($avx>=2) {{
+
+my ($state_4567, $state_89ab, $state_cdef, $tmp,
+ $v0, $v1, $v2, $v3, $v4, $v5, $v6, $v7,
+ $v8, $v9, $v10, $v11)=map("%ymm$_",(0..15));
+
+sub chacha_qr {
+
+my ($a,$b,$c,$d)=@_;
+
+$code.=<<___;
+
+ vpaddd $b, $a, $a # a += b
+ vpxor $a, $d, $d # d ^= a
+ vpshufb .rol16(%rip), $d, $d # d <<<= 16
+
+ vpaddd $d, $c, $c # c += d
+ vpxor $c, $b, $b # b ^= c
+ vpslld \$12, $b, $tmp
+ vpsrld \$20, $b, $b
+ vpxor $tmp, $b, $b # b <<<= 12
+
+ vpaddd $b, $a, $a # a += b
+ vpxor $a, $d, $d # d ^= a
+ vpshufb .rol8(%rip), $d, $d # d <<<= 8
+
+ vpaddd $d, $c, $c # c += d
+ vpxor $c, $b, $b # b ^= c
+
+ vpslld \$7, $b, $tmp
+ vpsrld \$25, $b, $b
+ vpxor $tmp, $b, $b # b <<<= 7
+___
+}
+
+
+$code.=<<___;
+.text
+.align 32
+chacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.rol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.rol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.avx2Init:
+.quad 0,0,1,0
+.avx2Inc:
+.quad 2,0,2,0
+___
+
+{
+
+my $state_cdef_xmm=$state_cdef;
+
+substr($state_cdef_xmm, 1, 1, "x");
+
+my ($out, $in, $in_len, $key_ptr, $nr)
+ =("%rdi", "%rsi", "%rdx", "%rcx", "%r8");
+
+$code.=<<___;
+.globl chacha_20_core_avx2
+.type chacha_20_core_avx2 ,\@function,2
+.align 64
+chacha_20_core_avx2:
+
+ vzeroupper
+
+ # Init state
+ vbroadcasti128 16*0($key_ptr), $state_4567
+ vbroadcasti128 16*1($key_ptr), $state_89ab
+ vbroadcasti128 16*2($key_ptr), $state_cdef
+ vpaddq .avx2Init(%rip), $state_cdef, $state_cdef
+
+2:
+ cmp \$6*64, $in_len
+ jb 2f
+
+ vmovdqa chacha20_consts(%rip), $v0
+ vmovdqa chacha20_consts(%rip), $v4
+ vmovdqa chacha20_consts(%rip), $v8
+
+ vmovdqa $state_4567, $v1
+ vmovdqa $state_4567, $v5
+ vmovdqa $state_4567, $v9
+
+ vmovdqa $state_89ab, $v2
+ vmovdqa $state_89ab, $v6
+ vmovdqa $state_89ab, $v10
+
+ vmovdqa $state_cdef, $v3
+ vpaddq .avx2Inc(%rip), $v3, $v7
+ vpaddq .avx2Inc(%rip), $v7, $v11
+
+ mov \$10, $nr
+
+ 1:
+___
+
+ &chacha_qr( $v0, $v1, $v2, $v3);
+ &chacha_qr( $v4, $v5, $v6, $v7);
+ &chacha_qr( $v8, $v9,$v10,$v11);
+
+$code.=<<___;
+ vpalignr \$4, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$12, $v3, $v3, $v3
+ vpalignr \$4, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$12, $v7, $v7, $v7
+ vpalignr \$4, $v9, $v9, $v9
+ vpalignr \$8, $v10, $v10, $v10
+ vpalignr \$12, $v11, $v11, $v11
+___
+
+ &chacha_qr( $v0, $v1, $v2, $v3);
+ &chacha_qr( $v4, $v5, $v6, $v7);
+ &chacha_qr( $v8, $v9,$v10,$v11);
+
+$code.=<<___;
+ vpalignr \$12, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$4, $v3, $v3, $v3
+ vpalignr \$12, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$4, $v7, $v7, $v7
+ vpalignr \$12, $v9, $v9, $v9
+ vpalignr \$8, $v10, $v10, $v10
+ vpalignr \$4, $v11, $v11, $v11
+
+ dec $nr
+
+ jnz 1b
+
+ vpaddd chacha20_consts(%rip), $v0, $v0
+ vpaddd chacha20_consts(%rip), $v4, $v4
+ vpaddd chacha20_consts(%rip), $v8, $v8
+
+ vpaddd $state_4567, $v1, $v1
+ vpaddd $state_4567, $v5, $v5
+ vpaddd $state_4567, $v9, $v9
+
+ vpaddd $state_89ab, $v2, $v2
+ vpaddd $state_89ab, $v6, $v6
+ vpaddd $state_89ab, $v10, $v10
+
+ vpaddd $state_cdef, $v3, $v3
+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef
+ vpaddd $state_cdef, $v7, $v7
+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef
+ vpaddd $state_cdef, $v11, $v11
+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef
+
+ vperm2i128 \$0x02, $v0, $v1, $tmp
+ vpxor 32*0($in), $tmp, $tmp
+ vmovdqu $tmp, 32*0($out)
+ vperm2i128 \$0x02, $v2, $v3, $tmp
+ vpxor 32*1($in), $tmp, $tmp
+ vmovdqu $tmp, 32*1($out)
+ vperm2i128 \$0x13, $v0, $v1, $tmp
+ vpxor 32*2($in), $tmp, $tmp
+ vmovdqu $tmp, 32*2($out)
+ vperm2i128 \$0x13, $v2, $v3, $tmp
+ vpxor 32*3($in), $tmp, $tmp
+ vmovdqu $tmp, 32*3($out)
+
+ vperm2i128 \$0x02, $v4, $v5, $v0
+ vperm2i128 \$0x02, $v6, $v7, $v1
+ vperm2i128 \$0x13, $v4, $v5, $v2
+ vperm2i128 \$0x13, $v6, $v7, $v3
+
+ vpxor 32*4($in), $v0, $v0
+ vpxor 32*5($in), $v1, $v1
+ vpxor 32*6($in), $v2, $v2
+ vpxor 32*7($in), $v3, $v3
+
+ vmovdqu $v0, 32*4($out)
+ vmovdqu $v1, 32*5($out)
+ vmovdqu $v2, 32*6($out)
+ vmovdqu $v3, 32*7($out)
+
+ vperm2i128 \$0x02, $v8, $v9, $v0
+ vperm2i128 \$0x02, $v10, $v11, $v1
+ vperm2i128 \$0x13, $v8, $v9, $v2
+ vperm2i128 \$0x13, $v10, $v11, $v3
+
+ vpxor 32*8($in), $v0, $v0
+ vpxor 32*9($in), $v1, $v1
+ vpxor 32*10($in), $v2, $v2
+ vpxor 32*11($in), $v3, $v3
+
+ vmovdqu $v0, 32*8($out)
+ vmovdqu $v1, 32*9($out)
+ vmovdqu $v2, 32*10($out)
+ vmovdqu $v3, 32*11($out)
+
+ lea 64*6($in), $in
+ lea 64*6($out), $out
+ sub \$64*6, $in_len
+
+ jmp 2b
+
+2:
+ cmp \$4*64, $in_len
+ jb 2f
+
+ vmovdqa chacha20_consts(%rip), $v0
+ vmovdqa chacha20_consts(%rip), $v4
+ vmovdqa $state_4567, $v1
+ vmovdqa $state_4567, $v5
+ vmovdqa $state_89ab, $v2
+ vmovdqa $state_89ab, $v6
+ vmovdqa $state_89ab, $v10
+ vmovdqa $state_cdef, $v3
+ vpaddq .avx2Inc(%rip), $v3, $v7
+
+ mov \$10, $nr
+
+ 1:
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+ &chacha_qr($v4,$v5,$v6,$v7);
+
+$code.=<<___;
+ vpalignr \$4, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$12, $v3, $v3, $v3
+ vpalignr \$4, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$12, $v7, $v7, $v7
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+ &chacha_qr($v4,$v5,$v6,$v7);
+
+$code.=<<___;
+ vpalignr \$12, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$4, $v3, $v3, $v3
+ vpalignr \$12, $v5, $v5, $v5
+ vpalignr \$8, $v6, $v6, $v6
+ vpalignr \$4, $v7, $v7, $v7
+
+ dec $nr
+
+ jnz 1b
+
+ vpaddd chacha20_consts(%rip), $v0, $v0
+ vpaddd chacha20_consts(%rip), $v4, $v4
+
+ vpaddd $state_4567, $v1, $v1
+ vpaddd $state_4567, $v5, $v5
+
+ vpaddd $state_89ab, $v2, $v2
+ vpaddd $state_89ab, $v6, $v6
+
+ vpaddd $state_cdef, $v3, $v3
+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef
+ vpaddd $state_cdef, $v7, $v7
+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef
+
+ vperm2i128 \$0x02, $v0, $v1, $v8
+ vperm2i128 \$0x02, $v2, $v3, $v9
+ vperm2i128 \$0x13, $v0, $v1, $v10
+ vperm2i128 \$0x13, $v2, $v3, $v11
+
+ vpxor 32*0($in), $v8, $v8
+ vpxor 32*1($in), $v9, $v9
+ vpxor 32*2($in), $v10, $v10
+ vpxor 32*3($in), $v11, $v11
+
+ vmovdqu $v8, 32*0($out)
+ vmovdqu $v9, 32*1($out)
+ vmovdqu $v10, 32*2($out)
+ vmovdqu $v11, 32*3($out)
+
+ vperm2i128 \$0x02, $v4, $v5, $v0
+ vperm2i128 \$0x02, $v6, $v7, $v1
+ vperm2i128 \$0x13, $v4, $v5, $v2
+ vperm2i128 \$0x13, $v6, $v7, $v3
+
+ vpxor 32*4($in), $v0, $v0
+ vpxor 32*5($in), $v1, $v1
+ vpxor 32*6($in), $v2, $v2
+ vpxor 32*7($in), $v3, $v3
+
+ vmovdqu $v0, 32*4($out)
+ vmovdqu $v1, 32*5($out)
+ vmovdqu $v2, 32*6($out)
+ vmovdqu $v3, 32*7($out)
+
+ lea 64*4($in), $in
+ lea 64*4($out), $out
+ sub \$64*4, $in_len
+
+ jmp 2b
+2:
+ cmp \$128, $in_len
+ jb 2f
+
+ vmovdqa chacha20_consts(%rip), $v0
+ vmovdqa $state_4567, $v1
+ vmovdqa $state_89ab, $v2
+ vmovdqa $state_cdef, $v3
+
+ mov \$10, $nr
+
+ 1:
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+
+$code.=<<___;
+ vpalignr \$4, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$12, $v3, $v3, $v3
+___
+
+ &chacha_qr($v0,$v1,$v2,$v3);
+$code.=<<___;
+ vpalignr \$12, $v1, $v1, $v1
+ vpalignr \$8, $v2, $v2, $v2
+ vpalignr \$4, $v3, $v3, $v3
+
+ dec $nr
+ jnz 1b
+
+ vpaddd chacha20_consts(%rip), $v0, $v0
+ vpaddd $state_4567, $v1, $v1
+ vpaddd $state_89ab, $v2, $v2
+ vpaddd $state_cdef, $v3, $v3
+ vpaddq .avx2Inc(%rip), $state_cdef, $state_cdef
+
+ vperm2i128 \$0x02, $v0, $v1, $v8
+ vperm2i128 \$0x02, $v2, $v3, $v9
+ vperm2i128 \$0x13, $v0, $v1, $v10
+ vperm2i128 \$0x13, $v2, $v3, $v11
+
+ vpxor 32*0($in), $v8, $v8
+ vpxor 32*1($in), $v9, $v9
+ vpxor 32*2($in), $v10, $v10
+ vpxor 32*3($in), $v11, $v11
+
+ vmovdqu $v8, 32*0($out)
+ vmovdqu $v9, 32*1($out)
+ vmovdqu $v10, 32*2($out)
+ vmovdqu $v11, 32*3($out)
+
+ lea 64*2($in), $in
+ lea 64*2($out), $out
+ sub \$64*2, $in_len
+ jmp 2b
+
+2:
+ vmovdqu $state_cdef_xmm, 16*2($key_ptr)
+
+ vzeroupper
+ ret
+.size chacha_20_core_avx2,.-chacha_20_core_avx2
+___
+}
+}}
+
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
+
diff --git a/crypto/chacha20poly1305/asm/poly1305_avx.pl b/crypto/chacha20poly1305/asm/poly1305_avx.pl
new file mode 100644
index 0000000..cedeca1
--- /dev/null
+++ b/crypto/chacha20poly1305/asm/poly1305_avx.pl
@@ -0,0 +1,732 @@
+##############################################################################
+# #
+# Copyright 2014 Intel Corporation #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); #
+# you may not use this file except in compliance with the License. #
+# You may obtain a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+# #
+##############################################################################
+# #
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center #
+# (2) University of Haifa #
+# #
+##############################################################################
+# state:
+# 0: r[0] || r^2[0]
+# 16: r[1] || r^2[1]
+# 32: r[2] || r^2[2]
+# 48: r[3] || r^2[3]
+# 64: r[4] || r^2[4]
+# 80: r[1]*5 || r^2[1]*5
+# 96: r[2]*5 || r^2[2]*5
+#112: r[3]*5 || r^2[3]*5
+#128: r[4]*5 || r^2[4]*5
+#144: k
+#160: A0
+#164: A1
+#168: A2
+#172: A3
+#176: A4
+#180: END
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+}
+
+if ($avx>=1) {{
+
+my ($_r0_, $_r1_, $_r2_, $_r3_, $_r4_,
+ $_r1_x5, $_r2_x5, $_r3_x5, $_r4_x5,
+ $_k_,
+ $_A0_, $_A1_, $_A2_, $_A3_, $_A4_)
+ = (0, 16, 32, 48, 64,
+ 80, 96, 112, 128,
+ 144,
+ 160, 164, 168, 172, 176);
+
+$code.=<<___;
+.text
+.align 32
+.LandMask:
+.quad 0x3FFFFFF, 0x3FFFFFF
+.LsetBit:
+.quad 0x1000000, 0x1000000
+.LrSet:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFF
+.quad 0x0FFFFFFC0FFFFFFC, 0x0FFFFFFC0FFFFFFC
+.Lone:
+.quad 1,0
+.Lpoly:
+.quad 0xfffffffffffffffb, 0xffffffffffffffff
+___
+
+
+{
+my ($A0, $A1, $A2, $A3, $A4,
+ $r0, $r1, $r2, $r3, $r4,
+ $T0, $T1, $A5, $A6, $A7, $A8)=map("%xmm$_",(0..15));
+my ($state, $key)
+ =("%rdi", "%rsi");
+
+$code.=<<___;
+################################################################################
+# void poly1305_init_avx(void *state, uint8_t key[32])
+
+.globl poly1305_init_avx
+.type poly1305_init_avx, \@function, 2
+.align 64
+poly1305_init_avx:
+ vzeroupper
+ # load and convert r
+ vmovq 8*0($key), $r0
+ vmovq 8*1($key), $T0
+ vpand .LrSet(%rip), $r0, $r0
+ vpand .LrSet+16(%rip), $T0, $T0
+
+ vpsrlq \$26, $r0, $r1
+ vpand .LandMask(%rip), $r0, $r0
+ vpsrlq \$26, $r1, $r2
+ vpand .LandMask(%rip), $r1, $r1
+ vpsllq \$12, $T0, $T1
+ vpxor $T1, $r2, $r2
+ vpsrlq \$26, $r2, $r3
+ vpsrlq \$40, $T0, $r4
+ vpand .LandMask(%rip), $r2, $r2
+ vpand .LandMask(%rip), $r3, $r3
+
+ # SQR R
+ vpmuludq $r0, $r0, $A0
+ vpmuludq $r1, $r0, $A1
+ vpmuludq $r2, $r0, $A2
+ vpmuludq $r3, $r0, $A3
+ vpmuludq $r4, $r0, $A4
+
+ vpsllq \$1, $A1, $A1
+ vpsllq \$1, $A2, $A2
+ vpmuludq $r1, $r1, $T0
+ vpaddq $T0, $A2, $A2
+ vpmuludq $r2, $r1, $T0
+ vpaddq $T0, $A3, $A3
+ vpmuludq $r3, $r1, $T0
+ vpaddq $T0, $A4, $A4
+ vpmuludq $r4, $r1, $A5
+
+ vpsllq \$1, $A3, $A3
+ vpsllq \$1, $A4, $A4
+ vpmuludq $r2, $r2, $T0
+ vpaddq $T0, $A4, $A4
+ vpmuludq $r3, $r2, $T0
+ vpaddq $T0, $A5, $A5
+ vpmuludq $r4, $r2, $A6
+
+ vpsllq \$1, $A5, $A5
+ vpsllq \$1, $A6, $A6
+ vpmuludq $r3, $r3, $T0
+ vpaddq $T0, $A6, $A6
+ vpmuludq $r4, $r3, $A7
+
+ vpsllq \$1, $A7, $A7
+ vpmuludq $r4, $r4, $A8
+
+ # Reduce
+ vpsrlq \$26, $A4, $T0
+ vpand .LandMask(%rip), $A4, $A4
+ vpaddq $T0, $A5, $A5
+
+ vpsllq \$2, $A5, $T0
+ vpaddq $T0, $A5, $A5
+ vpsllq \$2, $A6, $T0
+ vpaddq $T0, $A6, $A6
+ vpsllq \$2, $A7, $T0
+ vpaddq $T0, $A7, $A7
+ vpsllq \$2, $A8, $T0
+ vpaddq $T0, $A8, $A8
+
+ vpaddq $A5, $A0, $A0
+ vpaddq $A6, $A1, $A1
+ vpaddq $A7, $A2, $A2
+ vpaddq $A8, $A3, $A3
+
+ vpsrlq \$26, $A0, $T0
+ vpand .LandMask(%rip), $A0, $A0
+ vpaddq $T0, $A1, $A1
+ vpsrlq \$26, $A1, $T0
+ vpand .LandMask(%rip), $A1, $A1
+ vpaddq $T0, $A2, $A2
+ vpsrlq \$26, $A2, $T0
+ vpand .LandMask(%rip), $A2, $A2
+ vpaddq $T0, $A3, $A3
+ vpsrlq \$26, $A3, $T0
+ vpand .LandMask(%rip), $A3, $A3
+ vpaddq $T0, $A4, $A4
+
+ vpunpcklqdq $r0, $A0, $r0
+ vpunpcklqdq $r1, $A1, $r1
+ vpunpcklqdq $r2, $A2, $r2
+ vpunpcklqdq $r3, $A3, $r3
+ vpunpcklqdq $r4, $A4, $r4
+
+ vmovdqu $r0, $_r0_($state)
+ vmovdqu $r1, $_r1_($state)
+ vmovdqu $r2, $_r2_($state)
+ vmovdqu $r3, $_r3_($state)
+ vmovdqu $r4, $_r4_($state)
+
+ vpsllq \$2, $r1, $A1
+ vpsllq \$2, $r2, $A2
+ vpsllq \$2, $r3, $A3
+ vpsllq \$2, $r4, $A4
+
+ vpaddq $A1, $r1, $A1
+ vpaddq $A2, $r2, $A2
+ vpaddq $A3, $r3, $A3
+ vpaddq $A4, $r4, $A4
+
+ vmovdqu $A1, $_r1_x5($state)
+ vmovdqu $A2, $_r2_x5($state)
+ vmovdqu $A3, $_r3_x5($state)
+ vmovdqu $A4, $_r4_x5($state)
+ # Store k
+ vmovdqu 16*1($key), $T0
+ vmovdqu $T0, $_k_($state)
+ # Init the MAC value
+ vpxor $T0, $T0, $T0
+ vmovdqu $T0, $_A0_($state)
+ vmovd $T0, $_A4_($state)
+ vzeroupper
+ ret
+.size poly1305_init_avx,.-poly1305_init_avx
+___
+}
+
+{
+
+my ($A0, $A1, $A2, $A3, $A4,
+ $T0, $T1, $R0, $R1, $R2,
+ $R3, $R4, $AND_MASK)
+ = map("%xmm$_",(0..12));
+
+my ($state, $in, $in_len)
+ =("%rdi", "%rsi", "%rdx");
+
+$code.=<<___;
+
+###############################################################################
+# void* poly1305_update_avx(void* state, void* in, uint64_t in_len)
+.globl poly1305_update_avx
+.type poly1305_update_avx, \@function, 2
+.align 64
+poly1305_update_avx:
+
+ vzeroupper
+ vmovd $_A0_($state), $A0
+ vmovd $_A1_($state), $A1
+ vmovd $_A2_($state), $A2
+ vmovd $_A3_($state), $A3
+ vmovd $_A4_($state), $A4
+ vmovdqa .LandMask(%rip), $AND_MASK
+ # Skip to single block case
+ cmp \$32, $in_len
+ jb 3f
+1:
+ cmp \$16*4, $in_len
+ jb 1f
+ sub \$16*2, $in_len
+ # load the next two blocks
+ vmovdqu 16*0($in), $R2
+ vmovdqu 16*1($in), $R3
+ add \$16*2, $in
+
+ vpunpcklqdq $R3, $R2, $R0
+ vpunpckhqdq $R3, $R2, $R1
+
+ vpsrlq \$26, $R0, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A0, $A0
+
+ vpsrlq \$26, $R2, $R0
+ vpand $AND_MASK, $R2, $R2
+ vpaddq $R2, $A1, $A1
+
+ vpsllq \$12, $R1, $R2
+ vpxor $R2, $R0, $R0
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A2, $A2
+
+ vpsrlq \$26, $R2, $R0
+ vpsrlq \$40, $R1, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpxor .LsetBit(%rip), $R2, $R2
+ vpaddq $R0, $A3, $A3
+ vpaddq $R2, $A4, $A4
+
+ # Multiply input by R[0]
+ vbroadcastss $_r0_($state), $T0
+ vpmuludq $T0, $A0, $R0
+ vpmuludq $T0, $A1, $R1
+ vpmuludq $T0, $A2, $R2
+ vpmuludq $T0, $A3, $R3
+ vpmuludq $T0, $A4, $R4
+ # Multiply input by R[1] (and R[1]*5)
+ vbroadcastss $_r1_x5($state), $T0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R0, $R0
+ vbroadcastss $_r1_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R4, $R4
+ # Etc
+ vbroadcastss $_r2_x5($state), $T0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R1, $R1
+ vbroadcastss $_r2_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R4, $R4
+
+ vbroadcastss $_r3_x5($state), $T0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R2, $R2
+ vbroadcastss $_r3_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R4, $R4
+
+ vbroadcastss $_r4_x5($state), $T0
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R3, $R3
+ vbroadcastss $_r4_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R4, $R4
+ # Reduce
+ vpsrlq \$26, $R3, $T0
+ vpaddq $T0, $R4, $R4
+ vpand $AND_MASK, $R3, $R3
+
+ vpsrlq \$26, $R4, $T0
+ vpsllq \$2, $T0, $T1
+ vpaddq $T1, $T0, $T0
+ vpaddq $T0, $R0, $R0
+ vpand $AND_MASK, $R4, $R4
+
+ vpsrlq \$26, $R0, $T0
+ vpand $AND_MASK, $R0, $A0
+ vpaddq $T0, $R1, $R1
+ vpsrlq \$26, $R1, $T0
+ vpand $AND_MASK, $R1, $A1
+ vpaddq $T0, $R2, $R2
+ vpsrlq \$26, $R2, $T0
+ vpand $AND_MASK, $R2, $A2
+ vpaddq $T0, $R3, $R3
+ vpsrlq \$26, $R3, $T0
+ vpand $AND_MASK, $R3, $A3
+ vpaddq $T0, $R4, $A4
+ jmp 1b
+1:
+ cmp \$16*2, $in_len
+ jb 1f
+ sub \$16*2, $in_len
+ # load the next two blocks
+ vmovdqu 16*0($in), $R2
+ vmovdqu 16*1($in), $R3
+ add \$16*2, $in
+
+ vpunpcklqdq $R3, $R2, $R0
+ vpunpckhqdq $R3, $R2, $R1
+
+ vpsrlq \$26, $R0, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A0, $A0
+
+ vpsrlq \$26, $R2, $R0
+ vpand $AND_MASK, $R2, $R2
+ vpaddq $R2, $A1, $A1
+
+ vpsllq \$12, $R1, $R2
+ vpxor $R2, $R0, $R0
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A2, $A2
+
+ vpsrlq \$26, $R2, $R0
+ vpsrlq \$40, $R1, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpxor .LsetBit(%rip), $R2, $R2
+ vpaddq $R0, $A3, $A3
+ vpaddq $R2, $A4, $A4
+
+ # Multiply input by R[0]
+ vmovdqu $_r0_($state), $T0
+ vpmuludq $T0, $A0, $R0
+ vpmuludq $T0, $A1, $R1
+ vpmuludq $T0, $A2, $R2
+ vpmuludq $T0, $A3, $R3
+ vpmuludq $T0, $A4, $R4
+ # Multiply input by R[1] (and R[1]*5)
+ vmovdqu $_r1_x5($state), $T0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R0, $R0
+ vmovdqu $_r1_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R4, $R4
+ # Etc
+ vmovdqu $_r2_x5($state), $T0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R1, $R1
+ vmovdqu $_r2_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovdqu $_r3_x5($state), $T0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R2, $R2
+ vmovdqu $_r3_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovdqu $_r4_x5($state), $T0
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R3, $R3
+ vmovdqu $_r4_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R4, $R4
+1:
+ vpsrldq \$8, $R0, $A0
+ vpsrldq \$8, $R1, $A1
+ vpsrldq \$8, $R2, $A2
+ vpsrldq \$8, $R3, $A3
+ vpsrldq \$8, $R4, $A4
+
+ vpaddq $R0, $A0, $A0
+ vpaddq $R1, $A1, $A1
+ vpaddq $R2, $A2, $A2
+ vpaddq $R3, $A3, $A3
+ vpaddq $R4, $A4, $A4
+ # Reduce
+ vpsrlq \$26, $A3, $T0
+ vpaddq $T0, $A4, $A4
+ vpand $AND_MASK, $A3, $A3
+ vpsrlq \$26, $A4, $T0
+ vpsllq \$2, $T0, $T1
+ vpaddq $T1, $T0, $T0
+ vpaddq $T0, $A0, $A0
+ vpand $AND_MASK, $A4, $A4
+ vpsrlq \$26, $A0, $T0
+ vpand $AND_MASK, $A0, $A0
+ vpaddq $T0, $A1, $A1
+ vpsrlq \$26, $A1, $T0
+ vpand $AND_MASK, $A1, $A1
+ vpaddq $T0, $A2, $A2
+ vpsrlq \$26, $A2, $T0
+ vpand $AND_MASK, $A2, $A2
+ vpaddq $T0, $A3, $A3
+ vpsrlq \$26, $A3, $T0
+ vpand $AND_MASK, $A3, $A3
+ vpaddq $T0, $A4, $A4
+3:
+ cmp \$16, $in_len
+ jb 1f
+
+ # load the next block
+ vmovq 8*0($in), $R0
+ vmovq 8*1($in), $R1
+ add \$16, $in
+ sub \$16, $in_len
+
+ vpsrlq \$26, $R0, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A0, $A0
+
+ vpsrlq \$26, $R2, $R0
+ vpand $AND_MASK, $R2, $R2
+ vpaddq $R2, $A1, $A1
+
+ vpsllq \$12, $R1, $R2
+ vpxor $R2, $R0, $R0
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A2, $A2
+
+ vpsrlq \$26, $R2, $R0
+ vpsrlq \$40, $R1, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpxor .LsetBit(%rip), $R2, $R2
+ vpaddq $R0, $A3, $A3
+ vpaddq $R2, $A4, $A4
+2:
+ # Multiply input by R[0]
+ vmovq $_r0_+8($state), $T0
+ vpmuludq $T0, $A0, $R0
+ vpmuludq $T0, $A1, $R1
+ vpmuludq $T0, $A2, $R2
+ vpmuludq $T0, $A3, $R3
+ vpmuludq $T0, $A4, $R4
+ # Multiply input by R[1] (and R[1]*5)
+ vmovq $_r1_x5+8($state), $T0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R0, $R0
+ vmovq $_r1_+8($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R4, $R4
+ # Etc
+ vmovq $_r2_x5+8($state), $T0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R1, $R1
+ vmovq $_r2_+8($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovq $_r3_x5+8($state), $T0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R2, $R2
+ vmovq $_r3_+8($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovq $_r4_x5+8($state), $T0
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R3, $R3
+ vmovq $_r4_+8($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R4, $R4
+ # Reduce
+ vpsrlq \$26, $R3, $T0
+ vpaddq $T0, $R4, $R4
+ vpand $AND_MASK, $R3, $R3
+ vpsrlq \$26, $R4, $T0
+ vpsllq \$2, $T0, $T1
+ vpaddq $T1, $T0, $T0
+ vpaddq $T0, $R0, $R0
+ vpand $AND_MASK, $R4, $R4
+ vpsrlq \$26, $R0, $T0
+ vpand $AND_MASK, $R0, $A0
+ vpaddq $T0, $R1, $R1
+ vpsrlq \$26, $R1, $T0
+ vpand $AND_MASK, $R1, $A1
+ vpaddq $T0, $R2, $R2
+ vpsrlq \$26, $R2, $T0
+ vpand $AND_MASK, $R2, $A2
+ vpaddq $T0, $R3, $R3
+ vpsrlq \$26, $R3, $T0
+ vpand $AND_MASK, $R3, $A3
+ vpaddq $T0, $R4, $A4
+
+1:
+ test $in_len, $in_len
+ jz 1f
+
+ vmovdqa .Lone(%rip), $R0
+3:
+ dec $in_len
+ vpslldq \$1, $R0, $R0
+ vpinsrb \$0, ($in, $in_len), $R0, $R0
+ test $in_len, $in_len
+ jnz 3b
+
+ vpsrldq \$8, $R0, $R1
+ vpsrlq \$26, $R0, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A0, $A0
+
+ vpsrlq \$26, $R2, $R0
+ vpand $AND_MASK, $R2, $R2
+ vpaddq $R2, $A1, $A1
+
+ vpsllq \$12, $R1, $R2
+ vpxor $R2, $R0, $R0
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A2, $A2
+
+ vpsrlq \$26, $R2, $R0
+ vpsrlq \$40, $R1, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A3, $A3
+ vpaddq $R2, $A4, $A4
+ xor $in_len, $in_len
+ jmp 2b
+1:
+
+ vmovd $A0, $_A0_($state)
+ vmovd $A1, $_A1_($state)
+ vmovd $A2, $_A2_($state)
+ vmovd $A3, $_A3_($state)
+ vmovd $A4, $_A4_($state)
+
+ mov $in, %rax
+ vzeroupper
+ ret
+.size poly1305_update_avx,.-poly1305_update_avx
+###############################################################################
+# void poly1305_finish_avx(void* $state, uint64_t mac[2]);
+.type poly1305_finish_avx,\@function, 2
+.globl poly1305_finish_avx
+poly1305_finish_avx:
+___
+my $mac="%rsi";
+$code.=<<___;
+ mov $_A0_($state), %r8d
+ mov $_A1_($state), %eax
+ mov $_A2_($state), %r9d
+ mov $_A3_($state), %ecx
+ mov $_A4_($state), %r10d
+
+ shl \$26, %rax
+ add %rax, %r8
+
+ mov %r9, %rax
+ shl \$52, %rax
+ shr \$12, %r9
+ add %rax, %r8
+ adc \$0, %r9
+
+ mov %r10, %rax
+ shl \$14, %rcx
+ shl \$40, %rax
+ shr \$24, %r10
+
+ add %rcx, %r9
+ add %rax, %r9
+ adc \$0, %r10
+
+ mov %r10, %rax
+ shr \$2, %rax
+ and \$3, %r10
+
+ mov %rax, %rcx
+ shl \$2, %rax
+ add %rcx, %rax
+
+ add %rax, %r8
+ adc \$0, %r9
+ adc \$0, %r10
+
+ mov %r8, %rax
+ mov %r9, %rcx
+ sub \$-5, %rax
+ sbb \$-1, %rcx
+ sbb \$3, %r10
+
+ cmovc %r8, %rax
+ cmovc %r9, %rcx
+ add $_k_($state), %rax
+ adc $_k_+8($state), %rcx
+ mov %rax, ($mac)
+ mov %rcx, 8($mac)
+ ret
+.size poly1305_finish_avx,.-poly1305_finish_avx
+___
+}
+}}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
+
diff --git a/crypto/chacha20poly1305/asm/poly1305_avx2.pl b/crypto/chacha20poly1305/asm/poly1305_avx2.pl
new file mode 100644
index 0000000..d2dd51f
--- /dev/null
+++ b/crypto/chacha20poly1305/asm/poly1305_avx2.pl
@@ -0,0 +1,984 @@
+##############################################################################
+# #
+# Copyright 2014 Intel Corporation #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); #
+# you may not use this file except in compliance with the License. #
+# You may obtain a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+# #
+##############################################################################
+# #
+# Developers and authors: #
+# Shay Gueron (1, 2), and Vlad Krasnov (1) #
+# (1) Intel Corporation, Israel Development Center #
+# (2) University of Haifa #
+# #
+##############################################################################
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+}
+
+if ($avx>=1) {{
+
+my ($_r0_, $_r1_, $_r2_, $_r3_, $_r4_,
+ $_r1_x5, $_r2_x5, $_r3_x5, $_r4_x5,
+ $_k_,
+ $_A0_, $_A1_, $_A2_, $_A3_, $_A4_)
+ = (64, 96, 128, 160, 192,
+ 224, 256, 288, 320,
+ 40,
+ 352, 356, 360, 364, 368);
+
+$code.=<<___;
+.text
+.align 32
+.LandMask:
+.quad 0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF
+.LsetBit:
+.quad 0x1000000, 0x1000000, 0x1000000, 0x1000000
+.LrSet:
+.quad 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF, 0xFFFFFFC0FFFFFFF
+.quad 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC, 0xFFFFFFC0FFFFFFC
+
+.LpermFix:
+.long 6,7,6,7,6,7,6,7
+.long 4,5,6,7,6,7,6,7
+.long 2,3,6,7,4,5,6,7
+.long 0,1,4,5,2,3,6,7
+___
+
+
+{
+my ($A0, $A1, $A2, $A3, $A4,
+ $r0, $r1, $r2, $r3, $r4,
+ $T0, $T1, $A5, $A6, $A7, $A8)=map("%xmm$_",(0..15));
+
+my ($A0_y, $A1_y, $A2_y, $A3_y, $A4_y,
+ $r0_y, $r1_y, $r2_y, $r3_y, $r4_y)=map("%ymm$_",(0..9));
+
+my ($state, $key)
+ =("%rdi", "%rsi");
+
+$code.=<<___;
+################################################################################
+# void poly1305_init_avx2(void *state)
+
+.type poly1305_init_avx2, \@function, 2
+.align 64
+poly1305_init_avx2:
+ vzeroupper
+
+ # Init the MAC value
+ mov 8*0($state), %r8
+ mov 8*1($state), %r9
+ mov 8*2($state), %r10
+
+ mov %r8, %rax
+ mov %r9, %rcx
+
+ sub \$-5, %r8
+ sbb \$-1, %r9
+ sbb \$3, %r10
+
+ cmovc %rax, %r8
+ cmovc %rcx, %r9
+ cmovc 8*2($state), %r10
+
+ mov %r8, %rax
+ and \$0x3ffffff, %rax
+ mov %eax, $_A0_($state)
+
+ shrd \$26, %r9, %r8
+ shrd \$26, %r10, %r9
+
+ mov %r8, %rax
+ and \$0x3ffffff, %rax
+ mov %eax, $_A1_($state)
+
+ shrd \$26, %r9, %r8
+ shr \$26, %r9
+
+ mov %r8, %rax
+ and \$0x3ffffff, %rax
+ mov %eax, $_A2_($state)
+
+ shrd \$26, %r9, %r8
+
+ mov %r8, %rax
+ and \$0x3ffffff, %rax
+ mov %eax, $_A3_($state)
+
+ shr \$26, %r8
+
+ mov %r8, %rax
+ and \$0x3ffffff, %rax
+ mov %eax, $_A4_($state)
+
+ # load and convert r
+ vmovq 8*3($state), $r0
+ vmovq 8*4($state), $T0
+ vpand .LrSet(%rip), $r0, $r0
+ vpand .LrSet+32(%rip), $T0, $T0
+
+ vpsrlq \$26, $r0, $r1
+ vpand .LandMask(%rip), $r0, $r0
+ vpsrlq \$26, $r1, $r2
+ vpand .LandMask(%rip), $r1, $r1
+ vpsllq \$12, $T0, $T1
+ vpxor $T1, $r2, $r2
+ vpsrlq \$26, $r2, $r3
+ vpsrlq \$40, $T0, $r4
+ vpand .LandMask(%rip), $r2, $r2
+ vpand .LandMask(%rip), $r3, $r3
+ # SQR R
+ vpmuludq $r0, $r0, $A0
+ vpmuludq $r1, $r0, $A1
+ vpmuludq $r2, $r0, $A2
+ vpmuludq $r3, $r0, $A3
+ vpmuludq $r4, $r0, $A4
+
+ vpsllq \$1, $A1, $A1
+ vpsllq \$1, $A2, $A2
+ vpmuludq $r1, $r1, $T0
+ vpaddq $T0, $A2, $A2
+ vpmuludq $r2, $r1, $T0
+ vpaddq $T0, $A3, $A3
+ vpmuludq $r3, $r1, $T0
+ vpaddq $T0, $A4, $A4
+ vpmuludq $r4, $r1, $A5
+
+ vpsllq \$1, $A3, $A3
+ vpsllq \$1, $A4, $A4
+ vpmuludq $r2, $r2, $T0
+ vpaddq $T0, $A4, $A4
+ vpmuludq $r3, $r2, $T0
+ vpaddq $T0, $A5, $A5
+ vpmuludq $r4, $r2, $A6
+
+ vpsllq \$1, $A5, $A5
+ vpsllq \$1, $A6, $A6
+ vpmuludq $r3, $r3, $T0
+ vpaddq $T0, $A6, $A6
+ vpmuludq $r4, $r3, $A7
+
+ vpsllq \$1, $A7, $A7
+ vpmuludq $r4, $r4, $A8
+
+ # Reduce
+ vpsrlq \$26, $A4, $T0
+ vpand .LandMask(%rip), $A4, $A4
+ vpaddq $T0, $A5, $A5
+
+ vpsllq \$2, $A5, $T0
+ vpaddq $T0, $A5, $A5
+ vpsllq \$2, $A6, $T0
+ vpaddq $T0, $A6, $A6
+ vpsllq \$2, $A7, $T0
+ vpaddq $T0, $A7, $A7
+ vpsllq \$2, $A8, $T0
+ vpaddq $T0, $A8, $A8
+
+ vpaddq $A5, $A0, $A0
+ vpaddq $A6, $A1, $A1
+ vpaddq $A7, $A2, $A2
+ vpaddq $A8, $A3, $A3
+
+ vpsrlq \$26, $A0, $T0
+ vpand .LandMask(%rip), $A0, $A0
+ vpaddq $T0, $A1, $A1
+ vpsrlq \$26, $A1, $T0
+ vpand .LandMask(%rip), $A1, $A1
+ vpaddq $T0, $A2, $A2
+ vpsrlq \$26, $A2, $T0
+ vpand .LandMask(%rip), $A2, $A2
+ vpaddq $T0, $A3, $A3
+ vpsrlq \$26, $A3, $T0
+ vpand .LandMask(%rip), $A3, $A3
+ vpaddq $T0, $A4, $A4
+
+ vpunpcklqdq $r0, $A0, $r0
+ vpunpcklqdq $r1, $A1, $r1
+ vpunpcklqdq $r2, $A2, $r2
+ vpunpcklqdq $r3, $A3, $r3
+ vpunpcklqdq $r4, $A4, $r4
+
+ vmovdqu $r0, $_r0_+16($state)
+ vmovdqu $r1, $_r1_+16($state)
+ vmovdqu $r2, $_r2_+16($state)
+ vmovdqu $r3, $_r3_+16($state)
+ vmovdqu $r4, $_r4_+16($state)
+
+ vpsllq \$2, $r1, $A1
+ vpsllq \$2, $r2, $A2
+ vpsllq \$2, $r3, $A3
+ vpsllq \$2, $r4, $A4
+
+ vpaddq $A1, $r1, $A1
+ vpaddq $A2, $r2, $A2
+ vpaddq $A3, $r3, $A3
+ vpaddq $A4, $r4, $A4
+
+ vmovdqu $A1, $_r1_x5+16($state)
+ vmovdqu $A2, $_r2_x5+16($state)
+ vmovdqu $A3, $_r3_x5+16($state)
+ vmovdqu $A4, $_r4_x5+16($state)
+
+ # Compute r^3 and r^4
+ vpshufd \$0x44, $r0, $A0
+ vpshufd \$0x44, $r1, $A1
+ vpshufd \$0x44, $r2, $A2
+ vpshufd \$0x44, $r3, $A3
+ vpshufd \$0x44, $r4, $A4
+
+ # Multiply input by R[0]
+ vmovdqu $_r0_+16($state), $T0
+ vpmuludq $T0, $A0, $r0
+ vpmuludq $T0, $A1, $r1
+ vpmuludq $T0, $A2, $r2
+ vpmuludq $T0, $A3, $r3
+ vpmuludq $T0, $A4, $r4
+ # Multiply input by R[1] (and R[1]*5)
+ vmovdqu $_r1_x5+16($state), $T0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $r0, $r0
+ vmovdqu $_r1_+16($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $r1, $r1
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $r2, $r2
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $r3, $r3
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $r4, $r4
+ # Etc
+ vmovdqu $_r2_x5+16($state), $T0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $r0, $r0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $r1, $r1
+ vmovdqu $_r2_+16($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $r2, $r2
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $r3, $r3
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $r4, $r4
+
+ vmovdqu $_r3_x5+16($state), $T0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $r0, $r0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $r1, $r1
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $r2, $r2
+ vmovdqu $_r3_+16($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $r3, $r3
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $r4, $r4
+
+ vmovdqu $_r4_x5+16($state), $T0
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $r0, $r0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $r1, $r1
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $r2, $r2
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $r3, $r3
+ vmovdqu $_r4_+16($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $r4, $r4
+ # Reduce
+ vpsrlq \$26, $r3, $T0
+ vpaddq $T0, $r4, $r4
+ vpand .LandMask(%rip), $r3, $r3
+ vpsrlq \$26, $r4, $T0
+ vpsllq \$2, $T0, $T1
+ vpaddq $T1, $T0, $T0
+ vpaddq $T0, $r0, $r0
+ vpand .LandMask(%rip), $r4, $r4
+ vpsrlq \$26, $r0, $T0
+ vpand .LandMask(%rip), $r0, $r0
+ vpaddq $T0, $r1, $r1
+ vpsrlq \$26, $r1, $T0
+ vpand .LandMask(%rip), $r1, $r1
+ vpaddq $T0, $r2, $r2
+ vpsrlq \$26, $r2, $T0
+ vpand .LandMask(%rip), $r2, $r2
+ vpaddq $T0, $r3, $r3
+ vpsrlq \$26, $r3, $T0
+ vpand .LandMask(%rip), $r3, $r3
+ vpaddq $T0, $r4, $r4
+
+ vmovdqu $r0, $_r0_($state)
+ vmovdqu $r1, $_r1_($state)
+ vmovdqu $r2, $_r2_($state)
+ vmovdqu $r3, $_r3_($state)
+ vmovdqu $r4, $_r4_($state)
+
+ vpsllq \$2, $r1, $A1
+ vpsllq \$2, $r2, $A2
+ vpsllq \$2, $r3, $A3
+ vpsllq \$2, $r4, $A4
+
+ vpaddq $A1, $r1, $A1
+ vpaddq $A2, $r2, $A2
+ vpaddq $A3, $r3, $A3
+ vpaddq $A4, $r4, $A4
+
+ vmovdqu $A1, $_r1_x5($state)
+ vmovdqu $A2, $_r2_x5($state)
+ vmovdqu $A3, $_r3_x5($state)
+ vmovdqu $A4, $_r4_x5($state)
+
+ movq \$1, 8*7($state)
+
+ ret
+.size poly1305_init_avx2,.-poly1305_init_avx2
+___
+}
+
+{
+
+my ($A0, $A1, $A2, $A3, $A4,
+ $T0, $T1, $R0, $R1, $R2,
+ $R3, $R4, $AND_MASK, $PERM_MASK, $SET_MASK)
+ =map("%ymm$_",(0..14));
+
+my ($A0_x, $A1_x, $A2_x, $A3_x, $A4_x,
+ $T0_x, $T1_x, $R0_x, $R1_x, $R2_x,
+ $R3_x, $R4_x, $AND_MASK_x, $PERM_MASK_x, $SET_MASK_x)
+ =map("%xmm$_",(0..14));
+
+my ($state, $in, $in_len, $hlp, $rsp_save)
+ =("%rdi", "%rsi", "%rdx", "%rcx", "%rax");
+
+$code.=<<___;
+
+###############################################################################
+# void poly1305_update_avx2(void* $state, void* in, uint64_t in_len)
+.globl poly1305_update_avx2
+.type poly1305_update_avx2, \@function, 2
+.align 64
+poly1305_update_avx2:
+
+
+ test $in_len, $in_len
+ jz 6f
+
+ cmpq \$0, 8*7($state)
+ jne 1f # This means we already started avx2, and must finish
+
+ # If we are here we need to check in_len is longer than the min for avx2
+
+ cmp \$512, $in_len
+ jge 2f
+
+ jmp poly1305_update_x64
+
+2:
+ call poly1305_init_avx2
+
+1:
+ vmovd $_A0_($state), $A0_x
+ vmovd $_A1_($state), $A1_x
+ vmovd $_A2_($state), $A2_x
+ vmovd $_A3_($state), $A3_x
+ vmovd $_A4_($state), $A4_x
+
+ vmovdqa .LandMask(%rip), $AND_MASK
+1:
+ cmp \$32*4, $in_len
+ jb 1f
+ sub \$32*2, $in_len
+
+ # load the next four blocks
+ vmovdqu 32*0($in), $R2
+ vmovdqu 32*1($in), $R3
+ add \$32*2, $in
+
+ vpunpcklqdq $R3, $R2, $R0
+ vpunpckhqdq $R3, $R2, $R1
+
+ vpermq \$0xD8, $R0, $R0 # it is possible to rearrange the precomputations, and save this shuffle
+ vpermq \$0xD8, $R1, $R1
+
+ vpsrlq \$26, $R0, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A0, $A0
+
+ vpsrlq \$26, $R2, $R0
+ vpand $AND_MASK, $R2, $R2
+ vpaddq $R2, $A1, $A1
+
+ vpsllq \$12, $R1, $R2
+ vpxor $R2, $R0, $R0
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A2, $A2
+
+ vpsrlq \$26, $R2, $R0
+ vpsrlq \$40, $R1, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpxor .LsetBit(%rip), $R2, $R2
+ vpaddq $R0, $A3, $A3
+ vpaddq $R2, $A4, $A4
+
+ # Multiply input by R[0]
+ vpbroadcastq $_r0_($state), $T0
+ vpmuludq $T0, $A0, $R0
+ vpmuludq $T0, $A1, $R1
+ vpmuludq $T0, $A2, $R2
+ vpmuludq $T0, $A3, $R3
+ vpmuludq $T0, $A4, $R4
+ # Multiply input by R[1] (and R[1]*5)
+ vpbroadcastq $_r1_x5($state), $T0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R0, $R0
+ vpbroadcastq $_r1_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R4, $R4
+ # Etc
+ vpbroadcastq $_r2_x5($state), $T0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R1, $R1
+ vpbroadcastq $_r2_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R4, $R4
+
+ vpbroadcastq $_r3_x5($state), $T0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R2, $R2
+ vpbroadcastq $_r3_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R4, $R4
+
+ vpbroadcastq $_r4_x5($state), $T0
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R3, $R3
+ vpbroadcastq $_r4_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R4, $R4
+ # Reduce
+ vpsrlq \$26, $R3, $T0
+ vpaddq $T0, $R4, $R4
+ vpand $AND_MASK, $R3, $R3
+
+ vpsrlq \$26, $R4, $T0
+ vpsllq \$2, $T0, $T1
+ vpaddq $T1, $T0, $T0
+ vpaddq $T0, $R0, $R0
+ vpand $AND_MASK, $R4, $R4
+
+ vpsrlq \$26, $R0, $T0
+ vpand $AND_MASK, $R0, $A0
+ vpaddq $T0, $R1, $R1
+ vpsrlq \$26, $R1, $T0
+ vpand $AND_MASK, $R1, $A1
+ vpaddq $T0, $R2, $R2
+ vpsrlq \$26, $R2, $T0
+ vpand $AND_MASK, $R2, $A2
+ vpaddq $T0, $R3, $R3
+ vpsrlq \$26, $R3, $T0
+ vpand $AND_MASK, $R3, $A3
+ vpaddq $T0, $R4, $A4
+ jmp 1b
+1:
+
+ cmp \$32*2, $in_len
+ jb 1f
+ sub \$32*2, $in_len
+ # load the next four blocks
+ vmovdqu 32*0($in), $R2
+ vmovdqu 32*1($in), $R3
+ add \$32*2, $in
+
+ vpunpcklqdq $R3, $R2, $R0
+ vpunpckhqdq $R3, $R2, $R1
+
+ vpermq \$0xD8, $R0, $R0
+ vpermq \$0xD8, $R1, $R1
+
+ vpsrlq \$26, $R0, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A0, $A0
+
+ vpsrlq \$26, $R2, $R0
+ vpand $AND_MASK, $R2, $R2
+ vpaddq $R2, $A1, $A1
+
+ vpsllq \$12, $R1, $R2
+ vpxor $R2, $R0, $R0
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A2, $A2
+
+ vpsrlq \$26, $R2, $R0
+ vpsrlq \$40, $R1, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpxor .LsetBit(%rip), $R2, $R2
+ vpaddq $R0, $A3, $A3
+ vpaddq $R2, $A4, $A4
+
+ # Multiply input by R[0]
+ vmovdqu $_r0_($state), $T0
+ vpmuludq $T0, $A0, $R0
+ vpmuludq $T0, $A1, $R1
+ vpmuludq $T0, $A2, $R2
+ vpmuludq $T0, $A3, $R3
+ vpmuludq $T0, $A4, $R4
+ # Multiply input by R[1] (and R[1]*5)
+ vmovdqu $_r1_x5($state), $T0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R0, $R0
+ vmovdqu $_r1_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R4, $R4
+ # Etc
+ vmovdqu $_r2_x5($state), $T0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R1, $R1
+ vmovdqu $_r2_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovdqu $_r3_x5($state), $T0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R2, $R2
+ vmovdqu $_r3_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovdqu $_r4_x5($state), $T0
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R3, $R3
+ vmovdqu $_r4_($state), $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R4, $R4
+ # Reduce
+ vpsrlq \$26, $R3, $T0
+ vpaddq $T0, $R4, $R4
+ vpand $AND_MASK, $R3, $R3
+ vpsrlq \$26, $R4, $T0
+ vpsllq \$2, $T0, $T1
+ vpaddq $T1, $T0, $T0
+ vpaddq $T0, $R0, $R0
+ vpand $AND_MASK, $R4, $R4
+ vpsrlq \$26, $R0, $T0
+ vpand $AND_MASK, $R0, $A0
+ vpaddq $T0, $R1, $R1
+ vpsrlq \$26, $R1, $T0
+ vpand $AND_MASK, $R1, $A1
+ vpaddq $T0, $R2, $R2
+ vpsrlq \$26, $R2, $T0
+ vpand $AND_MASK, $R2, $A2
+ vpaddq $T0, $R3, $R3
+ vpsrlq \$26, $R3, $T0
+ vpand $AND_MASK, $R3, $A3
+ vpaddq $T0, $R4, $A4
+
+ vpsrldq \$8, $A0, $R0
+ vpsrldq \$8, $A1, $R1
+ vpsrldq \$8, $A2, $R2
+ vpsrldq \$8, $A3, $R3
+ vpsrldq \$8, $A4, $R4
+
+ vpaddq $R0, $A0, $A0
+ vpaddq $R1, $A1, $A1
+ vpaddq $R2, $A2, $A2
+ vpaddq $R3, $A3, $A3
+ vpaddq $R4, $A4, $A4
+
+ vpermq \$0xAA, $A0, $R0
+ vpermq \$0xAA, $A1, $R1
+ vpermq \$0xAA, $A2, $R2
+ vpermq \$0xAA, $A3, $R3
+ vpermq \$0xAA, $A4, $R4
+
+ vpaddq $R0, $A0, $A0
+ vpaddq $R1, $A1, $A1
+ vpaddq $R2, $A2, $A2
+ vpaddq $R3, $A3, $A3
+ vpaddq $R4, $A4, $A4
+1:
+ test $in_len, $in_len
+ jz 5f
+ # In case 1,2 or 3 blocks remain, we want to multiply them correctly
+ vmovq $A0_x, $A0_x
+ vmovq $A1_x, $A1_x
+ vmovq $A2_x, $A2_x
+ vmovq $A3_x, $A3_x
+ vmovq $A4_x, $A4_x
+
+ mov .LsetBit(%rip), $hlp
+ mov %rsp, $rsp_save
+ test \$15, $in_len
+ jz 1f
+ xor $hlp, $hlp
+ sub \$64, %rsp
+ vpxor $R0, $R0, $R0
+ vmovdqu $R0, (%rsp)
+ vmovdqu $R0, 32(%rsp)
+3:
+ movb ($in, $hlp), %r8b
+ movb %r8b, (%rsp, $hlp)
+ inc $hlp
+ cmp $hlp, $in_len
+ jne 3b
+
+ movb \$1, (%rsp, $hlp)
+ xor $hlp, $hlp
+ mov %rsp, $in
+
+1:
+
+ cmp \$16, $in_len
+ ja 2f
+ vmovq 8*0($in), $R0_x
+ vmovq 8*1($in), $R1_x
+ vmovq $hlp, $SET_MASK_x
+ vmovdqa .LpermFix(%rip), $PERM_MASK
+ jmp 1f
+2:
+ cmp \$32, $in_len
+ ja 2f
+ vmovdqu 16*0($in), $R2_x
+ vmovdqu 16*1($in), $R3_x
+ vmovq .LsetBit(%rip), $SET_MASK_x
+ vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x
+ vmovdqa .LpermFix+32(%rip), $PERM_MASK
+
+ vpunpcklqdq $R3, $R2, $R0
+ vpunpckhqdq $R3, $R2, $R1
+ jmp 1f
+2:
+ cmp \$48, $in_len
+ ja 2f
+ vmovdqu 32*0($in), $R2
+ vmovdqu 32*1($in), $R3_x
+ vmovq .LsetBit(%rip), $SET_MASK_x
+ vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x
+ vpermq \$0xc4, $SET_MASK, $SET_MASK
+ vmovdqa .LpermFix+64(%rip), $PERM_MASK
+
+ vpunpcklqdq $R3, $R2, $R0
+ vpunpckhqdq $R3, $R2, $R1
+ jmp 1f
+2:
+ vmovdqu 32*0($in), $R2
+ vmovdqu 32*1($in), $R3
+ vmovq .LsetBit(%rip), $SET_MASK_x
+ vpinsrq \$1, $hlp, $SET_MASK_x, $SET_MASK_x
+ vpermq \$0x40, $SET_MASK, $SET_MASK
+ vmovdqa .LpermFix+96(%rip), $PERM_MASK
+
+ vpunpcklqdq $R3, $R2, $R0
+ vpunpckhqdq $R3, $R2, $R1
+
+1:
+ mov $rsp_save, %rsp
+
+ vpsrlq \$26, $R0, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A0, $A0
+
+ vpsrlq \$26, $R2, $R0
+ vpand $AND_MASK, $R2, $R2
+ vpaddq $R2, $A1, $A1
+
+ vpsllq \$12, $R1, $R2
+ vpxor $R2, $R0, $R0
+ vpand $AND_MASK, $R0, $R0
+ vpaddq $R0, $A2, $A2
+
+ vpsrlq \$26, $R2, $R0
+ vpsrlq \$40, $R1, $R2
+ vpand $AND_MASK, $R0, $R0
+ vpxor $SET_MASK, $R2, $R2
+ vpaddq $R0, $A3, $A3
+ vpaddq $R2, $A4, $A4
+
+ # Multiply input by R[0]
+ vmovdqu $_r0_($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A0, $R0
+ vpmuludq $T0, $A1, $R1
+ vpmuludq $T0, $A2, $R2
+ vpmuludq $T0, $A3, $R3
+ vpmuludq $T0, $A4, $R4
+ # Multiply input by R[1] (and R[1]*5)
+ vmovdqu $_r1_x5($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R0, $R0
+ vmovdqu $_r1_($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R4, $R4
+ # Etc
+ vmovdqu $_r2_x5($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R1, $R1
+ vmovdqu $_r2_($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovdqu $_r3_x5($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R2, $R2
+ vmovdqu $_r3_($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R3, $R3
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R4, $R4
+
+ vmovdqu $_r4_x5($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A1, $T1
+ vpaddq $T1, $R0, $R0
+ vpmuludq $T0, $A2, $T1
+ vpaddq $T1, $R1, $R1
+ vpmuludq $T0, $A3, $T1
+ vpaddq $T1, $R2, $R2
+ vpmuludq $T0, $A4, $T1
+ vpaddq $T1, $R3, $R3
+ vmovdqu $_r4_($state), $T0
+ vpermd $T0, $PERM_MASK, $T0
+ vpmuludq $T0, $A0, $T1
+ vpaddq $T1, $R4, $R4
+ # Reduce
+ vpsrlq \$26, $R3, $T0
+ vpaddq $T0, $R4, $R4
+ vpand $AND_MASK, $R3, $R3
+ vpsrlq \$26, $R4, $T0
+ vpsllq \$2, $T0, $T1
+ vpaddq $T1, $T0, $T0
+ vpaddq $T0, $R0, $R0
+ vpand $AND_MASK, $R4, $R4
+ vpsrlq \$26, $R0, $T0
+ vpand $AND_MASK, $R0, $A0
+ vpaddq $T0, $R1, $R1
+ vpsrlq \$26, $R1, $T0
+ vpand $AND_MASK, $R1, $A1
+ vpaddq $T0, $R2, $R2
+ vpsrlq \$26, $R2, $T0
+ vpand $AND_MASK, $R2, $A2
+ vpaddq $T0, $R3, $R3
+ vpsrlq \$26, $R3, $T0
+ vpand $AND_MASK, $R3, $A3
+ vpaddq $T0, $R4, $A4
+
+ vpsrldq \$8, $A0, $R0
+ vpsrldq \$8, $A1, $R1
+ vpsrldq \$8, $A2, $R2
+ vpsrldq \$8, $A3, $R3
+ vpsrldq \$8, $A4, $R4
+
+ vpaddq $R0, $A0, $A0
+ vpaddq $R1, $A1, $A1
+ vpaddq $R2, $A2, $A2
+ vpaddq $R3, $A3, $A3
+ vpaddq $R4, $A4, $A4
+
+ vpermq \$0xAA, $A0, $R0
+ vpermq \$0xAA, $A1, $R1
+ vpermq \$0xAA, $A2, $R2
+ vpermq \$0xAA, $A3, $R3
+ vpermq \$0xAA, $A4, $R4
+
+ vpaddq $R0, $A0, $A0
+ vpaddq $R1, $A1, $A1
+ vpaddq $R2, $A2, $A2
+ vpaddq $R3, $A3, $A3
+ vpaddq $R4, $A4, $A4
+
+5:
+ vmovd $A0_x, $_A0_($state)
+ vmovd $A1_x, $_A1_($state)
+ vmovd $A2_x, $_A2_($state)
+ vmovd $A3_x, $_A3_($state)
+ vmovd $A4_x, $_A4_($state)
+6:
+ ret
+.size poly1305_update_avx2,.-poly1305_update_avx2
+###############################################################################
+# void poly1305_finish_avx2(void* $state, uint8_t mac[16]);
+.type poly1305_finish_avx2,\@function,2
+.globl poly1305_finish_avx2
+poly1305_finish_avx2:
+___
+my $mac="%rsi";
+my ($A0, $A1, $A2, $A3, $A4, $T0, $T1)
+ =map("%xmm$_",(0..6));
+
+$code.=<<___;
+ cmpq \$0, 8*7($state)
+ jne 1f
+ jmp poly1305_finish_x64
+
+1:
+ mov $_A0_($state), %r8d
+ mov $_A1_($state), %eax
+ mov $_A2_($state), %r9d
+ mov $_A3_($state), %ecx
+ mov $_A4_($state), %r10d
+
+ shl \$26, %rax
+ add %rax, %r8
+
+ mov %r9, %rax
+ shl \$52, %rax
+ shr \$12, %r9
+ add %rax, %r8
+ adc \$0, %r9
+
+ mov %r10, %rax
+ shl \$14, %rcx
+ shl \$40, %rax
+ shr \$24, %r10
+
+ add %rcx, %r9
+ add %rax, %r9
+ adc \$0, %r10
+
+ mov %r10, %rax
+ shr \$2, %rax
+ and \$3, %r10
+
+ mov %rax, %rcx
+ shl \$2, %rax
+ add %rcx, %rax
+
+ add %rax, %r8
+ adc \$0, %r9
+ adc \$0, %r10
+
+ mov %r8, %rax
+ mov %r9, %rcx
+ sub \$-5, %rax
+ sbb \$-1, %rcx
+ sbb \$3, %r10
+
+ cmovc %r8, %rax
+ cmovc %r9, %rcx
+ add $_k_($state), %rax
+ adc $_k_+8($state), %rcx
+ mov %rax, ($mac)
+ mov %rcx, 8($mac)
+
+ ret
+.size poly1305_finish_avx2,.-poly1305_finish_avx2
+___
+}
+}}
+
+$code =~ s/\`([^\`]*)\`/eval(\$1)/gem;
+print $code;
+close STDOUT;
+
diff --git a/crypto/chacha20poly1305/asm/poly1305_x64.pl b/crypto/chacha20poly1305/asm/poly1305_x64.pl
new file mode 100644
index 0000000..31c4c47
--- /dev/null
+++ b/crypto/chacha20poly1305/asm/poly1305_x64.pl
@@ -0,0 +1,281 @@
+
+##############################################################################
+# #
+# Copyright 2016 CloudFlare LTD #
+# #
+# Licensed under the Apache License, Version 2.0 (the "License"); #
+# you may not use this file except in compliance with the License. #
+# You may obtain a copy of the License at #
+# #
+# http://www.apache.org/licenses/LICENSE-2.0 #
+# #
+# Unless required by applicable law or agreed to in writing, software #
+# distributed under the License is distributed on an "AS IS" BASIS, #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
+# See the License for the specific language governing permissions and #
+# limitations under the License. #
+# #
+##############################################################################
+# #
+# Author: Vlad Krasnov #
+# #
+##############################################################################
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if ($win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if ($win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (`$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
+ my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
+ $avx = ($ver>=3.0) + ($ver>=3.01);
+}
+
+
+{
+{
+
+my ($state, $key)
+ =("%rdi", "%rsi");
+
+$code.=<<___;
+
+.LrSet:
+.align 16
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+###############################################################################
+# void poly1305_init_x64(void *state, uint8_t key[32])
+
+.globl poly1305_init_x64
+.type poly1305_init_x64, \@function, 2
+.align 64
+poly1305_init_x64:
+
+ xor %rax, %rax
+ mov %rax, 8*0($state)
+ mov %rax, 8*1($state)
+ mov %rax, 8*2($state)
+
+ movdqu 16*0($key), %xmm0
+ movdqu 16*1($key), %xmm1
+ pand .LrSet(%rip), %xmm0
+
+ movdqu %xmm0, 8*3($state)
+ movdqu %xmm1, 8*3+16($state)
+ movq \$0, 8*7($state)
+
+ ret
+.size poly1305_init_x64,.-poly1305_init_x64
+___
+}
+
+{
+
+my ($state, $inp)
+ =("%rdi", "%rsi");
+
+my ($acc0, $acc1, $acc2, $inl, $t0, $t1, $t2, $t3, $r0)
+ =("%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15");
+
+my ($r1)
+ =("8*4($state)");
+
+$code.=<<___;
+###############################################################################
+# void* poly1305_update_x64(void* state, void* in, uint64_t in_len)
+.globl poly1305_update_x64
+.type poly1305_update_x64, \@function, 2
+.align 64
+poly1305_update_x64:
+
+ push %r11
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rdx, $inl
+
+ mov 8*0($state), $acc0
+ mov 8*1($state), $acc1
+ mov 8*2($state), $acc2
+ mov 8*3($state), $r0
+
+ cmp \$16, $inl
+ jb 2f
+ jmp 1f
+
+.align 64
+1:
+############################
+ add 8*0($inp), $acc0
+ adc 8*1($inp), $acc1
+ lea 16($inp), $inp
+ adc \$1, $acc2
+
+5:
+ mov $r0, %rax
+ mulq $acc0
+ mov %rax, $t0
+ mov %rdx, $t1
+
+ mov $r0, %rax
+ mulq $acc1
+ add %rax, $t1
+ adc \$0, %rdx
+
+ mov $r0, $t2
+ imul $acc2, $t2
+ add %rdx, $t2
+############################
+ mov $r1, %rax
+ mulq $acc0
+ add %rax, $t1
+ adc \$0, %rdx
+ mov %rdx, $acc0
+
+ mov $r1, %rax
+ mulq $acc1
+ add $acc0, $t2
+ adc \$0, %rdx
+ add %rax, $t2
+ adc \$0, %rdx
+
+ mov $r1, $t3
+ imul $acc2, $t3
+ add %rdx, $t3
+############################
+
+ mov $t0, $acc0
+ mov $t1, $acc1
+ mov $t2, $acc2
+ and \$3, $acc2
+
+ mov $t2, $t0
+ mov $t3, $t1
+
+ and \$-4, $t0
+ shrd \$2, $t3, $t2
+ shr \$2, $t3
+
+ add $t0, $acc0
+ adc $t1, $acc1
+ adc \$0, $acc2
+
+ add $t2, $acc0
+ adc $t3, $acc1
+ adc \$0, $acc2
+
+ sub \$16, $inl
+ cmp \$16, $inl
+ jae 1b
+
+2:
+ test $inl, $inl
+ jz 3f
+
+ mov \$1, $t0
+ xor $t1, $t1
+ xor $t2, $t2
+ add $inl, $inp
+
+4:
+ shld \$8, $t0, $t1
+ shl \$8, $t0
+ movzxb -1($inp), $t2
+ xor $t2, $t0
+ dec $inp
+ dec $inl
+ jnz 4b
+
+ add $t0, $acc0
+ adc $t1, $acc1
+ adc \$0, $acc2
+
+ mov \$16, $inl
+ jmp 5b
+
+3:
+
+ mov $acc0, 8*0($state)
+ mov $acc1, 8*1($state)
+ mov $acc2, 8*2($state)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %r11
+ ret
+.size poly1305_update_x64, .-poly1305_update_x64
+___
+}
+
+{
+
+my ($mac, $state)=("%rsi", "%rdi");
+
+my ($acc0, $acc1, $acc2, $t0, $t1, $t2)
+ =("%rcx", "%rax", "%rdx", "%r8", "%r9", "%r10");
+
+$code.=<<___;
+###############################################################################
+# void poly1305_finish_x64(void* state, uint64_t mac[2]);
+.type poly1305_finish_x64,\@function, 2
+.align 64
+.globl poly1305_finish_x64
+poly1305_finish_x64:
+
+ mov 8*0($state), $acc0
+ mov 8*1($state), $acc1
+ mov 8*2($state), $acc2
+
+ mov $acc0, $t0
+ mov $acc1, $t1
+ mov $acc2, $t2
+
+ sub \$-5, $acc0
+ sbb \$-1, $acc1
+ sbb \$3, $acc2
+
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+ cmovc $t2, $acc2
+
+ add 8*5($state), $acc0
+ adc 8*6($state), $acc1
+ mov $acc0, ($mac)
+ mov $acc1, 8($mac)
+
+ ret
+.size poly1305_finish_x64, .-poly1305_finish_x64
+___
+}
+}
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/crypto/chacha20poly1305/chacha20.c b/crypto/chacha20poly1305/chacha20.c
new file mode 100644
index 0000000..3044751
--- /dev/null
+++ b/crypto/chacha20poly1305/chacha20.c
@@ -0,0 +1,162 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+/* Adapted from the public domain, estream code by D. Bernstein. */
+
+#include "chacha20poly1305.h"
+
+/* sigma contains the ChaCha constants, which happen to be an ASCII string. */
+static const char sigma[16] = "expand 32-byte k";
+
+#define ROTATE(v, n) (((v) << (n)) | ((v) >> (32 - (n))))
+#define XOR(v, w) ((v) ^ (w))
+#define PLUS(x, y) ((x) + (y))
+#define PLUSONE(v) (PLUS((v), 1))
+
+#define U32TO8_LITTLE(p, v) \
+ { \
+ (p)[0] = (v >> 0) & 0xff; \
+ (p)[1] = (v >> 8) & 0xff; \
+ (p)[2] = (v >> 16) & 0xff; \
+ (p)[3] = (v >> 24) & 0xff; \
+ }
+
+#define U8TO32_LITTLE(p) \
+ (((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) | \
+ ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))
+
+/* QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. */
+#define QUARTERROUND(a,b,c,d) \
+ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \
+ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \
+ x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \
+ x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7);
+
+/* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in
+ * |input| and writes the 64 output bytes to |output|. */
+static void chacha_core(uint8_t output[64], const uint32_t input[16]) {
+ uint32_t x[16];
+ int i;
+
+ memcpy(x, input, sizeof(uint32_t) * 16);
+ for (i = 20; i > 0; i -= 2) {
+ QUARTERROUND(0, 4, 8, 12)
+ QUARTERROUND(1, 5, 9, 13)
+ QUARTERROUND(2, 6, 10, 14)
+ QUARTERROUND(3, 7, 11, 15)
+ QUARTERROUND(0, 5, 10, 15)
+ QUARTERROUND(1, 6, 11, 12)
+ QUARTERROUND(2, 7, 8, 13)
+ QUARTERROUND(3, 4, 9, 14)
+ }
+
+ for (i = 0; i < 16; ++i) {
+ x[i] = PLUS(x[i], input[i]);
+ }
+ for (i = 0; i < 16; ++i) {
+ U32TO8_LITTLE(output + 4 * i, x[i]);
+ }
+}
+
+void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
+ uint8_t nonce[48]) {
+
+#ifdef CHAPOLY_x86_64_ASM
+ const int AVX2_threshold = 256;
+ const int AVX2_min_buf = 128;
+ const int AVX_min_buf = 64;
+
+ uint8_t buf[128];
+ size_t buf_size;
+
+ void (*core_func)(uint8_t *out, const uint8_t *in, size_t in_len,
+ uint8_t nonce[48]) = NULL;
+#else /* !CHAPOLY_x86_64_ASM */
+
+ uint8_t buf[64];
+
+#endif
+
+ uint32_t input[16];
+ size_t todo, i;
+
+#ifdef CHAPOLY_x86_64_ASM
+ if (in_len >= AVX2_threshold && ((OPENSSL_ia32cap_loc()[1] >> 5) & 1)) {
+ buf_size = AVX2_min_buf;
+ core_func = chacha_20_core_avx2;
+ } else if ((OPENSSL_ia32cap_loc()[0] >> 60) & 1) {
+ buf_size = AVX_min_buf;
+ core_func = chacha_20_core_avx;
+ } else goto do_legacy;
+
+ core_func(out, in, in_len, nonce);
+ todo = in_len & (buf_size - 1);
+
+ if(todo) {
+ out += in_len - todo;
+ in += in_len - todo;
+ memcpy(buf, in, todo);
+
+ core_func(buf, buf, buf_size, nonce);
+
+ memcpy(out, buf, todo);
+ memset(buf, 0, buf_size);
+ }
+ return;
+
+do_legacy:
+#endif /* CHAPOLY_x86_64_ASM */
+
+ input[0] = U8TO32_LITTLE(sigma + 0);
+ input[1] = U8TO32_LITTLE(sigma + 4);
+ input[2] = U8TO32_LITTLE(sigma + 8);
+ input[3] = U8TO32_LITTLE(sigma + 12);
+
+ input[4] = U8TO32_LITTLE(nonce + 0);
+ input[5] = U8TO32_LITTLE(nonce + 4);
+ input[6] = U8TO32_LITTLE(nonce + 8);
+ input[7] = U8TO32_LITTLE(nonce + 12);
+
+ input[8] = U8TO32_LITTLE(nonce + 16);
+ input[9] = U8TO32_LITTLE(nonce + 20);
+ input[10] = U8TO32_LITTLE(nonce + 24);
+ input[11] = U8TO32_LITTLE(nonce + 28);
+
+ input[12] = U8TO32_LITTLE(nonce + 32);
+ input[13] = U8TO32_LITTLE(nonce + 36);
+ input[14] = U8TO32_LITTLE(nonce + 40);
+ input[15] = U8TO32_LITTLE(nonce + 44);
+
+ while (in_len > 0) {
+ todo = 64;
+ if (in_len < todo) {
+ todo = in_len;
+ }
+
+ chacha_core(buf, input);
+ for (i = 0; i < todo; i++) {
+ out[i] = in[i] ^ buf[i];
+ }
+
+ out += todo;
+ in += todo;
+ in_len -= todo;
+
+ ((uint64_t*)input)[6]++;
+ }
+
+ U32TO8_LITTLE(nonce + 32, input[12]);
+ U32TO8_LITTLE(nonce + 36, input[13]);
+}
+
diff --git a/crypto/chacha20poly1305/chacha20poly1305.h b/crypto/chacha20poly1305/chacha20poly1305.h
new file mode 100644
index 0000000..09b0450
--- /dev/null
+++ b/crypto/chacha20poly1305/chacha20poly1305.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_POLY1305_H
+#define OPENSSL_HEADER_POLY1305_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include "crypto.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define POLY1305_MAC_LEN (16)
+#define POLY1305_PAD_LEN (16)
+
+typedef unsigned char poly1305_state[372];
+
+
+/* CRYPTO_poly1305_init sets up |state| so that it can be used to calculate an
+ * authentication tag with the one-time key |key|. Note that |key| is a
+ * one-time key and therefore there is no `reset' method because that would
+ * enable several messages to be authenticated with the same key. */
+void CRYPTO_poly1305_init(poly1305_state* state, const uint8_t key[32]);
+
+/* CRYPTO_poly1305_update processes |in_len| bytes from |in|. It can be called
+ * zero or more times after poly1305_init. */
+void CRYPTO_poly1305_update(poly1305_state* state, const uint8_t* in,
+ size_t in_len);
+
+/* CRYPTO_poly1305_finish completes the poly1305 calculation and writes a 16
+ * byte authentication tag to |mac|. */
+void CRYPTO_poly1305_finish(poly1305_state* state,
+ uint8_t mac[POLY1305_MAC_LEN]);
+
+/* CRYPTO_chacha_20 encrypts |in_len| bytes from |in| with the given key and
+ * nonce and writes the result to |out|, which may be equal to |in|. The
+ * initial block counter is specified by |counter|. */
+void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
+ uint8_t nonce[48]);
+
+#ifdef CHAPOLY_x86_64_ASM
+void poly1305_init_x64(poly1305_state* state, const uint8_t key[32]);
+void poly1305_update_x64(poly1305_state* state, const uint8_t *in, size_t in_len);
+void poly1305_finish_x64(poly1305_state* state, uint8_t mac[16]);
+
+void poly1305_init_avx(poly1305_state* state, const uint8_t key[32]);
+void poly1305_update_avx(poly1305_state* state, const uint8_t *in, size_t in_len);
+void poly1305_finish_avx(poly1305_state* state, uint8_t mac[16]);
+
+void poly1305_update_avx2(poly1305_state* state, const uint8_t *in, size_t in_len);
+void poly1305_finish_avx2(poly1305_state* state, uint8_t mac[16]);
+
+void chacha_20_core_avx(uint8_t *out, const uint8_t *in, size_t in_len,
+ uint8_t nonce[48]);
+
+void chacha_20_core_avx2(uint8_t *out, const uint8_t *in, size_t in_len,
+ uint8_t nonce[48]);
+#endif
+
+
+#if defined(__cplusplus)
+} /* extern C */
+#endif
+
+#endif /* OPENSSL_HEADER_POLY1305_H */
diff --git a/crypto/chacha20poly1305/chapolytest.c b/crypto/chacha20poly1305/chapolytest.c
new file mode 100644
index 0000000..7e2933f
--- /dev/null
+++ b/crypto/chacha20poly1305/chapolytest.c
@@ -0,0 +1,470 @@
+/* ====================================================================
+ * Copyright (c) 2011-2013 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include <openssl/chacha20poly1305.h>
+
+struct chacha_test {
+ const char *noncehex;
+ const char *outhex;
+};
+
+struct poly1305_test {
+ const char *inputhex;
+ const char *keyhex;
+ const char *outhex;
+};
+
+static const struct chacha_test chacha_tests[] = {
+ {
+ "00000000000000000000000000000000""00000000000000000000000000000000"
+ "00000000000000000000000000000000",
+ "76b8e0ada0f13d90405d6ae55386bd28""bdd219b8a08ded1aa836efcc8b770dc7"
+ "da41597c5157488d7724e03fb8d84a37""6a43b8f41518a11cc387b669b2ee6586",
+ },
+ {
+ "00000000000000000000000000000000""00000000000000000000000000000001"
+ "00000000000000000000000000000000",
+ "4540f05a9f1fb296d7736e7b208e3c96""eb4fe1834688d2604f450952ed432d41"
+ "bbe2a0b6ea7566d2a5d1e7e20d42af2c""53d792b1c43fea817e9ad275ae546963",
+ },
+ {
+ "00000000000000000000000000000000""00000000000000000000000000000000"
+ "00000000000000000000000000000001",
+ "de9cba7bf3d69ef5e786dc63973f653a""0b49e015adbff7134fcb7df137821031"
+ "e85a050278a7084527214f73efc7fa5b""5277062eb7a0433e445f41e31afab757",
+ },
+ {
+ "00000000000000000000000000000000""00000000000000000000000000000000"
+ "00000000000000000100000000000000",
+ "ef3fdfd6c61578fbf5cf35bd3dd33b80""09631634d21e42ac33960bd138e50d32"
+ "111e4caf237ee53ca8ad6426194a8854""5ddc497a0b466e7d6bbdb0041b2f586b",
+ },
+ {
+ "000102030405060708090a0b0c0d0e0f""101112131415161718191a1b1c1d1e1f"
+ "00000000000000000001020304050607",
+ "f798a189f195e66982105ffb640bb775""7f579da31602fc93ec01ac56f85ac3c1"
+ "34a4547b733b46413042c94400491769""05d3be59ea1c53f15916155c2be8241a"
+ "38008b9a26bc35941e2444177c8ade66""89de95264986d95889fb60e84629c9bd"
+ "9a5acb1cc118be563eb9b3a4a472f82e""09a7e778492b562ef7130e88dfe031c7"
+ "9db9d4f7c7a899151b9a475032b63fc3""85245fe054e3dd5a97a5f576fe064025"
+ "d3ce042c566ab2c507b138db853e3d69""59660996546cc9c4a6eafdc777c040d7"
+ "0eaf46f76dad3979e5c5360c3317166a""1c894c94a371876a94df7628fe4eaaf2"
+ "ccb27d5aaae0ad7ad0f9d4b6ad3b5409""8746d4524d38407a6deb",
+ },
+};
+
+static const struct poly1305_test poly1305_tests[] = {
+ {
+ "",
+ "c8afaac331ee372cd6082de134943b17""4710130e9f6fea8d72293850a667d86c",
+ "4710130e9f6fea8d72293850a667d86c",
+ },
+ {
+ "48656c6c6f20776f726c6421",
+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035",
+ "a6f745008f81c916a20dcc74eef2b2f0",
+ },
+ {
+ "00000000000000000000000000000000""00000000000000000000000000000000",
+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035",
+ "49ec78090e481ec6c26b33b91ccc0307",
+ },
+ {
+ "43727970746f6772617068696320466f""72756d2052657365617263682047726f"
+ "7570",
+ "85d6be7857556d337f4452fe42d506a8""0103808afb0db2fd4abff6af4149f51b",
+ "a8061dc1305136c6c22b8baf0c0127a9"
+ },
+ {
+ "f3f6",
+ "851fc40c3467ac0be05cc20404f3f700""580b3b0f9447bb1e69d095b5928b6dbc",
+ "f4c633c3044fc145f84f335cb81953de"
+ },
+ {
+ "",
+ "a0f3080000f46400d0c7e9076c834403""dd3fab2251f11ac759f0887129cc2ee7",
+ "dd3fab2251f11ac759f0887129cc2ee7"
+ },
+ {
+ "663cea190ffb83d89593f3f476b6bc24""d7e679107ea26adb8caf6652d0656136",
+ "48443d0bb0d21109c89a100b5ce2c208""83149c69b561dd88298a1798b10716ef",
+ "0ee1c16bb73f0f4fd19881753c01cdbe"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d1""36c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67""fa83e158c994d961c4cb21095c1bf9",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "5154ad0d2cb26e01274fc51148491f1b"
+ },
+ /*
+ * self-generated
+ */
+ {
+ "ab0812724a7f1e342742cbed374d94d1""36c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67""fa83e158c994d961c4cb21095c1bf9af",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "812059a5da198637cac7c4a631bee466"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d1""36c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "5b88d7f6228b11e2e28579a5c0c1f761"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "bbb613b2b6d753ba07395b916aaece15"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "c794d7057d1778c4bbee0a39b3d97342"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "ffbcb9b371423152d7fca5ad042fbaa9"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136"
+ "812059a5da198637cac7c4a631bee466",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "069ed6b8ef0f207b3e243bb1019fe632"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136"
+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "cca339d9a45fa2368c2c68b3a4179133"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136"
+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761"
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "53f6e828a2f0fe0ee815bf0bd5841a34"
+ },
+ {
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136"
+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761"
+ "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
+ "990c62e48b8018b2c3e4a0fa3134cb67fa83e158c994d961c4cb21095c1bf9af"
+ "48443d0bb0d21109c89a100b5ce2c20883149c69b561dd88298a1798b10716ef"
+ "663cea190ffb83d89593f3f476b6bc24d7e679107ea26adb8caf6652d0656136"
+ "812059a5da198637cac7c4a631bee4665b88d7f6228b11e2e28579a5c0c1f761",
+ "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
+ "b846d44e9bbd53cedffbfbb6b7fa4933"
+ },
+ {
+ /*
+ * poly1305_ieee754.c failed this in final stage
+ */
+ "842364e156336c0998b933a6237726180d9e3fdcbde4cd5d17080fc3beb49614"
+ "d7122c037463ff104d73f19c12704628d417c4c54a3fe30d3c3d7714382d43b0"
+ "382a50a5dee54be844b076e8df88201a1cd43b90eb21643fa96f39b518aa8340"
+ "c942ff3c31baf7c9bdbf0f31ae3fa096bf8c63030609829fe72e179824890bc8"
+ "e08c315c1cce2a83144dbbff09f74e3efc770b54d0984a8f19b14719e6363564"
+ "1d6b1eedf63efbf080e1783d32445412114c20de0b837a0dfa33d6b82825fff4"
+ "4c9a70ea54ce47f07df698e6b03323b53079364a5fc3e9dd034392bdde86dccd"
+ "da94321c5e44060489336cb65bf3989c36f7282c2f5d2b882c171e74",
+ "95d5c005503e510d8cd0aa072c4a4d06""6eabc52d11653df47fbf63ab198bcc26",
+ "f248312e578d9d58f8b7bb4d19105431"
+ },
+ /*
+ * test vectors from Google
+ */
+ {
+ "",
+ "c8afaac331ee372cd6082de134943b17""4710130e9f6fea8d72293850a667d86c",
+ "4710130e9f6fea8d72293850a667d86c",
+ },
+ {
+ "48656c6c6f20776f726c6421",
+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035",
+ "a6f745008f81c916a20dcc74eef2b2f0"
+ },
+ {
+ "0000000000000000000000000000000000000000000000000000000000000000",
+ "746869732069732033322d6279746520""6b657920666f7220506f6c7931333035",
+ "49ec78090e481ec6c26b33b91ccc0307"
+ },
+ /*
+ * test vectors from Andrew Moon
+ */
+ { /* nacl */
+ "8e993b9f48681273c29650ba32fc76ce48332ea7164d96a4476fb8c531a1186a"
+ "c0dfc17c98dce87b4da7f011ec48c97271d2c20f9b928fe2270d6fb863d51738"
+ "b48eeee314a7cc8ab932164548e526ae90224368517acfeabd6bb3732bc0e9da"
+ "99832b61ca01b6de56244a9e88d5f9b37973f622a43d14a6599b1f654cb45a74"
+ "e355a5",
+ "eea6a7251c1e72916d11c2cb214d3c25""2539121d8e234e652d651fa4c8cff880",
+ "f3ffc7703f9400e52a7dfb4b3d3305d9"
+ },
+ { /* wrap 2^130-5 */
+ "ffffffffffffffffffffffffffffffff",
+ "02000000000000000000000000000000""00000000000000000000000000000000",
+ "03000000000000000000000000000000"
+ },
+ { /* wrap 2^128 */
+ "02000000000000000000000000000000",
+ "02000000000000000000000000000000""ffffffffffffffffffffffffffffffff",
+ "03000000000000000000000000000000"
+ },
+ { /* limb carry */
+ "fffffffffffffffffffffffffffffffff0ffffffffffffffffffffffffffffff"
+ "11000000000000000000000000000000",
+ "01000000000000000000000000000000""00000000000000000000000000000000",
+ "05000000000000000000000000000000"
+ },
+ { /* 2^130-5 */
+ "fffffffffffffffffffffffffffffffffbfefefefefefefefefefefefefefefe"
+ "01010101010101010101010101010101",
+ "01000000000000000000000000000000""00000000000000000000000000000000",
+ "00000000000000000000000000000000"
+ },
+ { /* 2^130-6 */
+ "fdffffffffffffffffffffffffffffff",
+ "02000000000000000000000000000000""00000000000000000000000000000000",
+ "faffffffffffffffffffffffffffffff"
+ },
+ { /* 5*H+L reduction intermediate */
+ "e33594d7505e43b900000000000000003394d7505e4379cd0100000000000000"
+ "0000000000000000000000000000000001000000000000000000000000000000",
+ "01000000000000000400000000000000""00000000000000000000000000000000",
+ "14000000000000005500000000000000"
+ },
+ { /* 5*H+L reduction final */
+ "e33594d7505e43b900000000000000003394d7505e4379cd0100000000000000"
+ "00000000000000000000000000000000",
+ "01000000000000000400000000000000""00000000000000000000000000000000",
+ "13000000000000000000000000000000"
+ }
+};
+
+static unsigned char hex_digit(char h)
+{
+ if (h >= '0' && h <= '9')
+ return h - '0';
+ else if (h >= 'a' && h <= 'f')
+ return h - 'a' + 10;
+ else if (h >= 'A' && h <= 'F')
+ return h - 'A' + 10;
+ else
+ abort();
+}
+
+static void hex_decode(unsigned char *out, const char* hex)
+{
+ size_t j = 0;
+
+ while (*hex != 0) {
+ unsigned char v = hex_digit(*hex++);
+ v <<= 4;
+ v |= hex_digit(*hex++);
+ out[j++] = v;
+ }
+}
+
+static void hexdump(unsigned char *a, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ printf("%02x", a[i]);
+ }
+}
+
+/* misalign returns a pointer that points 0 to 15 bytes into |in| such that the
+ * returned pointer has alignment 1 mod 16. */
+static void* misalign(void* in)
+{
+ intptr_t x = (intptr_t) in;
+ x += (17 - (x % 16)) % 16;
+ return (void*) x;
+}
+
+int main()
+{
+
+ unsigned num_tests =
+ sizeof(chacha_tests) / sizeof(struct chacha_test);
+ unsigned i;
+ unsigned char nonce_bytes[48 + 16] = {0};
+
+ for (i = 0; i < num_tests; i++) {
+ unsigned char *nonce = misalign(nonce_bytes);
+
+ printf("ChaCha20 test #%d\n", i);
+ const struct chacha_test *test = &chacha_tests[i];
+ unsigned char *expected, *out_bytes, *zero_bytes, *out, *zeros;
+ size_t len = strlen(test->outhex);
+
+ if (strlen(test->noncehex) != 48*2 || (len & 1) == 1)
+ return 1;
+
+ len /= 2;
+
+ hex_decode(nonce, test->noncehex);
+
+ expected = malloc(len);
+ out_bytes = malloc(len+16);
+ zero_bytes = malloc(len+16);
+ /* Attempt to test unaligned inputs. */
+ out = misalign(out_bytes);
+ zeros = misalign(zero_bytes);
+ memset(zeros, 0, len);
+
+ hex_decode(expected, test->outhex);
+ CRYPTO_chacha_20(out, zeros, len, nonce);
+
+ if (memcmp(out, expected, len) != 0) {
+ printf("ChaCha20 test #%d failed.\n", i);
+ printf("got: ");
+ hexdump(out, len);
+ printf("\nexpected: ");
+ hexdump(expected, len);
+ printf("\n");
+ return 1;
+ }
+
+
+ free(expected);
+ free(zero_bytes);
+ free(out_bytes);
+ }
+
+ num_tests =
+ sizeof(poly1305_tests) / sizeof(struct poly1305_test);
+ unsigned char key[32], out[16], expected[16];
+ poly1305_state poly1305;
+
+ for (i = 0; i < num_tests; i++) {
+ printf("Poly1305 test #%d\n", i);
+ const struct poly1305_test *test = &poly1305_tests[i];
+ unsigned char *in;
+ size_t inlen = strlen(test->inputhex);
+
+ if (strlen(test->keyhex) != sizeof(key)*2 ||
+ strlen(test->outhex) != sizeof(out)*2 ||
+ (inlen & 1) == 1)
+ return 1;
+
+ inlen /= 2;
+
+ hex_decode(key, test->keyhex);
+ hex_decode(expected, test->outhex);
+
+ in = malloc(inlen);
+
+ hex_decode(in, test->inputhex);
+
+#ifdef CHAPOLY_x86_64_ASM
+ if((OPENSSL_ia32cap_loc()[1] >> 5) & 1) {
+ poly1305_init_x64(&poly1305, key);
+ poly1305_update_avx2(&poly1305, in, inlen);
+ poly1305_finish_avx2(&poly1305, out);
+ } else {
+ poly1305_init_x64(&poly1305, key);
+ poly1305_update_x64(&poly1305, in, inlen);
+ poly1305_finish_x64(&poly1305, out);
+ }
+#else
+ {
+ CRYPTO_poly1305_init(&poly1305, key);
+ CRYPTO_poly1305_update(&poly1305, in, inlen);
+ CRYPTO_poly1305_finish(&poly1305, out);
+ }
+#endif
+ if (memcmp(out, expected, sizeof(expected)) != 0) {
+ printf("Poly1305 test #%d failed.\n", i);
+ printf("got: ");
+ hexdump(out, sizeof(out));
+ printf("\nexpected: ");
+ hexdump(expected, sizeof(expected));
+ printf("\n");
+ return 1;
+ }
+
+ free(in);
+ }
+
+ printf("PASS\n");
+ return 0;
+}
+
diff --git a/crypto/chacha20poly1305/poly1305.c b/crypto/chacha20poly1305/poly1305.c
new file mode 100644
index 0000000..50bc4a0
--- /dev/null
+++ b/crypto/chacha20poly1305/poly1305.c
@@ -0,0 +1,287 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+/* This implementation of poly1305 is by Andrew Moon
+ * (https://github.com/floodyberry/poly1305-donna) and released as public
+ * domain. */
+
+#include "chacha20poly1305.h"
+
+#include <string.h>
+
+#if !defined(B_ENDIAN)
+/* We can assume little-endian. */
+static uint32_t U8TO32_LE(const uint8_t *m) {
+ uint32_t r;
+ memcpy(&r, m, sizeof(r));
+ return r;
+}
+
+static void U32TO8_LE(uint8_t *m, uint32_t v) { memcpy(m, &v, sizeof(v)); }
+#else
+static uint32_t U8TO32_LE(const uint8_t *m) {
+ return (uint32_t)m[0] | (uint32_t)m[1] << 8 | (uint32_t)m[2] << 16 |
+ (uint32_t)m[3] << 24;
+}
+
+static void U32TO8_LE(uint8_t *m, uint32_t v) {
+ m[0] = v;
+ m[1] = v >> 8;
+ m[2] = v >> 16;
+ m[3] = v >> 24;
+}
+#endif
+
+static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; }
+
+struct poly1305_state_st {
+ uint32_t r0, r1, r2, r3, r4;
+ uint32_t s1, s2, s3, s4;
+ uint32_t h0, h1, h2, h3, h4;
+ uint8_t buf[16];
+ unsigned int buf_used;
+ uint8_t key[16];
+};
+
+/* poly1305_blocks updates |state| given some amount of input data. This
+ * function may only be called with a |len| that is not a multiple of 16 at the
+ * end of the data. Otherwise the input must be buffered into 16 byte blocks. */
+static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in,
+ size_t len) {
+ uint32_t t0, t1, t2, t3;
+ uint64_t t[5];
+ uint32_t b;
+ uint64_t c;
+ size_t j;
+ uint8_t mp[16];
+
+ if (len < 16) {
+ goto poly1305_donna_atmost15bytes;
+ }
+
+poly1305_donna_16bytes:
+ t0 = U8TO32_LE(in);
+ t1 = U8TO32_LE(in + 4);
+ t2 = U8TO32_LE(in + 8);
+ t3 = U8TO32_LE(in + 12);
+
+ in += 16;
+ len -= 16;
+
+ state->h0 += t0 & 0x3ffffff;
+ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
+ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
+ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
+ state->h4 += (t3 >> 8) | (1 << 24);
+
+poly1305_donna_mul:
+ t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) +
+ mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) +
+ mul32x32_64(state->h4, state->s1);
+ t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) +
+ mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) +
+ mul32x32_64(state->h4, state->s2);
+ t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) +
+ mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) +
+ mul32x32_64(state->h4, state->s3);
+ t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) +
+ mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) +
+ mul32x32_64(state->h4, state->s4);
+ t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) +
+ mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) +
+ mul32x32_64(state->h4, state->r0);
+
+ state->h0 = (uint32_t)t[0] & 0x3ffffff;
+ c = (t[0] >> 26);
+ t[1] += c;
+ state->h1 = (uint32_t)t[1] & 0x3ffffff;
+ b = (uint32_t)(t[1] >> 26);
+ t[2] += b;
+ state->h2 = (uint32_t)t[2] & 0x3ffffff;
+ b = (uint32_t)(t[2] >> 26);
+ t[3] += b;
+ state->h3 = (uint32_t)t[3] & 0x3ffffff;
+ b = (uint32_t)(t[3] >> 26);
+ t[4] += b;
+ state->h4 = (uint32_t)t[4] & 0x3ffffff;
+ b = (uint32_t)(t[4] >> 26);
+ state->h0 += b * 5;
+
+ if (len >= 16)
+ goto poly1305_donna_16bytes;
+
+/* final bytes */
+poly1305_donna_atmost15bytes:
+ if (!len)
+ return;
+
+ for (j = 0; j < len; j++)
+ mp[j] = in[j];
+ mp[j++] = 1;
+ for (; j < 16; j++)
+ mp[j] = 0;
+ len = 0;
+
+ t0 = U8TO32_LE(mp + 0);
+ t1 = U8TO32_LE(mp + 4);
+ t2 = U8TO32_LE(mp + 8);
+ t3 = U8TO32_LE(mp + 12);
+
+ state->h0 += t0 & 0x3ffffff;
+ state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff;
+ state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff;
+ state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff;
+ state->h4 += (t3 >> 8);
+
+ goto poly1305_donna_mul;
+}
+
+void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) {
+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
+ uint32_t t0, t1, t2, t3;
+
+ t0 = U8TO32_LE(key + 0);
+ t1 = U8TO32_LE(key + 4);
+ t2 = U8TO32_LE(key + 8);
+ t3 = U8TO32_LE(key + 12);
+
+ /* precompute multipliers */
+ state->r0 = t0 & 0x3ffffff;
+ t0 >>= 26;
+ t0 |= t1 << 6;
+ state->r1 = t0 & 0x3ffff03;
+ t1 >>= 20;
+ t1 |= t2 << 12;
+ state->r2 = t1 & 0x3ffc0ff;
+ t2 >>= 14;
+ t2 |= t3 << 18;
+ state->r3 = t2 & 0x3f03fff;
+ t3 >>= 8;
+ state->r4 = t3 & 0x00fffff;
+
+ state->s1 = state->r1 * 5;
+ state->s2 = state->r2 * 5;
+ state->s3 = state->r3 * 5;
+ state->s4 = state->r4 * 5;
+
+ /* init state */
+ state->h0 = 0;
+ state->h1 = 0;
+ state->h2 = 0;
+ state->h3 = 0;
+ state->h4 = 0;
+
+ state->buf_used = 0;
+ memcpy(state->key, key + 16, sizeof(state->key));
+}
+
+void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in,
+ size_t in_len) {
+ unsigned int i;
+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
+
+ if (state->buf_used) {
+ unsigned int todo = 16 - state->buf_used;
+ if (todo > in_len)
+ todo = in_len;
+ for (i = 0; i < todo; i++)
+ state->buf[state->buf_used + i] = in[i];
+ state->buf_used += todo;
+ in_len -= todo;
+ in += todo;
+
+ if (state->buf_used == 16) {
+ poly1305_update(state, state->buf, 16);
+ state->buf_used = 0;
+ }
+ }
+
+ if (in_len >= 16) {
+ size_t todo = in_len & ~0xf;
+ poly1305_update(state, in, todo);
+ in += todo;
+ in_len &= 0xf;
+ }
+
+ if (in_len) {
+ for (i = 0; i < in_len; i++)
+ state->buf[i] = in[i];
+ state->buf_used = in_len;
+ }
+}
+
+void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) {
+ struct poly1305_state_st *state = (struct poly1305_state_st *)statep;
+ uint64_t f0, f1, f2, f3;
+ uint32_t g0, g1, g2, g3, g4;
+ uint32_t b, nb;
+
+ if (state->buf_used)
+ poly1305_update(state, state->buf, state->buf_used);
+
+ b = state->h0 >> 26;
+ state->h0 = state->h0 & 0x3ffffff;
+ state->h1 += b;
+ b = state->h1 >> 26;
+ state->h1 = state->h1 & 0x3ffffff;
+ state->h2 += b;
+ b = state->h2 >> 26;
+ state->h2 = state->h2 & 0x3ffffff;
+ state->h3 += b;
+ b = state->h3 >> 26;
+ state->h3 = state->h3 & 0x3ffffff;
+ state->h4 += b;
+ b = state->h4 >> 26;
+ state->h4 = state->h4 & 0x3ffffff;
+ state->h0 += b * 5;
+
+ g0 = state->h0 + 5;
+ b = g0 >> 26;
+ g0 &= 0x3ffffff;
+ g1 = state->h1 + b;
+ b = g1 >> 26;
+ g1 &= 0x3ffffff;
+ g2 = state->h2 + b;
+ b = g2 >> 26;
+ g2 &= 0x3ffffff;
+ g3 = state->h3 + b;
+ b = g3 >> 26;
+ g3 &= 0x3ffffff;
+ g4 = state->h4 + b - (1 << 26);
+
+ b = (g4 >> 31) - 1;
+ nb = ~b;
+ state->h0 = (state->h0 & nb) | (g0 & b);
+ state->h1 = (state->h1 & nb) | (g1 & b);
+ state->h2 = (state->h2 & nb) | (g2 & b);
+ state->h3 = (state->h3 & nb) | (g3 & b);
+ state->h4 = (state->h4 & nb) | (g4 & b);
+
+ f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]);
+ f1 = ((state->h1 >> 6) | (state->h2 << 20)) +
+ (uint64_t)U8TO32_LE(&state->key[4]);
+ f2 = ((state->h2 >> 12) | (state->h3 << 14)) +
+ (uint64_t)U8TO32_LE(&state->key[8]);
+ f3 = ((state->h3 >> 18) | (state->h4 << 8)) +
+ (uint64_t)U8TO32_LE(&state->key[12]);
+
+ U32TO8_LE(&mac[0], f0);
+ f1 += (f0 >> 32);
+ U32TO8_LE(&mac[4], f1);
+ f2 += (f1 >> 32);
+ U32TO8_LE(&mac[8], f2);
+ f3 += (f2 >> 32);
+ U32TO8_LE(&mac[12], f3);
+}
+
diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
index 1925428..3446c6a 100644
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@@ -656,16 +656,6 @@ const char *CRYPTO_get_lock_name(int type)
extern unsigned int OPENSSL_ia32cap_P[4];
unsigned long *OPENSSL_ia32cap_loc(void)
{
- if (sizeof(long) == 4)
- /*
- * If 32-bit application pulls address of OPENSSL_ia32cap_P[0]
- * clear second element to maintain the illusion that vector
- * is 32-bit.
- */
- OPENSSL_ia32cap_P[1] = 0;
-
- OPENSSL_ia32cap_P[2] = 0;
-
return (unsigned long *)OPENSSL_ia32cap_P;
}
diff --git a/crypto/evp/Makefile b/crypto/evp/Makefile
index fa138d0..7eee6c7 100644
--- a/crypto/evp/Makefile
+++ b/crypto/evp/Makefile
@@ -29,7 +29,8 @@ LIBSRC= encode.c digest.c evp_enc.c evp_key.c evp_acnf.c evp_cnf.c \
c_all.c c_allc.c c_alld.c evp_lib.c bio_ok.c \
evp_pkey.c evp_pbe.c p5_crpt.c p5_crpt2.c \
e_old.c pmeth_lib.c pmeth_fn.c pmeth_gn.c m_sigver.c \
- e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c
+ e_aes_cbc_hmac_sha1.c e_aes_cbc_hmac_sha256.c e_rc4_hmac_md5.c \
+ e_chacha20poly1305.c
LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \
e_des.o e_bf.o e_idea.o e_des3.o e_camellia.o\
@@ -42,7 +43,8 @@ LIBOBJ= encode.o digest.o evp_enc.o evp_key.o evp_acnf.o evp_cnf.o \
c_all.o c_allc.o c_alld.o evp_lib.o bio_ok.o \
evp_pkey.o evp_pbe.o p5_crpt.o p5_crpt2.o \
e_old.o pmeth_lib.o pmeth_fn.o pmeth_gn.o m_sigver.o \
- e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o
+ e_aes_cbc_hmac_sha1.o e_aes_cbc_hmac_sha256.o e_rc4_hmac_md5.o \
+ e_chacha20poly1305.o
SRC= $(LIBSRC)
@@ -264,6 +266,7 @@ e_cast.o: ../../include/openssl/objects.h ../../include/openssl/opensslconf.h
e_cast.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
e_cast.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
e_cast.o: ../../include/openssl/symhacks.h ../cryptlib.h e_cast.c evp_locl.h
+e_chacha20poly1305.o: ../../include/openssl/chacha20poly1305.h e_chacha20poly1305.c
e_des.o: ../../e_os.h ../../include/openssl/asn1.h ../../include/openssl/bio.h
e_des.o: ../../include/openssl/buffer.h ../../include/openssl/crypto.h
e_des.o: ../../include/openssl/des.h ../../include/openssl/des_old.h
diff --git a/crypto/evp/e_chacha20poly1305.c b/crypto/evp/e_chacha20poly1305.c
new file mode 100644
index 0000000..17f8cb4
--- /dev/null
+++ b/crypto/evp/e_chacha20poly1305.c
@@ -0,0 +1,435 @@
+/* ====================================================================
+ * Copyright (c) 2001-2014 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ * openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ */
+
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_CHACHA_POLY
+#include <openssl/evp.h>
+#include <openssl/err.h>
+#include <openssl/chacha20poly1305.h>
+#include "evp_locl.h"
+#include <openssl/rand.h>
+
+#define FILL_BUFFER ((size_t)128)
+
+typedef struct {
+ uint8_t iv[12];
+ uint8_t nonce[48];
+ size_t aad_l;
+ size_t ct_l;
+ unsigned valid:1;
+ unsigned draft:1;
+ uint8_t poly_buffer[FILL_BUFFER];
+ uint8_t chacha_buffer[FILL_BUFFER];
+ uint16_t poly_buffer_used;
+ uint16_t chacha_used;
+#ifdef CHAPOLY_x86_64_ASM
+ void (*poly1305_init_ptr)(poly1305_state *, const uint8_t *);
+ void (*poly1305_update_ptr)(poly1305_state *, const uint8_t *, size_t);
+ void (*poly1305_finish_ptr)(poly1305_state *, uint8_t *);
+ poly1305_state poly_state;
+ #define poly_init aead_ctx->poly1305_init_ptr
+ #define poly_update poly1305_update_wrapper
+ #define poly_finish poly1305_finish_wrapper
+#else
+ #define poly_init CRYPTO_poly1305_init
+ #define poly_update(c,i,l) CRYPTO_poly1305_update(&c->poly_state,i,l)
+ #define poly_finish(c,m) CRYPTO_poly1305_finish(&c->poly_state,m)
+#endif
+} EVP_CHACHA20_POLY1305_CTX;
+
+
+#ifdef CHAPOLY_x86_64_ASM
+#include <immintrin.h>
+
+static void poly1305_update_wrapper(EVP_CHACHA20_POLY1305_CTX *ctx,
+ const uint8_t *in,
+ size_t in_len)
+{
+ int todo;
+ /* Attempt to fill as many bytes as possible before calling the update
+ function */
+ if (in_len < FILL_BUFFER || ctx->poly_buffer_used) {
+ todo = FILL_BUFFER - ctx->poly_buffer_used;
+ todo = in_len < todo? in_len : todo;
+ memcpy(ctx->poly_buffer + ctx->poly_buffer_used, in, todo);
+ ctx->poly_buffer_used += todo;
+ in += todo;
+ in_len -= todo;
+
+ if (ctx->poly_buffer_used == FILL_BUFFER) {
+ ctx->poly1305_update_ptr(&ctx->poly_state,
+ ctx->poly_buffer,
+ FILL_BUFFER);
+ ctx->poly_buffer_used = 0;
+ }
+ }
+
+ if (in_len >= FILL_BUFFER) {
+ ctx->poly1305_update_ptr(&ctx->poly_state, in, in_len & (-FILL_BUFFER));
+ in += in_len & (-FILL_BUFFER);
+ in_len &= (FILL_BUFFER - 1);
+ }
+
+ if (in_len) {
+ memcpy(ctx->poly_buffer, in, in_len);
+ ctx->poly_buffer_used = in_len;
+ }
+}
+
+
+static void poly1305_finish_wrapper(EVP_CHACHA20_POLY1305_CTX *ctx,
+ uint8_t mac[POLY1305_MAC_LEN])
+{
+ if (ctx->poly_buffer_used) {
+
+ if (ctx->poly_buffer_used % POLY1305_PAD_LEN) {
+ memset(ctx->poly_buffer + ctx->poly_buffer_used, 0,
+ POLY1305_PAD_LEN - (ctx->poly_buffer_used % POLY1305_PAD_LEN));
+ }
+
+ ctx->poly1305_update_ptr(&ctx->poly_state,
+ ctx->poly_buffer,
+ ctx->poly_buffer_used);
+ }
+
+ ctx->poly1305_finish_ptr(&ctx->poly_state, mac);
+ memset(ctx->poly_buffer, 0, FILL_BUFFER);
+}
+#endif
+
+
+#ifdef CHAPOLY_x86_64_ASM
+static void EVP_chacha20_poly1305_cpuid(EVP_CHACHA20_POLY1305_CTX *ctx)
+{
+ if ((OPENSSL_ia32cap_loc()[1] >> 5) & 1) { /* AVX2 */
+ ctx->poly1305_init_ptr = poly1305_init_x64; /* Lazy init */
+ ctx->poly1305_update_ptr = poly1305_update_avx2;
+ ctx->poly1305_finish_ptr = poly1305_finish_avx2;
+/*
+ } else if (0 && (OPENSSL_ia32cap_loc()[0] >> 60) & 1) { // AVX -disabled
+ ctx->poly1305_init_ptr = poly1305_init_avx;
+ ctx->poly1305_update_ptr = poly1305_update_avx;
+ ctx->poly1305_finish_ptr = poly1305_finish_avx;
+*/
+ } else { /* x64 code */
+ ctx->poly1305_init_ptr = poly1305_init_x64;
+ ctx->poly1305_update_ptr = poly1305_update_x64;
+ ctx->poly1305_finish_ptr = poly1305_finish_x64;
+ }
+}
+#endif
+
+
+static int EVP_chacha20_poly1305_init_draft(EVP_CIPHER_CTX *ctx,
+ const unsigned char *key,
+ const unsigned char *iv,
+ int enc)
+{
+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
+ memcpy(aead_ctx->nonce, key, 32);
+ aead_ctx->valid = 0;
+ aead_ctx->draft = 1;
+
+#ifdef CHAPOLY_x86_64_ASM
+ EVP_chacha20_poly1305_cpuid(aead_ctx);
+#endif
+
+ return 1;
+}
+
+
+static int EVP_chacha20_poly1305_init(EVP_CIPHER_CTX *ctx,
+ const unsigned char *key,
+ const unsigned char *iv,
+ int enc)
+{
+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
+ memcpy(aead_ctx->nonce, key, 32);
+ memcpy(aead_ctx->iv, iv, 12);
+ aead_ctx->valid = 0;
+ aead_ctx->draft = 0;
+
+#ifdef CHAPOLY_x86_64_ASM
+ EVP_chacha20_poly1305_cpuid(aead_ctx);
+#endif
+
+ return 1;
+}
+
+
+static int EVP_chacha20_poly1305_cipher(EVP_CIPHER_CTX *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ size_t inl)
+{
+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
+ uint8_t poly_mac[POLY1305_MAC_LEN];
+ uint8_t zero[POLY1305_PAD_LEN] = {0};
+ uint64_t cmp;
+ int i, todo;
+
+ if (!aead_ctx->valid)
+ return 0;
+
+ if (inl < POLY1305_MAC_LEN)
+ return -1;
+
+ /* Fix for MAC */
+ inl -= POLY1305_MAC_LEN;
+
+ if (!ctx->encrypt) {
+ poly_update(aead_ctx, in, inl);
+ }
+
+ i = 0;
+ if (inl < 256) {
+ /* Consume the buffer we computed during poly initialization */
+ todo = inl > (FILL_BUFFER - aead_ctx->chacha_used) ?
+ FILL_BUFFER - aead_ctx->chacha_used :
+ inl;
+
+#ifdef CHAPOLY_x86_64_ASM
+ for (; i <= todo - 16; i+=16) {
+ _mm_storeu_si128((__m128i*)&out[i],
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)&in[i]),
+ _mm_loadu_si128((__m128i *)&aead_ctx->chacha_buffer[i + 64])));
+ }
+#endif
+ for (; i < todo; i++) {
+ out[i] = in[i] ^ aead_ctx->chacha_buffer[i + 64 /*aead_ctx->chacha_used*/];
+ }
+
+ } else {
+ /* For long messages don't use precomputed buffer */
+ ((uint64_t *)(aead_ctx->nonce))[4]--;
+ }
+
+ todo = inl - i;
+
+ if (todo) {
+ CRYPTO_chacha_20(&out[i], &in[i], todo, aead_ctx->nonce);
+ }
+
+ if (ctx->encrypt) {
+ poly_update(aead_ctx, out, inl);
+ }
+
+ aead_ctx->ct_l += inl;
+
+ if (!aead_ctx->draft) {
+ /* For RFC padd ciphertext with zeroes, then mac len(aad)||len(ct) */
+ todo = aead_ctx->ct_l % POLY1305_PAD_LEN ?
+ POLY1305_PAD_LEN - (aead_ctx->ct_l % POLY1305_PAD_LEN) :
+ 0;
+
+ if (todo) {
+ poly_update(aead_ctx, zero, todo);
+ }
+
+ poly_update(aead_ctx, (uint8_t*)&aead_ctx->aad_l, sizeof(uint64_t));
+ poly_update(aead_ctx, (uint8_t*)&aead_ctx->ct_l, sizeof(uint64_t));
+
+ } else {
+ /* For the draft don't pad, mac len(ct) */
+ poly_update(aead_ctx, (uint8_t*)&aead_ctx->ct_l, sizeof(uint64_t));
+ }
+ aead_ctx->valid = 0;
+
+ if (ctx->encrypt) {
+ poly_finish(aead_ctx, &out[inl]);
+ return inl + POLY1305_MAC_LEN;
+
+ } else { /* Decryption */
+ poly_finish(aead_ctx, poly_mac);
+ /* Constant time comparison */
+ cmp = (*(uint64_t *)(poly_mac)) ^ (*(uint64_t *)(in + inl));
+ cmp |= (*(uint64_t *)(poly_mac + 8)) ^ (*(uint64_t *)(in + inl + 8));
+
+ if (cmp) {
+ OPENSSL_cleanse(out, inl);
+ return -1;
+ }
+
+ return inl;
+ }
+}
+
+
+static int EVP_chacha20_poly1305_cleanup(EVP_CIPHER_CTX *ctx)
+{
+ return 1;
+}
+
+
+static int EVP_chacha20_poly1305_ctrl(EVP_CIPHER_CTX *ctx,
+ int type,
+ int arg,
+ void *ptr)
+{
+ EVP_CHACHA20_POLY1305_CTX *aead_ctx = ctx->cipher_data;
+ uint8_t aad[EVP_AEAD_TLS1_AAD_LEN + 8];
+ uint64_t thirteen = EVP_AEAD_TLS1_AAD_LEN;
+
+ switch (type) {
+ case EVP_CTRL_AEAD_TLS1_AAD:
+
+ if (arg != EVP_AEAD_TLS1_AAD_LEN)
+ return 0;
+
+ /* Initialize poly keys */
+ memset(aead_ctx->chacha_buffer, 0, FILL_BUFFER);
+
+ if (!aead_ctx->draft) {
+ /* RFC IV = (0 || iv) ^ seq_num */
+ memset(aead_ctx->nonce + 32, 0, 4);
+ memcpy(aead_ctx->nonce + 36, aead_ctx->iv, 12);
+ *(uint64_t *)(aead_ctx->nonce + 40) ^= *(uint64_t *)(ptr);
+
+ } else {
+ /* draft IV = 0 || seq_num */
+ memset(aead_ctx->nonce + 32, 0, 8);
+ memcpy(aead_ctx->nonce + 40, ptr, 8);
+ }
+ /* Poly keys = ENC(0) */
+ CRYPTO_chacha_20(aead_ctx->chacha_buffer,
+ aead_ctx->chacha_buffer,
+ FILL_BUFFER,
+ aead_ctx->nonce);
+
+ poly_init(&aead_ctx->poly_state, aead_ctx->chacha_buffer);
+
+ aead_ctx->chacha_used = 64;
+ aead_ctx->poly_buffer_used = 0;
+ aead_ctx->aad_l = arg;
+ aead_ctx->ct_l = 0;
+
+ /* Absorb AAD */
+ memcpy(aad, ptr, arg);
+ memset(aad + arg, 0, sizeof(aad) - arg);
+
+ /* If decrypting fix length for tag */
+ if (!ctx->encrypt) {
+ unsigned int len = (aad[arg-2] << 8) | aad[arg-1];
+ len -= POLY1305_MAC_LEN;
+ aad[arg-2] = len>>8;
+ aad[arg-1] = len & 0xff;
+ }
+
+ if (!aead_ctx->draft) {
+ /* In the RFC, AAD is padded with zeroes */
+ poly_update(aead_ctx, aad, POLY1305_PAD_LEN);
+
+ } else {
+ /* In the draft AAD is followed by len(AAD) */
+ memcpy(&aad[arg], &thirteen, sizeof(thirteen));
+ poly_update(aead_ctx, aad, arg + sizeof(thirteen));
+ }
+
+ aead_ctx->valid = 1;
+ return POLY1305_MAC_LEN;
+
+ break;
+
+ default:
+ return 0;
+ break;
+ }
+
+ return 0;
+}
+
+
+#define CUSTOM_FLAGS (\
+ EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+ | EVP_CIPH_ALWAYS_CALL_INIT \
+ | EVP_CIPH_CUSTOM_COPY)
+
+
+static const EVP_CIPHER chacha20_poly1305_d = {
+ 0, /* nid ??? */
+ 1, /* block size, sorta */
+ 32, /* key len */
+ 0, /* iv len */
+ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */
+ EVP_chacha20_poly1305_init_draft,
+ EVP_chacha20_poly1305_cipher,
+ EVP_chacha20_poly1305_cleanup,
+ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */
+ NULL,
+ NULL,
+ EVP_chacha20_poly1305_ctrl,
+ NULL
+ };
+
+
+static const EVP_CIPHER chacha20_poly1305 = {
+ 0, /* nid ??? */
+ 1, /* block size, sorta */
+ 32, /* key len */
+ 12, /* iv len */
+ CUSTOM_FLAGS|EVP_CIPH_FLAG_AEAD_CIPHER, /* flags */
+ EVP_chacha20_poly1305_init,
+ EVP_chacha20_poly1305_cipher,
+ EVP_chacha20_poly1305_cleanup,
+ sizeof(EVP_CHACHA20_POLY1305_CTX), /* ctx size */
+ NULL,
+ NULL,
+ EVP_chacha20_poly1305_ctrl,
+ NULL
+ };
+
+
+const EVP_CIPHER *EVP_chacha20_poly1305_draft(void)
+{ return &chacha20_poly1305_d; }
+
+
+const EVP_CIPHER *EVP_chacha20_poly1305(void)
+{ return &chacha20_poly1305; }
+#endif
diff --git a/crypto/evp/evp.h b/crypto/evp/evp.h
index 39ab793..53ed671 100644
--- a/crypto/evp/evp.h
+++ b/crypto/evp/evp.h
@@ -893,6 +893,10 @@ const EVP_CIPHER *EVP_camellia_256_cfb128(void);
# define EVP_camellia_256_cfb EVP_camellia_256_cfb128
const EVP_CIPHER *EVP_camellia_256_ofb(void);
# endif
+# ifndef OPENSSL_NO_CHACHA_POLY
+const EVP_CIPHER *EVP_chacha20_poly1305(void);
+const EVP_CIPHER *EVP_chacha20_poly1305_draft(void);
+# endif
# ifndef OPENSSL_NO_SEED
const EVP_CIPHER *EVP_seed_ecb(void);
diff --git a/ssl/s3_lib.c b/ssl/s3_lib.c
index 0385e03..a29f325 100644
--- a/ssl/s3_lib.c
+++ b/ssl/s3_lib.c
@@ -2945,6 +2945,111 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[] = {
256},
#endif
+#if !defined(OPENSSL_NO_CHACHA_POLY)
+ /* Draft ciphers */
+ {
+ 1,
+ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D,
+ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D,
+ SSL_kEECDH,
+ SSL_aRSA,
+ SSL_CHACHA20POLY1305_D,
+ SSL_AEAD,
+ SSL_TLSV1_2,
+ SSL_HIGH,
+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
+ 256,
+ 256},
+ {
+ 1,
+ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D,
+ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D,
+ SSL_kEECDH,
+ SSL_aECDSA,
+ SSL_CHACHA20POLY1305_D,
+ SSL_AEAD,
+ SSL_TLSV1_2,
+ SSL_HIGH,
+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
+ 256,
+ 256},
+ {
+ 1,
+ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D,
+ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D,
+ SSL_kEDH,
+ SSL_aRSA,
+ SSL_CHACHA20POLY1305_D,
+ SSL_AEAD,
+ SSL_TLSV1_2,
+ SSL_HIGH,
+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
+ 256,
+ 256},
+ /* RFC ciphers */
+ /* Cipher CCA8 as per draft-ietf-tls-chacha20-poly1305-03 */
+ {
+ 1,
+ TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305,
+ TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305,
+ SSL_kECDHE,
+ SSL_aRSA,
+ SSL_CHACHA20POLY1305,
+ SSL_AEAD,
+ SSL_TLSV1_2,
+ SSL_HIGH,
+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
+ 256,
+ 256,
+ },
+ /* Cipher CCA9 */
+ {
+ 1,
+ TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
+ TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,
+ SSL_kECDHE,
+ SSL_aECDSA,
+ SSL_CHACHA20POLY1305,
+ SSL_AEAD,
+ SSL_TLSV1_2,
+ SSL_HIGH,
+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
+ 256,
+ 256,
+ },
+ /* Cipher CCAA */
+ {
+ 1,
+ TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305,
+ TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305,
+ SSL_kDHE,
+ SSL_aRSA,
+ SSL_CHACHA20POLY1305,
+ SSL_AEAD,
+ SSL_TLSV1_2,
+ SSL_HIGH,
+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
+ 256,
+ 256,
+ },
+ /* Cipher CCAB */
+ {
+ 1,
+ TLS1_TXT_PSK_WITH_CHACHA20_POLY1305,
+ TLS1_CK_PSK_WITH_CHACHA20_POLY1305,
+ SSL_kPSK,
+ SSL_aPSK,
+ SSL_CHACHA20POLY1305,
+ SSL_AEAD,
+ SSL_TLSV1_2,
+ SSL_HIGH,
+ SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256,
+ 256,
+ 256,
+ },
+#endif
+
+
/* end of list */
};
@@ -4090,6 +4195,7 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
int i, ii, ok;
CERT *cert;
unsigned long alg_k, alg_a, mask_k, mask_a, emask_k, emask_a;
+ int use_chacha = 0;
/* Let's see which ciphers we can support */
cert = s->cert;
@@ -4123,9 +4229,17 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
if (s->options & SSL_OP_CIPHER_SERVER_PREFERENCE || tls1_suiteb(s)) {
prio = srvr;
allow = clnt;
+ /* Use ChaCha20+Poly1305 iff it's client's most preferred cipher suite */
+ if (sk_SSL_CIPHER_num(clnt) > 0) {
+ c = sk_SSL_CIPHER_value(clnt, 0);
+ if (c->algorithm_enc == SSL_CHACHA20POLY1305 ||
+ c->algorithm_enc == SSL_CHACHA20POLY1305_D)
+ use_chacha = 1;
+ }
} else {
prio = clnt;
allow = srvr;
+ use_chacha = 1;
}
tls1_set_cert_validity(s);
@@ -4137,6 +4251,11 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
if ((c->algorithm_ssl & SSL_TLSV1_2) && !SSL_USE_TLS1_2_CIPHERS(s))
continue;
+ /* Skip ChaCha unless top client priority */
+ if ((c->algorithm_enc == SSL_CHACHA20POLY1305 ||
+ c->algorithm_enc == SSL_CHACHA20POLY1305_D) && !use_chacha)
+ continue;
+
ssl_set_cert_masks(cert, c);
mask_k = cert->mask_k;
mask_a = cert->mask_a;
diff --git a/ssl/ssl.h b/ssl/ssl.h
index 90aeb0c..dc1cde2 100644
--- a/ssl/ssl.h
+++ b/ssl/ssl.h
@@ -294,6 +294,8 @@ extern "C" {
# define SSL_TXT_AES256 "AES256"
# define SSL_TXT_AES "AES"
# define SSL_TXT_AES_GCM "AESGCM"
+# define SSL_TXT_CHACHA20_D "CHACHA20-draft"
+# define SSL_TXT_CHACHA20 "CHACHA20"
# define SSL_TXT_CAMELLIA128 "CAMELLIA128"
# define SSL_TXT_CAMELLIA256 "CAMELLIA256"
# define SSL_TXT_CAMELLIA "CAMELLIA"
diff --git a/ssl/ssl_ciph.c b/ssl/ssl_ciph.c
index 2ad8f43..09a162a 100644
--- a/ssl/ssl_ciph.c
+++ b/ssl/ssl_ciph.c
@@ -150,25 +150,27 @@
#endif
#include "ssl_locl.h"
-#define SSL_ENC_DES_IDX 0
-#define SSL_ENC_3DES_IDX 1
-#define SSL_ENC_RC4_IDX 2
-#define SSL_ENC_RC2_IDX 3
-#define SSL_ENC_IDEA_IDX 4
-#define SSL_ENC_NULL_IDX 5
-#define SSL_ENC_AES128_IDX 6
-#define SSL_ENC_AES256_IDX 7
-#define SSL_ENC_CAMELLIA128_IDX 8
-#define SSL_ENC_CAMELLIA256_IDX 9
-#define SSL_ENC_GOST89_IDX 10
-#define SSL_ENC_SEED_IDX 11
-#define SSL_ENC_AES128GCM_IDX 12
-#define SSL_ENC_AES256GCM_IDX 13
-#define SSL_ENC_NUM_IDX 14
+#define SSL_ENC_DES_IDX 0
+#define SSL_ENC_3DES_IDX 1
+#define SSL_ENC_RC4_IDX 2
+#define SSL_ENC_RC2_IDX 3
+#define SSL_ENC_IDEA_IDX 4
+#define SSL_ENC_NULL_IDX 5
+#define SSL_ENC_AES128_IDX 6
+#define SSL_ENC_AES256_IDX 7
+#define SSL_ENC_CAMELLIA128_IDX 8
+#define SSL_ENC_CAMELLIA256_IDX 9
+#define SSL_ENC_GOST89_IDX 10
+#define SSL_ENC_SEED_IDX 11
+#define SSL_ENC_AES128GCM_IDX 12
+#define SSL_ENC_AES256GCM_IDX 13
+#define SSL_ENC_CHACHA20POLY1305_DRAFT_IDX 14
+#define SSL_ENC_CHACHA20POLY1305_IDX 15
+#define SSL_ENC_NUM_IDX 16
static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX] = {
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
- NULL, NULL
+ NULL, NULL, NULL, NULL
};
#define SSL_COMP_NULL_IDX 0
@@ -362,6 +364,9 @@ static const SSL_CIPHER cipher_aliases[] = {
{0, SSL3_TXT_DHE_RSA_DES_192_CBC3_SHA, 0,
SSL_kDHE, SSL_aRSA, SSL_3DES, SSL_SHA1, SSL_SSLV3,
SSL_NOT_EXP | SSL_HIGH | SSL_FIPS, 0, 0, 0,},
+
+ {0, SSL_TXT_CHACHA20_D, 0, 0, 0, SSL_CHACHA20POLY1305_D, 0, 0, 0, 0, 0, 0},
+ {0, SSL_TXT_CHACHA20, 0, 0, 0, SSL_CHACHA20POLY1305, 0, 0, 0, 0, 0, 0},
};
/*
@@ -431,6 +436,11 @@ void ssl_load_ciphers(void)
ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] =
EVP_get_cipherbyname(SN_aes_256_gcm);
+ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] =
+ EVP_chacha20_poly1305_draft();
+ ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] =
+ EVP_chacha20_poly1305();
+
ssl_digest_methods[SSL_MD_MD5_IDX] = EVP_get_digestbyname(SN_md5);
ssl_mac_secret_size[SSL_MD_MD5_IDX] =
EVP_MD_size(ssl_digest_methods[SSL_MD_MD5_IDX]);
@@ -581,6 +591,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
case SSL_AES256GCM:
i = SSL_ENC_AES256GCM_IDX;
break;
+ case SSL_CHACHA20POLY1305_D:
+ i = SSL_ENC_CHACHA20POLY1305_DRAFT_IDX;
+ break;
+ case SSL_CHACHA20POLY1305:
+ i = SSL_ENC_CHACHA20POLY1305_IDX;
+ break;
default:
i = -1;
break;
@@ -796,6 +812,12 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth,
(ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] ==
NULL) ? SSL_AES256GCM : 0;
*enc |=
+ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_DRAFT_IDX] ==
+ NULL) ? SSL_CHACHA20POLY1305_D : 0;
+ *enc |=
+ (ssl_cipher_methods[SSL_ENC_CHACHA20POLY1305_IDX] ==
+ NULL) ? SSL_CHACHA20POLY1305 : 0;
+ *enc |=
(ssl_cipher_methods[SSL_ENC_CAMELLIA128_IDX] ==
NULL) ? SSL_CAMELLIA128 : 0;
*enc |=
@@ -1812,6 +1834,12 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
case SSL_AES256GCM:
enc = "AESGCM(256)";
break;
+ case SSL_CHACHA20POLY1305_D:
+ enc = "ChaCha20-Poly1305-draft";
+ break;
+ case SSL_CHACHA20POLY1305:
+ enc = "ChaCha20-Poly1305";
+ break;
case SSL_CAMELLIA128:
enc = "Camellia(128)";
break;
diff --git a/ssl/ssl_locl.h b/ssl/ssl_locl.h
index 6df725f..5e2fbe4 100644
--- a/ssl/ssl_locl.h
+++ b/ssl/ssl_locl.h
@@ -354,6 +354,8 @@
# define SSL_SEED 0x00000800L
# define SSL_AES128GCM 0x00001000L
# define SSL_AES256GCM 0x00002000L
+# define SSL_CHACHA20POLY1305_D 0x00004000L
+# define SSL_CHACHA20POLY1305 0x00080000U /* Value from openssl */
# define SSL_AES (SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM)
# define SSL_CAMELLIA (SSL_CAMELLIA128|SSL_CAMELLIA256)
diff --git a/ssl/tls1.h b/ssl/tls1.h
index 7e237d0..fb0c981 100644
--- a/ssl/tls1.h
+++ b/ssl/tls1.h
@@ -563,6 +563,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
# define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256 0x0300C031
# define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384 0x0300C032
+/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */
+# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC13
+# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D 0x0300CC14
+# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305_D 0x0300CC15
+
+/* ChaCha20-Poly1305 ciphersuites from RFC */
+# define TLS1_CK_ECDHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCA8
+# define TLS1_CK_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 0x0300CCA9
+# define TLS1_CK_DHE_RSA_WITH_CHACHA20_POLY1305 0x0300CCAA
+# define TLS1_CK_PSK_WITH_CHACHA20_POLY1305 0x0300CCAB
+# define TLS1_CK_ECDHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAC
+# define TLS1_CK_DHE_PSK_WITH_CHACHA20_POLY1305 0x0300CCAD
+# define TLS1_CK_RSA_PSK_WITH_CHACHA20_POLY1305 0x0300CCAE
+
/*
* XXX * Backward compatibility alert: + * Older versions of OpenSSL gave
* some DHE ciphers names with "EDH" + * instead of "DHE". Going forward, we
@@ -713,6 +727,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
# define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256 "ECDH-RSA-AES128-GCM-SHA256"
# define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384 "ECDH-RSA-AES256-GCM-SHA384"
+/* ChaCha20-Poly1305 ciphersuites draft-agl-tls-chacha20poly1305-01 */
+# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305_D "ECDHE-RSA-CHACHA20-POLY1305-D"
+# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_D "ECDHE-ECDSA-CHACHA20-POLY1305-D"
+# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305_D "DHE-RSA-CHACHA20-POLY1305-D"
+
+/* Chacha20-Poly1305 ciphersuites from RFC */
+# define TLS1_TXT_ECDHE_RSA_WITH_CHACHA20_POLY1305 "ECDHE-RSA-CHACHA20-POLY1305"
+# define TLS1_TXT_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 "ECDHE-ECDSA-CHACHA20-POLY1305"
+# define TLS1_TXT_DHE_RSA_WITH_CHACHA20_POLY1305 "DHE-RSA-CHACHA20-POLY1305"
+# define TLS1_TXT_PSK_WITH_CHACHA20_POLY1305 "PSK-CHACHA20-POLY1305"
+# define TLS1_TXT_ECDHE_PSK_WITH_CHACHA20_POLY1305 "ECDHE-PSK-CHACHA20-POLY1305"
+# define TLS1_TXT_DHE_PSK_WITH_CHACHA20_POLY1305 "DHE-PSK-CHACHA20-POLY1305"
+# define TLS1_TXT_RSA_PSK_WITH_CHACHA20_POLY1305 "RSA-PSK-CHACHA20-POLY1305"
+
# define TLS_CT_RSA_SIGN 1
# define TLS_CT_DSS_SIGN 2
# define TLS_CT_RSA_FIXED_DH 3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment