Skip to content

Instantly share code, notes, and snippets.

@rsms
Last active August 11, 2016 16:42
Show Gist options
  • Save rsms/6433149 to your computer and use it in GitHub Desktop.
Save rsms/6433149 to your computer and use it in GitHub Desktop.
Base64 decoder with constant performance at around 770 MB/s on a 2.8 GHz Intel Core i7 (with clang4.2 -O3 -std=c++11 -DNDEBUG) This gist also includes a 16-byte special-purposed base64 encoder (which can easily be modified to accept variable length input.)
static int base64_decode(unsigned char* dst, size_t dstsize, const char* src, size_t srclen) {
//
// On a 2.8 GHz Intel Core i7 and clang 4.2 -O3 -std=c++11 this was measured to perform well:
// Avg nsec/invocation: 32
// Avg invocations/second: 31 649 841
// Iterations: 10 000 000
// These numbers include C function call overhead (pusing and popping the stack) and are for
// dstsize=16, srclen=22 (tested on decoding a igraph_id). After measuring many different source
// sizes ranging from 16 to 4096, raw performance stays constant (at around 770 MB/s).
//
size_t len = srclen;
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wchar-subscripts"
// We ignore "array subscript is of type 'char'" warnings as we know its safe
for (size_t i = 0, resi = 0;; ) {
size_t n = len - i;
assert(n);
if (n > 4) {
const long b11000000 = bitset(long(2),6);
dst[resi++] = (BASE64_VALUES[src[i++]] << 2) | (BASE64_VALUES[src[i]] >> 4);
dst[resi++] = (BASE64_VALUES[src[i++]] << 4) | (BASE64_VALUES[src[i]] >> 2);
dst[resi++] = ((BASE64_VALUES[src[i++]] << 6) & b11000000) | BASE64_VALUES[src[i++]];
} else {
if (resi + (n-1) > dstsize) {
// Destination is not big enough
return 0;
}
dst[resi] = BASE64_VALUES[src[i]] << 2;
if (n > 1) {
dst[resi++] |= BASE64_VALUES[src[++i]] >> 4;
if (n == 3) {
dst[resi++] = (BASE64_VALUES[src[i++]] << 4) | (BASE64_VALUES[src[i]] >> 2);
}
}
break;
}
}
#pragma GCC diagnostic pop
return 1;
}
__ZL13base64_decodePhmPKcm:
pushq %rbp
movq %rsp, %rbp
leaq 2(%rdi), %r10
movl $32, %r9d
leaq L_.str+3(%rip), %rdx
leaq __ZL13BASE64_VALUES(%rip), %r8
.align 4, 0x90
LBB1_1: ## =>This Inner Loop Header: Depth=1
movsbq -3(%rdx), %rax
movzbl (%rax,%r8), %ecx
shll $2, %ecx
movsbq -2(%rdx), %rax
movsbl (%rax,%r8), %eax
movl %eax, %esi
shrl $4, %esi
orl %ecx, %esi
movb %sil, -2(%r10)
shll $4, %eax
addq $-4, %r9
movsbq -1(%rdx), %rcx
movsbl (%rcx,%r8), %ecx
movl %ecx, %esi
shrl $2, %esi
orl %eax, %esi
movb %sil, -1(%r10)
shlb $6, %cl
movsbq (%rdx), %rax
leaq 4(%rdx), %rdx
orb (%rax,%r8), %cl
cmpq $4, %r9
movb %cl, (%r10)
leaq 3(%r10), %r10
ja LBB1_1
## BB#2:
movb $0, 21(%rdi)
popq %rbp
ret
typedef struct{unsigned char value[16];} B16;
void base64_encode_B16(B16 id, char buf[22]) {
// Base64 encoding is pretty simple, with a 6-bit stride and a repeating pattern every 3 bytes:
// ••••••|•• ••••|•••• ••|••••••| repeat
// --B-- 01234567 01234567 01234567 01234567 012345 01 2345 0123 45 012345
// 0..3 •••••••• •••••••• •••••••• •••••••• ••••••|•• ••••|•••• ••|••••••| ••••••|••
// 4..7 •••••••• •••••••• •••••••• •••••••• ••••|•••• ••|••••••| ••••••|•• ••••|••••
// 8..11 •••••••• •••••••• •••••••• •••••••• ••|••••••| ••••••|•• ••••|•••• ••|••••••
// 12..15 •••••••• •••••••• •••••••• •••••••• |••••••|•• ••••|•••• ••|••••••| ••••••••
//
// Bits in chunks: 012345 01 2345 0123 45 012345
// Chunk #: —— 0 —|—— 1 ——|—— 2 ——|—— 3 —|
// Chunk mask: c0 c1a c1b c2a c2b=c1a c3=c0
// 0..3 ••••••|•• ••••|•••• ••|••••••| Pass 1
// 4..7 ••••••|•• ••••|•••• ••|••••••| Pass 2
// 8..11 ••••••|•• ••••|•••• ••|••••••| Pass 3
// 12..15 ••••••|•• ••••|•••• ••|••••••| Pass 4
// 16..19 ••••••|•• ••••|•••• ••|••••••| Pass 5
// 20..21 ••••••|•• xxxx| Pass 6
// 22..23 xxxx xx|xxxxxx|
//
// On a 2.8 GHz Intel Core i7 and clang 4.2 -O3 -std=c++11 this was measured to perform well:
// Avg nsec/invocation: 39
// Avg invocations/second: 25 372 038
// Iterations: 10 000 000
// These numbers include C function call overhead (pusing and popping the stack).
// Basically this function becomes a sequence of a few logical instructions like SHL, AND, etc
// with a single CMP and a single JUMP (speaking of x86 here).
//
const long b00000011 = bitset(2, 0);
const long b11110000 = bitset(4, 4);
const long b00001111 = bitset(4, 0);
const long b11000000 = bitset(2, 6);
const long b00111111 = bitset(6, 0);
char* p = buf;
for (size_t srci = 0; ; srci += 3) {
long src0 = id.value[srci]; // Upsize from i8 to long (e.g. MOVZB)
*p++ = BASE64_CHARS[src0 >> 2];
// chunk0 = (mask src[0] 00111111)
if (srci == 15) {
// Last pass is partial
*p = BASE64_CHARS[(src0 & b00000011) << 4];
return;
}
long src1 = id.value[srci+1];
*p++ = BASE64_CHARS[ ((src0 & b00000011) << 4) | ((src1 & b11110000) >> 4) ];
// chunk1 = (combine
// (bit-downsize 4 (mask src[0] 00000011)) #-> 00110000
// (bit-upsize 4 (mask src[1] 11110000)) #-> ????1111
// )
long src2 = id.value[srci+2];
*p++ = BASE64_CHARS[ ((src1 & b00001111) << 2) | ((src2 & b11000000) >> 6) ];
*p++ = BASE64_CHARS[src2 & b00111111];
}
}
__ZL13base64_decodePhmPKcm:
pushq %rbp
movq %rsp, %rbp
leaq 2(%rdi), %r10
movl $32, %r9d
leaq L_.str+3(%rip), %rdx
leaq __ZL13BASE64_VALUES(%rip), %r8
.align 4, 0x90
LBB1_1: ## =>This Inner Loop Header: Depth=1
movsbq -3(%rdx), %rax
movzbl (%rax,%r8), %ecx
shll $2, %ecx
movsbq -2(%rdx), %rax
movsbl (%rax,%r8), %eax
movl %eax, %esi
shrl $4, %esi
orl %ecx, %esi
movb %sil, -2(%r10)
shll $4, %eax
addq $-4, %r9
movsbq -1(%rdx), %rcx
movsbl (%rcx,%r8), %ecx
movl %ecx, %esi
shrl $2, %esi
orl %eax, %esi
movb %sil, -1(%r10)
shlb $6, %cl
movsbq (%rdx), %rax
leaq 4(%rdx), %rdx
orb (%rax,%r8), %cl
cmpq $4, %r9
movb %cl, (%r10)
leaq 3(%r10), %r10
ja LBB1_1
## BB#2:
movb $0, 21(%rdi)
popq %rbp
ret
template <typename T, typename X> inline static T bitset(T n1, X n0) {
// Returns a value with specific bits set:
// n1 set bits followed by n0 unset bits
// bitset(char(4), 2) -> 00111100
return ~(~0 << n1) << n0;
// As seen in "Bitwise Operators" of "The C Programming Language, second edition", and
// "Space Efficiency" of "The Practice of Programming".
}
static const char BASE64_CHARS[] = {
'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F',
'G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V',
'W','X','Y','Z','a','b','c','d','e','f','g','h','i','j','k','l',
'm','n','o','p','q','r','s','t','u','v','w','x','y','z','-','_'};
static const char BASE64_VALUES[] = {
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,
-1,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,-1,-1,-1,-1,63,
-1,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};
// See https://gist.github.com/rsms/6418070 for using different mappings
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment