Last active
July 11, 2018 15:03
-
-
Save bluetech/36ac1d0b21864a4f42fa723de569e5f8 to your computer and use it in GitHub Desktop.
Websocket masking comparison
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// gcc -O3 -march=native mask.c -o mask | |
#include <assert.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <time.h> | |
#include <immintrin.h> | |
// Approach used by wsaccel. | |
static void mask_8_by_8(uint8_t *data, size_t len, const uint8_t mask[4]) { | |
for (size_t i = 0; i < len; i++) { | |
data[i] ^= mask[i & 3]; | |
} | |
} | |
static void mask_32_by_32(uint8_t *data, size_t len, const uint8_t mask[4]) { | |
// For simplicity we assume that `data` is already 4-bytes aligned. | |
// Real general code should handle the case where it's not aligned. | |
assert((uintptr_t) data % 4 == 0); | |
uint32_t *data32 = (uint32_t *) data; | |
uint32_t mask32 = *((uint32_t *) mask); | |
for (size_t i = 0; i < len / 4; i++) { | |
data32[i] ^= mask32; | |
} | |
mask_8_by_8(data + len - (len % 4), len % 4, mask); | |
} | |
static void mask_64_by_64(uint8_t *data, size_t len, const uint8_t mask[4]) { | |
// For simplicity we assume that `data` is already 8-bytes aligned. | |
// Real general code should handle the case where it's not aligned. | |
assert((uintptr_t) data % 8 == 0); | |
uint64_t *data64 = (uint64_t *) data; | |
uint32_t mask32 = *((uint32_t *) mask); | |
uint64_t mask64 = ((uint64_t) mask32 << 32) | mask32; | |
for (size_t i = 0; i < len / 8; i++) { | |
data64[i] ^= mask64; | |
} | |
mask_8_by_8(data + len - (len % 8), len % 8, mask); | |
} | |
static void mask_128_by_128(uint8_t *data, size_t len, const uint8_t mask[4]) { | |
// For simplicity we assume that `data` is already 16-bytes aligned. | |
// Real general code should handle the case where it's not aligned. | |
assert((uintptr_t) data % 16 == 0); | |
__m128i *data128 = (__m128i *) data; | |
uint32_t mask32 = *((uint32_t *) mask); | |
__m128i mask128 = _mm_set1_epi32(mask32); | |
for (size_t i = 0; i < len / 16; i++) { | |
_mm_store_si128(data128 + i, _mm_xor_si128(_mm_load_si128(data128 + i), mask128)); | |
} | |
mask_8_by_8(data + len - (len % 16), len % 16, mask); | |
} | |
static void mask_256_by_256(uint8_t *data, size_t len, const uint8_t mask[4]) { | |
// For simplicity we assume that `data` is already 32-bytes aligned. | |
// Real general code should handle the case where it's not aligned. | |
assert((uintptr_t) data % 32 == 0); | |
__m256i *data256 = (__m256i *) data; | |
uint32_t mask32 = *((uint32_t *) mask); | |
__m256i mask256 = _mm256_set1_epi32(mask32); | |
for (size_t i = 0; i < len / 32; i++) { | |
_mm256_store_si256(data256 + i, _mm256_xor_si256(_mm256_load_si256(data256 + i), mask256)); | |
} | |
mask_8_by_8(data + len - (len % 32), len % 32, mask); | |
} | |
static void check(uint8_t *data, size_t len) { | |
for (size_t i = 0; i < len; i++) { | |
assert(data[i] == 0xff); | |
} | |
} | |
int main(void) { | |
const uint8_t mask[] = {0xff, 0xff, 0xff, 0xff}; | |
const size_t SIZE = 100000; | |
const int ITERS = 10000; | |
uint8_t *data = aligned_alloc(32, SIZE); | |
clock_t start; | |
memset(data, 0, SIZE); | |
mask_8_by_8(data, SIZE, mask); | |
check(data, SIZE); | |
memset(data, 0, SIZE); | |
start = clock(); | |
for (int i = 0; i < ITERS; i++) { | |
mask_8_by_8(data, SIZE, mask); | |
} | |
printf("8 by 8: %u\n", clock() - start); | |
memset(data, 0, SIZE); | |
mask_32_by_32(data, SIZE, mask); | |
check(data, SIZE); | |
memset(data, 0, SIZE); | |
start = clock(); | |
for (int i = 0; i < ITERS; i++) { | |
mask_32_by_32(data, SIZE, mask); | |
} | |
printf("32 by 32 : %lu\n", clock() - start); | |
memset(data, 0, SIZE); | |
mask_64_by_64(data, SIZE, mask); | |
check(data, SIZE); | |
memset(data, 0, SIZE); | |
start = clock(); | |
for (int i = 0; i < ITERS; i++) { | |
mask_64_by_64(data, SIZE, mask); | |
} | |
printf("64 by 64 : %lu\n", clock() - start); | |
memset(data, 0, SIZE); | |
mask_128_by_128(data, SIZE, mask); | |
check(data, SIZE); | |
memset(data, 0, SIZE); | |
start = clock(); | |
for (int i = 0; i < ITERS; i++) { | |
mask_128_by_128(data, SIZE, mask); | |
} | |
printf("128 by 128: %lu\n", clock() - start); | |
memset(data, 0, SIZE); | |
mask_256_by_256(data, SIZE, mask); | |
check(data, SIZE); | |
memset(data, 0, SIZE); | |
start = clock(); | |
for (int i = 0; i < ITERS; i++) { | |
mask_256_by_256(data, SIZE, mask); | |
} | |
printf("256 by 256: %lu\n", clock() - start); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment