Created
December 11, 2021 19:24
-
-
Save mattsta/1242194ba7dc717b1bea57f77719cd6e to your computer and use it in GitHub Desktop.
a more optimized version of speedups.c from python websockets project with more efficient cascading XOR calculation for large data using Intel SIMD from 32 byte blocks then 16 byte blocks then 8 byte blocks then 1 byte blocks.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* C implementation of performance sensitive functions. */ | |
#define PY_SSIZE_T_CLEAN | |
#include <Python.h> | |
#include <assert.h> | |
#include <stdint.h> /* uint32_t, uint64_t */ | |
#include <stdio.h> | |
#if __AVX2__ | |
#include <immintrin.h> | |
#endif | |
#if __SSE2__ | |
#include <emmintrin.h> | |
#endif | |
static const Py_ssize_t MASK_LEN = 4; | |
/* Similar to PyBytes_AsStringAndSize, but accepts more types */ | |
static int _PyBytesLike_AsStringAndSize(PyObject *obj, char **buffer, | |
Py_ssize_t *length) { | |
// This supports bytes, bytearrays, and C-contiguous memoryview objects, | |
// which are the most useful data structures for handling byte streams. | |
// websockets.framing.prepare_data() returns only values of these types. | |
// Any object implementing the buffer protocol could be supported, however | |
// that would require allocation or copying memory, which is expensive. | |
if (PyBytes_Check(obj)) { | |
*buffer = PyBytes_AS_STRING(obj); | |
*length = PyBytes_GET_SIZE(obj); | |
} else if (PyByteArray_Check(obj)) { | |
*buffer = PyByteArray_AS_STRING(obj); | |
*length = PyByteArray_GET_SIZE(obj); | |
} else if (PyMemoryView_Check(obj)) { | |
Py_buffer *mv_buf; | |
mv_buf = PyMemoryView_GET_BUFFER(obj); | |
if (PyBuffer_IsContiguous(mv_buf, 'C')) { | |
*buffer = mv_buf->buf; | |
*length = mv_buf->len; | |
} else { | |
PyErr_Format(PyExc_TypeError, "expected a contiguous memoryview"); | |
return -1; | |
} | |
} else { | |
PyErr_Format(PyExc_TypeError, | |
"expected a bytes-like object, %.200s found", | |
Py_TYPE(obj)->tp_name); | |
return -1; | |
} | |
return 0; | |
} | |
/* C implementation of websockets.utils.apply_mask */ | |
static PyObject *apply_mask(PyObject *self, PyObject *args, PyObject *kwds) { | |
// In order to support various bytes-like types, accept any Python object. | |
static char *kwlist[] = {"data", "mask", NULL}; | |
PyObject *input_obj; | |
PyObject *mask_obj; | |
// A pointer to a char * + length will be extracted from the data and mask | |
// arguments, possibly via a Py_buffer. | |
char *input; | |
Py_ssize_t input_len; | |
char *mask; | |
Py_ssize_t mask_len; | |
// Initialize a PyBytesObject then get a pointer to the underlying char * | |
// in order to avoid an extra memory copy in PyBytes_FromStringAndSize. | |
PyObject *result; | |
char *output; | |
// Other variables. | |
Py_ssize_t i = 0; | |
// Parse inputs. | |
if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO", kwlist, &input_obj, | |
&mask_obj)) { | |
return NULL; | |
} | |
if (_PyBytesLike_AsStringAndSize(input_obj, &input, &input_len) == -1) { | |
return NULL; | |
} | |
if (_PyBytesLike_AsStringAndSize(mask_obj, &mask, &mask_len) == -1) { | |
return NULL; | |
} | |
if (mask_len != MASK_LEN) { | |
PyErr_SetString(PyExc_ValueError, "mask must contain 4 bytes"); | |
return NULL; | |
} | |
// Create output. | |
result = PyBytes_FromStringAndSize(NULL, input_len); | |
if (result == NULL) { | |
return NULL; | |
} | |
// Since we juste created result, we don't need error checks. | |
output = PyBytes_AS_STRING(result); | |
// Perform the masking operation. | |
// Apparently GCC cannot figure out the following optimizations by itself. | |
// We need a new scope for MSVC 2010 (non C99 friendly) | |
{ | |
#if __AVX2__ | |
/* First do the xor by 32 byte increments... */ | |
Py_ssize_t input_len_256 = input_len & ~31; | |
__m256i mask_256 = _mm256_set1_epi32(*(uint32_t *)mask); | |
for (; i < input_len_256; i += 32) { | |
__m256i in_256 = _mm256_loadu_si256((__m256i *)(input + i)); | |
__m256i out_256 = _mm256_xor_si256(in_256, mask_256); | |
_mm256_storeu_si256((__m256i *)(output + i), out_256); | |
} | |
#endif | |
#if __SSE2__ | |
// With SSE2 support, XOR by blocks of 16 bytes = 128 bits. | |
// Since we cannot control the 16-bytes alignment of input and output | |
// buffers, we rely on loadu/storeu rather than load/store. | |
Py_ssize_t input_len_128 = (input_len - i) & ~15; | |
__m128i mask_128 = _mm_set1_epi32(*(uint32_t *)mask); | |
for (; i < input_len_128; i += 16) { | |
__m128i in_128 = _mm_loadu_si128((__m128i *)(input + i)); | |
__m128i out_128 = _mm_xor_si128(in_128, mask_128); | |
_mm_storeu_si128((__m128i *)(output + i), out_128); | |
} | |
#endif | |
// Without SSE2 support, XOR by blocks of 8 bytes = 64 bits. | |
// We assume the memory allocator aligns everything on 8 bytes | |
// boundaries. | |
Py_ssize_t input_len_64 = (input_len - i) & ~7; | |
uint32_t mask_32 = *(uint32_t *)mask; | |
uint64_t mask_64 = ((uint64_t)mask_32 << 32) | (uint64_t)mask_32; | |
for (; i < input_len_64; i += 8) { | |
*(uint64_t *)(output + i) = *(uint64_t *)(input + i) ^ mask_64; | |
} | |
} | |
// XOR the remainder of the input byte by byte. | |
for (; i < input_len; i++) { | |
output[i] = input[i] ^ mask[i & (MASK_LEN - 1)]; | |
} | |
return result; | |
} | |
static PyMethodDef speedups_methods[] = { | |
{ | |
"apply_mask", | |
(PyCFunction)apply_mask, | |
METH_VARARGS | METH_KEYWORDS, | |
"Apply masking to websocket message.", | |
}, | |
{NULL, NULL, 0, NULL}, /* Sentinel */ | |
}; | |
static struct PyModuleDef speedups_module = { | |
PyModuleDef_HEAD_INIT, "websocket.speedups", /* m_name */ | |
"C implementation of performance sensitive functions.", | |
/* m_doc */ | |
-1, /* m_size */ | |
speedups_methods, /* m_methods */ | |
NULL, NULL, NULL, NULL}; | |
PyMODINIT_FUNC PyInit_speedups(void) { | |
return PyModule_Create(&speedups_module); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment