Last active
October 8, 2018 08:24
-
-
Save pkorpine/0b3a768e018827039f862cd6228e5e7e to your computer and use it in GitHub Desktop.
memcpy 16-bit values to 32-bit values (zero extend using SSE4.1)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <smmintrin.h> | |
void memcpy_16to32(uint32_t *pdst_, const uint16_t *psrc_, size_t n) | |
{ | |
size_t items = n / sizeof(uint16_t); | |
__m128i *psrc = (__m128i *) psrc_; | |
__m128i *pdst = (__m128i *) pdst_; | |
while (items >= 8) { | |
// Read 8x 16-bit | |
__m128i src = *psrc++; | |
// Zero extend 4 first elements from 16-bit to 32-bit | |
*pdst++ = _mm_cvtepu16_epi32(src); | |
// Shift right 8 bytes | |
src = _mm_srli_si128(src, 8); | |
// Zero extend 4 last elements from 16-bit to 32-bit | |
*pdst++ = _mm_cvtepu16_epi32(src); | |
items -= 8; | |
} | |
// Copy last 7 or less items using for-loop | |
uint16_t *psrc_u16 = (uint16_t*) psrc; | |
uint32_t *pdst_u32 = (uint32_t*) pdst; | |
while (items-- > 0) { | |
*pdst_u32++ = *psrc_u16++; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment