Last active
August 7, 2018 18:15
-
-
Save harubaru/1981d6517e499cd5eb611055de4b712f to your computer and use it in GitHub Desktop.
a wrapper made in C for SSE
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* sse.h - a wrapper made in C for SSE | |
*/ | |
#ifndef INTRINSICS_H | |
#define INTRINSICS_H | |
#if (__STDC_VERSION__ > 199409L) | |
#define USE_INLINE inline | |
#else | |
#define USE_INLINE | |
#endif | |
#include <x86intrin.h> | |
/* float */ | |
USE_INLINE float sse_fadd(__m128 a) { a = _mm_add_ps(a, _mm_movehl_ps(a, a)); a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } | |
USE_INLINE float sse_fsub(__m128 a) { a = _mm_sub_ps(a, _mm_movehl_ps(a, a)); a = _mm_sub_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } | |
USE_INLINE float sse_fmul(__m128 a) { a = _mm_mul_ps(a, _mm_movehl_ps(a, a)); a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } | |
USE_INLINE float sse_fmin(__m128 a) { a = _mm_min_ps(a, _mm_movehl_ps(a, a)); a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } | |
USE_INLINE float sse_fmax(__m128 a) { a = _mm_max_ps(a, _mm_movehl_ps(a, a)); a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); return _mm_cvtss_f32(a); } | |
USE_INLINE float vec2_fadd(float a, float b) { return sse_fadd(_mm_set_ps(a, b, 0, 0)); } | |
USE_INLINE float vec2_fsub(float a, float b) { return sse_fsub(_mm_set_ps(a, b, 0, 0)); } | |
USE_INLINE float vec2_fmul(float a, float b) { return sse_fmul(_mm_set_ps(a, b, 1, 1)); } | |
USE_INLINE float vec2_fmax(float a, float b) { return sse_fmax(_mm_set_ps(a, b, 1.175494351e-38F, 1.175494351e-38F)); } | |
USE_INLINE float vec2_fmin(float a, float b) { return sse_fmin(_mm_set_ps(a, b, 3.402823466e+38F, 3.402823466e+38F)); } | |
USE_INLINE float vec3_fadd(float a, float b, float c) { return sse_fadd(_mm_set_ps(a, b, c, 0)); } | |
USE_INLINE float vec3_fsub(float a, float b, float c) { return sse_fsub(_mm_set_ps(a, b, c, 0)); } | |
USE_INLINE float vec3_fmul(float a, float b, float c) { return sse_fmul(_mm_set_ps(a, b, c, 1)); } | |
USE_INLINE float vec3_fmax(float a, float b, float c) { return sse_fmul(_mm_set_ps(a, b, c, 1.175494351e-38F)); } | |
USE_INLINE float vec3_fmin(float a, float b, float c) { return sse_fmul(_mm_set_ps(a, b, c, 3.402823466e+38F)); } | |
USE_INLINE float vec4_fadd(float a, float b, float c, float d) { return sse_fadd(_mm_set_ps(a, b, c, d)); } | |
USE_INLINE float vec4_fsub(float a, float b, float c, float d) { return sse_fsub(_mm_set_ps(a, b, c, d)); } | |
USE_INLINE float vec4_fmul(float a, float b, float c, float d) { return sse_fmul(_mm_set_ps(a, b, c, d)); } | |
USE_INLINE float vec4_fmax(float a, float b, float c, float d) { return sse_fmul(_mm_set_ps(a, b, c, d)); } | |
USE_INLINE float vec4_fmin(float a, float b, float c, float d) { return sse_fmul(_mm_set_ps(a, b, c, d)); } | |
/* signed int */ | |
USE_INLINE int sse_iadd(__m128i a) { a = _mm_add_epi32(a, _mm_srli_si128(a, 8)); a = _mm_add_epi32(a, _mm_srli_si128(a, 4)); return _mm_cvtsi128_si32(a); } | |
USE_INLINE int sse_isub(__m128i a) { a = _mm_sub_epi32(a, _mm_srli_si128(a, 8)); a = _mm_sub_epi32(a, _mm_srli_si128(a, 4)); return _mm_cvtsi128_si32(a); } | |
USE_INLINE int sse_imul(__m128i a) { a = _mm_mullo_epi32(a, _mm_srli_si128(a, 8)); a = _mm_mullo_epi32(a, _mm_srli_si128(a, 4)); return _mm_cvtsi128_si32(a); } | |
USE_INLINE int sse_imin(__m128i a) { a = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = _mm_min_epi16(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } | |
USE_INLINE int sse_imax(__m128i a) { a = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); a = _mm_max_epi16(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si32(a); } | |
USE_INLINE int vec2_iadd(int a, int b) { return sse_iadd(_mm_set_epi32(a, b, 0, 0)); } | |
USE_INLINE int vec2_isub(int a, int b) { return sse_isub(_mm_set_epi32(a, b, 0, 0)); } | |
USE_INLINE int vec2_imul(int a, int b) { return sse_imul(_mm_set_epi32(a, b, 1, 1)); } | |
USE_INLINE int vec2_imin(int a, int b) { return sse_imin(_mm_set_epi32(a, b, 0x7FFFFFFF, 0x7FFFFFFF)); } | |
USE_INLINE int vec2_imax(int a, int b) { return sse_imax(_mm_set_epi32(a, b, 0x80000000, 0x80000000)); } | |
USE_INLINE int vec3_iadd(int a, int b, int c) { return sse_iadd(_mm_set_epi32(a, b, c, 0)); } | |
USE_INLINE int vec3_isub(int a, int b, int c) { return sse_isub(_mm_set_epi32(a, b, c, 0)); } | |
USE_INLINE int vec3_imul(int a, int b, int c) { return sse_imul(_mm_set_epi32(a, b, c, 1)); } | |
USE_INLINE int vec3_imin(int a, int b, int c) { return sse_imin(_mm_set_epi32(a, b, c, 0x7FFFFFFF)); } | |
USE_INLINE int vec3_imax(int a, int b, int c) { return sse_imax(_mm_set_epi32(a, b, c, 0x80000000)); } | |
USE_INLINE int vec4_iadd(int a, int b, int c, int d) { return sse_iadd(_mm_set_epi32(a, b, c, d)); } | |
USE_INLINE int vec4_isub(int a, int b, int c, int d) { return sse_isub(_mm_set_epi32(a, b, c, d)); } | |
USE_INLINE int vec4_imul(int a, int b, int c, int d) { return sse_imul(_mm_set_epi32(a, b, c, d)); } | |
USE_INLINE int vec4_imin(int a, int b, int c, int d) { return sse_imin(_mm_set_epi32(a, b, c, d)); } | |
USE_INLINE int vec4_imax(int a, int b, int c, int d) { return sse_imax(_mm_set_epi32(a, b, c, d)); } | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Fixed problem where it couldn't be compiled on compilers using ANSI C standard