-
-
Save danieloneill/9952820ad3ae0fe36738b4106ae6a775 to your computer and use it in GitHub Desktop.
diff --git a/deps/media-playback/CMakeLists.txt b/deps/media-playback/CMakeLists.txt | |
index d58d121..ae938fa 100644 | |
--- a/deps/media-playback/CMakeLists.txt | |
+++ b/deps/media-playback/CMakeLists.txt | |
@@ -28,6 +28,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le") | |
PUBLIC | |
-mvsx) | |
add_compile_definitions(NO_WARN_X86_INTRINSICS) | |
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch).*") | |
elseif(NOT MSVC) | |
target_compile_options(media-playback | |
PUBLIC | |
diff --git a/libobs-opengl/CMakeLists.txt b/libobs-opengl/CMakeLists.txt | |
index a116c99..645bb12 100644 | |
--- a/libobs-opengl/CMakeLists.txt | |
+++ b/libobs-opengl/CMakeLists.txt | |
@@ -43,6 +43,7 @@ else() #This needs to change to be more specific to get ready for Wayland | |
set(libobs-opengl_PLATFORM_DEPS | |
${XCB_LIBRARIES} | |
+ ${OPENGL_LIBRARIES} | |
${X11_XCB_LIBRARIES}) | |
set(libobs-opengl_PLATFORM_SOURCES | |
diff --git a/libobs/CMakeLists.txt b/libobs/CMakeLists.txt | |
index 8681148..b439847 100644 | |
--- a/libobs/CMakeLists.txt | |
+++ b/libobs/CMakeLists.txt | |
@@ -13,6 +13,7 @@ endif() | |
if(UNIX) | |
if (NOT APPLE) | |
+ find_package(X11 REQUIRED) | |
find_package(X11_XCB REQUIRED) | |
find_package(XCB OPTIONAL_COMPONENTS XINPUT) | |
if (XCB_XINPUT_FOUND) | |
@@ -204,6 +205,7 @@ elseif(UNIX) | |
${X11_XCB_DEFINITIONS}) | |
set(libobs_PLATFORM_DEPS | |
${libobs_PLATFORM_DEPS} | |
+ ${X11_LIBRARIES} | |
${X11_XCB_LIBRARIES}) | |
if(USE_XINPUT) | |
@@ -470,6 +472,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64le") | |
PUBLIC | |
-mvsx) | |
add_compile_definitions(NO_WARN_X86_INTRINSICS) | |
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch).*") | |
elseif(NOT MSVC) | |
target_compile_options(libobs | |
PUBLIC | |
diff --git a/libobs/graphics/quat.h b/libobs/graphics/quat.h | |
index 23552ca..0610eb1 100644 | |
--- a/libobs/graphics/quat.h | |
+++ b/libobs/graphics/quat.h | |
@@ -20,7 +20,12 @@ | |
#include "../util/c99defs.h" | |
#include "math-defs.h" | |
#include "vec3.h" | |
-#include <xmmintrin.h> | |
+#if defined(__aarch64__) || defined(__arm__) | |
+# include <arm_neon.h> | |
+# include "sse2neon.h" | |
+#else | |
+# include <xmmintrin.h> | |
+#endif | |
/* | |
* Quaternion math | |
diff --git a/libobs/graphics/vec3.h b/libobs/graphics/vec3.h | |
index d6b9f8d..4ea72d5 100644 | |
--- a/libobs/graphics/vec3.h | |
+++ b/libobs/graphics/vec3.h | |
@@ -19,7 +19,12 @@ | |
#include "math-defs.h" | |
#include "vec4.h" | |
-#include <xmmintrin.h> | |
+#if defined(__aarch64__) || defined(__arm__) | |
+# include <arm_neon.h> | |
+# include "sse2neon.h" | |
+#else | |
+# include <xmmintrin.h> | |
+#endif | |
#ifdef __cplusplus | |
extern "C" { | |
diff --git a/libobs/graphics/vec4.h b/libobs/graphics/vec4.h | |
index 61143cc..c66a0a5 100644 | |
--- a/libobs/graphics/vec4.h | |
+++ b/libobs/graphics/vec4.h | |
@@ -18,7 +18,12 @@ | |
#pragma once | |
#include "math-defs.h" | |
-#include <xmmintrin.h> | |
+#if defined(__aarch64__) || defined(__arm__) | |
+# include <arm_neon.h> | |
+# include "sse2neon.h" | |
+#else | |
+# include <xmmintrin.h> | |
+#endif | |
#ifdef __cplusplus | |
extern "C" { | |
@@ -76,7 +81,11 @@ static inline void vec4_mul(struct vec4 *dst, const struct vec4 *v1, | |
static inline void vec4_div(struct vec4 *dst, const struct vec4 *v1, | |
const struct vec4 *v2) | |
{ | |
+#if defined(__aarch64__) || defined(__arm__) | |
+ dst->m = vdivq_f32(v1->m, v2->m); | |
+#else | |
dst->m = _mm_div_ps(v1->m, v2->m); | |
+#endif | |
} | |
static inline void vec4_addf(struct vec4 *dst, const struct vec4 *v, float f) | |
diff --git a/libobs/media-io/format-conversion.c b/libobs/media-io/format-conversion.c | |
index f054064..8233698 100644 | |
--- a/libobs/media-io/format-conversion.c | |
+++ b/libobs/media-io/format-conversion.c | |
@@ -16,8 +16,12 @@ | |
******************************************************************************/ | |
#include "format-conversion.h" | |
-#include <xmmintrin.h> | |
-#include <emmintrin.h> | |
+#if defined(__aarch64__) || defined(__arm__) | |
+# include "sse2neon.h" | |
+#else | |
+# include <xmmintrin.h> | |
+# include <emmintrin.h> | |
+#endif | |
/* ...surprisingly, if I don't use a macro to force inlining, it causes the | |
* CPU usage to boost by a tremendous amount in debug builds. */ | |
diff --git a/libobs/obs-audio-controls.c b/libobs/obs-audio-controls.c | |
index 81803ad..9106270 100644 | |
--- a/libobs/obs-audio-controls.c | |
+++ b/libobs/obs-audio-controls.c | |
@@ -16,7 +16,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
#include <math.h> | |
-#include <xmmintrin.h> | |
+#if defined(__aarch64__) || defined(__arm__) | |
+# include "sse2neon.h" | |
+#else | |
+# include <xmmintrin.h> | |
+# include <emmintrin.h> | |
+#endif | |
#include "util/threading.h" | |
#include "util/bmem.h" | |
diff --git a/plugins/decklink/audio-repack.c b/plugins/decklink/audio-repack.c | |
index 8c59af5..c644969 100644 | |
--- a/plugins/decklink/audio-repack.c | |
+++ b/plugins/decklink/audio-repack.c | |
@@ -1,6 +1,10 @@ | |
#include "audio-repack.h" | |
-#include <emmintrin.h> | |
+#if defined(__aarch64__) || defined(__arm__) | |
+# include "sse2neon.h" | |
+#else | |
+# include <emmintrin.h> | |
+#endif | |
int check_buffer(struct audio_repack *repack, uint32_t frame_count) | |
{ |
// ===================================================================================== | |
// | |
// Filename: sse2neon.h | |
// | |
// Description: | |
// | |
// Version: 1.0 | |
// Created: 2016年10月14日 21时58分08秒 | |
// Revision: none | |
// Compiler: g++ | |
// | |
// Author: Su Junjie (USTC), [email protected] | |
// Company: | |
// | |
// ===================================================================================== | |
#ifndef SSE2NEON_H | |
#define SSE2NEON_H | |
//#define ENABLE_CPP_VERSION 0 | |
#define GCC 1 | |
#if GCC | |
#define INLINE inline __attribute__((always_inline)) | |
#else | |
#define INLINE inline | |
#endif | |
#include <arm_neon.h> | |
typedef float32x4_t __m128; | |
typedef float32x2_t __m64; | |
typedef int32x4_t __m128i; | |
/* expands to the following value */ | |
#define _MM_SHUFFLE(z, y, x, w) ( (z<<6) | (y<<4) | (x<<2) | w ) | |
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ | |
{ \ | |
__m128 tmp3, tmp2, tmp1, tmp0; \ | |
tmp0 = _mm_unpacklo_ps(row0, row1); \ | |
tmp2 = _mm_unpacklo_ps(row2, row3); \ | |
tmp1 = _mm_unpackhi_ps(row0, row1); \ | |
tmp3 = _mm_unpackhi_ps(row2, row3); \ | |
row0 = _mm_movelh_ps(tmp0, tmp2); \ | |
row1 = _mm_movehl_ps(tmp2, tmp0); \ | |
row2 = _mm_movelh_ps(tmp1, tmp3); \ | |
row3 = _mm_movehl_ps(tmp3, tmp1); \ | |
} | |
/*************************************************************************** | |
* max and min | |
***************************************************************************/ | |
INLINE __m128i _mm_max_epu8 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vmaxq_u8((uint8x16_t) a, (uint8x16_t) b); | |
} | |
INLINE __m128i _mm_max_epi16 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vmaxq_s16((int16x8_t) a, (int16x8_t) b); | |
} | |
/* Return value | |
* A 128-bit parameter that can be defined with the following equations: | |
* r0 := (a0 > b0) ? a0 : b0 | |
* r1 := (a1 > b1) ? a1 : b1 | |
* r2 := (a2 > b2) ? a2 : b2 | |
* r3 := (a3 > b3) ? a3 : b3 | |
* */ | |
INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) | |
{ | |
return vmaxq_s32(a,b); | |
} | |
/* todo: when the input data contain the NaN. => different behave | |
BUT, in actual use, NaN ? | |
Need MORE tests? | |
*/ | |
INLINE __m128 _mm_max_ps(__m128 a, __m128 b) | |
{ | |
return vmaxq_f32(a, b); | |
} | |
INLINE __m128i _mm_min_epu8 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vminq_u8((uint8x16_t) a, (uint8x16_t) b); | |
} | |
INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) | |
{ | |
return (__m128i)vminq_s16((int16x8_t)a, (int16x8_t)b); | |
} | |
/* Return value | |
* A 128-bit parameter that can be defined with the following equations: | |
* r0 := (a0 < b0) ? a0 : b0 | |
* r1 := (a1 < b1) ? a1 : b1 | |
* r2 := (a2 < b2) ? a2 : b2 | |
* r3 := (a3 < b3) ? a3 : b3 | |
* */ | |
INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) | |
{ | |
return vminq_s32(a,b); | |
} | |
/* todo: when the input data contain the NaN. => different behave | |
BUT, in actual use, NaN ? | |
Need MORE tests? | |
*/ | |
INLINE __m128 _mm_min_ps(__m128 a, __m128 b) | |
{ | |
return vminq_f32(a, b); | |
} | |
/*************************************************************************** | |
* add and sub | |
***************************************************************************/ | |
/* Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit integers of a and saturates. | |
* r0 := UnsignedSaturate(a0 - b0) | |
* r1 := UnsignedSaturate(a1 - b1) | |
* ... | |
* r15 := UnsignedSaturate(a15 - b15) | |
* */ | |
INLINE __m128i _mm_subs_epu8 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vqsubq_u8((uint8x16_t) a, (uint8x16_t) b); | |
} | |
INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) | |
{ | |
return (__m128i)vaddq_s16((int16x8_t)a, (int16x8_t)b); | |
} | |
INLINE __m128i _mm_sub_epi16 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vsubq_s16((int16x8_t) a, (int16x8_t) b); | |
} | |
INLINE __m128i _mm_adds_epu16 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vqaddq_u16((uint16x8_t) a, (uint16x8_t) b); | |
} | |
INLINE __m128i _mm_subs_epu16 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vqsubq_u16((uint16x8_t) a, (uint16x8_t) b); | |
} | |
/* Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b and saturates. | |
* r0 := SignedSaturate(a0 + b0) | |
* r1 := SignedSaturate(a1 + b1) | |
* ... | |
* r7 := SignedSaturate(a7 + b7) | |
* */ | |
INLINE __m128i _mm_adds_epi16 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vqaddq_s16((int16x8_t)a, (int16x8_t)b); | |
} | |
/* Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers of a and saturates. | |
* r0 := SignedSaturate(a0 - b0) | |
* r1 := SignedSaturate(a1 - b1) | |
* ... | |
* r7 := SignedSaturate(a7 - b7) | |
* */ | |
INLINE __m128i _mm_subs_epi16 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vqsubq_s16((int16x8_t) a, (int16x8_t) b); | |
} | |
/* Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. | |
* r0 := a0 + b0 | |
* r1 := a1 + b1 | |
* r2 := a2 + b2 | |
* r3 := a3 + b3 | |
* */ | |
INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) | |
{ | |
return vaddq_s32(a, b); | |
} | |
INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) | |
{ | |
return (__m128i)vaddq_s64((int64x2_t)a, (int64x2_t)b); | |
} | |
/* Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. | |
* r0 := a0 - b0 | |
* r1 := a1 - b1 | |
* r2 := a2 - b2 | |
* r3 := a3 - b3 | |
* */ | |
INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) | |
{ | |
return vsubq_s32(a, b); | |
} | |
/* Adds the four single-precision, floating-point values of a and b. | |
* r0 := a0 + b0 | |
* r1 := a1 + b1 | |
* r2 := a2 + b2 | |
* r3 := a3 + b3 | |
* */ | |
INLINE __m128 _mm_add_ps(__m128 a, __m128 b) | |
{ | |
return vaddq_f32(a, b); | |
} | |
/* Subtracts the four single-precision, floating-point values of a and b. | |
* r0 := a0 - b0 | |
* r1 := a1 - b1 | |
* r2 := a2 - b2 | |
* r3 := a3 - b3 | |
* */ | |
INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) | |
{ | |
return vsubq_f32(a, b); | |
} | |
/* The haddps instruction performs a horizontal add, meaning that adjacent elements in the same operand are added together. Each 128-bit argument is considered as four 32-bit floating-point elements, numbered from 0 to 3, with 3 being the high-order element. The result of the operation on operand a (A3, A2, A1, A0) and operand b (B3, B2, B1, B0) is (B3 + B2, B1 + B0, A3 + A2, A1 + A0). | |
* This routine is only available as an intrinsic*/ | |
INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) | |
{ | |
return vcombine_f32(vpadd_f32(vget_low_f32(a), vget_high_f32(a)), vpadd_f32(vget_low_f32(b), vget_high_f32(b))); | |
} | |
/*************************************************************************** | |
* Multiply | |
***************************************************************************/ | |
/* Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. | |
* r0 := (a0 * b0)[31:16] | |
* r1 := (a1 * b1)[31:16] | |
* ... | |
* r7 := (a7 * b7)[31:16] | |
* */ | |
INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) | |
{ | |
return (__m128i)vshrq_n_s16(vqdmulhq_s16((int16x8_t)a, (int16x8_t)b), 1); | |
} | |
/* Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. | |
* r0 := (a0 * b0)[15:0] | |
* r1 := (a1 * b1)[15:0] | |
* ... | |
* r7 := (a7 * b7)[15:0] | |
* */ | |
INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) | |
{ | |
return (__m128i)vmulq_s16((int16x8_t)a, (int16x8_t)b); | |
} | |
/* Multiplies the four single-precision, floating-point values of a and b. | |
* r0 := a0 * b0 | |
* r1 := a1 * b1 | |
* r2 := a2 * b2 | |
* r3 := a3 * b3 | |
* */ | |
INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) | |
{ | |
//todo: | |
//NEON:(-2.33512e-28) * (-2.13992e-13)=0 | |
//SSE: (-2.33512e-28) * (-2.13992e-13)=4.99689e-41 | |
// return vmulq_f32(a, b); | |
__m128 ret; | |
ret[0] = a[0]*b[0]; | |
ret[1] = a[1]*b[1]; | |
ret[2] = a[2]*b[2]; | |
ret[3] = a[3]*b[3]; | |
return ret; | |
} | |
/* Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. | |
* __m128i _mm_madd_epi16 (__m128i a, __m128i b); | |
* PMADDWD | |
* Return Value | |
* Adds the signed 32-bit integer results pairwise and packs the 4 signed 32-bit integer results. | |
* r0 := (a0 * b0) + (a1 * b1) | |
* r1 := (a2 * b2) + (a3 * b3) | |
* r2 := (a4 * b4) + (a5 * b5) | |
* r3 := (a6 * b6) + (a7 * b7) | |
* */ | |
INLINE __m128i _mm_madd_epi16 (__m128i a, __m128i b) | |
{ | |
int32x4_t r_l = vmull_s16(vget_low_s16((int16x8_t)a), vget_low_s16((int16x8_t)b)); | |
int32x4_t r_h = vmull_s16(vget_high_s16((int16x8_t)a), vget_high_s16((int16x8_t)b)); | |
return vcombine_s32(vpadd_s32(vget_low_s32(r_l), vget_high_s32(r_l)), vpadd_s32(vget_low_s32(r_h), vget_high_s32(r_h))); | |
} | |
/*************************************************************************** | |
* absdiff | |
***************************************************************************/ | |
//#define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)) | |
/* Computes the absolute difference of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. | |
* __m128i _mm_sad_epu8 (__m128i a, __m128i b); | |
* PSADBW | |
* Return Value | |
* Sums the upper 8 differences and lower 8 differences and packs the resulting 2 unsigned 16-bit integers into the upper and lower 64-bit elements. | |
* r0 := abs(a0 - b0) + abs(a1 - b1) +...+ abs(a7 - b7) | |
* r1 := 0x0 ; r2 := 0x0 ; r3 := 0x0 | |
* r4 := abs(a8 - b8) + abs(a9 - b9) +...+ abs(a15 - b15) | |
* r5 := 0x0 ; r6 := 0x0 ; r7 := 0x0 | |
* */ | |
INLINE __m128i _mm_sad_epu8 (__m128i a, __m128i b) | |
{ | |
uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); | |
uint16_t r0 = t[0]+t[1]+t[2]+t[3]; | |
uint16_t r4 = t[4]+t[5]+t[6]+t[7]; | |
uint16x8_t r = vsetq_lane_u16(r0,vdupq_n_u16(0),0); | |
return (__m128i)vsetq_lane_u16(r4,r,4); | |
} | |
/*************************************************************************** | |
* divides | |
***************************************************************************/ | |
/* r0 := a0 / b0 | |
* r1 := a1 / b1 | |
* r2 := a2 / b2 | |
* r3 := a3 / b3 | |
* */ | |
INLINE __m128 _mm_div_ps(__m128 a, __m128 b ) | |
{ | |
// get an initial estimate of 1/b. | |
float32x4_t reciprocal = vrecpeq_f32(b); | |
// use a couple Newton-Raphson steps to refine the estimate. Depending on your | |
// application's accuracy requirements, you may be able to get away with only | |
// one refinement (instead of the two used here). Be sure to test! | |
reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); | |
// reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal); | |
// and finally, compute a/b = a*(1/b) | |
float32x4_t result = vmulq_f32(a,reciprocal); | |
return result; | |
} | |
/* Computes the square roots of the four single-precision, floating-point values of a. | |
* r0 := sqrt(a0) | |
* r1 := sqrt(a1) | |
* r2 := sqrt(a2) | |
* r3 := sqrt(a3) | |
* */ | |
INLINE __m128 _mm_sqrt_ps(__m128 in) | |
{ | |
__m128 recipsq = vrsqrteq_f32(in); | |
__m128 sq = vrecpeq_f32(recipsq); | |
// ??? use step versions of both sqrt and recip for better accuracy? | |
//precision loss | |
// __m128 recipsq = vrsqrtsq_f32(in,vdupq_n_f32(1.0)); | |
// __m128 sq = vrecpsq_f32(recipsq,vdupq_n_f32(1.0)); | |
return sq; | |
} | |
/*************************************************************************** | |
* logic | |
***************************************************************************/ | |
/* Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. | |
* r := a | b | |
* */ | |
INLINE __m128i _mm_or_si128(__m128i a, __m128i b) | |
{ | |
return (__m128i)vorrq_s32(a, b); | |
} | |
/* Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b. | |
* r := a ^ b | |
* */ | |
INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) | |
{ | |
return veorq_s32(a, b); | |
} | |
/* Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. | |
* r := (~a) & b | |
* */ | |
INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) | |
{ | |
return vbicq_s32(b, a); // *NOTE* argument swap | |
} | |
/* Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. | |
* r0 := ~a0 & b0 | |
* r1 := ~a1 & b1 | |
* r2 := ~a2 & b2 | |
* r3 := ~a3 & b3 | |
* */ | |
INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) | |
{ | |
return (__m128)vbicq_s32((__m128i)b, (__m128i)a); // *NOTE* argument swap | |
} | |
/* Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. | |
* r := a & b | |
* */ | |
INLINE __m128i _mm_and_si128(__m128i a, __m128i b) | |
{ | |
return vandq_s32(a, b); | |
} | |
/* Computes the bitwise AND of the four single-precision, floating-point values of a and b. | |
* r0 := a0 & b0 | |
* r1 := a1 & b1 | |
* r2 := a2 & b2 | |
* r3 := a3 & b3 | |
* */ | |
INLINE __m128 _mm_and_ps(__m128 a, __m128 b) | |
{ | |
return (__m128)vandq_s32((__m128i)a, (__m128i)b); | |
} | |
/* Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for greater than. | |
* r0 := (a0 > b0) ? 0xff : 0x0 | |
* r1 := (a1 > b1) ? 0xff : 0x0 | |
* ... | |
* r15 := (a15 > b15) ? 0xff : 0x0 | |
*/ | |
INLINE __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vcgtq_s8((int8x16_t) a,( int8x16_t) b); | |
} | |
/* Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers in b for greater than. | |
* r0 := (a0 > b0) ? 0xffff : 0x0 | |
* r1 := (a1 > b1) ? 0xffff : 0x0 | |
* ... | |
* r7 := (a7 > b7) ? 0xffff : 0x0 | |
* */ | |
INLINE __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b) | |
{ | |
return (__m128i)vcgtq_s16((int16x8_t) a,( int16x8_t) b); | |
} | |
/* Compares for greater than. | |
* r0 := (a0 > b0) ? 0xffffffff : 0x0 | |
* r1 := (a1 > b1) ? 0xffffffff : 0x0 | |
* r2 := (a2 > b2) ? 0xffffffff : 0x0 | |
* r3 := (a3 > b3) ? 0xffffffff : 0x0 | |
* */ | |
INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) | |
{ | |
return (__m128)vcgtq_f32(a, b); | |
} | |
/* Compares for less than or equal. | |
* r0 := (a0 <= b0) ? 0xffffffff : 0x0 | |
* r1 := (a1 <= b1) ? 0xffffffff : 0x0 | |
* r2 := (a2 <= b2) ? 0xffffffff : 0x0 | |
* r3 := (a3 <= b3) ? 0xffffffff : 0x0 | |
* */ | |
INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) | |
{ | |
return (__m128)vcleq_f32(a, b); | |
} | |
/*************************************************************************** | |
* load and store | |
***************************************************************************/ | |
INLINE __m128i _mm_load_si128(const __m128i *p) | |
{ | |
return vld1q_s32((int32_t *)p); | |
} | |
INLINE void _mm_store_si128(__m128i *p, __m128i a ) | |
{ | |
vst1q_s32((int32_t*) p,a); | |
} | |
INLINE __m128 _mm_load_ps(const float * p) | |
{ | |
return vld1q_f32(p); | |
} | |
INLINE void _mm_store_ps(float *p, __m128 a) | |
{ | |
vst1q_f32(p, a); | |
} | |
#define _mm_loadu_si128 _mm_load_si128 | |
#define _mm_storeu_si128 _mm_store_si128 | |
#define _mm_loadu_ps _mm_load_ps | |
#define _mm_storeu_ps _mm_store_ps | |
INLINE __m128i _mm_loadl_epi64(__m128i const*p) | |
{ | |
/* Load the lower 64 bits of the value pointed to by p into the lower 64 bits of the result, zeroing the upper 64 bits of the result. */ | |
return vcombine_s32(vld1_s32((int32_t const *)p),vcreate_s32(0)); | |
} | |
INLINE void _mm_storel_epi64(__m128i* a, __m128i b) | |
{ | |
/* Reads the lower 64 bits of b and stores them into the lower 64 bits of a. */ | |
// | |
//*a = (__m128i)vsetq_lane_s64((int64_t)vget_low_s32(b), *(int64x2_t*)a, 0); | |
//vst1_s64( (int64_t *) a, vget_low_s64((int64x2_t)b)); | |
vst1_s32( (int32_t *) a, vget_low_s32((int32x4_t)b)); | |
} | |
/* Sets the lower two single-precision, floating-point values with 64 bits of data loaded from the address p; the upper two values are passed through from a. | |
* __m128 _mm_loadl_pi( __m128 a , __m64 * p ); | |
* MOVLPS reg, mem | |
* Return Value | |
* r0 := *p0 | |
* r1 := *p1 | |
* r2 := a2 | |
* r3 := a3 | |
* */ | |
INLINE __m128 _mm_loadl_pi( __m128 a , __m64 const * p ) | |
{ | |
return vcombine_f32(vld1_f32((float32_t const *)p),vget_high_f32(a)); | |
} | |
/* Stores the lower two single-precision, floating-point values of a to the address p. | |
* *p0 := b0 | |
* *p1 := b1 | |
* */ | |
INLINE void _mm_storel_pi( __m64 * p , __m128 a ) | |
{ | |
vst1_f32((float32_t *)p, vget_low_f32((float32x4_t)a)); | |
} | |
INLINE __m128 _mm_load_ss(const float * p) | |
{ | |
/* Loads an single-precision, floating-point value into the low word and clears the upper three words. */ | |
__m128 result = vdupq_n_f32(0); | |
return vsetq_lane_f32(*p, result, 0); | |
} | |
INLINE void _mm_store_ss(float *p, __m128 a) | |
{ | |
/* Stores the lower single-precision, floating-point value. */ | |
vst1q_lane_f32(p, a, 0); | |
} | |
/*************************************************************************** | |
* SET | |
***************************************************************************/ | |
/* Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. | |
* r0 := a | |
* r1 := 0x0 ; r2 := 0x0 ; r3 := 0x0 | |
* */ | |
INLINE __m128i _mm_cvtsi32_si128(int a) | |
{ | |
__m128i result = vdupq_n_s32(0); | |
return vsetq_lane_s32(a, result, 0); | |
} | |
/* Sets the 16 signed 8-bit integer values to b. | |
* r0 := b | |
* r1 := b | |
* ... | |
* r15 := b | |
* */ | |
INLINE __m128i _mm_set1_epi8 (char b) | |
{ | |
return (__m128i)vdupq_n_s8((int8_t)b); | |
} | |
/* Sets the 8 signed 16-bit integer values to w. | |
* r0 := w | |
* r1 := w | |
* ... | |
* r7 := w | |
* */ | |
INLINE __m128i _mm_set1_epi16 (short w) | |
{ | |
return (__m128i)vdupq_n_s16((int16_t)w); | |
} | |
/* Sets the 4 signed 32-bit integer values to i. | |
* r0 := i | |
* r1 := i | |
* r2 := i | |
* r3 := I | |
* */ | |
INLINE __m128i _mm_set1_epi32(int i) | |
{ | |
return vdupq_n_s32(i); | |
} | |
/* Sets the four single-precision, floating-point values to w | |
* r0 := r1 := r2 := r3 := w | |
* */ | |
INLINE __m128 _mm_set1_ps(float w) | |
{ | |
return vdupq_n_f32(w); | |
} | |
#define _mm_set_ps1 _mm_set1_ps | |
INLINE __m128 _mm_set_ps(float w, float z, float y, float x) | |
{ | |
float __attribute__((aligned(16))) data[4] = {x,y,z,w}; | |
return vld1q_f32(data); | |
} | |
/* Sets the 8 signed 16-bit integer values in reverse order. | |
* __m128i _mm_setr_epi16 (short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7); | |
* (composite) | |
* Return Value | |
* r0 := w0 | |
* r1 := w1 | |
* ... | |
* r7 := w7 | |
* */ | |
INLINE __m128i _mm_setr_epi16 (short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) | |
{ | |
short __attribute__ ((aligned (16))) data[8] = { w0, w1, w2, w3, w4, w5, w6, w7 }; | |
return (__m128i)vld1q_s16((int16_t*)data); | |
} | |
//todo ~~~~~~~~~~~~~~~~~~~~~~~~~ | |
/* Snhuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. */ | |
INLINE __m128i _mm_shuffle_epi32 (__m128i a, int imm) | |
{ | |
__m128i ret; | |
switch (imm) | |
{ | |
case 0 : | |
return vdupq_n_s32(vgetq_lane_s32(a, 0)); | |
break; | |
default: | |
// __m128i ret = vdupq_n_s32(0); | |
ret[0] = a[imm & 0x3]; | |
ret[1] = a[(imm >> 2) & 0x3]; | |
ret[2] = a[(imm >> 4) & 0x03]; | |
ret[3] = a[(imm >> 6) & 0x03]; | |
return ret; | |
} | |
} | |
INLINE __m128i _mm_shufflelo_epi16 (__m128i a, int imm) | |
{ | |
// FIXME: Do this properly, this doesn't currently work: | |
return _mm_shuffle_epi32 (a, imm); | |
} | |
//todo ~~~~~~~~~~~~~~~~~~~~~~~~~ | |
/* Selects four specific single-precision, floating-point values from a and b, based on the mask i. */ | |
INLINE __m128 _mm_shuffle_ps(__m128 a , __m128 b , int i ) | |
{ | |
__m128 ret; | |
switch (i) | |
{ | |
// case 0 : | |
// return | |
// break; | |
default: | |
ret[0] = a[i & 0x3]; | |
ret[1] = a[(i >> 2) & 0x3]; | |
ret[2] = b[(i >> 4) & 0x03]; | |
ret[3] = b[(i >> 6) & 0x03]; | |
return ret; | |
} | |
} | |
/*************************************************************************** | |
* GET | |
***************************************************************************/ | |
INLINE int _mm_cvtsi128_si32(__m128i a) | |
{ | |
/* Moves the least significant 32 bits of a to a 32-bit integer. */ | |
return vgetq_lane_s32(a, 0); | |
} | |
/* Sets the 128-bit value to zero. | |
* r := 0x0 | |
* */ | |
INLINE __m128i _mm_setzero_si128() | |
{ | |
return vdupq_n_s32(0); | |
} | |
/* Clears the four single-precision, floating-point values. | |
* r0 := r1 := r2 := r3 := 0.0 | |
* */ | |
INLINE __m128 _mm_setzero_ps(void) | |
{ | |
return vdupq_n_f32(0); | |
} | |
/*************************************************************************** | |
* convert | |
***************************************************************************/ | |
/* Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. | |
* r0 := SignedSaturate(a0) | |
* r1 := SignedSaturate(a1) | |
* r2 := SignedSaturate(a2) | |
* r3 := SignedSaturate(a3) | |
* r4 := SignedSaturate(b0) | |
* r5 := SignedSaturate(b1) | |
* r6 := SignedSaturate(b2) | |
* r7 := SignedSaturate(b3) | |
* */ | |
INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) | |
{ | |
return (__m128i)vcombine_s16(vqmovn_s32(a), vqmovn_s32(b)); | |
} | |
/* Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b. | |
* r0 := a8 ; r1 := b8 | |
* r2 := a9 ; r3 := b9 | |
* ... | |
* r14 := a15 ; r15 := b15 | |
* */ | |
INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) | |
{ | |
int8x8_t a_h = vget_high_s8((int8x16_t)a); | |
int8x8_t b_h = vget_high_s8((int8x16_t)b); | |
int8x8x2_t r = vzip_s8(a_h, b_h); | |
return (__m128i)vcombine_s8(r.val[0],r.val[1]); | |
} | |
/* Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b. | |
* r0 := a0 ; r1 := b0 | |
* r2 := a1 ; r3 := b1 | |
* ... | |
* r14 := a7 ; r15 := b7 | |
* */ | |
INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) | |
{ | |
int8x8_t a_l = vget_low_s8((int8x16_t)a); | |
int8x8_t b_l = vget_low_s8((int8x16_t)b); | |
int8x8x2_t r = vzip_s8(a_l, b_l); | |
return (__m128i)vcombine_s8(r.val[0],r.val[1]); | |
} | |
/* Interleaves the lower 2 signed or unsigned 32-bit integers in a with the lower 2 signed or unsigned 32-bit integers in b. | |
* r0 := a0 ; r1 := b0 | |
* r2 := a1 ; r3 := b1 | |
* */ | |
INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) | |
{ | |
int32x2_t a_l = vget_low_s32((int32x4_t)a); | |
int32x2_t b_l = vget_low_s32((int32x4_t)b); | |
int32x2x2_t r = vzip_s32(a_l, b_l); | |
return (__m128i)vcombine_s32(r.val[0],r.val[1]); | |
} | |
INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) | |
{ | |
int64x1_t a_l = vget_low_s64((int64x2_t)a); | |
int64x1_t b_l = vget_low_s64((int64x2_t)b); | |
return (__m128i)vcombine_s64(a_l, b_l); | |
} | |
/* Interleaves the upper signed or unsigned 64-bit integer in a with the upper signed or unsigned 64-bit integer in b. | |
* __m128i _mm_unpackhi_epi64 (__m128i a, __m128i b); | |
* PUNPCKHQDQ | |
* Return Value | |
* r0 := a1 ; r1 := b1 | |
* */ | |
INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) | |
{ | |
int64x1_t a_h = vget_high_s64((int64x2_t)a); | |
int64x1_t b_h = vget_high_s64((int64x2_t)b); | |
return (__m128i)vcombine_s64(a_h, b_h); | |
} | |
/* Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b. | |
* r0 := a4 ; r1 := b4 | |
* r2 := a5 ; r3 := b5 | |
* r4 := a6 ; r5 := b6 | |
* r6 := a7 ; r7 := b7 | |
* */ | |
INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) | |
{ | |
int16x4_t a_h = vget_high_s16((int16x8_t)a); | |
int16x4_t b_h = vget_high_s16((int16x8_t)b); | |
int16x4x2_t result = vzip_s16(a_h, b_h); | |
return (__m128i)vcombine_s16(result.val[0], result.val[1]); | |
} | |
/* Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b. | |
* r0 := a0 ; r1 := b0 | |
* r2 := a1 ; r3 := b1 | |
* r4 := a2 ; r5 := b2 | |
* r6 := a3 ; r7 := b3 | |
* */ | |
INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) | |
{ | |
int16x4_t a_l = vget_low_s16((int16x8_t)a); | |
int16x4_t b_l = vget_low_s16((int16x8_t)b); | |
int16x4x2_t result = vzip_s16(a_l, b_l); | |
return (__m128i)vcombine_s16(result.val[0], result.val[1]); | |
} | |
INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) | |
{ | |
int32x2_t a1 = vget_high_s32(a); | |
int32x2_t b1 = vget_high_s32(b); | |
int32x2x2_t result = vzip_s32(a1, b1); | |
return vcombine_s32(result.val[0], result.val[1]); | |
} | |
/* Selects and interleaves the lower two single-precision, floating-point values from a and b. | |
* r0 := a0 | |
* r1 := b0 | |
* r2 := a1 | |
* r3 := b1 | |
* */ | |
INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) | |
{ | |
float32x2x2_t result = vzip_f32(vget_low_f32(a), vget_low_f32(b)); | |
return vcombine_f32(result.val[0], result.val[1]); | |
} | |
/* Selects and interleaves the upper two single-precision, floating-point values from a and b. | |
* r0 := a2 | |
* r1 := b2 | |
* r2 := a3 | |
* r3 := b3 | |
* */ | |
INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) | |
{ | |
float32x2x2_t result = vzip_f32(vget_high_f32(a), vget_high_f32(b)); | |
return vcombine_f32(result.val[0], result.val[1]); | |
} | |
INLINE __m128 _mm_cvtepi32_ps(__m128i a) | |
{ | |
return vcvtq_f32_s32(a); | |
} | |
/* Converts the four single-precision, floating-point values of a to signed 32-bit integer values. | |
* r0 := (int) a0 | |
* r1 := (int) a1 | |
* r2 := (int) a2 | |
* r3 := (int) a3 | |
* */ | |
INLINE __m128i _mm_cvtps_epi32(__m128 a) | |
{ | |
//todo:precision | |
//NaN -0 | |
// | |
return vcvtq_s32_f32(a); | |
// __m128i ret; | |
// ret[0] = (int64_t)a[0]; | |
// ret[1] = (int64_t)a[1]; | |
// ret[2] = (int64_t)a[2]; | |
// ret[3] = (int64_t)a[3]; | |
// return ret; | |
} | |
/* Packs the 16 signed 16-bit integers from a and b into 8-bit unsigned integers and saturates. | |
* | |
* r0 := UnsignedSaturate(a0) | |
* r1 := UnsignedSaturate(a1) | |
* ... | |
* r7 := UnsignedSaturate(a7) | |
* r8 := UnsignedSaturate(b0) | |
* r9 := UnsignedSaturate(b1) | |
* ... | |
* r15 := UnsignedSaturate(b7) | |
* */ | |
INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) | |
{ | |
return (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)a), vqmovun_s16((int16x8_t)b)); | |
} | |
/* Moves the lower two single-precision, floating-point values of b to the upper two single-precision, floating-point values of the result. | |
* r3 := b1 | |
* r2 := b0 | |
* r1 := a1 | |
* r0 := a0 | |
* */ | |
INLINE __m128 _mm_movelh_ps( __m128 a, __m128 b ) | |
{ | |
return vcombine_f32(vget_low_f32(a),vget_low_f32(b)); | |
} | |
/* The upper two single-precision, floating-point values of a are passed through to the result. | |
* r3 := a3 | |
* r2 := a2 | |
* r1 := b3 | |
* r0 := b2 | |
* */ | |
INLINE __m128 _mm_movehl_ps( __m128 a, __m128 b ) | |
{ | |
return vcombine_f32(vget_high_f32(b),vget_high_f32(a)); | |
} | |
/*************************************************************************** | |
* shift | |
***************************************************************************/ | |
/* Shifts the 4 signed 32-bit integers in a right by count bits while shifting in the sign bit. | |
* r0 := a0 >> count | |
* r1 := a1 >> count | |
* r2 := a2 >> count | |
* r3 := a3 >> count | |
* immediate ,use #define _mm_srai_epi32(a, imm) vshrq_n_s32(a, imm) | |
* */ | |
INLINE __m128i _mm_srai_epi32 (__m128i a, int count) | |
{ | |
// return vshrq_n_s32(a, count); | |
// todo : | |
// if immediate | |
return vshlq_s32(a, vdupq_n_s32(-count)); | |
} | |
/* Shifts the 8 signed 16-bit integers in a right by count bits while shifting in the sign bit. | |
* r0 := a0 >> count | |
* r1 := a1 >> count | |
* ... | |
* r7 := a7 >> count | |
* */ | |
INLINE __m128i _mm_srai_epi16 (__m128i a, int count) | |
{ | |
// return vshrq_n_s16(a, count); | |
// todo : | |
// if immediate | |
return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(-count)); | |
} | |
/* Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while shifting in zeros. | |
* r0 := a0 << count | |
* r1 := a1 << count | |
* ... | |
* r7 := a7 << count | |
* */ | |
INLINE __m128i _mm_slli_epi16(__m128i a, int count) | |
{ | |
// todo : | |
// if immediate | |
return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(count)); | |
} | |
/* Shifts the 8 signed or unsigned 16-bit integers in a right by count bits while shifting in zeros. | |
* r0 := srl(a0, count) | |
* r1 := srl(a1, count) | |
* ... | |
* r7 := srl(a7, count) | |
* */ | |
INLINE __m128i _mm_srli_epi16(__m128i a, int count) | |
{ | |
// todo : | |
// if immediate | |
return (__m128i)vshlq_u16((uint16x8_t)a, vdupq_n_s16(-count)); | |
} | |
/* Shifts the 128-bit value in a right by imm bytes while shifting in zeros. imm must be an immediate. | |
* r := srl(a, imm*8) | |
* */ | |
#define _mm_srli_si128( a, imm ) (__m128i)vextq_s8((int8x16_t)a, vdupq_n_s8(0), (imm)) | |
/* Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate. | |
* r := a << (imm * 8)*/ | |
//todo ::imm =0, compile error | |
#define _mm_slli_si128( a, imm ) (__m128i)vextq_s8(vdupq_n_s8(0), (int8x16_t)a, 16 - (imm)) | |
#endif |
I'm not super comfortable with valgrind or gdb. if you have a gist of what I should run to get that info that would be great. I'm setting up a fresh jetpack install and going to go through it again just to make sure I didint mess anything up while monkeying around.
cd build/rundir/RelWithDebInfo
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libGL.so valgrind ./bin/64bit/obs
Then just paste where it segfaulted
Or with gdb:
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libGL.so gdb ./bin/64bit/obs
run
bt
Then just paste that backtrace output into your comment here.
ldd bin/64bit/obs will tell us "some" things, but obs dlopens various libraries at runtime which won't show up here. Still, it'll tell us what it did link to at compile time, which could also be causing issues.
Thank you!