Skip to content

Instantly share code, notes, and snippets.

cbuffer CB_PROJ
{
matrix camera;
};
struct VOut
{
float3 position : POSITION;
float3 r_s : NORMAL;
uint bits : BLENDINDICES;
static class Program
{
// Make a random transaction between two people.
static void randomTransaction( ref int to, ref int from )
{
const int transactionAmount = 5;
int amount = Math.Min( transactionAmount, from );
from -= amount;
to += amount;
}
__m128i convertMask( __m256d src )
{
// Bit-cast into fp32 vector, the intrinsic compiles into no instructions
const __m256 f32 = _mm256_castpd_ps( src );
// Split into high/low halves; casting is free, vextractf128 is not.
const __m128 low = _mm256_castps256_ps128( f32 );
const __m128 high = _mm256_extractf128_ps( f32, 1 );
// Combine 32-bit values into a single vector with correct order
// _mm_shuffle_ps takes first 2 lanes from the first argument, last 2 lanes fro the second argument.
const __m128 combined = _mm_shuffle_ps( low, high, _MM_SHUFFLE( 2, 0, 2, 0 ) );
#include "pch.h"
#include "RenderDeviceGLImpl.hpp"
#include "../../../../NetCore/ModeSet/API/eglContext.h"
#include <EGL/egl.h>
#include <EGL/eglext.h>
#define GL_GLEXT_PROTOTYPES
#include <GLES2/gl2.h>
#include <GLES2/gl2ext.h>
#include <libdrm/drm_fourcc.h>
#include <string>
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
// 30 vector registers in total; ARM64 has 32 of them, so we're good.
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
float32x4_t mat0, mat1, mat2, mat3, mat4;
float32x4_t res0, res1, res2, res3;
vec80 = mat4 = vdupq_n_f32( 0.0f );
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 ) \
static class C
{
static int div2(int i)
{
return i / 2;
}
}
#include <vector>
#include <assert.h>
#include <immintrin.h>
struct data
{
std::vector<int8_t> byteVals; // byteVals[i] == -128 means look in intVals
std::vector<int> intVals; // length is number of -128 values in byteVals
void push_back( int v )
using System;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
static class MotionDetectNeon
{
/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary>
[MethodImpl( MethodImplOptions.AggressiveInlining )]
static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc )
int NeonTest( const uint8_t* lhs, const uint8_t* rhs, size_t count )
{
// If the length is not multiple of 16, you gonna need more code to handle the remainder
assert( 0 == ( count % 16 ) );
const uint8_t* const lhsEnd = lhs + count;
int32x4_t acc = vdupq_n_s32( 0 );
// The threshold is power of 2, using bits test for comparison for v >= 16
const uint8x16_t thresholdBitMask = vdupq_n_u8( 0xF0 );
// Store 10-bit pieces from 16-bit lanes of the AVX2 vector, with truncation.
// The function writes 20 bytes to the pointer.
inline void store_10x16_avx2( __m256i v, uint8_t* rdi )
{
__m256i low, high;
// Pack pairs of 10 bits into 20
low = _mm256_slli_epi16( v, 6 );
v = _mm256_blend_epi16( v, low, 0b01010101 );
// Now the vector contains 32-bit lanes with 20 payload bits / each in the middle of them