This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cbuffer CB_PROJ | |
{ | |
matrix camera; | |
}; | |
struct VOut | |
{ | |
float3 position : POSITION; | |
float3 r_s : NORMAL; | |
uint bits : BLENDINDICES; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static class Program | |
{ | |
// Make a random transaction between two people. | |
static void randomTransaction( ref int to, ref int from ) | |
{ | |
const int transactionAmount = 5; | |
int amount = Math.Min( transactionAmount, from ); | |
from -= amount; | |
to += amount; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__m128i convertMask( __m256d src ) | |
{ | |
// Bit-cast into fp32 vector, the intrinsic compiles into no instructions | |
const __m256 f32 = _mm256_castpd_ps( src ); | |
// Split into high/low halves; casting is free, vextractf128 is not. | |
const __m128 low = _mm256_castps256_ps128( f32 ); | |
const __m128 high = _mm256_extractf128_ps( f32, 1 ); | |
// Combine 32-bit values into a single vector with correct order | |
// _mm_shuffle_ps takes first 2 lanes from the first argument, last 2 lanes fro the second argument. | |
const __m128 combined = _mm_shuffle_ps( low, high, _MM_SHUFFLE( 2, 0, 2, 0 ) ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "pch.h" | |
#include "RenderDeviceGLImpl.hpp" | |
#include "../../../../NetCore/ModeSet/API/eglContext.h" | |
#include <EGL/egl.h> | |
#include <EGL/eglext.h> | |
#define GL_GLEXT_PROTOTYPES | |
#include <GLES2/gl2.h> | |
#include <GLES2/gl2ext.h> | |
#include <libdrm/drm_fourcc.h> | |
#include <string> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 ) | |
{ | |
// 30 vector registers in total; ARM64 has 32 of them, so we're good. | |
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80; | |
float32x4_t mat0, mat1, mat2, mat3, mat4; | |
float32x4_t res0, res1, res2, res3; | |
vec80 = mat4 = vdupq_n_f32( 0.0f ); | |
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer | |
#define LOAD_VEC_16( v0, v1, v2, v3 ) \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static class C | |
{ | |
static int div2(int i) | |
{ | |
return i / 2; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <assert.h> | |
#include <immintrin.h> | |
struct data | |
{ | |
std::vector<int8_t> byteVals; // byteVals[i] == -128 means look in intVals | |
std::vector<int> intVals; // length is number of -128 values in byteVals | |
void push_back( int v ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Runtime.CompilerServices; | |
using System.Runtime.Intrinsics; | |
using System.Runtime.Intrinsics.Arm; | |
static class MotionDetectNeon | |
{ | |
/// <summary>Compute absolute difference between a and b, count the elements with difference above the threshold.</summary> | |
[MethodImpl( MethodImplOptions.AggressiveInlining )] | |
static Vector128<int> countAboveThreshold( Vector128<byte> a, Vector128<byte> b, Vector128<byte> threshold, Vector128<int> acc ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int NeonTest( const uint8_t* lhs, const uint8_t* rhs, size_t count ) | |
{ | |
// If the length is not multiple of 16, you gonna need more code to handle the remainder | |
assert( 0 == ( count % 16 ) ); | |
const uint8_t* const lhsEnd = lhs + count; | |
int32x4_t acc = vdupq_n_s32( 0 ); | |
// The threshold is power of 2, using bits test for comparison for v >= 16 | |
const uint8x16_t thresholdBitMask = vdupq_n_u8( 0xF0 ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Store 10-bit pieces from 16-bit lanes of the AVX2 vector, with truncation. | |
// The function writes 20 bytes to the pointer. | |
inline void store_10x16_avx2( __m256i v, uint8_t* rdi ) | |
{ | |
__m256i low, high; | |
// Pack pairs of 10 bits into 20 | |
low = _mm256_slli_epi16( v, 6 ); | |
v = _mm256_blend_epi16( v, low, 0b01010101 ); | |
// Now the vector contains 32-bit lanes with 20 payload bits / each in the middle of them |