Skip to content

Instantly share code, notes, and snippets.

#include <stdlib.h>
#include <stdio.h>
#include <random>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include <optional>
#include <intrin.h>
#include <inttypes.h>
using System.Linq.Expressions;
using System.Reflection;
using System.Runtime.CompilerServices;
static class ReflectTest
{
/// <summary>Generic method to call</summary>
public static T GetValue<T>( T value )
{
return value;
#include <stdlib.h>
#include <vector>
#include <intrin.h>
#include <stdint.h>
#include <inttypes.h>
std::vector<char> makeTestVector( bool random )
{
std::vector<char> result;
result.resize( 1024 * 16 );
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// Load 16 bytes from memory
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
#include <immintrin.h>
// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
// BTW, according to godbolt.org, gcc does better than clang for this code.
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors
__m256 a00 = _mm256_setzero_ps();
__m256 a01 = _mm256_setzero_ps();
#include <immintrin.h>
// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
// Using 2 accumulators per row to workaround data dependency on the accumulators
// Initialize the accumulators
__m256 a00 = _mm256_setzero_ps();
using System.Runtime.InteropServices;
using Whisper;
/// <summary>This class demonstrates how to implement iAudioBuffer COM interface in C#, to supply audio samples produced by managed code</summary>
/// <remarks>The library requires these samples to be <c>float</c> numbers @ 16 kHz sample rate</remarks>
sealed class AudioBuffer: iAudioBuffer
{
void IDisposable.Dispose()
{
free();