Skip to content

Instantly share code, notes, and snippets.

static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)";
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <chrono>
#include <immintrin.h>
#include <assert.h>
using namespace std;
using namespace std::chrono;
static const char* const sourceDataPath = R"(C:\Temp\2remove\vectors.csv)";
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <chrono>
#include <immintrin.h>
using namespace std;
using namespace std::chrono;
constexpr int SIZE = 640000;
#include <stdint.h>
#include <emmintrin.h> // SSE 2
#include <tmmintrin.h> // SSSE 3
#include <smmintrin.h> // SSE 4.1
// Vector constants for dot4Sse function
struct ConstantVectorsSse
{
__m128i abcd;
__m128i lowNibbleMask;
#include <stdio.h>
#include <vector>
#include <set>
static bool s_log = false;
void message( const char* what )
{
if( s_log )
{
printf( "%s\n", what );
std::vector<std::string> someFunction( const Invocation& invocation )
{
// Define hash and comparison for string pointers, by value
struct StringPtrTraits
{
size_t operator()( const std::string* rsi ) const
{
return std::hash<std::string>()( *rsi );
}
bool operator()( const std::string* a, const std::string* b ) const
#include <immintrin.h>
#include <stdint.h>
// 1 = use `vpgatherdq` to load 4 numbers with 1 instruction, 0 = load them with scalar loads
// It seems on AMD CPUs scalar loads are slightly faster
#define USE_GATHER_INSTUCTIONS 0
// Inclusive prefix sum of unsigned bytes = offsets of the end of the numbers
// When the sum of all bytes exceeds 0xFF, the output is garbage
// Which is fine here because our bytes are in [0..8] interval
#include <stdint.h>
#include <immintrin.h>
#include <intrin.h>
#include <stdio.h>
// Count of set bits in `plus` minus count of set bits in `minus`
// The result is in [ -32 .. +32 ] interval
inline int popCntDiff( uint32_t plus, uint32_t minus )
{
plus = __popcnt( plus );
// Transform 4 inputs with 4 lookup tables, making 4 outputs
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval
// The 4 tables are in a single AVX2 vector
uint32_t applyLookup4( uint32_t i4, __m256i tables4 )
{
// Move 4 bytes into SSE vector
__m128i bytes = _mm_cvtsi32_si128( (int)i4 );
// Expand bytes into uint64_t lanes
__m256i v = _mm256_cvtepu8_epi64( bytes );
// Multiply them by 4 to get shift amounts in bits
#include <stdlib.h>
#include <stdio.h>
#include <random>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include <optional>
#include <intrin.h>
#include <inttypes.h>
using System.Linq.Expressions;
using System.Reflection;
using System.Runtime.CompilerServices;
static class ReflectTest
{
/// <summary>Generic method to call</summary>
public static T GetValue<T>( T value )
{
return value;