Const-me · November 21, 2024 13:22
diff --git a/EigenDX.cpp b/EigenDX.cpp
 // Compiled with VS 2022: Release AMD64, AVX2 ISA, LTCG
 // RDTSC time on Ryzen 7 8700G for 1024 matrices: 15834 Eigen, 7224 DirectXMath
 constexpr bool useEigen = true;

 // Eigen 3.4.0
 #include <Eigen/Eigen>
 __forceinline void multiplyWithEigen( float* rdi, const float* rsi )
 {
 	using Mat = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
 	static_assert( sizeof( Mat ) == 4 * 4 * sizeof( float ) );

 	const Mat& lhs = *(const Mat*)( rsi );
 	const Mat& rhs = *(const Mat*)( rsi + 16 );
 	Mat& prod = *(Mat*)rdi;

 	prod.noalias() = lhs * rhs;
 }

 // DirectXMath current master
 #include "DirectXMath/DirectXMath.h"
 __forceinline void multiplyWithDxMath( float* rdi, const float* rsi )
 {
 	using namespace DirectX;
 	XMMATRIX lhs = XMLoadFloat4x4( (const XMFLOAT4X4*)( rsi ) );
 	XMMATRIX rhs = XMLoadFloat4x4( (const XMFLOAT4X4*)( rsi + 16 ) );
 	XMMATRIX prod = XMMatrixMultiply( lhs, rhs );
 	XMStoreFloat4x4( (XMFLOAT4X4*)rdi, prod );
 }

 static void __declspec( noinline ) multiplyMatrices( float* rdi, size_t count, const float* rsi )
 {
 	const float* const rsiEnd = rsi + count * ( 16 * 2 );
 	for( ; rsi < rsiEnd; rsi += ( 16 * 2 ), rdi += 16 )
 	{
 		if constexpr( useEigen )
 			multiplyWithEigen( rdi, rsi );
 		else
 			multiplyWithDxMath( rdi, rsi );
 	}
 }

 #include <vector>
 #include <random>
 // Generate FP32 vector filled with [ -1 .. +1 ] random values
 std::vector<float> makeRandomVector( size_t length )
 {
 	std::vector<float> vec( length );
 	std::random_device rd; // Seed
 	std::mt19937 gen( rd() ); // Mersenne Twister RNG
 	std::uniform_real_distribution<float> dis( -1.0f, 1.0f ); // Range [-1, +1]
 	for( float& value : vec )
 		value = dis( gen );
 	return vec;
 }

 #include <stdio.h>
 int main()
 {
 	constexpr size_t countMatrices = 1024;

 	const std::vector<float> source = makeRandomVector( 4 * 4 * 2 * countMatrices );
 	std::vector<float> result;
 	result.resize( 4 * 4 * countMatrices );

 	const uint64_t start = __rdtsc();
 	_ReadBarrier();

 	multiplyMatrices( result.data(), countMatrices, source.data() );

 	_ReadBarrier();
 	const uint64_t end = __rdtsc();

 	printf( "%lli cycles; %g\n", end - start, *result.rbegin() );
 	return 0;
 }
	// Compiled with VS 2022: Release AMD64, AVX2 ISA, LTCG
	// RDTSC time on Ryzen 7 8700G for 1024 matrices: 15834 Eigen, 7224 DirectXMath
	constexpr bool useEigen = true;

	// Eigen 3.4.0
	#include <Eigen/Eigen>
	__forceinline void multiplyWithEigen( float* rdi, const float* rsi )
	{
	using Mat = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
	static_assert( sizeof( Mat ) == 4 * 4 * sizeof( float ) );

	const Mat& lhs = (const Mat)( rsi );
	const Mat& rhs = (const Mat)( rsi + 16 );
	Mat& prod = (Mat)rdi;

	prod.noalias() = lhs * rhs;
	}

	// DirectXMath current master
	#include "DirectXMath/DirectXMath.h"
	__forceinline void multiplyWithDxMath( float* rdi, const float* rsi )
	{
	using namespace DirectX;
	XMMATRIX lhs = XMLoadFloat4x4( (const XMFLOAT4X4*)( rsi ) );
	XMMATRIX rhs = XMLoadFloat4x4( (const XMFLOAT4X4*)( rsi + 16 ) );
	XMMATRIX prod = XMMatrixMultiply( lhs, rhs );
	XMStoreFloat4x4( (XMFLOAT4X4*)rdi, prod );
	}

	static void __declspec( noinline ) multiplyMatrices( float* rdi, size_t count, const float* rsi )
	{
	const float* const rsiEnd = rsi + count * ( 16 * 2 );
	for( ; rsi < rsiEnd; rsi += ( 16 * 2 ), rdi += 16 )
	{
	if constexpr( useEigen )
	multiplyWithEigen( rdi, rsi );
	else
	multiplyWithDxMath( rdi, rsi );
	}
	}

	#include <vector>
	#include <random>
	// Generate FP32 vector filled with [ -1 .. +1 ] random values
	std::vector<float> makeRandomVector( size_t length )
	{
	std::vector<float> vec( length );
	std::random_device rd; // Seed
	std::mt19937 gen( rd() ); // Mersenne Twister RNG
	std::uniform_real_distribution<float> dis( -1.0f, 1.0f ); // Range [-1, +1]
	for( float& value : vec )
	value = dis( gen );
	return vec;
	}

	#include <stdio.h>
	int main()
	{
	constexpr size_t countMatrices = 1024;

	const std::vector<float> source = makeRandomVector( 4 * 4 * 2 * countMatrices );
	std::vector<float> result;
	result.resize( 4 * 4 * countMatrices );

	const uint64_t start = __rdtsc();
	_ReadBarrier();

	multiplyMatrices( result.data(), countMatrices, source.data() );

	_ReadBarrier();
	const uint64_t end = __rdtsc();

	printf( "%lli cycles; %g\n", end - start, *result.rbegin() );
	return 0;
	}