OlivierLi · August 29, 2015 14:25
diff --git a/avx2.cpp b/avx2.cpp
 #include "stdafx.h"

 #include <iostream>
 #include <chrono>
 #include <immintrin.h>
 #include <memory>

 static inline bool isAligned( const void *pointer, size_t byte_count )
 {
 	return (uintptr_t) pointer % byte_count == 0;
 }

 void test_sse( int *output, const int *input )
 {

 	bool alligned = isAligned( input, 32 );
 	if( !alligned )
 	{
 		throw std::exception( "Memory is not alligned!!" ); //Get alligned memory then remove this, then noThrow()?
 	}

 	//SSE -> AVX transition
 	//_mm256_zeroall();

 	//Load the integers into an xmm register
 	__m256i loaded_ints = _mm256_loadu_si256( ( const __m256i* )input );

 	//Convert the integers to float values
 	__m256 values_reg = _mm256_cvtepi32_ps( loaded_ints );

 	//Load the factors
 	__m256 factors_reg = _mm256_set1_ps( 255.f / 1024.f );

 	//Apply the multiplication
 	__m256 result_reg = _mm256_mul_ps( values_reg, factors_reg );

 	//Convert results back to int
 	__m256i final_results = _mm256_cvtps_epi32( result_reg );

 	//Add 5 to the result
 	__m256i fives = _mm256_set1_epi32( 5 );
 	final_results = _mm256_add_epi32( final_results, fives );

 	//Store results
 	_mm256_store_si256( ( __m256i* )output, final_results );

 	//AVX -> SSE transition
 	//_mm256_zeroall();
 }

 void test( int *output, const int *input )
 {
 	float results[8] = { 0.0f };

 	static const float factor = 255 / 1024.f;
 	for( int i = 0; i < 8; ++i )
 	{
 		results[i] = input[i] * factor;
 	}

 	for( int i = 0; i < 8; ++i )
 	{
 		output[i] = static_cast<int>(results[i]);
 	}

 	for( int i = 0; i < 8; ++i )
 	{
 		output[i] += 5;
 	}
 }

 void test2( unsigned char *output, const int *input )
 {
 	for( int i = 0; i < 8; ++i )
 	{
 		output[i] = static_cast<unsigned char>( ( input[i] * 249 ) >> 10 ); //Aproximation pour tester!
 	}
 }

 template <typename T>
 std::pair<uint8_t*, T*> get_aligned_offset_of_type( int count , int allignment )
 {
 	uint8_t *memory_block = new uint8_t[count*sizeof(T) + 31];
 	uint8_t *offseted_ptr = memory_block;

 	while( !isAligned( offseted_ptr, allignment ) )
 	{
 		++offseted_ptr;
 	}

 	return std::make_pair( memory_block, reinterpret_cast<T*>( offseted_ptr ) );
 }

 int main( int argc, const char * argv[] )
 {
 	std::pair<uint8_t*, int*> alligned_block_values =  get_aligned_offset_of_type<int>( 8, 32 );
 	std::pair<uint8_t*, int*> alligned_block_results = get_aligned_offset_of_type<int>( 8, 32 );

 	int *values  = alligned_block_values.second;
 	int *results = alligned_block_results.second;

 	auto start = std::chrono::high_resolution_clock::now();

 	int iterationCount = 100000000;

 	for( int i = 0; i < iterationCount; ++i )
 	{
 		values[0] = 1;
 		values[1] = 254;
 		values[2] = 777;
 		values[3] = 1024;
 		values[4] = 1;
 		values[5] = 254;
 		values[6] = 777;
 		values[7] = 1024;

 		test( results, values );
 		//test2( output, values );
 		//test_sse( output, values );
 	}

 	auto end = std::chrono::high_resolution_clock::now();
 	auto duration = end - start;
 	std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl;

 	start = std::chrono::high_resolution_clock::now();

 	for( int i = 0; i < iterationCount; ++i )
 	{
 		values[0] = 1;
 		values[1] = 254;
 		values[2] = 777;
 		values[3] = 1024;
 		values[4] = 1;
 		values[5] = 254;
 		values[6] = 777;
 		values[7] = 1024;

 		//test( output, values );
 		//test2( output, values );
 		test_sse( results, values );
 	}

 	end = std::chrono::high_resolution_clock::now();
 	duration = end - start;
 	std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl;

 	//for( int i = 0; i<8; ++i )
 	//{
 	//	std::cout << results[i] << std::endl;
 	//}

 	delete[] alligned_block_results.first;
 	delete[] alligned_block_values.first;
 	return 0;
 }
	#include "stdafx.h"

	#include <iostream>
	#include <chrono>
	#include <immintrin.h>
	#include <memory>

	static inline bool isAligned( const void *pointer, size_t byte_count )
	{
	return (uintptr_t) pointer % byte_count == 0;
	}

	void test_sse( int output, const int input )
	{

	bool alligned = isAligned( input, 32 );
	if( !alligned )
	{
	throw std::exception( "Memory is not alligned!!" ); //Get alligned memory then remove this, then noThrow()?
	}

	//SSE -> AVX transition
	//_mm256_zeroall();

	//Load the integers into an xmm register
	__m256i loaded_ints = _mm256_loadu_si256( ( const __m256i* )input );

	//Convert the integers to float values
	__m256 values_reg = _mm256_cvtepi32_ps( loaded_ints );

	//Load the factors
	__m256 factors_reg = _mm256_set1_ps( 255.f / 1024.f );

	//Apply the multiplication
	__m256 result_reg = _mm256_mul_ps( values_reg, factors_reg );

	//Convert results back to int
	__m256i final_results = _mm256_cvtps_epi32( result_reg );

	//Add 5 to the result
	__m256i fives = _mm256_set1_epi32( 5 );
	final_results = _mm256_add_epi32( final_results, fives );

	//Store results
	_mm256_store_si256( ( __m256i* )output, final_results );

	//AVX -> SSE transition
	//_mm256_zeroall();
	}

	void test( int output, const int input )
	{
	float results[8] = { 0.0f };

	static const float factor = 255 / 1024.f;
	for( int i = 0; i < 8; ++i )
	{
	results[i] = input[i] * factor;
	}

	for( int i = 0; i < 8; ++i )
	{
	output[i] = static_cast<int>(results[i]);
	}

	for( int i = 0; i < 8; ++i )
	{
	output[i] += 5;
	}
	}

	void test2( unsigned char output, const int input )
	{
	for( int i = 0; i < 8; ++i )
	{
	output[i] = static_cast<unsigned char>( ( input[i] * 249 ) >> 10 ); //Aproximation pour tester!
	}
	}

	template <typename T>
	std::pair<uint8_t, T> get_aligned_offset_of_type( int count , int allignment )
	{
	uint8_t memory_block = new uint8_t[countsizeof(T) + 31];
	uint8_t *offseted_ptr = memory_block;

	while( !isAligned( offseted_ptr, allignment ) )
	{
	++offseted_ptr;
	}

	return std::make_pair( memory_block, reinterpret_cast<T*>( offseted_ptr ) );
	}

	int main( int argc, const char * argv[] )
	{
	std::pair<uint8_t, int> alligned_block_values = get_aligned_offset_of_type<int>( 8, 32 );
	std::pair<uint8_t, int> alligned_block_results = get_aligned_offset_of_type<int>( 8, 32 );

	int *values = alligned_block_values.second;
	int *results = alligned_block_results.second;

	auto start = std::chrono::high_resolution_clock::now();

	int iterationCount = 100000000;

	for( int i = 0; i < iterationCount; ++i )
	{
	values[0] = 1;
	values[1] = 254;
	values[2] = 777;
	values[3] = 1024;
	values[4] = 1;
	values[5] = 254;
	values[6] = 777;
	values[7] = 1024;

	test( results, values );
	//test2( output, values );
	//test_sse( output, values );
	}

	auto end = std::chrono::high_resolution_clock::now();
	auto duration = end - start;
	std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl;

	start = std::chrono::high_resolution_clock::now();

	for( int i = 0; i < iterationCount; ++i )
	{
	values[0] = 1;
	values[1] = 254;
	values[2] = 777;
	values[3] = 1024;
	values[4] = 1;
	values[5] = 254;
	values[6] = 777;
	values[7] = 1024;

	//test( output, values );
	//test2( output, values );
	test_sse( results, values );
	}

	end = std::chrono::high_resolution_clock::now();
	duration = end - start;
	std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl;

	//for( int i = 0; i<8; ++i )
	//{
	// std::cout << results[i] << std::endl;
	//}

	delete[] alligned_block_results.first;
	delete[] alligned_block_values.first;
	return 0;
	}