Skip to content

Instantly share code, notes, and snippets.

@OlivierLi
Last active August 29, 2015 14:25
Show Gist options
  • Save OlivierLi/dbae6038de31504a16e2 to your computer and use it in GitHub Desktop.
Save OlivierLi/dbae6038de31504a16e2 to your computer and use it in GitHub Desktop.
#include "stdafx.h"
#include <iostream>
#include <chrono>
#include <immintrin.h>
#include <memory>
static inline bool isAligned( const void *pointer, size_t byte_count )
{
return (uintptr_t) pointer % byte_count == 0;
}
void test_sse( int *output, const int *input )
{
bool alligned = isAligned( input, 32 );
if( !alligned )
{
throw std::exception( "Memory is not alligned!!" ); //Get alligned memory then remove this, then noThrow()?
}
//SSE -> AVX transition
//_mm256_zeroall();
//Load the integers into an xmm register
__m256i loaded_ints = _mm256_loadu_si256( ( const __m256i* )input );
//Convert the integers to float values
__m256 values_reg = _mm256_cvtepi32_ps( loaded_ints );
//Load the factors
__m256 factors_reg = _mm256_set1_ps( 255.f / 1024.f );
//Apply the multiplication
__m256 result_reg = _mm256_mul_ps( values_reg, factors_reg );
//Convert results back to int
__m256i final_results = _mm256_cvtps_epi32( result_reg );
//Add 5 to the result
__m256i fives = _mm256_set1_epi32( 5 );
final_results = _mm256_add_epi32( final_results, fives );
//Store results
_mm256_store_si256( ( __m256i* )output, final_results );
//AVX -> SSE transition
//_mm256_zeroall();
}
void test( int *output, const int *input )
{
float results[8] = { 0.0f };
static const float factor = 255 / 1024.f;
for( int i = 0; i < 8; ++i )
{
results[i] = input[i] * factor;
}
for( int i = 0; i < 8; ++i )
{
output[i] = static_cast<int>(results[i]);
}
for( int i = 0; i < 8; ++i )
{
output[i] += 5;
}
}
void test2( unsigned char *output, const int *input )
{
for( int i = 0; i < 8; ++i )
{
output[i] = static_cast<unsigned char>( ( input[i] * 249 ) >> 10 ); //Aproximation pour tester!
}
}
template <typename T>
std::pair<uint8_t*, T*> get_aligned_offset_of_type( int count , int allignment )
{
uint8_t *memory_block = new uint8_t[count*sizeof(T) + 31];
uint8_t *offseted_ptr = memory_block;
while( !isAligned( offseted_ptr, allignment ) )
{
++offseted_ptr;
}
return std::make_pair( memory_block, reinterpret_cast<T*>( offseted_ptr ) );
}
int main( int argc, const char * argv[] )
{
std::pair<uint8_t*, int*> alligned_block_values = get_aligned_offset_of_type<int>( 8, 32 );
std::pair<uint8_t*, int*> alligned_block_results = get_aligned_offset_of_type<int>( 8, 32 );
int *values = alligned_block_values.second;
int *results = alligned_block_results.second;
auto start = std::chrono::high_resolution_clock::now();
int iterationCount = 100000000;
for( int i = 0; i < iterationCount; ++i )
{
values[0] = 1;
values[1] = 254;
values[2] = 777;
values[3] = 1024;
values[4] = 1;
values[5] = 254;
values[6] = 777;
values[7] = 1024;
test( results, values );
//test2( output, values );
//test_sse( output, values );
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = end - start;
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl;
start = std::chrono::high_resolution_clock::now();
for( int i = 0; i < iterationCount; ++i )
{
values[0] = 1;
values[1] = 254;
values[2] = 777;
values[3] = 1024;
values[4] = 1;
values[5] = 254;
values[6] = 777;
values[7] = 1024;
//test( output, values );
//test2( output, values );
test_sse( results, values );
}
end = std::chrono::high_resolution_clock::now();
duration = end - start;
std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl;
//for( int i = 0; i<8; ++i )
//{
// std::cout << results[i] << std::endl;
//}
delete[] alligned_block_results.first;
delete[] alligned_block_values.first;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment