Last active
August 29, 2015 14:25
-
-
Save OlivierLi/dbae6038de31504a16e2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "stdafx.h" | |
| #include <iostream> | |
| #include <chrono> | |
| #include <immintrin.h> | |
| #include <memory> | |
| static inline bool isAligned( const void *pointer, size_t byte_count ) | |
| { | |
| return (uintptr_t) pointer % byte_count == 0; | |
| } | |
| void test_sse( int *output, const int *input ) | |
| { | |
| bool alligned = isAligned( input, 32 ); | |
| if( !alligned ) | |
| { | |
| throw std::exception( "Memory is not alligned!!" ); //Get alligned memory then remove this, then noThrow()? | |
| } | |
| //SSE -> AVX transition | |
| //_mm256_zeroall(); | |
| //Load the integers into an xmm register | |
| __m256i loaded_ints = _mm256_loadu_si256( ( const __m256i* )input ); | |
| //Convert the integers to float values | |
| __m256 values_reg = _mm256_cvtepi32_ps( loaded_ints ); | |
| //Load the factors | |
| __m256 factors_reg = _mm256_set1_ps( 255.f / 1024.f ); | |
| //Apply the multiplication | |
| __m256 result_reg = _mm256_mul_ps( values_reg, factors_reg ); | |
| //Convert results back to int | |
| __m256i final_results = _mm256_cvtps_epi32( result_reg ); | |
| //Add 5 to the result | |
| __m256i fives = _mm256_set1_epi32( 5 ); | |
| final_results = _mm256_add_epi32( final_results, fives ); | |
| //Store results | |
| _mm256_store_si256( ( __m256i* )output, final_results ); | |
| //AVX -> SSE transition | |
| //_mm256_zeroall(); | |
| } | |
| void test( int *output, const int *input ) | |
| { | |
| float results[8] = { 0.0f }; | |
| static const float factor = 255 / 1024.f; | |
| for( int i = 0; i < 8; ++i ) | |
| { | |
| results[i] = input[i] * factor; | |
| } | |
| for( int i = 0; i < 8; ++i ) | |
| { | |
| output[i] = static_cast<int>(results[i]); | |
| } | |
| for( int i = 0; i < 8; ++i ) | |
| { | |
| output[i] += 5; | |
| } | |
| } | |
| void test2( unsigned char *output, const int *input ) | |
| { | |
| for( int i = 0; i < 8; ++i ) | |
| { | |
| output[i] = static_cast<unsigned char>( ( input[i] * 249 ) >> 10 ); //Aproximation pour tester! | |
| } | |
| } | |
| template <typename T> | |
| std::pair<uint8_t*, T*> get_aligned_offset_of_type( int count , int allignment ) | |
| { | |
| uint8_t *memory_block = new uint8_t[count*sizeof(T) + 31]; | |
| uint8_t *offseted_ptr = memory_block; | |
| while( !isAligned( offseted_ptr, allignment ) ) | |
| { | |
| ++offseted_ptr; | |
| } | |
| return std::make_pair( memory_block, reinterpret_cast<T*>( offseted_ptr ) ); | |
| } | |
| int main( int argc, const char * argv[] ) | |
| { | |
| std::pair<uint8_t*, int*> alligned_block_values = get_aligned_offset_of_type<int>( 8, 32 ); | |
| std::pair<uint8_t*, int*> alligned_block_results = get_aligned_offset_of_type<int>( 8, 32 ); | |
| int *values = alligned_block_values.second; | |
| int *results = alligned_block_results.second; | |
| auto start = std::chrono::high_resolution_clock::now(); | |
| int iterationCount = 100000000; | |
| for( int i = 0; i < iterationCount; ++i ) | |
| { | |
| values[0] = 1; | |
| values[1] = 254; | |
| values[2] = 777; | |
| values[3] = 1024; | |
| values[4] = 1; | |
| values[5] = 254; | |
| values[6] = 777; | |
| values[7] = 1024; | |
| test( results, values ); | |
| //test2( output, values ); | |
| //test_sse( output, values ); | |
| } | |
| auto end = std::chrono::high_resolution_clock::now(); | |
| auto duration = end - start; | |
| std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl; | |
| start = std::chrono::high_resolution_clock::now(); | |
| for( int i = 0; i < iterationCount; ++i ) | |
| { | |
| values[0] = 1; | |
| values[1] = 254; | |
| values[2] = 777; | |
| values[3] = 1024; | |
| values[4] = 1; | |
| values[5] = 254; | |
| values[6] = 777; | |
| values[7] = 1024; | |
| //test( output, values ); | |
| //test2( output, values ); | |
| test_sse( results, values ); | |
| } | |
| end = std::chrono::high_resolution_clock::now(); | |
| duration = end - start; | |
| std::cout << std::chrono::duration_cast<std::chrono::milliseconds>( duration ).count() << std::endl; | |
| //for( int i = 0; i<8; ++i ) | |
| //{ | |
| // std::cout << results[i] << std::endl; | |
| //} | |
| delete[] alligned_block_results.first; | |
| delete[] alligned_block_values.first; | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment