Created
November 22, 2012 14:05
-
-
Save questor/4131337 to your computer and use it in GitHub Desktop.
more sse tests with expression templates and latency hide stragegy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I came across an article to optimise simd further and to test this I've modified your testbed and wanted to share the results. | |
I've tested your template benchmark code when using two sse registers parallel to break this pattern up: | |
load v0 | |
process v0 | |
store v0 | |
load v1 | |
process v1 | |
store v1 | |
into | |
load v0 | |
load v1 | |
process v0 | |
process v1 | |
store v0 | |
store v1 | |
below is the source to make this happen. and here some numbers from my computer: | |
dot_product: | |
Total times | |
------------------------ | |
Float array time: 0.577373468875 secs | |
SSE array time: 0.575501309400 secs | |
AVX array time: 0.585318814476 secs | |
NEON array time: 0.000000000000 secs | |
Naive valarray time: 2.399326313377 secs | |
Float valarray time: 1.146172549986 secs | |
SSE valarray time: 0.614939387875 secs | |
AVX valarray time: 0.620804318710 secs | |
NEON valarray time: 0.000000000000 secs | |
SSE_2 valarray time: 0.599064946572 secs | |
vector_length: | |
Total times | |
------------------------ | |
Float array time: 0.510715525838 secs | |
SSE array time: 0.371821220316 secs | |
AVX array time: 0.368442524105 secs | |
NEON array time: 0.000000000000 secs | |
Naive valarray time: 3.048511081576 secs | |
Float valarray time: 1.370276624605 secs | |
SSE valarray time: 0.413611685559 secs | |
AVX valarray time: 0.404943647059 secs | |
NEON valarray time: 0.000000000000 secs | |
SSE_2 valarray time: 0.396866164117 secs | |
I could not get it to work with 4 _m128 values because of some strange msvc-compiler error :/ (visual studio 2012 express) but I have looked at the assembly and it's really working :) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include "sse_array.hpp" | |
namespace expression_template_latencyhider { | |
template<typename T> class MultipleValues_2 { | |
public: | |
T v0, v1; | |
MultipleValues_2(T v0, T v1) : v0(v0), v1(v1) {} | |
}; | |
template<typename T> INLINE MultipleValues_2<T> add(const MultipleValues_2<T> &lhs, const MultipleValues_2<T> &rhs) { | |
return MultipleValues_2<T>(expression_template_simd::add(rhs.v0, lhs.v0), expression_template_simd::add(rhs.v1, lhs.v1)); | |
} | |
template<typename T> INLINE MultipleValues_2<T> mul(const MultipleValues_2<T> &lhs, const MultipleValues_2<T> &rhs) { | |
return MultipleValues_2<T>(expression_template_simd::mul(rhs.v0, lhs.v0), expression_template_simd::mul(rhs.v1, lhs.v1)); | |
} | |
template<typename T> INLINE MultipleValues_2<T> madd(const MultipleValues_2<T> &a, const MultipleValues_2<T> &b, const MultipleValues_2<T> &c) { | |
return MultipleValues_2<T>(expression_template_simd::madd(a.v0, b.v0, c.v0), expression_template_simd::madd(a.v1, b.v1, c.v1)); | |
} | |
template<typename T> INLINE MultipleValues_2<T> square_root(const MultipleValues_2<T> &v) { | |
return MultipleValues_2<T>(expression_template_simd::square_root(v.v0), expression_template_simd::square_root(v.v1)); | |
} | |
INLINE float get(const MultipleValues_2<__m128>& value, std::size_t i) { | |
if(i>=4) | |
return value.v1.m128_f32[i-4]; | |
else | |
return value.v0.m128_f32[i]; | |
} | |
using namespace expression_template_simd; | |
template <typename Real> class valarray_rep_sse_times2; | |
template <> class valarray_rep_sse_times2<float> { | |
public: | |
typedef float value_type; | |
typedef MultipleValues_2<__m128> element_type; | |
INLINE valarray_rep_sse_times2(std::size_t size) | |
: _size(size) | |
, _elements((size / element_size()) + ((size % element_size() == 0) ? 0 : 1)) | |
{ | |
_values = (element_type*)_mm_malloc(_elements * sizeof(element_type), alignment()); | |
} | |
INLINE valarray_rep_sse_times2(std::size_t size, value_type value) | |
: _size(size) | |
, _elements((size / element_size()) + ((size % element_size() == 0) ? 0 : 1)) | |
{ | |
_values = (element_type*)_mm_malloc(_elements * sizeof(element_type), alignment()); | |
const __m128 value_sse = _mm_set1_ps(value); | |
for (std::size_t i = 0; i < _elements; ++i) | |
_values[i] = MultipleValues_2<__m128>(value_sse, value_sse); | |
} | |
INLINE ~valarray_rep_sse_times2() { | |
_mm_free(_values); | |
} | |
INLINE valarray_rep_sse_times2(const valarray_rep_sse_times2& copy) | |
: _size(copy._size) | |
, _elements(copy._elements) | |
{ | |
_values = (element_type*)_mm_malloc(_elements, alignment()); | |
swap(copy); | |
} | |
INLINE valarray_rep_sse_times2& operator= (const valarray_rep_sse_times2& copy) { | |
swap(copy); | |
return *this; | |
} | |
INLINE element_type operator() (std::size_t i) const { | |
assert(i < _elements); | |
return _values[i]; | |
} | |
INLINE element_type& operator() (std::size_t i) { | |
assert(i < _elements); | |
return _values[i]; | |
} | |
INLINE float operator[] (std::size_t i) const { | |
assert(i < _size); | |
const std::size_t element = i / element_size(); | |
const std::size_t index = i % element_size(); | |
return get(_values[element], index); | |
} | |
INLINE std::size_t size() const { | |
return _size; | |
} | |
INLINE std::size_t elements() const { | |
return _elements; | |
} | |
INLINE static std::size_t alignment() { | |
return sizeof(element_type); | |
} | |
INLINE static std::size_t element_size() { | |
return sizeof(element_type) / sizeof(value_type); | |
} | |
INLINE void swap(const valarray_rep_sse_times2& copy) { | |
assert(_size == copy._size); | |
for (std::size_t i = 0; i < _elements; ++i) | |
_values[i] = copy._values[i]; | |
} | |
private: | |
std::size_t _size; | |
std::size_t _elements; | |
element_type* _values; | |
} ; // end class valarray_rep_sse_times2<float> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment