Skip to content

Instantly share code, notes, and snippets.

@questor
Created November 22, 2012 14:05
Show Gist options
  • Save questor/4131337 to your computer and use it in GitHub Desktop.
Save questor/4131337 to your computer and use it in GitHub Desktop.
more sse tests with expression templates and latency hide stragegy
I came across an article to optimise simd further and to test this I've modified your testbed and wanted to share the results.
I've tested your template benchmark code when using two sse registers parallel to break this pattern up:
load v0
process v0
store v0
load v1
process v1
store v1
into
load v0
load v1
process v0
process v1
store v0
store v1
below is the source to make this happen. and here some numbers from my computer:
dot_product:
Total times
------------------------
Float array time: 0.577373468875 secs
SSE array time: 0.575501309400 secs
AVX array time: 0.585318814476 secs
NEON array time: 0.000000000000 secs
Naive valarray time: 2.399326313377 secs
Float valarray time: 1.146172549986 secs
SSE valarray time: 0.614939387875 secs
AVX valarray time: 0.620804318710 secs
NEON valarray time: 0.000000000000 secs
SSE_2 valarray time: 0.599064946572 secs
vector_length:
Total times
------------------------
Float array time: 0.510715525838 secs
SSE array time: 0.371821220316 secs
AVX array time: 0.368442524105 secs
NEON array time: 0.000000000000 secs
Naive valarray time: 3.048511081576 secs
Float valarray time: 1.370276624605 secs
SSE valarray time: 0.413611685559 secs
AVX valarray time: 0.404943647059 secs
NEON valarray time: 0.000000000000 secs
SSE_2 valarray time: 0.396866164117 secs
I could not get it to work with 4 _m128 values because of some strange msvc-compiler error :/ (visual studio 2012 express) but I have looked at the assembly and it's really working :)
#pragma once
#include "sse_array.hpp"
namespace expression_template_latencyhider {
template<typename T> class MultipleValues_2 {
public:
T v0, v1;
MultipleValues_2(T v0, T v1) : v0(v0), v1(v1) {}
};
template<typename T> INLINE MultipleValues_2<T> add(const MultipleValues_2<T> &lhs, const MultipleValues_2<T> &rhs) {
return MultipleValues_2<T>(expression_template_simd::add(rhs.v0, lhs.v0), expression_template_simd::add(rhs.v1, lhs.v1));
}
template<typename T> INLINE MultipleValues_2<T> mul(const MultipleValues_2<T> &lhs, const MultipleValues_2<T> &rhs) {
return MultipleValues_2<T>(expression_template_simd::mul(rhs.v0, lhs.v0), expression_template_simd::mul(rhs.v1, lhs.v1));
}
template<typename T> INLINE MultipleValues_2<T> madd(const MultipleValues_2<T> &a, const MultipleValues_2<T> &b, const MultipleValues_2<T> &c) {
return MultipleValues_2<T>(expression_template_simd::madd(a.v0, b.v0, c.v0), expression_template_simd::madd(a.v1, b.v1, c.v1));
}
template<typename T> INLINE MultipleValues_2<T> square_root(const MultipleValues_2<T> &v) {
return MultipleValues_2<T>(expression_template_simd::square_root(v.v0), expression_template_simd::square_root(v.v1));
}
INLINE float get(const MultipleValues_2<__m128>& value, std::size_t i) {
if(i>=4)
return value.v1.m128_f32[i-4];
else
return value.v0.m128_f32[i];
}
using namespace expression_template_simd;
template <typename Real> class valarray_rep_sse_times2;
template <> class valarray_rep_sse_times2<float> {
public:
typedef float value_type;
typedef MultipleValues_2<__m128> element_type;
INLINE valarray_rep_sse_times2(std::size_t size)
: _size(size)
, _elements((size / element_size()) + ((size % element_size() == 0) ? 0 : 1))
{
_values = (element_type*)_mm_malloc(_elements * sizeof(element_type), alignment());
}
INLINE valarray_rep_sse_times2(std::size_t size, value_type value)
: _size(size)
, _elements((size / element_size()) + ((size % element_size() == 0) ? 0 : 1))
{
_values = (element_type*)_mm_malloc(_elements * sizeof(element_type), alignment());
const __m128 value_sse = _mm_set1_ps(value);
for (std::size_t i = 0; i < _elements; ++i)
_values[i] = MultipleValues_2<__m128>(value_sse, value_sse);
}
INLINE ~valarray_rep_sse_times2() {
_mm_free(_values);
}
INLINE valarray_rep_sse_times2(const valarray_rep_sse_times2& copy)
: _size(copy._size)
, _elements(copy._elements)
{
_values = (element_type*)_mm_malloc(_elements, alignment());
swap(copy);
}
INLINE valarray_rep_sse_times2& operator= (const valarray_rep_sse_times2& copy) {
swap(copy);
return *this;
}
INLINE element_type operator() (std::size_t i) const {
assert(i < _elements);
return _values[i];
}
INLINE element_type& operator() (std::size_t i) {
assert(i < _elements);
return _values[i];
}
INLINE float operator[] (std::size_t i) const {
assert(i < _size);
const std::size_t element = i / element_size();
const std::size_t index = i % element_size();
return get(_values[element], index);
}
INLINE std::size_t size() const {
return _size;
}
INLINE std::size_t elements() const {
return _elements;
}
INLINE static std::size_t alignment() {
return sizeof(element_type);
}
INLINE static std::size_t element_size() {
return sizeof(element_type) / sizeof(value_type);
}
INLINE void swap(const valarray_rep_sse_times2& copy) {
assert(_size == copy._size);
for (std::size_t i = 0; i < _elements; ++i)
_values[i] = copy._values[i];
}
private:
std::size_t _size;
std::size_t _elements;
element_type* _values;
} ; // end class valarray_rep_sse_times2<float>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment