__m128 _mm_load_ps (float * a)
{
r[0] = a[0]
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_loadr_ps (float * a)
{
r[0] = a[3]
r[1] = a[2]
r[2] = a[1]
r[3] = a[0]
}
__m128 _mm_load1_ps (float * a)
{
r[0] = a[0]
r[1] = a[0]
r[2] = a[0]
r[3] = a[0]
}
__m128 _mm_load_ss (float * a)
{
r[0] = a[0]
r[1] = 0
r[2] = 0
r[3] = 0
}
__m128 _mm_set_ps (float x, float y, float z, float w)
{
r[0] = w
r[1] = z
r[2] = y
r[3] = x
}
__m128 _mm_setr_ps (float x, float y, float z, float w)
{
r[0] = x
r[1] = y
r[2] = z
r[3] = w
}
__m128 _mm_set1_ps (float x)
{
r[0] = x
r[1] = x
r[2] = x
r[3] = x
}
__m128 _mm_set_ss (float x)
{
r[0] = x
r[1] = 0
r[2] = 0
r[3] = 0
}
__m128 _mm_setzero_ps ()
{
r[0] = 0
r[1] = 0
r[2] = 0
r[3] = 0
}
void _mm_store_ps (float * v, __m128 a)
{
v[0] = a[0]
v[1] = a[1]
v[2] = a[2]
v[3] = a[3]
}
void _mm_storer_ps (float * v, __m128 a)
{
v[0] = a[3]
v[1] = a[2]
v[2] = a[1]
v[3] = a[0]
}
void _mm_store1_ps (float * v, __m128 a)
{
v[0] = a[0]
v[1] = a[0]
v[2] = a[0]
v[3] = a[0]
}
void _mm_store_ss (float * v, __m128 a)
{
v[0] = a[0]
}
__m128 _mm_shuffle_ps (__m128 a, __m128 b, _MM_SHUFFLE (i, j, k, l))
{
r[0] = a[l]
r[1] = a[k]
r[2] = b[j]
r[3] = b[i]
}
__m128 _mm_unpacklo_ps (__m128 a, __m128 b)
{
r[0] = a[0]
r[1] = b[0]
r[2] = a[1]
r[3] = b[1]
}
__m128 _mm_unpackhi_ps (__m128 a, __m128 b)
{
r[0] = a[2]
r[1] = b[2]
r[2] = a[3]
r[3] = b[3]
}
__m128 _mm_move_ss (__m128 a, __m128 b)
{
r[0] = b[0]
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_movelh_ps (__m128 a, __m128 b)
{
r[0] = a[0]
r[1] = a[1]
r[2] = b[0]
r[3] = b[1]
}
__m128 _mm_movehl_ps (__m128 a, __m128 b)
{
r[0] = b[2]
r[1] = b[3]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_moveldup_ps (__m128 a)
{
r[0] = a[0]
r[1] = a[0]
r[2] = a[2]
r[3] = a[2]
}
__m128 _mm_movehdup_ps (__m128 a)
{
r[0] = a[1]
r[1] = a[1]
r[2] = a[3]
r[3] = a[3]
}
__m128 _mm_add_ps (__m128 a, __m128 b)
{
r[0] = a[0] + b[0]
r[1] = a[1] + b[1]
r[2] = a[2] + b[2]
r[3] = a[3] + b[3]
}
__m128 _mm_add_ss (__m128 a, __m128 b)
{
r[0] = a[0] + b[0]
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_sub_ps (__m128 a, __m128 b)
{
r[0] = a[0] - b[0]
r[1] = a[1] - b[1]
r[2] = a[2] - b[2]
r[3] = a[3] - b[3]
}
__m128 _mm_sub_ss (__m128 a, __m128 b)
{
r[0] = a[0] - b[0]
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_hadd_ps (__m128 a, __m128 b)
{
r[0] = a[0] + a[1]
r[1] = a[2] + a[3]
r[2] = b[0] + b[1]
r[3] = b[2] + b[3]
}
__m128 _mm_hsub_ps (__m128 a, __m128 b)
{
r[0] = a[0] - a[1]
r[1] = a[2] - a[3]
r[2] = b[0] - b[1]
r[3] = b[2] - b[3]
}
__m128 _mm_addsub_ps (__m128 a, __m128 b)
{
r[0] = a[0] - b[0]
r[1] = a[1] + b[1]
r[2] = a[2] - b[2]
r[3] = a[3] + b[3]
}
__m128 _mm_mul_ps (__m128 a, __m128 b)
{
r[0] = a[0] * b[0]
r[1] = a[1] * b[1]
r[2] = a[2] * b[2]
r[3] = a[3] * b[3]
}
__m128 _mm_mul_ss (__m128 a, __m128 b)
{
r[0] = a[0] * b[0]
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_div_ps (__m128 a, __m128 b)
{
r[0] = a[0] / b[0]
r[1] = a[1] / b[1]
r[2] = a[2] / b[2]
r[3] = a[3] / b[3]
}
__m128 _mm_div_ss (__m128 a, __m128 b)
{
r[0] = a[0] / b[0]
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
TODO
__m128 _mm_sqrt_ps (__m128 a)
{
r[0] = sqrt (a[0])
r[1] = sqrt (a[1])
r[2] = sqrt (a[2])
r[3] = sqrt (a[3])
}
__m128 _mm_sqrt_ss (__m128 a)
{
r[0] = sqrt (a[0])
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
// approximation of the reciprocal
__m128 _mm_rcp_ps (__m128 a)
{
r[0] = 1.0 / a[0]
r[1] = 1.0 / a[1]
r[2] = 1.0 / a[2]
r[3] = 1.0 / a[3]
}
// approximation of the reciprocal
__m128 _mm_rcp_ss (__m128 a)
{
r[0] = 1.0 / a[0]
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
// approximation of the reciprocal of the square root
__m128 _mm_rsqrt_ps (__m128 a)
{
r[0] = 1.0 / sqrt (a[0])
r[1] = 1.0 / sqrt (a[1])
r[2] = 1.0 / sqrt (a[2])
r[3] = 1.0 / sqrt (a[3])
}
// approximation of the reciprocal of the square root
__m128 _mm_rsqrt_ss (__m128 a)
{
r[0] = 1.0 / sqrt (a[0])
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_min_ps (__m128 a, __m128 b)
{
r[0] = min (a[0], b[0])
r[1] = min (a[1], b[1])
r[2] = min (a[2], b[2])
r[3] = min (a[3], b[3])
}
__m128 _mm_min_ss (__m128 a, __m128 b)
{
r[0] = min (a[0], b[0])
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}
__m128 _mm_max_ps (__m128 a, __m128 b)
{
r[0] = max (a[0], b[0])
r[1] = max (a[1], b[1])
r[2] = max (a[2], b[2])
r[3] = max (a[3], b[3])
}
__m128 _mm_max_ss (__m128 a, __m128 b)
{
r[0] = max (a[0], b[0])
r[1] = a[1]
r[2] = a[2]
r[3] = a[3]
}