Skip to content

Instantly share code, notes, and snippets.

@pedropgusmao
Created April 7, 2017 08:49
Show Gist options
  • Select an option

  • Save pedropgusmao/fe283d613a3f47ea57b3bf6f81f85fed to your computer and use it in GitHub Desktop.

Select an option

Save pedropgusmao/fe283d613a3f47ea57b3bf6f81f85fed to your computer and use it in GitHub Desktop.
VSX.c
#ifdef __PPC64__
#include <altivec.h>
#include <stddef.h>
//--------------------------------------------------------------------------------------------------
// THDoubleVector_fill_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
double val[2] = {c, c};
vector double fp64vec2 = vec_xl(0, val);
for (i = 0; i <= n-128; i += 128)
{
vec_xst(fp64vec2, 0, x+(i ));
vec_xst(fp64vec2, 0, x+(i+2 ));
vec_xst(fp64vec2, 0, x+(i+4 ));
vec_xst(fp64vec2, 0, x+(i+6 ));
vec_xst(fp64vec2, 0, x+(i+8 ));
vec_xst(fp64vec2, 0, x+(i+10 ));
vec_xst(fp64vec2, 0, x+(i+12 ));
vec_xst(fp64vec2, 0, x+(i+14 ));
vec_xst(fp64vec2, 0, x+(i+16 ));
vec_xst(fp64vec2, 0, x+(i+18 ));
vec_xst(fp64vec2, 0, x+(i+20 ));
vec_xst(fp64vec2, 0, x+(i+22 ));
vec_xst(fp64vec2, 0, x+(i+24 ));
vec_xst(fp64vec2, 0, x+(i+26 ));
vec_xst(fp64vec2, 0, x+(i+28 ));
vec_xst(fp64vec2, 0, x+(i+30 ));
vec_xst(fp64vec2, 0, x+(i+32 ));
vec_xst(fp64vec2, 0, x+(i+34 ));
vec_xst(fp64vec2, 0, x+(i+36 ));
vec_xst(fp64vec2, 0, x+(i+38 ));
vec_xst(fp64vec2, 0, x+(i+40 ));
vec_xst(fp64vec2, 0, x+(i+42 ));
vec_xst(fp64vec2, 0, x+(i+44 ));
vec_xst(fp64vec2, 0, x+(i+46 ));
vec_xst(fp64vec2, 0, x+(i+48 ));
vec_xst(fp64vec2, 0, x+(i+50 ));
vec_xst(fp64vec2, 0, x+(i+52 ));
vec_xst(fp64vec2, 0, x+(i+54 ));
vec_xst(fp64vec2, 0, x+(i+56 ));
vec_xst(fp64vec2, 0, x+(i+58 ));
vec_xst(fp64vec2, 0, x+(i+60 ));
vec_xst(fp64vec2, 0, x+(i+62 ));
vec_xst(fp64vec2, 0, x+(i+64 ));
vec_xst(fp64vec2, 0, x+(i+66 ));
vec_xst(fp64vec2, 0, x+(i+68 ));
vec_xst(fp64vec2, 0, x+(i+70 ));
vec_xst(fp64vec2, 0, x+(i+72 ));
vec_xst(fp64vec2, 0, x+(i+74 ));
vec_xst(fp64vec2, 0, x+(i+76 ));
vec_xst(fp64vec2, 0, x+(i+78 ));
vec_xst(fp64vec2, 0, x+(i+80 ));
vec_xst(fp64vec2, 0, x+(i+82 ));
vec_xst(fp64vec2, 0, x+(i+84 ));
vec_xst(fp64vec2, 0, x+(i+86 ));
vec_xst(fp64vec2, 0, x+(i+88 ));
vec_xst(fp64vec2, 0, x+(i+90 ));
vec_xst(fp64vec2, 0, x+(i+92 ));
vec_xst(fp64vec2, 0, x+(i+94 ));
vec_xst(fp64vec2, 0, x+(i+96 ));
vec_xst(fp64vec2, 0, x+(i+98 ));
vec_xst(fp64vec2, 0, x+(i+100));
vec_xst(fp64vec2, 0, x+(i+102));
vec_xst(fp64vec2, 0, x+(i+104));
vec_xst(fp64vec2, 0, x+(i+106));
vec_xst(fp64vec2, 0, x+(i+108));
vec_xst(fp64vec2, 0, x+(i+110));
vec_xst(fp64vec2, 0, x+(i+112));
vec_xst(fp64vec2, 0, x+(i+114));
vec_xst(fp64vec2, 0, x+(i+116));
vec_xst(fp64vec2, 0, x+(i+118));
vec_xst(fp64vec2, 0, x+(i+120));
vec_xst(fp64vec2, 0, x+(i+122));
vec_xst(fp64vec2, 0, x+(i+124));
vec_xst(fp64vec2, 0, x+(i+126));
}
for (; i <= n-16; i += 16)
{
vec_xst(fp64vec2, 0, x+(i ));
vec_xst(fp64vec2, 0, x+(i+2 ));
vec_xst(fp64vec2, 0, x+(i+4 ));
vec_xst(fp64vec2, 0, x+(i+6 ));
vec_xst(fp64vec2, 0, x+(i+8 ));
vec_xst(fp64vec2, 0, x+(i+10 ));
vec_xst(fp64vec2, 0, x+(i+12 ));
vec_xst(fp64vec2, 0, x+(i+14 ));
}
for (; i <= n-2; i += 2)
vec_xst(fp64vec2, 0, x+(i ));
for (; i < n; i++)
x[i] = c;
}
//--------------------------------------------------------------------------------------------------
// THDoubleVector_cadds_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_cadd_VSX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
double val[2] = {c, c};
vector double c_fp64vec2 = vec_xl(0, val);
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
for (i = 0; i <= n-24; i += 24)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
y1_fp64vec2 = vec_xl(0, y+(i+2 ));
y2_fp64vec2 = vec_xl(0, y+(i+4 ));
y3_fp64vec2 = vec_xl(0, y+(i+6 ));
y4_fp64vec2 = vec_xl(0, y+(i+8 ));
y5_fp64vec2 = vec_xl(0, y+(i+10));
y6_fp64vec2 = vec_xl(0, y+(i+12));
y7_fp64vec2 = vec_xl(0, y+(i+14));
y8_fp64vec2 = vec_xl(0, y+(i+16));
y9_fp64vec2 = vec_xl(0, y+(i+18));
y10_fp64vec2 = vec_xl(0, y+(i+20));
y11_fp64vec2 = vec_xl(0, y+(i+22));
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
x4_fp64vec2 = vec_xl(0, x+(i+8 ));
x5_fp64vec2 = vec_xl(0, x+(i+10));
x6_fp64vec2 = vec_xl(0, x+(i+12));
x7_fp64vec2 = vec_xl(0, x+(i+14));
x8_fp64vec2 = vec_xl(0, x+(i+16));
x9_fp64vec2 = vec_xl(0, x+(i+18));
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
y1_fp64vec2 = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
y2_fp64vec2 = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
y3_fp64vec2 = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
y4_fp64vec2 = vec_madd(y4_fp64vec2, c_fp64vec2, x4_fp64vec2);
y5_fp64vec2 = vec_madd(y5_fp64vec2, c_fp64vec2, x5_fp64vec2);
y6_fp64vec2 = vec_madd(y6_fp64vec2, c_fp64vec2, x6_fp64vec2);
y7_fp64vec2 = vec_madd(y7_fp64vec2, c_fp64vec2, x7_fp64vec2);
y8_fp64vec2 = vec_madd(y8_fp64vec2, c_fp64vec2, x8_fp64vec2);
y9_fp64vec2 = vec_madd(y9_fp64vec2, c_fp64vec2, x9_fp64vec2);
y10_fp64vec2 = vec_madd(y10_fp64vec2, c_fp64vec2,x10_fp64vec2);
y11_fp64vec2 = vec_madd(y11_fp64vec2, c_fp64vec2,x11_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
vec_xst(y1_fp64vec2, 0, z+(i+2 ));
vec_xst(y2_fp64vec2, 0, z+(i+4 ));
vec_xst(y3_fp64vec2, 0, z+(i+6 ));
vec_xst(y4_fp64vec2, 0, z+(i+8 ));
vec_xst(y5_fp64vec2, 0, z+(i+10));
vec_xst(y6_fp64vec2, 0, z+(i+12));
vec_xst(y7_fp64vec2, 0, z+(i+14));
vec_xst(y8_fp64vec2, 0, z+(i+16));
vec_xst(y9_fp64vec2, 0, z+(i+18));
vec_xst(y10_fp64vec2, 0, z+(i+20));
vec_xst(y11_fp64vec2, 0, z+(i+22));
}
for (; i <= n-8; i += 8)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
y1_fp64vec2 = vec_xl(0, y+(i+2 ));
y2_fp64vec2 = vec_xl(0, y+(i+4 ));
y3_fp64vec2 = vec_xl(0, y+(i+6 ));
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
y1_fp64vec2 = vec_madd(y1_fp64vec2, c_fp64vec2, x1_fp64vec2);
y2_fp64vec2 = vec_madd(y2_fp64vec2, c_fp64vec2, x2_fp64vec2);
y3_fp64vec2 = vec_madd(y3_fp64vec2, c_fp64vec2, x3_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
vec_xst(y1_fp64vec2, 0, z+(i+2 ));
vec_xst(y2_fp64vec2, 0, z+(i+4 ));
vec_xst(y3_fp64vec2, 0, z+(i+6 ));
}
for (; i <= n-2; i += 2)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
x0_fp64vec2 = vec_xl(0, x+(i ));
y0_fp64vec2 = vec_madd(y0_fp64vec2, c_fp64vec2, x0_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
}
for (; i < n; i++)
z[i] = x[i] + c* y[i];
}
//--------------------------------------------------------------------------------------------------
// THDoubleVector_adds_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_adds_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
double val[2] = {c, c};
vector double c_fp64vec2 = vec_xl(0, val);
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
for (i = 0; i <= n-24; i += 24)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
x4_fp64vec2 = vec_xl(0, x+(i+8 ));
x5_fp64vec2 = vec_xl(0, x+(i+10));
x6_fp64vec2 = vec_xl(0, x+(i+12));
x7_fp64vec2 = vec_xl(0, x+(i+14));
x8_fp64vec2 = vec_xl(0, x+(i+16));
x9_fp64vec2 = vec_xl(0, x+(i+18));
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_add(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_add(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_add(x3_fp64vec2, c_fp64vec2);
y4_fp64vec2 = vec_add(x4_fp64vec2, c_fp64vec2);
y5_fp64vec2 = vec_add(x5_fp64vec2, c_fp64vec2);
y6_fp64vec2 = vec_add(x6_fp64vec2, c_fp64vec2);
y7_fp64vec2 = vec_add(x7_fp64vec2, c_fp64vec2);
y8_fp64vec2 = vec_add(x8_fp64vec2, c_fp64vec2);
y9_fp64vec2 = vec_add(x9_fp64vec2, c_fp64vec2);
y10_fp64vec2 = vec_add(x10_fp64vec2, c_fp64vec2);
y11_fp64vec2 = vec_add(x11_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
vec_xst(y2_fp64vec2, 0, y+(i+4 ));
vec_xst(y3_fp64vec2, 0, y+(i+6 ));
vec_xst(y4_fp64vec2, 0, y+(i+8 ));
vec_xst(y5_fp64vec2, 0, y+(i+10));
vec_xst(y6_fp64vec2, 0, y+(i+12));
vec_xst(y7_fp64vec2, 0, y+(i+14));
vec_xst(y8_fp64vec2, 0, y+(i+16));
vec_xst(y9_fp64vec2, 0, y+(i+18));
vec_xst(y10_fp64vec2, 0, y+(i+20));
vec_xst(y11_fp64vec2, 0, y+(i+22));
}
for (; i <= n-8; i += 8)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_add(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_add(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_add(x3_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
vec_xst(y2_fp64vec2, 0, y+(i+4 ));
vec_xst(y3_fp64vec2, 0, y+(i+6 ));
}
for (; i <= n-2; i += 2)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
y0_fp64vec2 = vec_add(x0_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
}
for (; i < n; i++)
y[i] = x[i] +c;
}
//--------------------------------------------------------------------------------------------------
// THDoubleVector_cmul_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_cmul_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
{
ptrdiff_t i;
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
for (i = 0; i <= n-24; i += 24)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
y1_fp64vec2 = vec_xl(0, y+(i+2 ));
y2_fp64vec2 = vec_xl(0, y+(i+4 ));
y3_fp64vec2 = vec_xl(0, y+(i+6 ));
y4_fp64vec2 = vec_xl(0, y+(i+8 ));
y5_fp64vec2 = vec_xl(0, y+(i+10));
y6_fp64vec2 = vec_xl(0, y+(i+12));
y7_fp64vec2 = vec_xl(0, y+(i+14));
y8_fp64vec2 = vec_xl(0, y+(i+16));
y9_fp64vec2 = vec_xl(0, y+(i+18));
y10_fp64vec2 = vec_xl(0, y+(i+20));
y11_fp64vec2 = vec_xl(0, y+(i+22));
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
x4_fp64vec2 = vec_xl(0, x+(i+8 ));
x5_fp64vec2 = vec_xl(0, x+(i+10));
x6_fp64vec2 = vec_xl(0, x+(i+12));
x7_fp64vec2 = vec_xl(0, x+(i+14));
x8_fp64vec2 = vec_xl(0, x+(i+16));
x9_fp64vec2 = vec_xl(0, x+(i+18));
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2);
y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2);
y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2);
y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2);
y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2);
y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2);
y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2);
y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
vec_xst(y1_fp64vec2, 0, z+(i+2 ));
vec_xst(y2_fp64vec2, 0, z+(i+4 ));
vec_xst(y3_fp64vec2, 0, z+(i+6 ));
vec_xst(y4_fp64vec2, 0, z+(i+8 ));
vec_xst(y5_fp64vec2, 0, z+(i+10));
vec_xst(y6_fp64vec2, 0, z+(i+12));
vec_xst(y7_fp64vec2, 0, z+(i+14));
vec_xst(y8_fp64vec2, 0, z+(i+16));
vec_xst(y9_fp64vec2, 0, z+(i+18));
vec_xst(y10_fp64vec2, 0, z+(i+20));
vec_xst(y11_fp64vec2, 0, z+(i+22));
}
for (; i <= n-8; i += 8)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
y1_fp64vec2 = vec_xl(0, y+(i+2 ));
y2_fp64vec2 = vec_xl(0, y+(i+4 ));
y3_fp64vec2 = vec_xl(0, y+(i+6 ));
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2);
y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2);
y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
vec_xst(y1_fp64vec2, 0, z+(i+2 ));
vec_xst(y2_fp64vec2, 0, z+(i+4 ));
vec_xst(y3_fp64vec2, 0, z+(i+6 ));
}
for (; i <= n-2; i += 2)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
x0_fp64vec2 = vec_xl(0, x+(i ));
y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
}
for (; i < n; i++)
z[i] = x[i] * y[i];
}
//--------------------------------------------------------------------------------------------------
// THDoubleVector_muls_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_muls_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
double val[2] = {c, c};
vector double c_fp64vec2 = vec_xl(0, val);
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
for (i = 0; i <= n-24; i += 24)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
x4_fp64vec2 = vec_xl(0, x+(i+8 ));
x5_fp64vec2 = vec_xl(0, x+(i+10));
x6_fp64vec2 = vec_xl(0, x+(i+12));
x7_fp64vec2 = vec_xl(0, x+(i+14));
x8_fp64vec2 = vec_xl(0, x+(i+16));
x9_fp64vec2 = vec_xl(0, x+(i+18));
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_mul(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_mul(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_mul(x3_fp64vec2, c_fp64vec2);
y4_fp64vec2 = vec_mul(x4_fp64vec2, c_fp64vec2);
y5_fp64vec2 = vec_mul(x5_fp64vec2, c_fp64vec2);
y6_fp64vec2 = vec_mul(x6_fp64vec2, c_fp64vec2);
y7_fp64vec2 = vec_mul(x7_fp64vec2, c_fp64vec2);
y8_fp64vec2 = vec_mul(x8_fp64vec2, c_fp64vec2);
y9_fp64vec2 = vec_mul(x9_fp64vec2, c_fp64vec2);
y10_fp64vec2 = vec_mul(x10_fp64vec2, c_fp64vec2);
y11_fp64vec2 = vec_mul(x11_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
vec_xst(y2_fp64vec2, 0, y+(i+4 ));
vec_xst(y3_fp64vec2, 0, y+(i+6 ));
vec_xst(y4_fp64vec2, 0, y+(i+8 ));
vec_xst(y5_fp64vec2, 0, y+(i+10));
vec_xst(y6_fp64vec2, 0, y+(i+12));
vec_xst(y7_fp64vec2, 0, y+(i+14));
vec_xst(y8_fp64vec2, 0, y+(i+16));
vec_xst(y9_fp64vec2, 0, y+(i+18));
vec_xst(y10_fp64vec2, 0, y+(i+20));
vec_xst(y11_fp64vec2, 0, y+(i+22));
}
for (; i <= n-8; i += 8)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_mul(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_mul(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_mul(x3_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
vec_xst(y2_fp64vec2, 0, y+(i+4 ));
vec_xst(y3_fp64vec2, 0, y+(i+6 ));
}
for (; i <= n-2; i += 2)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
y0_fp64vec2 = vec_mul(x0_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
}
for (; i < n; i++)
y[i] = c * x[i];
}
//--------------------------------------------------------------------------------------------------
// THDoubleVector_cdiv_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_cdiv_VSX(double *z, const double *x, const double *y, const ptrdiff_t n)
{
ptrdiff_t i;
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
for (i = 0; i <= n-24; i += 24)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
y1_fp64vec2 = vec_xl(0, y+(i+2 ));
y2_fp64vec2 = vec_xl(0, y+(i+4 ));
y3_fp64vec2 = vec_xl(0, y+(i+6 ));
y4_fp64vec2 = vec_xl(0, y+(i+8 ));
y5_fp64vec2 = vec_xl(0, y+(i+10));
y6_fp64vec2 = vec_xl(0, y+(i+12));
y7_fp64vec2 = vec_xl(0, y+(i+14));
y8_fp64vec2 = vec_xl(0, y+(i+16));
y9_fp64vec2 = vec_xl(0, y+(i+18));
y10_fp64vec2 = vec_xl(0, y+(i+20));
y11_fp64vec2 = vec_xl(0, y+(i+22));
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
x4_fp64vec2 = vec_xl(0, x+(i+8 ));
x5_fp64vec2 = vec_xl(0, x+(i+10));
x6_fp64vec2 = vec_xl(0, x+(i+12));
x7_fp64vec2 = vec_xl(0, x+(i+14));
x8_fp64vec2 = vec_xl(0, x+(i+16));
x9_fp64vec2 = vec_xl(0, x+(i+18));
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2);
y1_fp64vec2 = vec_div(x1_fp64vec2, y1_fp64vec2);
y2_fp64vec2 = vec_div(x2_fp64vec2, y2_fp64vec2);
y3_fp64vec2 = vec_div(x3_fp64vec2, y3_fp64vec2);
y4_fp64vec2 = vec_div(x4_fp64vec2, y4_fp64vec2);
y5_fp64vec2 = vec_div(x5_fp64vec2, y5_fp64vec2);
y6_fp64vec2 = vec_div(x6_fp64vec2, y6_fp64vec2);
y7_fp64vec2 = vec_div(x7_fp64vec2, y7_fp64vec2);
y8_fp64vec2 = vec_div(x8_fp64vec2, y8_fp64vec2);
y9_fp64vec2 = vec_div(x9_fp64vec2, y9_fp64vec2);
y10_fp64vec2 = vec_div(x10_fp64vec2, y10_fp64vec2);
y11_fp64vec2 = vec_div(x11_fp64vec2, y11_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
vec_xst(y1_fp64vec2, 0, z+(i+2 ));
vec_xst(y2_fp64vec2, 0, z+(i+4 ));
vec_xst(y3_fp64vec2, 0, z+(i+6 ));
vec_xst(y4_fp64vec2, 0, z+(i+8 ));
vec_xst(y5_fp64vec2, 0, z+(i+10));
vec_xst(y6_fp64vec2, 0, z+(i+12));
vec_xst(y7_fp64vec2, 0, z+(i+14));
vec_xst(y8_fp64vec2, 0, z+(i+16));
vec_xst(y9_fp64vec2, 0, z+(i+18));
vec_xst(y10_fp64vec2, 0, z+(i+20));
vec_xst(y11_fp64vec2, 0, z+(i+22));
}
for (; i <= n-8; i += 8)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
y1_fp64vec2 = vec_xl(0, y+(i+2 ));
y2_fp64vec2 = vec_xl(0, y+(i+4 ));
y3_fp64vec2 = vec_xl(0, y+(i+6 ));
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2);
y1_fp64vec2 = vec_div(x1_fp64vec2, y1_fp64vec2);
y2_fp64vec2 = vec_div(x2_fp64vec2, y2_fp64vec2);
y3_fp64vec2 = vec_div(x3_fp64vec2, y3_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
vec_xst(y1_fp64vec2, 0, z+(i+2 ));
vec_xst(y2_fp64vec2, 0, z+(i+4 ));
vec_xst(y3_fp64vec2, 0, z+(i+6 ));
}
for (; i <= n-2; i += 2)
{
y0_fp64vec2 = vec_xl(0, y+(i ));
x0_fp64vec2 = vec_xl(0, x+(i ));
y0_fp64vec2 = vec_div(x0_fp64vec2, y0_fp64vec2);
vec_xst(y0_fp64vec2, 0, z+(i ));
}
for (; i < n; i++)
z[i] = x[i] / y[i];
}
//--------------------------------------------------------------------------------------------------
// THDoubleVector_divs_VSX:
//--------------------------------------------------------------------------------------------------
static void THDoubleVector_divs_VSX(double *y, const double *x, const double c, const ptrdiff_t n)
{
ptrdiff_t i;
double val[2] = {c, c};
vector double c_fp64vec2 = vec_xl(0, val);
vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
for (i = 0; i <= n-24; i += 24)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
x4_fp64vec2 = vec_xl(0, x+(i+8 ));
x5_fp64vec2 = vec_xl(0, x+(i+10));
x6_fp64vec2 = vec_xl(0, x+(i+12));
x7_fp64vec2 = vec_xl(0, x+(i+14));
x8_fp64vec2 = vec_xl(0, x+(i+16));
x9_fp64vec2 = vec_xl(0, x+(i+18));
x10_fp64vec2 = vec_xl(0, x+(i+20));
x11_fp64vec2 = vec_xl(0, x+(i+22));
y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
y4_fp64vec2 = vec_div(x4_fp64vec2, c_fp64vec2);
y5_fp64vec2 = vec_div(x5_fp64vec2, c_fp64vec2);
y6_fp64vec2 = vec_div(x6_fp64vec2, c_fp64vec2);
y7_fp64vec2 = vec_div(x7_fp64vec2, c_fp64vec2);
y8_fp64vec2 = vec_div(x8_fp64vec2, c_fp64vec2);
y9_fp64vec2 = vec_div(x9_fp64vec2, c_fp64vec2);
y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
vec_xst(y2_fp64vec2, 0, y+(i+4 ));
vec_xst(y3_fp64vec2, 0, y+(i+6 ));
vec_xst(y4_fp64vec2, 0, y+(i+8 ));
vec_xst(y5_fp64vec2, 0, y+(i+10));
vec_xst(y6_fp64vec2, 0, y+(i+12));
vec_xst(y7_fp64vec2, 0, y+(i+14));
vec_xst(y8_fp64vec2, 0, y+(i+16));
vec_xst(y9_fp64vec2, 0, y+(i+18));
vec_xst(y10_fp64vec2, 0, y+(i+20));
vec_xst(y11_fp64vec2, 0, y+(i+22));
}
for (; i <= n-8; i += 8)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+2 ));
x2_fp64vec2 = vec_xl(0, x+(i+4 ));
x3_fp64vec2 = vec_xl(0, x+(i+6 ));
y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
vec_xst(y2_fp64vec2, 0, y+(i+4 ));
vec_xst(y3_fp64vec2, 0, y+(i+6 ));
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+2 ));
vec_xst(y2_fp64vec2, 0, y+(i+4 ));
vec_xst(y3_fp64vec2, 0, y+(i+6 ));
}
for (; i <= n-2; i += 2)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
}
for (; i < n; i++)
y[i] = x[i] / c;
}
//--------------------------------------------------------------------------------------------------
// THFloatVector_fill_VSX:
//--------------------------------------------------------------------------------------------------
static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
float val[4] = {c, c, c, c};
vector float fp32vec4 = vec_xl(0, val);
for (i = 0; i <= n-256; i += 256)
{
vec_xst(fp32vec4, 0, x+(i ));
vec_xst(fp32vec4, 0, x+(i+4 ));
vec_xst(fp32vec4, 0, x+(i+8 ));
vec_xst(fp32vec4, 0, x+(i+12 ));
vec_xst(fp32vec4, 0, x+(i+16 ));
vec_xst(fp32vec4, 0, x+(i+20 ));
vec_xst(fp32vec4, 0, x+(i+24 ));
vec_xst(fp32vec4, 0, x+(i+28 ));
vec_xst(fp32vec4, 0, x+(i+32 ));
vec_xst(fp32vec4, 0, x+(i+36 ));
vec_xst(fp32vec4, 0, x+(i+40 ));
vec_xst(fp32vec4, 0, x+(i+44 ));
vec_xst(fp32vec4, 0, x+(i+48 ));
vec_xst(fp32vec4, 0, x+(i+52 ));
vec_xst(fp32vec4, 0, x+(i+56 ));
vec_xst(fp32vec4, 0, x+(i+60 ));
vec_xst(fp32vec4, 0, x+(i+64 ));
vec_xst(fp32vec4, 0, x+(i+68 ));
vec_xst(fp32vec4, 0, x+(i+72 ));
vec_xst(fp32vec4, 0, x+(i+76 ));
vec_xst(fp32vec4, 0, x+(i+80 ));
vec_xst(fp32vec4, 0, x+(i+84 ));
vec_xst(fp32vec4, 0, x+(i+88 ));
vec_xst(fp32vec4, 0, x+(i+92 ));
vec_xst(fp32vec4, 0, x+(i+96 ));
vec_xst(fp32vec4, 0, x+(i+100));
vec_xst(fp32vec4, 0, x+(i+104));
vec_xst(fp32vec4, 0, x+(i+108));
vec_xst(fp32vec4, 0, x+(i+112));
vec_xst(fp32vec4, 0, x+(i+116));
vec_xst(fp32vec4, 0, x+(i+120));
vec_xst(fp32vec4, 0, x+(i+124));
vec_xst(fp32vec4, 0, x+(i+128));
vec_xst(fp32vec4, 0, x+(i+132));
vec_xst(fp32vec4, 0, x+(i+136));
vec_xst(fp32vec4, 0, x+(i+140));
vec_xst(fp32vec4, 0, x+(i+144));
vec_xst(fp32vec4, 0, x+(i+148));
vec_xst(fp32vec4, 0, x+(i+152));
vec_xst(fp32vec4, 0, x+(i+156));
vec_xst(fp32vec4, 0, x+(i+160));
vec_xst(fp32vec4, 0, x+(i+164));
vec_xst(fp32vec4, 0, x+(i+168));
vec_xst(fp32vec4, 0, x+(i+172));
vec_xst(fp32vec4, 0, x+(i+176));
vec_xst(fp32vec4, 0, x+(i+180));
vec_xst(fp32vec4, 0, x+(i+184));
vec_xst(fp32vec4, 0, x+(i+188));
vec_xst(fp32vec4, 0, x+(i+192));
vec_xst(fp32vec4, 0, x+(i+196));
vec_xst(fp32vec4, 0, x+(i+200));
vec_xst(fp32vec4, 0, x+(i+204));
vec_xst(fp32vec4, 0, x+(i+208));
vec_xst(fp32vec4, 0, x+(i+212));
vec_xst(fp32vec4, 0, x+(i+216));
vec_xst(fp32vec4, 0, x+(i+220));
vec_xst(fp32vec4, 0, x+(i+224));
vec_xst(fp32vec4, 0, x+(i+228));
vec_xst(fp32vec4, 0, x+(i+232));
vec_xst(fp32vec4, 0, x+(i+236));
vec_xst(fp32vec4, 0, x+(i+240));
vec_xst(fp32vec4, 0, x+(i+244));
vec_xst(fp32vec4, 0, x+(i+248));
vec_xst(fp32vec4, 0, x+(i+252));
}
for (; i <= n-32; i += 32)
{
vec_xst(fp32vec4, 0, x+(i ));
vec_xst(fp32vec4, 0, x+(i+4 ));
vec_xst(fp32vec4, 0, x+(i+8 ));
vec_xst(fp32vec4, 0, x+(i+12 ));
vec_xst(fp32vec4, 0, x+(i+16 ));
vec_xst(fp32vec4, 0, x+(i+20 ));
vec_xst(fp32vec4, 0, x+(i+24 ));
vec_xst(fp32vec4, 0, x+(i+28 ));
}
for (; i <= n-4; i += 4)
vec_xst(fp32vec4, 0, x+(i ));
for (; i < n; i++)
x[i] = c;
}
//--------------------------------------------------------------------------------------------------
// THFloatVector_cadd_VSX:
//--------------------------------------------------------------------------------------------------
static void THFloatVector_cadd_VSX(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
float val[4] = {c, c, c, c};
vector float c_fp32vec4 = vec_xl(0, val);
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
for (i = 0; i <= n-48; i += 48)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4 ));
y2_fp32vec4 = vec_xl(0, y+(i+8 ));
y3_fp32vec4 = vec_xl(0, y+(i+12));
y4_fp32vec4 = vec_xl(0, y+(i+16 ));
y5_fp32vec4 = vec_xl(0, y+(i+20));
y6_fp32vec4 = vec_xl(0, y+(i+24));
y7_fp32vec4 = vec_xl(0, y+(i+28));
y8_fp32vec4 = vec_xl(0, y+(i+32));
y9_fp32vec4 = vec_xl(0, y+(i+36));
y10_fp32vec4 = vec_xl(0, y+(i+40));
y11_fp32vec4 = vec_xl(0, y+(i+44));
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12 ));
x4_fp32vec4 = vec_xl(0, x+(i+16 ));
x5_fp32vec4 = vec_xl(0, x+(i+20));
x6_fp32vec4 = vec_xl(0, x+(i+24));
x7_fp32vec4 = vec_xl(0, x+(i+28));
x8_fp32vec4 = vec_xl(0, x+(i+32));
x9_fp32vec4 = vec_xl(0, x+(i+36));
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
y1_fp32vec4 = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
y2_fp32vec4 = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
y3_fp32vec4 = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
y4_fp32vec4 = vec_madd(y4_fp32vec4, c_fp32vec4, x4_fp32vec4);
y5_fp32vec4 = vec_madd(y5_fp32vec4, c_fp32vec4, x5_fp32vec4);
y6_fp32vec4 = vec_madd(y6_fp32vec4, c_fp32vec4, x6_fp32vec4);
y7_fp32vec4 = vec_madd(y7_fp32vec4, c_fp32vec4, x7_fp32vec4);
y8_fp32vec4 = vec_madd(y8_fp32vec4, c_fp32vec4, x8_fp32vec4);
y9_fp32vec4 = vec_madd(y9_fp32vec4, c_fp32vec4, x9_fp32vec4);
y10_fp32vec4 = vec_madd(y10_fp32vec4, c_fp32vec4, x10_fp32vec4);
y11_fp32vec4 = vec_madd(y11_fp32vec4, c_fp32vec4, x11_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
vec_xst(y1_fp32vec4, 0, z+(i+4 ));
vec_xst(y2_fp32vec4, 0, z+(i+8 ));
vec_xst(y3_fp32vec4, 0, z+(i+12 ));
vec_xst(y4_fp32vec4, 0, z+(i+16 ));
vec_xst(y5_fp32vec4, 0, z+(i+20));
vec_xst(y6_fp32vec4, 0, z+(i+24));
vec_xst(y7_fp32vec4, 0, z+(i+28));
vec_xst(y8_fp32vec4, 0, z+(i+32));
vec_xst(y9_fp32vec4, 0, z+(i+36));
vec_xst(y10_fp32vec4, 0, z+(i+40));
vec_xst(y11_fp32vec4, 0, z+(i+44));
}
for (; i <= n-16; i += 16)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4 ));
y2_fp32vec4 = vec_xl(0, y+(i+8 ));
y3_fp32vec4 = vec_xl(0, y+(i+12 ));
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12 ));
y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
y1_fp32vec4 = vec_madd(y1_fp32vec4, c_fp32vec4, x1_fp32vec4);
y2_fp32vec4 = vec_madd(y2_fp32vec4, c_fp32vec4, x2_fp32vec4);
y3_fp32vec4 = vec_madd(y3_fp32vec4, c_fp32vec4, x3_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
vec_xst(y1_fp32vec4, 0, z+(i+4 ));
vec_xst(y2_fp32vec4, 0, z+(i+8 ));
vec_xst(y3_fp32vec4, 0, z+(i+12 ));
}
for (; i <= n-4; i += 4)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
x0_fp32vec4 = vec_xl(0, x+(i ));
y0_fp32vec4 = vec_madd(y0_fp32vec4, c_fp32vec4, x0_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
}
for (; i < n; i++)
z[i] = x[i] + c* y[i];
}
//--------------------------------------------------------------------------------------------------
// THFloatVector_adds_VSX:
//--------------------------------------------------------------------------------------------------
static void THFloatVector_adds_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
float val[4] = {c, c, c, c};
vector float c_fp32vec4 = vec_xl(0, val);
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
for (i = 0; i <= n-48; i += 48)
{
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12));
x4_fp32vec4 = vec_xl(0, x+(i+16));
x5_fp32vec4 = vec_xl(0, x+(i+20));
x6_fp32vec4 = vec_xl(0, x+(i+24));
x7_fp32vec4 = vec_xl(0, x+(i+28));
x8_fp32vec4 = vec_xl(0, x+(i+32));
x9_fp32vec4 = vec_xl(0, x+(i+36));
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4);
y1_fp32vec4 = vec_add(x1_fp32vec4, c_fp32vec4);
y2_fp32vec4 = vec_add(x2_fp32vec4, c_fp32vec4);
y3_fp32vec4 = vec_add(x3_fp32vec4, c_fp32vec4);
y4_fp32vec4 = vec_add(x4_fp32vec4, c_fp32vec4);
y5_fp32vec4 = vec_add(x5_fp32vec4, c_fp32vec4);
y6_fp32vec4 = vec_add(x6_fp32vec4, c_fp32vec4);
y7_fp32vec4 = vec_add(x7_fp32vec4, c_fp32vec4);
y8_fp32vec4 = vec_add(x8_fp32vec4, c_fp32vec4);
y9_fp32vec4 = vec_add(x9_fp32vec4, c_fp32vec4);
y10_fp32vec4 = vec_add(x10_fp32vec4, c_fp32vec4);
y11_fp32vec4 = vec_add(x11_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
vec_xst(y2_fp32vec4, 0, y+(i+8 ));
vec_xst(y3_fp32vec4, 0, y+(i+12));
vec_xst(y4_fp32vec4, 0, y+(i+16));
vec_xst(y5_fp32vec4, 0, y+(i+20));
vec_xst(y6_fp32vec4, 0, y+(i+24));
vec_xst(y7_fp32vec4, 0, y+(i+28));
vec_xst(y8_fp32vec4, 0, y+(i+32));
vec_xst(y9_fp32vec4, 0, y+(i+36));
vec_xst(y10_fp32vec4, 0, y+(i+40));
vec_xst(y11_fp32vec4, 0, y+(i+44));
}
for (; i <= n-16; i += 16)
{
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12));
y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4);
y1_fp32vec4 = vec_add(x1_fp32vec4, c_fp32vec4);
y2_fp32vec4 = vec_add(x2_fp32vec4, c_fp32vec4);
y3_fp32vec4 = vec_add(x3_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
vec_xst(y2_fp32vec4, 0, y+(i+8 ));
vec_xst(y3_fp32vec4, 0, y+(i+12));
}
for (; i <= n-4; i += 4)
{
x0_fp32vec4 = vec_xl(0, x+(i ));
y0_fp32vec4 = vec_add(x0_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
}
for (; i < n; i++)
y[i] = c + x[i];
}
//--------------------------------------------------------------------------------------------------
// THFloatVector_cmul_VSX:
//--------------------------------------------------------------------------------------------------
static void THFloatVector_cmul_VSX(float *z, const float *y, const float *x, const ptrdiff_t n)
{
ptrdiff_t i;
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
for (i = 0; i <= n-48; i += 48)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4 ));
y2_fp32vec4 = vec_xl(0, y+(i+8 ));
y3_fp32vec4 = vec_xl(0, y+(i+12 ));
y4_fp32vec4 = vec_xl(0, y+(i+16 ));
y5_fp32vec4 = vec_xl(0, y+(i+20));
y6_fp32vec4 = vec_xl(0, y+(i+24));
y7_fp32vec4 = vec_xl(0, y+(i+28));
y8_fp32vec4 = vec_xl(0, y+(i+32));
y9_fp32vec4 = vec_xl(0, y+(i+36));
y10_fp32vec4 = vec_xl(0, y+(i+40));
y11_fp32vec4 = vec_xl(0, y+(i+44));
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12 ));
x4_fp32vec4 = vec_xl(0, x+(i+16 ));
x5_fp32vec4 = vec_xl(0, x+(i+20));
x6_fp32vec4 = vec_xl(0, x+(i+24));
x7_fp32vec4 = vec_xl(0, x+(i+28));
x8_fp32vec4 = vec_xl(0, x+(i+32));
x9_fp32vec4 = vec_xl(0, x+(i+36));
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4);
y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4);
y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4);
y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4);
y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4);
y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4);
y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4);
y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
vec_xst(y1_fp32vec4, 0, z+(i+4 ));
vec_xst(y2_fp32vec4, 0, z+(i+8 ));
vec_xst(y3_fp32vec4, 0, z+(i+12 ));
vec_xst(y4_fp32vec4, 0, z+(i+16 ));
vec_xst(y5_fp32vec4, 0, z+(i+20));
vec_xst(y6_fp32vec4, 0, z+(i+24));
vec_xst(y7_fp32vec4, 0, z+(i+28));
vec_xst(y8_fp32vec4, 0, z+(i+32));
vec_xst(y9_fp32vec4, 0, z+(i+36));
vec_xst(y10_fp32vec4, 0, z+(i+40));
vec_xst(y11_fp32vec4, 0, z+(i+44));
}
for (; i <= n-16; i += 16)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4 ));
y2_fp32vec4 = vec_xl(0, y+(i+8 ));
y3_fp32vec4 = vec_xl(0, y+(i+12 ));
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12 ));
y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4);
y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4);
y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
vec_xst(y1_fp32vec4, 0, z+(i+4 ));
vec_xst(y2_fp32vec4, 0, z+(i+8 ));
vec_xst(y3_fp32vec4, 0, z+(i+12 ));
}
for (; i <= n-4; i += 4)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
x0_fp32vec4 = vec_xl(0, x+(i ));
y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
}
for (; i < n; i++)
z[i] = y[i] * x[i];
}
//--------------------------------------------------------------------------------------------------
// THFloatVector_muls_VSX:
//--------------------------------------------------------------------------------------------------
static void THFloatVector_muls_VSX(float *y, const float *x, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
float val[4] = {c, c, c, c};
vector float c_fp32vec4 = vec_xl(0, val);
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
for (i = 0; i <= n-48; i += 48)
{
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12));
x4_fp32vec4 = vec_xl(0, x+(i+16));
x5_fp32vec4 = vec_xl(0, x+(i+20));
x6_fp32vec4 = vec_xl(0, x+(i+24));
x7_fp32vec4 = vec_xl(0, x+(i+28));
x8_fp32vec4 = vec_xl(0, x+(i+32));
x9_fp32vec4 = vec_xl(0, x+(i+36));
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4);
y1_fp32vec4 = vec_mul(x1_fp32vec4, c_fp32vec4);
y2_fp32vec4 = vec_mul(x2_fp32vec4, c_fp32vec4);
y3_fp32vec4 = vec_mul(x3_fp32vec4, c_fp32vec4);
y4_fp32vec4 = vec_mul(x4_fp32vec4, c_fp32vec4);
y5_fp32vec4 = vec_mul(x5_fp32vec4, c_fp32vec4);
y6_fp32vec4 = vec_mul(x6_fp32vec4, c_fp32vec4);
y7_fp32vec4 = vec_mul(x7_fp32vec4, c_fp32vec4);
y8_fp32vec4 = vec_mul(x8_fp32vec4, c_fp32vec4);
y9_fp32vec4 = vec_mul(x9_fp32vec4, c_fp32vec4);
y10_fp32vec4 = vec_mul(x10_fp32vec4, c_fp32vec4);
y11_fp32vec4 = vec_mul(x11_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
vec_xst(y2_fp32vec4, 0, y+(i+8 ));
vec_xst(y3_fp32vec4, 0, y+(i+12));
vec_xst(y4_fp32vec4, 0, y+(i+16));
vec_xst(y5_fp32vec4, 0, y+(i+20));
vec_xst(y6_fp32vec4, 0, y+(i+24));
vec_xst(y7_fp32vec4, 0, y+(i+28));
vec_xst(y8_fp32vec4, 0, y+(i+32));
vec_xst(y9_fp32vec4, 0, y+(i+36));
vec_xst(y10_fp32vec4, 0, y+(i+40));
vec_xst(y11_fp32vec4, 0, y+(i+44));
}
for (; i <= n-16; i += 16)
{
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12));
y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4);
y1_fp32vec4 = vec_mul(x1_fp32vec4, c_fp32vec4);
y2_fp32vec4 = vec_mul(x2_fp32vec4, c_fp32vec4);
y3_fp32vec4 = vec_mul(x3_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
vec_xst(y1_fp32vec4, 0, y+(i+4 ));
vec_xst(y2_fp32vec4, 0, y+(i+8 ));
vec_xst(y3_fp32vec4, 0, y+(i+12));
}
for (; i <= n-4; i += 4)
{
x0_fp32vec4 = vec_xl(0, x+(i ));
y0_fp32vec4 = vec_mul(x0_fp32vec4, c_fp32vec4);
vec_xst(y0_fp32vec4, 0, y+(i ));
}
for (; i < n; i++)
y[i] = c * x[i];
}
//--------------------------------------------------------------------------------------------------
// THFloatVector_cdiv_VSX:
//--------------------------------------------------------------------------------------------------
static void THFloatVector_cdiv_VSX(float *z, const float *x, const float *y, const ptrdiff_t n)
{
ptrdiff_t i;
vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4;
vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4;
vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4;
vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4;
for (i = 0; i <= n-48; i += 48)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4));
y2_fp32vec4 = vec_xl(0, y+(i+8));
y3_fp32vec4 = vec_xl(0, y+(i+12));
y4_fp32vec4 = vec_xl(0, y+(i+16));
y5_fp32vec4 = vec_xl(0, y+(i+20));
y6_fp32vec4 = vec_xl(0, y+(i+24));
y7_fp32vec4 = vec_xl(0, y+(i+28));
y8_fp32vec4 = vec_xl(0, y+(i+32));
y9_fp32vec4 = vec_xl(0, y+(i+36));
y10_fp32vec4 = vec_xl(0, y+(i+40));
y11_fp32vec4 = vec_xl(0, y+(i+44));
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12 ));
x4_fp32vec4 = vec_xl(0, x+(i+16 ));
x5_fp32vec4 = vec_xl(0, x+(i+20));
x6_fp32vec4 = vec_xl(0, x+(i+24));
x7_fp32vec4 = vec_xl(0, x+(i+28));
x8_fp32vec4 = vec_xl(0, x+(i+32));
x9_fp32vec4 = vec_xl(0, x+(i+36));
x10_fp32vec4 = vec_xl(0, x+(i+40));
x11_fp32vec4 = vec_xl(0, x+(i+44));
y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4);
y1_fp32vec4 = vec_div(x1_fp32vec4, y1_fp32vec4);
y2_fp32vec4 = vec_div(x2_fp32vec4, y2_fp32vec4);
y3_fp32vec4 = vec_div(x3_fp32vec4, y3_fp32vec4);
y4_fp32vec4 = vec_div(x4_fp32vec4, y4_fp32vec4);
y5_fp32vec4 = vec_div(x5_fp32vec4, y5_fp32vec4);
y6_fp32vec4 = vec_div(x6_fp32vec4, y6_fp32vec4);
y7_fp32vec4 = vec_div(x7_fp32vec4, y7_fp32vec4);
y8_fp32vec4 = vec_div(x8_fp32vec4, y8_fp32vec4);
y9_fp32vec4 = vec_div(x9_fp32vec4, y9_fp32vec4);
y10_fp32vec4 = vec_div(x10_fp32vec4, y10_fp32vec4);
y11_fp32vec4 = vec_div(x11_fp32vec4, y11_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
vec_xst(y1_fp32vec4, 0, z+(i+4 ));
vec_xst(y2_fp32vec4, 0, z+(i+8 ));
vec_xst(y3_fp32vec4, 0, z+(i+12 ));
vec_xst(y4_fp32vec4, 0, z+(i+16 ));
vec_xst(y5_fp32vec4, 0, z+(i+20));
vec_xst(y6_fp32vec4, 0, z+(i+24));
vec_xst(y7_fp32vec4, 0, z+(i+28));
vec_xst(y8_fp32vec4, 0, z+(i+32));
vec_xst(y9_fp32vec4, 0, z+(i+36));
vec_xst(y10_fp32vec4, 0, z+(i+40));
vec_xst(y11_fp32vec4, 0, z+(i+44));
}
for (; i <= n-16; i += 16)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
y1_fp32vec4 = vec_xl(0, y+(i+4 ));
y2_fp32vec4 = vec_xl(0, y+(i+8 ));
y3_fp32vec4 = vec_xl(0, y+(i+12 ));
x0_fp32vec4 = vec_xl(0, x+(i ));
x1_fp32vec4 = vec_xl(0, x+(i+4 ));
x2_fp32vec4 = vec_xl(0, x+(i+8 ));
x3_fp32vec4 = vec_xl(0, x+(i+12 ));
y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4);
y1_fp32vec4 = vec_div(x1_fp32vec4, y1_fp32vec4);
y2_fp32vec4 = vec_div(x2_fp32vec4, y2_fp32vec4);
y3_fp32vec4 = vec_div(x3_fp32vec4, y3_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
vec_xst(y1_fp32vec4, 0, z+(i+4 ));
vec_xst(y2_fp32vec4, 0, z+(i+8 ));
vec_xst(y3_fp32vec4, 0, z+(i+12 ));
}
for (; i <= n-4; i += 4)
{
y0_fp32vec4 = vec_xl(0, y+(i ));
x0_fp32vec4 = vec_xl(0, x+(i ));
y0_fp32vec4 = vec_div(x0_fp32vec4, y0_fp32vec4);
vec_xst(y0_fp32vec4, 0, z+(i ));
}
for (; i < n; i++)
z[i] = x[i] / y[i];
}
//--------------------------------------------------------------------------------------------------
// THFloatVector_divs_VSX:
//--------------------------------------------------------------------------------------------------
static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const ptrdiff_t n)
{
ptrdiff_t i;
float val[4] = {c, c, c, c};
vector float c_fp64vec2 = vec_xl(0, val);
vector float y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2;
vector float y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2;
vector float x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2;
vector float x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2;
for (i = 0; i <= n-48; i += 48)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+4 ));
x2_fp64vec2 = vec_xl(0, x+(i+8 ));
x3_fp64vec2 = vec_xl(0, x+(i+12 ));
x4_fp64vec2 = vec_xl(0, x+(i+16 ));
x5_fp64vec2 = vec_xl(0, x+(i+20));
x6_fp64vec2 = vec_xl(0, x+(i+24));
x7_fp64vec2 = vec_xl(0, x+(i+28));
x8_fp64vec2 = vec_xl(0, x+(i+32));
x9_fp64vec2 = vec_xl(0, x+(i+36));
x10_fp64vec2 = vec_xl(0, x+(i+40));
x11_fp64vec2 = vec_xl(0, x+(i+44));
y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
y4_fp64vec2 = vec_div(x4_fp64vec2, c_fp64vec2);
y5_fp64vec2 = vec_div(x5_fp64vec2, c_fp64vec2);
y6_fp64vec2 = vec_div(x6_fp64vec2, c_fp64vec2);
y7_fp64vec2 = vec_div(x7_fp64vec2, c_fp64vec2);
y8_fp64vec2 = vec_div(x8_fp64vec2, c_fp64vec2);
y9_fp64vec2 = vec_div(x9_fp64vec2, c_fp64vec2);
y10_fp64vec2 = vec_div(x10_fp64vec2, c_fp64vec2);
y11_fp64vec2 = vec_div(x11_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+4 ));
vec_xst(y2_fp64vec2, 0, y+(i+8 ));
vec_xst(y3_fp64vec2, 0, y+(i+12 ));
vec_xst(y4_fp64vec2, 0, y+(i+16 ));
vec_xst(y5_fp64vec2, 0, y+(i+20));
vec_xst(y6_fp64vec2, 0, y+(i+24));
vec_xst(y7_fp64vec2, 0, y+(i+28));
vec_xst(y8_fp64vec2, 0, y+(i+32));
vec_xst(y9_fp64vec2, 0, y+(i+36));
vec_xst(y10_fp64vec2, 0, y+(i+40));
vec_xst(y11_fp64vec2, 0, y+(i+44));
}
for (; i <= n-16; i += 16)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
x1_fp64vec2 = vec_xl(0, x+(i+4 ));
x2_fp64vec2 = vec_xl(0, x+(i+8 ));
x3_fp64vec2 = vec_xl(0, x+(i+12 ));
y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
y1_fp64vec2 = vec_div(x1_fp64vec2, c_fp64vec2);
y2_fp64vec2 = vec_div(x2_fp64vec2, c_fp64vec2);
y3_fp64vec2 = vec_div(x3_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+4 ));
vec_xst(y2_fp64vec2, 0, y+(i+8 ));
vec_xst(y3_fp64vec2, 0, y+(i+12 ));
vec_xst(y0_fp64vec2, 0, y+(i ));
vec_xst(y1_fp64vec2, 0, y+(i+4 ));
vec_xst(y2_fp64vec2, 0, y+(i+8 ));
vec_xst(y3_fp64vec2, 0, y+(i+16 ));
}
for (; i <= n-4; i += 4)
{
x0_fp64vec2 = vec_xl(0, x+(i ));
y0_fp64vec2 = vec_div(x0_fp64vec2, c_fp64vec2);
vec_xst(y0_fp64vec2, 0, y+(i ));
}
for (; i < n; i++)
y[i] = x[i] / c;
}
//------------------------------------------------
//
// Testing for correctness and performance
//
// If you want to run these tests, compile this
// file with -DRUN_VSX_TESTS on a Power machine,
// and then run the executable that is generated.
//
//------------------------------------------------
//
// Example passing run (from a Power8 machine):
//
// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
// $ ./vsxtest
//
// TODO
//
//
// Finished runnning all tests. All tests PASSED.
//
//------------------------------------------------
#ifdef RUN_VSX_TESTS
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <assert.h>
#include <math.h>
#define VSX_PERF_NUM_TEST_ELEMENTS 100000000
#define VSX_FUNC_NUM_TEST_ELEMENTS 2507
//--------------------------------------------------------------------------------------------------
// Standard implementations:
//--------------------------------------------------------------------------------------------------
static void standardDouble_fill(double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
x[i] = c;
}
static void standardFloat_fill(float *x, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
x[i] = c;
}
static void standardDouble_cadd(double *z, const double *x, const double *y, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
z[i] = x[i] + c * y[i];
}
static void standardFloat_cadd(float *z, const float *x, const float *y, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
z[i] = x[i] + c * y[i];
}
static void standardDouble_adds(double *y, const double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
y[i] = c + x[i];
}
static void standardFloat_adds(float *y, const float *x, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
y[i] = c + x[i];
}
static void standardDouble_cmul(double *z, const double *x, const double *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
z[i] = x[i] * y[i];
}
static void standardFloat_cmul(float *z, const float *x, const float *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
z[i] = x[i] * y[i];
}
static void standardDouble_muls(double *y, const double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
y[i] = c * x[i];
}
static void standardFloat_muls(float *y, const float *x, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
y[i] = c * x[i];
}
static void standardDouble_cdiv(double *z, const double *x, const double *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
z[i] = x[i] / y[i];
}
static void standardFloat_cdiv(float *z, const float *x, const float *y, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
z[i] = x[i] / y[i];
}
static void standardDouble_divs(double *y, const double *x, const double c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
y[i] = x[i] / c;
}
static void standardFloat_divs(float *y, const float *x, const float c, const ptrdiff_t n)
{
for (ptrdiff_t i = 0; i < n; i++)
y[i] = x[i] / c;
}
double randDouble()
{
return (double)(rand()%100)/(double)(rand()%100) * (rand()%2 ? -1.0 : 1.0);
}
int near(double a, double b)
{
int aClass = fpclassify(a);
int bClass = fpclassify(b);
if(aClass != bClass) // i.e. is it NAN, infinite, or finite...?
return 0;
if(aClass == FP_INFINITE) // if it is infinite, the sign must be the same, i.e. positive infinity is not near negative infinity
return (signbit(a) == signbit(b));
else if(aClass == FP_NORMAL) // if it is a normal number then check the magnitude of the difference between the numbers
return fabs(a - b) < 0.001;
else // if both number are of the same class as each other and are of any other class (i.e. such as NAN), then they are near to each other.
return 1;
}
//--------------------------------------------------------------------------------------------------
// Standard tests:
//--------------------------------------------------------------------------------------------------
void test_THDoubleVector_fill_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *x_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double yVal0 = 17.2;
double yVal1 = 8.2;
double yVal2 = 5.1;
double yVal3 = -0.9;
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardDouble_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
standardDouble_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardDouble_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardDouble_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
THDoubleVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
THDoubleVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
THDoubleVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THDoubleVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
yVal0 += 1.0;
yVal1 += 1.0;
yVal2 += 1.0;
yVal3 -= 1.0;
standardDouble_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
assert(x_optimized[i] == yVal0);
standardDouble_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THDoubleVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardDouble_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THDoubleVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardDouble_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THDoubleVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardDouble_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THDoubleVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardDouble_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THDoubleVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
assert(x_optimized[i] == x_standard[i]);
printf("All assertions PASSED for THDoubleVector_fill_VSX() test.\n\n");
free(x_standard);
free(x_optimized);
}
void test_THFloatVector_fill_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *x_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float yVal0 = 17.2;
float yVal1 = 8.2;
float yVal2 = 5.1;
float yVal3 = -0.9;
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardFloat_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
standardFloat_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardFloat_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardFloat_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_fill() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THFloatVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS );
THFloatVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1);
THFloatVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2);
THFloatVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THFloatVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
yVal0 += 1.0;
yVal1 += 1.0;
yVal2 += 1.0;
yVal3 -= 1.0;
standardFloat_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
THFloatVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS);
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
assert(x_optimized[i] == yVal0);
standardFloat_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THFloatVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardFloat_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THFloatVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardFloat_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THFloatVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardFloat_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THFloatVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardFloat_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THFloatVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
assert(x_optimized[i] == x_standard[i]);
printf("All assertions PASSED for THFloatVector_fill_VSX() test.\n\n");
free(x_standard);
free(x_optimized);
}
void test_THDoubleVector_cadd_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
y[i] = randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardDouble_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THDoubleVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THDoubleVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardDouble_cadd( z_standard+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THDoubleVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardDouble_cadd( z_standard+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THDoubleVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardDouble_cadd( z_standard+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THDoubleVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardDouble_cadd( z_standard+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THDoubleVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardDouble_cadd( z_standard+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THDoubleVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
printf("All assertions PASSED for THDoubleVector_cadd_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
free(x);
}
void test_THFloatVector_cadd_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float c = (float)randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = (float)randDouble();
y[i] = (float)randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardFloat_cadd(z_standard, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_cadd() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS );
THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THFloatVector_cadd_VSX(z_optimized, x, y, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THFloatVector_cadd_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardFloat_cadd( z_standard+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THFloatVector_cadd_VSX(z_optimized+1, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardFloat_cadd( z_standard+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THFloatVector_cadd_VSX(z_optimized+2, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardFloat_cadd( z_standard+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THFloatVector_cadd_VSX(z_optimized+3, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardFloat_cadd( z_standard+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THFloatVector_cadd_VSX(z_optimized+517, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardFloat_cadd( z_standard+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THFloatVector_cadd_VSX(z_optimized+517+r, x, y, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
printf("All assertions PASSED for THFloatVector_cadd_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
free(x);
}
void test_THDoubleVector_adds_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
x[i] = randDouble();
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardDouble_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THDoubleVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THDoubleVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardDouble_adds( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THDoubleVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardDouble_adds( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THDoubleVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardDouble_adds( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THDoubleVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardDouble_adds( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THDoubleVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardDouble_adds( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THDoubleVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
printf("All assertions PASSED for THDoubleVector_adds_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
free(x);
}
void test_THFloatVector_adds_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float c = (float)randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
x[i] = (float)randDouble();
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardFloat_adds(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_adds() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THFloatVector_adds_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THFloatVector_adds_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardFloat_adds( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THFloatVector_adds_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardFloat_adds( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THFloatVector_adds_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardFloat_adds( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THFloatVector_adds_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardFloat_adds( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THFloatVector_adds_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardFloat_adds( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THFloatVector_adds_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
printf("All assertions PASSED for THFloatVector_adds_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
free(x);
}
void test_THDoubleVector_cmul_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
y[i] = randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardDouble_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
THDoubleVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THDoubleVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardDouble_cmul( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THDoubleVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardDouble_cmul( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THDoubleVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardDouble_cmul( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THDoubleVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardDouble_cmul( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THDoubleVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardDouble_cmul( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THDoubleVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
printf("All assertions PASSED for THDoubleVector_cmul_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
free(x);
}
void test_THFloatVector_cmul_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = (float)randDouble();
y[i] = (float)randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardFloat_cmul(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_cmul() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
THFloatVector_cmul_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THFloatVector_cmul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardFloat_cmul( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THFloatVector_cmul_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardFloat_cmul( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THFloatVector_cmul_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardFloat_cmul( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THFloatVector_cmul_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardFloat_cmul( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THFloatVector_cmul_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardFloat_cmul( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THFloatVector_cmul_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
printf("All assertions PASSED for THFloatVector_cmul_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
free(x);
}
void test_THDoubleVector_muls_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardDouble_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THDoubleVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THDoubleVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardDouble_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THDoubleVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardDouble_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THDoubleVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardDouble_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THDoubleVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardDouble_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THDoubleVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardDouble_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THDoubleVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
printf("All assertions PASSED for THDoubleVector_muls_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
free(x);
}
void test_THFloatVector_muls_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float c = (float)randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = (float)randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardFloat_muls(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_muls() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THFloatVector_muls_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THFloatVector_muls_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardFloat_muls( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THFloatVector_muls_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardFloat_muls( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THFloatVector_muls_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardFloat_muls( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THFloatVector_muls_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardFloat_muls( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THFloatVector_muls_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardFloat_muls( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THFloatVector_muls_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
printf("All assertions PASSED for THFloatVector_muls_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
free(x);
}
void test_THDoubleVector_cdiv_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
y[i] = randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardDouble_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
THDoubleVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THDoubleVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardDouble_cdiv( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THDoubleVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardDouble_cdiv( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THDoubleVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardDouble_cdiv( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THDoubleVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardDouble_cdiv( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THDoubleVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardDouble_cdiv( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THDoubleVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
printf("All assertions PASSED for THDoubleVector_cdiv_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
free(x);
}
void test_THFloatVector_cdiv_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = (float)randDouble();
y[i] = (float)randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardFloat_cdiv(z_standard, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_cdiv() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS );
THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-1);
THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-2);
THFloatVector_cdiv_VSX(z_optimized, x, y, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THFloatVector_cdiv_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardFloat_cdiv( z_standard+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THFloatVector_cdiv_VSX(z_optimized+1, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardFloat_cdiv( z_standard+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THFloatVector_cdiv_VSX(z_optimized+2, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardFloat_cdiv( z_standard+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THFloatVector_cdiv_VSX(z_optimized+3, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardFloat_cdiv( z_standard+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THFloatVector_cdiv_VSX(z_optimized+517, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardFloat_cdiv( z_standard+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THFloatVector_cdiv_VSX(z_optimized+517+r, x, y, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(z_optimized[i], z_standard[i]))
printf("%d %f %f\n", i, z_optimized[i], z_standard[i]);
assert(near(z_optimized[i], z_standard[i]));
}
printf("All assertions PASSED for THFloatVector_cdiv_VSX() test.\n\n");
free(z_standard);
free(z_optimized);
free(x);
}
void test_THDoubleVector_divs_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double));
double c = randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardDouble_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardDouble_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THDoubleVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THDoubleVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardDouble_divs( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THDoubleVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardDouble_divs( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THDoubleVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardDouble_divs( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THDoubleVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardDouble_divs( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THDoubleVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardDouble_divs( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THDoubleVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
printf("All assertions PASSED for THDoubleVector_divs_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
free(x);
}
void test_THFloatVector_divs_VSX()
{
clock_t start, end;
double elapsedSeconds_optimized, elapsedSeconds_standard;
float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float));
float c = (float)randDouble();
// Initialize randomly
for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++)
{
x[i] = randDouble();
}
//-------------------------------------------------
// Performance Test
//-------------------------------------------------
start = clock();
standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
standardFloat_divs(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC;
printf("standardFloat_divs() test took %.5lf seconds\n", elapsedSeconds_standard);
start = clock();
THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS );
THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1);
THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2);
THFloatVector_divs_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3);
end = clock();
elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC;
printf("THFloatVector_divs_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized);
//-------------------------------------------------
// Correctness Test
//-------------------------------------------------
standardFloat_divs( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
THFloatVector_divs_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2);
standardFloat_divs( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
THFloatVector_divs_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4);
standardFloat_divs( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
THFloatVector_divs_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6);
standardFloat_divs( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
THFloatVector_divs_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029);
int r = rand() % 258;
standardFloat_divs( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
THFloatVector_divs_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100));
for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++)
{
if(!near(y_optimized[i], y_standard[i]))
printf("%d %f %f\n", i, y_optimized[i], y_standard[i]);
assert(near(y_optimized[i], y_standard[i]));
}
printf("All assertions PASSED for THFloatVector_divs_VSX() test.\n\n");
free(y_standard);
free(y_optimized);
free(x);
}
//--------------------------------------------------------------------------------------------------
// Run tests:
//--------------------------------------------------------------------------------------------------
int main()
{
printf("\n");
// First test utility functions
assert(!near(0.1, -0.1));
assert(!near(0.1f, -0.1f));
assert(!near(9, 10));
assert(near(0.1, 0.1000001));
assert(near(0.1f, 0.1000001f));
assert(near(100.764, 100.764));
assert(!near(NAN, 0.0));
assert(!near(-9.5, NAN));
assert(!near(NAN, 100));
assert(!near(-0.0, NAN));
assert(near(NAN, NAN));
assert(near(INFINITY, INFINITY));
assert(near(-INFINITY, -INFINITY));
assert(!near(INFINITY, NAN));
assert(!near(0, INFINITY));
assert(!near(-999.4324, INFINITY));
assert(!near(INFINITY, 982374.1));
assert(!near(-INFINITY, INFINITY));
// Then test each vectorized function
test_THDoubleVector_fill_VSX();
test_THFloatVector_fill_VSX();
test_THDoubleVector_cadd_VSX();
test_THFloatVector_cadd_VSX();
test_THDoubleVector_adds_VSX();
test_THFloatVector_adds_VSX();
test_THDoubleVector_cmul_VSX();
test_THFloatVector_cmul_VSX();
test_THDoubleVector_muls_VSX();
test_THFloatVector_muls_VSX();
test_THDoubleVector_cdiv_VSX();
test_THFloatVector_cdiv_VSX();
test_THDoubleVector_divs_VSX();
test_THFloatVector_divs_VSX();
printf("Finished runnning all tests. All tests PASSED.\n");
return 0;
}
#endif // defined RUN_VSX_TESTS
#endif // defined __PPC64__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment