Last active
April 2, 2018 07:25
-
-
Save jdryg/9bb267b5ce1088eaf7399a9ab7dbe0ad to your computer and use it in GitHub Desktop.
strokerConvexFillAA (benchmark)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <inttypes.h> | |
#include <math.h> | |
#include <malloc.h> | |
#include <memory.h> | |
#include <Windows.h> | |
#include <stdio.h> | |
#include <xmmintrin.h> | |
#include <immintrin.h> | |
#include "iacaMarks.h" | |
#define INSERT_IACA_MARKERS 0 | |
#define CREATE_VERTEX_BUFFER 0 | |
#define CREATE_INDEX_BUFFER 1 | |
#ifdef _DEBUG | |
#define NUM_ITERATIONS 1 | |
#else | |
#if CREATE_VERTEX_BUFFER | |
#define NUM_ITERATIONS 1000000 | |
#else | |
#define NUM_ITERATIONS 10000000 | |
#endif | |
#endif | |
#define NUM_VERTICES 1024 | |
#define SIMD_INDEX_BUFFER 1 | |
#define TEST_SIMD 3 | |
#define VERIFY_SIMD 1 | |
// 0: 1.0 / sqrt | |
// 1: _mm_rsqrt_ss | |
// 2: Newton/Raphson | |
#define RSQRT_ALGORITHM 2 | |
// 0: 1.0 / a | |
// 1: _mm_rcp_ss | |
// 2: Newton/Raphson | |
#define RCP_ALGORITHM 1 | |
#define VG_EPSILON 1e-5f | |
#define PI 3.1415926f | |
struct Vec2 | |
{ | |
float x, y; | |
}; | |
static inline float fsign(float a) | |
{ | |
return a < 0.0f ? -1.0f : 1.0f; | |
} | |
static const __m128 xmm_half = _mm_set_ps1(0.5f); | |
static const __m128 xmm_one = _mm_set_ps1(1.0f); | |
static const __m128 xmm_three = _mm_set_ps1(3.0f); | |
static const __m128 oneish = _mm_castsi128_ps(_mm_set1_epi32(0x3f800001)); | |
static const __m128 vec2_perpCCW_xorMask = _mm_castsi128_ps(_mm_set_epi32(0, 0, 0, 0x80000000)); | |
static inline float rsqrt(float a) | |
{ | |
#if RSQRT_ALGORITHM == 0 | |
return 1.0f / sqrtf(a); | |
#elif RSQRT_ALGORITHM == 1 | |
float res; | |
__m128 rsqrtRes = _mm_rsqrt_ss(_mm_load_ss(&a)); | |
_mm_store_ss(&res, rsqrtRes); | |
return res; | |
#elif RSQRT_ALGORITHM == 2 | |
const __m128 xmm_a = _mm_load_ss(&a); | |
const __m128 rsqrtEst = _mm_rsqrt_ss(xmm_a); | |
const __m128 iter0 = _mm_mul_ss(xmm_a, rsqrtEst); | |
const __m128 iter1 = _mm_mul_ss(iter0, rsqrtEst); | |
const __m128 half_rsqrt = _mm_mul_ss(xmm_half, rsqrtEst); | |
const __m128 three_sub_iter1 = _mm_sub_ss(xmm_three, iter1); | |
const __m128 result = _mm_mul_ss(half_rsqrt, three_sub_iter1); | |
float res; | |
_mm_store_ss(&res, result); | |
return res; | |
#endif | |
} | |
static inline float rcp(float a) | |
{ | |
#if RCP_ALGORITHM == 0 | |
return 1.0f / a; | |
#elif RCP_ALGORITHM == 1 | |
float res; | |
__m128 rcpRes = _mm_rcp_ss(_mm_load_ss(&a)); | |
_mm_store_ss(&res, rcpRes); | |
return res; | |
#elif RCP_ALGORITHM == 2 | |
const __m128 xmm_a = _mm_load_ss(&a); | |
const __m128 est = _mm_rcp_ss(xmm_a); | |
const __m128 tmp1 = _mm_sub_ss(_mm_mul_ss(xmm_a, est), oneish); | |
const __m128 result = _mm_add_ss(_mm_mul_ss(tmp1, est), est); | |
float res; | |
_mm_store_ss(&res, result); | |
return res; | |
#endif | |
} | |
inline Vec2 vec2Add(const Vec2& a, const Vec2& b) { return{ a.x + b.x, a.y + b.y }; } | |
inline Vec2 vec2Sub(const Vec2& a, const Vec2& b) { return{ a.x - b.x, a.y - b.y }; } | |
inline Vec2 vec2Scale(const Vec2& a, float s) { return{ a.x * s, a.y * s }; } | |
inline Vec2 vec2PerpCCW(const Vec2& a) { return{ -a.y, a.x }; } | |
inline Vec2 vec2PerpCW(const Vec2& a) { return{ a.y, -a.x }; } | |
inline float vec2Cross(const Vec2& a, const Vec2& b) { return a.x * b.y - b.x * a.y; } | |
inline float vec2Dot(const Vec2& a, const Vec2& b) { return a.x * b.x + a.y * b.y; } | |
// Direction from a to b | |
inline Vec2 vec2Dir(const Vec2& a, const Vec2& b) | |
{ | |
const float dx = b.x - a.x; | |
const float dy = b.y - a.y; | |
const float lenSqr = dx * dx + dy * dy; | |
#if TEST_SIMD && VERIFY_SIMD | |
const float invLen = lenSqr < VG_EPSILON ? 0.0f : 1.0f / sqrtf(lenSqr); | |
#else | |
const float invLen = lenSqr < VG_EPSILON ? 0.0f : rsqrt(lenSqr); | |
#endif | |
return{ dx * invLen, dy * invLen }; | |
} | |
inline Vec2 calcExtrusionVector(const Vec2& d01, const Vec2& d12) | |
{ | |
// v is the vector from the path point to the outline point, assuming a stroke width of 1.0. | |
// Equation obtained by solving the intersection of the 2 line segments. d01 and d12 are | |
// assumed to be normalized. | |
Vec2 v = vec2PerpCCW(d01); | |
const float cross = vec2Cross(d12, d01); | |
if (fabsf(cross) > VG_EPSILON) { | |
#if TEST_SIMD && VERIFY_SIMD | |
v = vec2Scale(vec2Sub(d01, d12), 1.0f / cross); | |
#else | |
v = vec2Scale(vec2Sub(d01, d12), rcp(cross)); | |
#endif | |
} | |
return v; | |
} | |
struct Stroker | |
{ | |
float m_FringeWidth; | |
uint32_t m_NumVertices; | |
uint32_t m_NumIndices; | |
uint32_t m_VertexCapacity; | |
Vec2* m_PosBuffer; | |
uint32_t* m_ColorBuffer; | |
uint16_t* m_IndexBuffer; | |
uint32_t m_IndexCapacity; | |
}; | |
static void resetGeometry(Stroker* stroker) | |
{ | |
stroker->m_NumVertices = 0; | |
stroker->m_NumIndices = 0; | |
} | |
static void reallocVB(Stroker* stroker, uint32_t n) | |
{ | |
stroker->m_VertexCapacity += n; | |
stroker->m_PosBuffer = (Vec2*)_aligned_realloc(stroker->m_PosBuffer, sizeof(Vec2) * stroker->m_VertexCapacity, 16); | |
stroker->m_ColorBuffer = (uint32_t*)_aligned_realloc(stroker->m_ColorBuffer, sizeof(uint32_t) * stroker->m_VertexCapacity, 16); | |
#if 0 | |
memset(stroker->m_PosBuffer, 0xFF, sizeof(Vec2) * stroker->m_VertexCapacity); | |
#endif | |
} | |
static void expandVB(Stroker* stroker, uint32_t n) | |
{ | |
if (stroker->m_NumVertices + n > stroker->m_VertexCapacity) { | |
reallocVB(stroker, n); | |
} | |
} | |
static void reallocIB(Stroker* stroker, uint32_t n) | |
{ | |
stroker->m_IndexCapacity += n; | |
stroker->m_IndexBuffer = (uint16_t*)_aligned_realloc(stroker->m_IndexBuffer, sizeof(uint16_t) * stroker->m_IndexCapacity, 16); | |
} | |
static void expandIB(Stroker* stroker, uint32_t n) | |
{ | |
if (stroker->m_NumIndices + n > stroker->m_IndexCapacity) { | |
reallocIB(stroker, n); | |
} | |
} | |
static void strokerConvexFillAA(Stroker* stroker, const float* vertexList, uint32_t numVertices) | |
{ | |
const Vec2* vtx = (const Vec2*)vertexList; | |
const float cross = vec2Cross(vec2Sub(vtx[1], vtx[0]), vec2Sub(vtx[2], vtx[0])); | |
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross); | |
const uint32_t numTris = | |
(numVertices - 2) + // Triangle fan | |
(numVertices * 2); // AA fringes | |
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point. | |
const uint32_t numDrawIndices = numTris * 3; | |
resetGeometry(stroker); | |
#if CREATE_VERTEX_BUFFER | |
// Vertex buffer | |
{ | |
expandVB(stroker, numDrawVertices); | |
Vec2 d01 = vec2Dir(vtx[numVertices - 1], vtx[0]); | |
Vec2* dstPos = stroker->m_PosBuffer; | |
for (uint32_t iSegment = 0; iSegment < numVertices; ++iSegment) { | |
const Vec2& p1 = vtx[iSegment]; | |
const Vec2& p2 = vtx[iSegment == numVertices - 1 ? 0 : iSegment + 1]; | |
const Vec2 d12 = vec2Dir(p1, p2); | |
const Vec2 v = calcExtrusionVector(d01, d12); | |
const Vec2 v_aa = vec2Scale(v, aa); | |
dstPos[0] = vec2Add(p1, v_aa); | |
dstPos[1] = vec2Sub(p1, v_aa); | |
dstPos += 2; | |
d01 = d12; | |
} | |
stroker->m_NumVertices += numDrawVertices; | |
} | |
#endif // CREATE_VERTEX_BUFFER | |
#if CREATE_INDEX_BUFFER | |
// Index buffer | |
{ | |
expandIB(stroker, numDrawIndices); | |
uint16_t* dstIndex = stroker->m_IndexBuffer; | |
// First fringe quad | |
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3; | |
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2; | |
dstIndex += 6; | |
const uint32_t numFanTris = numVertices - 2; | |
uint16_t secondTriVertex = 2; | |
for (uint32_t i = 0; i < numFanTris; ++i) { | |
const uint16_t id0 = secondTriVertex; | |
const uint16_t id1 = secondTriVertex + 1; | |
const uint16_t id2 = secondTriVertex + 2; | |
const uint16_t id3 = secondTriVertex + 3; | |
// Fan triangle | |
dstIndex[0] = 0; | |
dstIndex[1] = id0; | |
dstIndex[2] = id2; | |
// Fringe quad | |
dstIndex[3] = id0; | |
dstIndex[4] = id1; | |
dstIndex[5] = id3; | |
dstIndex[6] = id0; | |
dstIndex[7] = id3; | |
dstIndex[8] = id2; | |
dstIndex += 9; | |
secondTriVertex += 2; | |
} | |
// Last fringe quad | |
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1); | |
dstIndex[0] = lastID; | |
dstIndex[1] = lastID + 1; | |
dstIndex[2] = 1; | |
dstIndex[3] = lastID; | |
dstIndex[4] = 1; | |
dstIndex[5] = 0; | |
stroker->m_NumIndices += numDrawIndices; | |
} | |
#endif // CREATE_INDEX_BUFFER | |
} | |
static inline float _mm_vec2_cross(const __m128 a, const __m128 b) | |
{ | |
const __m128 axy_bxy = _mm_movelh_ps(a, b); // { a.x, a.y, b.x, b.y } | |
const __m128 byx_ayx = _mm_shuffle_ps(axy_bxy, axy_bxy, _MM_SHUFFLE(0, 1, 2, 3)); // { b.y, b.x, a.y, a.x } | |
const __m128 axby_aybx = _mm_mul_ps(axy_bxy, byx_ayx); // { a.x * b.y, a.y * b.x, b.x * a.y, b.y * a.x } | |
const __m128 bxay = _mm_shuffle_ps(axby_aybx, axby_aybx, _MM_SHUFFLE(1, 1, 1, 1)); // { a.y * b.x, a.y * b.x, a.y * b.x, a.y * b.x } | |
const __m128 cross = _mm_sub_ss(axby_aybx, bxay); | |
return _mm_cvtss_f32(cross); | |
} | |
static inline __m128 _mm_vec2_dir(const __m128 a, const __m128 b) | |
{ | |
const __m128 dxy = _mm_sub_ps(b, a); // { dx, dy, DC, DC } | |
const __m128 dxySqr = _mm_mul_ps(dxy, dxy); // { dx * dx, dy * dy, DC, DC } | |
const __m128 dySqr = _mm_shuffle_ps(dxySqr, dxySqr, _MM_SHUFFLE(1, 1, 1, 1)); // { dy * dy, dy * dy, dy * dy, dy * dy } | |
const float lenSqr = _mm_cvtss_f32(_mm_add_ss(dxySqr, dySqr)); | |
__m128 dir = _mm_setzero_ps(); | |
if (lenSqr >= VG_EPSILON) { | |
const __m128 invLen = _mm_set_ps1(rsqrt(lenSqr)); | |
dir = _mm_mul_ps(dxy, invLen); | |
} | |
return dir; | |
} | |
static inline __m128 _mm_vec2_rotCCW90(const __m128 a) | |
{ | |
__m128 ayx = _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)); // { a.y, a.x, DC, DC } | |
return _mm_xor_ps(ayx, vec2_perpCCW_xorMask); // { -a.y, a.x, DC, DC } | |
} | |
static inline __m128 calcExtrusionVector(const __m128 d01, const __m128 d12) | |
{ | |
const float cross = _mm_vec2_cross(d12, d01); | |
return (fabs(cross) > VG_EPSILON) ? _mm_mul_ps(_mm_sub_ps(d01, d12), _mm_set_ps1(rcp(cross))) : _mm_vec2_rotCCW90(d01); | |
} | |
static void strokerConvexFillAA_SIMD(Stroker* stroker, const float* vertexList, uint32_t numVertices) | |
{ | |
const uint32_t lastVertexID = numVertices - 1; | |
const __m128 vtx0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)vertexList); | |
const __m128 vtx1 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 2)); | |
const __m128 vtx2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 4)); | |
const float cross = _mm_vec2_cross(_mm_sub_ps(vtx1, vtx0), _mm_sub_ps(vtx2, vtx0)); | |
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross); | |
const __m128 xmm_aa = _mm_set_ps1(aa); | |
const uint32_t numTris = | |
(numVertices - 2) + // Triangle fan | |
(numVertices * 2); // AA fringes | |
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point. | |
const uint32_t numDrawIndices = numTris * 3; | |
resetGeometry(stroker); | |
#if CREATE_VERTEX_BUFFER | |
// Vertex buffer | |
{ | |
expandVB(stroker, numDrawVertices); | |
const __m128 vtxLast = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + (lastVertexID << 1))); | |
__m128 d01 = _mm_vec2_dir(vtxLast, vtx0); | |
__m128 p1 = vtx0; | |
const float* srcPos = vertexList + 2; | |
float* dstPos = &stroker->m_PosBuffer->x; | |
const uint32_t numIter = lastVertexID >> 1; | |
for (uint32_t iSegment = 0; iSegment < numIter; ++iSegment) { | |
// srcPos alignment unknown | |
const __m128 p23 = _mm_loadu_ps(srcPos); // { p2.x, p2.y, p3.x, p3.y } | |
const __m128 p2 = p23; // { p2.x, p2.y, DC, DC } | |
const __m128 p3 = _mm_movehl_ps(p23, p23); // { p3.x, p3.y, DC, DC } | |
const __m128 d12 = _mm_vec2_dir(p1, p2); // { d12.x, d12.y, DC, DC } | |
const __m128 d23 = _mm_vec2_dir(p2, p3); // { d23.x, d23.y, DC, DC } | |
const __m128 v012 = calcExtrusionVector(d01, d12); // { v012.x, v012.y, DC, DC } | |
const __m128 v123 = calcExtrusionVector(d12, d23); // { v123.x, v123.y, DC, DC } | |
const __m128 v012_123 = _mm_movelh_ps(v012, v123); // { v012.x, v012.y, v123.x, v123.y } | |
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa); // { v012.x * aa, v012.y * aa, v123.x * aa, v123.y * aa } | |
const __m128 p12 = _mm_movelh_ps(p1, p2); // { p1.x, p1.y, p2.x, p2.y } | |
const __m128 posEdge = _mm_add_ps(p12, v012_v123_aa); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p2.x + v123.x * aa, p2.y + v123.y * aa } | |
const __m128 negEdge = _mm_sub_ps(p12, v012_v123_aa); // { p1.x - v012.x * aa, p1.y - v012.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa } | |
const __m128 packed0 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(1, 0, 1, 0)); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p1.x - v012.x * aa, p1.y - v012.y * aa } | |
const __m128 packed1 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(3, 2, 3, 2)); // { p2.x + v123.x * aa, p2.y + v123.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa } | |
// Aligned stores because dstPos is 16-byte aligned | |
_mm_store_ps(dstPos, packed0); | |
_mm_store_ps(dstPos + 4, packed1); | |
dstPos += 8; | |
srcPos += 4; | |
d01 = d23; | |
p1 = p3; | |
} | |
const uint32_t rem = (lastVertexID & 1); | |
if (rem) { | |
const __m128 p2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)srcPos); | |
const __m128 d12 = _mm_vec2_dir(p1, p2); | |
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, d12), xmm_aa); | |
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa)); | |
_mm_store_ps(dstPos, packed); | |
dstPos += 4; | |
srcPos += 2; | |
d01 = d12; | |
p1 = p2; | |
} | |
// Last segment | |
{ | |
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, _mm_vec2_dir(p1, vtx0)), xmm_aa); | |
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa)); | |
_mm_storeu_ps(dstPos, packed); | |
} | |
stroker->m_NumVertices += numDrawVertices; | |
} | |
#endif // CREATE_VERTEX_BUFFER | |
#if CREATE_INDEX_BUFFER | |
// Index buffer | |
{ | |
expandIB(stroker, numDrawIndices); | |
uint16_t* dstIndex = stroker->m_IndexBuffer; | |
// First fringe quad | |
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3; | |
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2; | |
dstIndex += 6; | |
const uint32_t numFanTris = numVertices - 2; | |
uint16_t secondTriVertex = 2; | |
for (uint32_t i = 0; i < numFanTris; ++i) { | |
const uint16_t id0 = secondTriVertex; | |
const uint16_t id1 = secondTriVertex + 1; | |
const uint16_t id2 = secondTriVertex + 2; | |
const uint16_t id3 = secondTriVertex + 3; | |
// Fan triangle | |
dstIndex[0] = 0; | |
dstIndex[1] = id0; | |
dstIndex[2] = id2; | |
// Fringe quad | |
dstIndex[3] = id0; | |
dstIndex[4] = id1; | |
dstIndex[5] = id3; | |
dstIndex[6] = id0; | |
dstIndex[7] = id3; | |
dstIndex[8] = id2; | |
dstIndex += 9; | |
secondTriVertex += 2; | |
} | |
// Last fringe quad | |
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1); | |
dstIndex[0] = lastID; | |
dstIndex[1] = lastID + 1; | |
dstIndex[2] = 1; | |
dstIndex[3] = lastID; | |
dstIndex[4] = 1; | |
dstIndex[5] = 0; | |
stroker->m_NumIndices += numDrawIndices; | |
} | |
#endif | |
} | |
static inline __m128 xmm_rsqrt(__m128 a) | |
{ | |
#if RSQRT_ALGORITHM == 0 | |
const __m128 res = _mm_div_ps(xmm_one, _mm_sqrt_ps(a)); | |
#elif RSQRT_ALGORITHM == 1 | |
const __m128 res = _mm_rsqrt_ps(a); | |
#elif RSQRT_ALGORITHM == 2 | |
// Newton/Raphson | |
const __m128 rsqrtEst = _mm_rsqrt_ps(a); | |
const __m128 iter0 = _mm_mul_ps(a, rsqrtEst); | |
const __m128 iter1 = _mm_mul_ps(iter0, rsqrtEst); | |
const __m128 half_rsqrt = _mm_mul_ps(xmm_half, rsqrtEst); | |
const __m128 three_sub_iter1 = _mm_sub_ps(xmm_three, iter1); | |
const __m128 res = _mm_mul_ps(half_rsqrt, three_sub_iter1); | |
#endif | |
return res; | |
} | |
static inline __m128 xmm_rcp(__m128 a) | |
{ | |
#if RCP_ALGORITHM == 0 | |
const __m128 inv_a = _mm_div_ps(xmm_one, a); | |
#elif RCP_ALGORITHM == 1 | |
const __m128 inv_a = _mm_rcp_ps(a); | |
#elif RCP_ALGORITHM == 2 | |
// TODO: | |
#endif | |
return inv_a; | |
} | |
static void strokerConvexFillAA_SIMD2(Stroker* stroker, const float* vertexList, uint32_t numVertices) | |
{ | |
const uint32_t lastVertexID = numVertices - 1; | |
const __m128 vtx0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)vertexList); | |
const __m128 vtx1 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 2)); | |
const __m128 vtx2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 4)); | |
const float cross = _mm_vec2_cross(_mm_sub_ps(vtx1, vtx0), _mm_sub_ps(vtx2, vtx0)); | |
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross); | |
const __m128 xmm_aa = _mm_set_ps1(aa); | |
const uint32_t numTris = | |
(numVertices - 2) + // Triangle fan | |
(numVertices * 2); // AA fringes | |
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point. | |
const uint32_t numDrawIndices = numTris * 3; | |
resetGeometry(stroker); | |
#if CREATE_VERTEX_BUFFER | |
// Vertex buffer | |
{ | |
expandVB(stroker, numDrawVertices); | |
const __m128 vtxLast = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + (lastVertexID << 1))); | |
__m128 d01 = _mm_vec2_dir(vtxLast, vtx0); | |
__m128 p1 = vtx0; | |
const float* srcPos = vertexList + 2; | |
float* dstPos = &stroker->m_PosBuffer->x; | |
const __m128 xmm_epsilon = _mm_set_ps1(VG_EPSILON); | |
const __m128 vec2x2_perpCCW_xorMask = _mm_castsi128_ps(_mm_set_epi32(0, 0x80000000, 0, 0x80000000)); | |
const uint32_t numIter = lastVertexID >> 1; | |
for (uint32_t i = 0; i < numIter; ++i) { | |
#if TEST_SIMD == 2 && INSERT_IACA_MARKERS | |
IACA_VC64_START; | |
#endif | |
// Even if initial srcPos is 16-byte aligned we just skipped 8 bytes so it's not aligned anymore. | |
const __m128 p23 = _mm_loadu_ps(srcPos); // { p2.x, p2.y, p3.x, p3.y } | |
const __m128 p12 = _mm_movelh_ps(p1, p23); // { p1.x, p1.y, p2.x, p2.y } | |
__m128 d12, d23; | |
{ | |
const __m128 d12_23 = _mm_sub_ps(p23, p12); // { p2.x - p1.x, p2.y - p1.y, p3.x - p2.x, p3.y - p2.y } | |
const __m128 d12_23_xy_sqr = _mm_mul_ps(d12_23, d12_23); // { d12.x * d12.x, d12.y * d12.y, d23.x * d23.x, d23.y * d23.y } | |
const __m128 d12_23_yx_sqr = _mm_shuffle_ps(d12_23_xy_sqr, d12_23_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1)); // { d12.y * d12.y, d12.x * d12.x, d23.y * d23.y, d23.x * d23.x } | |
const __m128 len12_23_sqr = _mm_add_ps(d12_23_xy_sqr, d12_23_yx_sqr); // { len12_sqr, len12_sqr, len23_sqr, len23_sqr } | |
const __m128 lenSqr_ge_eps = _mm_cmpge_ps(len12_23_sqr, xmm_epsilon); // { len12_sqr >= eps ? 0xFFFFFFFF : 0, ... } | |
const __m128 invLen12_23 = xmm_rsqrt(len12_23_sqr); | |
const __m128 invLen12_23_masked = _mm_and_ps(invLen12_23, lenSqr_ge_eps); // { len12_sqr >= eps ? rsqrt(len12_sqr) : 0, ... } | |
const __m128 d12_23_norm = _mm_mul_ps(d12_23, invLen12_23_masked); // | |
d12 = _mm_movelh_ps(d12_23_norm, d12_23_norm); | |
d23 = _mm_movehl_ps(d12_23_norm, d12_23_norm); | |
} | |
__m128 v012_123; | |
{ | |
const __m128 d12xy_d01xy = _mm_movelh_ps(d12, d01); // { d12.x, d12.y, d01.x, d01.y } | |
const __m128 d23xy_d12xy = _mm_movelh_ps(d23, d12); // { d23.x, d23.y, d12.x, d12.y } | |
const __m128 d01yx_d12yx = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(0, 1, 2, 3)); // { d01.y, d01.x, d12.y, d12.x } | |
const __m128 d12yx_d23yx = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(0, 1, 2, 3)); // { d12.y, d12.x, d23.y, d23.x } | |
const __m128 d12xd01y_d12yd01x = _mm_mul_ps(d12xy_d01xy, d01yx_d12yx); // { d12.x * d01.y, d12.y * d01.x, d01.x * d12.y, d01.y * d12.x } | |
const __m128 d23xd12y_d23yd12x = _mm_mul_ps(d23xy_d12xy, d12yx_d23yx); // { d23.x * d12.y, d23.y * d12.x, d12.x * d23.y, d12.y * d23.x } | |
const __m128 d12yd01x_d23yd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(1, 1, 1, 1)); // { d12.y * d01.x, d12.y * d01.x, d23.y * d12.x, d23.y * d12.x } | |
const __m128 d12xd01y_d23xd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(0, 0, 0, 0)); // { d12.x * d01.y, d12.x * d01.x, d23.x * d12.y, d23.x * d12.y } | |
const __m128 cross012_123 = _mm_sub_ps(d12xd01y_d23xd12x, d12yd01x_d23yd12x); // { cross(d12, d01), cross(d12, d01), cross(d23, d12), cross(d23, d12) } | |
const __m128 inv_cross012_123 = xmm_rcp(cross012_123); | |
const __m128 v012_123_fake = _mm_xor_ps(d01yx_d12yx, vec2x2_perpCCW_xorMask); | |
const __m128 d01xy_d12xy = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(1, 0, 3, 2)); | |
const __m128 d12xy_d23xy = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(1, 0, 3, 2)); | |
const __m128 d012xy_d123xy = _mm_sub_ps(d01xy_d12xy, d12xy_d23xy); | |
const __m128 v012_123_true = _mm_mul_ps(d012xy_d123xy, inv_cross012_123); | |
const __m128 cross_gt_eps = _mm_cmpge_ps(cross012_123, xmm_epsilon); | |
const __m128 v012_123_true_masked = _mm_and_ps(cross_gt_eps, v012_123_true); | |
const __m128 v012_123_fake_masked = _mm_andnot_ps(cross_gt_eps, v012_123_fake); | |
v012_123 = _mm_or_ps(v012_123_true_masked, v012_123_fake_masked); | |
} | |
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa); // { v012.x * aa, v012.y * aa, v123.x * aa, v123.y * aa } | |
const __m128 posEdge = _mm_add_ps(p12, v012_v123_aa); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p2.x + v123.x * aa, p2.y + v123.y * aa } | |
const __m128 negEdge = _mm_sub_ps(p12, v012_v123_aa); // { p1.x - v012.x * aa, p1.y - v012.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa } | |
const __m128 packed0 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(1, 0, 1, 0)); // { p1.x + v012.x * aa, p1.y + v012.y * aa, p1.x - v012.x * aa, p1.y - v012.y * aa } | |
const __m128 packed1 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(3, 2, 3, 2)); // { p2.x + v123.x * aa, p2.y + v123.y * aa, p2.x - v123.x * aa, p2.y - v123.y * aa } | |
// Aligned stores because dstPos is 16-byte aligned | |
_mm_store_ps(dstPos, packed0); | |
_mm_store_ps(dstPos + 4, packed1); | |
dstPos += 8; | |
srcPos += 4; | |
d01 = d23; | |
p1 = _mm_movehl_ps(p23, p23); | |
} | |
#if TEST_SIMD == 2 && INSERT_IACA_MARKERS | |
IACA_VC64_END; | |
#endif | |
const uint32_t rem = (lastVertexID & 1); | |
if (rem) { | |
const __m128 p2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)srcPos); | |
const __m128 d12 = _mm_vec2_dir(p1, p2); | |
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, d12), xmm_aa); | |
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa)); | |
_mm_store_ps(dstPos, packed); | |
dstPos += 4; | |
srcPos += 2; | |
d01 = d12; | |
p1 = p2; | |
} | |
// Last segment | |
{ | |
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, _mm_vec2_dir(p1, vtx0)), xmm_aa); | |
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa)); | |
_mm_storeu_ps(dstPos, packed); | |
} | |
stroker->m_NumVertices += numDrawVertices; | |
} | |
#endif // CREATE_VERTEX_BUFFER | |
#if CREATE_INDEX_BUFFER | |
// Index buffer | |
{ | |
expandIB(stroker, numDrawIndices); | |
uint16_t* dstIndex = stroker->m_IndexBuffer; | |
// First fringe quad | |
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3; | |
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2; | |
dstIndex += 6; | |
const uint32_t numFanTris = numVertices - 2; | |
uint16_t secondTriVertex = 2; | |
for (uint32_t i = 0; i < numFanTris; ++i) { | |
const uint16_t id0 = secondTriVertex; | |
const uint16_t id1 = secondTriVertex + 1; | |
const uint16_t id2 = secondTriVertex + 2; | |
const uint16_t id3 = secondTriVertex + 3; | |
// Fan triangle | |
dstIndex[0] = 0; | |
dstIndex[1] = id0; | |
dstIndex[2] = id2; | |
// Fringe quad | |
dstIndex[3] = id0; | |
dstIndex[4] = id1; | |
dstIndex[5] = id3; | |
dstIndex[6] = id0; | |
dstIndex[7] = id3; | |
dstIndex[8] = id2; | |
dstIndex += 9; | |
secondTriVertex += 2; | |
} | |
// Last fringe quad | |
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1); | |
dstIndex[0] = lastID; | |
dstIndex[1] = lastID + 1; | |
dstIndex[2] = 1; | |
dstIndex[3] = lastID; | |
dstIndex[4] = 1; | |
dstIndex[5] = 0; | |
stroker->m_NumIndices += numDrawIndices; | |
} | |
#endif | |
} | |
static void strokerConvexFillAA_SIMD3(Stroker* stroker, const float* vertexList, uint32_t numVertices) | |
{ | |
const uint32_t lastVertexID = numVertices - 1; | |
const __m128 vtx0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)vertexList); | |
const __m128 vtx1 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 2)); | |
const __m128 vtx2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + 4)); | |
const float cross = _mm_vec2_cross(_mm_sub_ps(vtx1, vtx0), _mm_sub_ps(vtx2, vtx0)); | |
const float aa = stroker->m_FringeWidth * 0.5f * fsign(cross); | |
const __m128 xmm_aa = _mm_set_ps1(aa); | |
const uint32_t numTris = | |
(numVertices - 2) + // Triangle fan | |
(numVertices * 2); // AA fringes | |
const uint32_t numDrawVertices = numVertices * 2; // original polygon point + AA fringe point. | |
const uint32_t numDrawIndices = numTris * 3; | |
resetGeometry(stroker); | |
#if CREATE_VERTEX_BUFFER | |
// Vertex buffer | |
{ | |
expandVB(stroker, numDrawVertices); | |
const __m128 vtxLast = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)(vertexList + (lastVertexID << 1))); | |
__m128 d01 = _mm_vec2_dir(vtxLast, vtx0); | |
__m128 p1 = vtx0; | |
const float* srcPos = vertexList + 2; | |
float* dstPos = &stroker->m_PosBuffer->x; | |
const __m128 xmm_epsilon = _mm_set_ps1(VG_EPSILON); | |
const __m128 vec2x2_perpCCW_xorMask = _mm_castsi128_ps(_mm_set_epi32(0, 0x80000000, 0, 0x80000000)); | |
const uint32_t numIter = lastVertexID >> 2; | |
for (uint32_t i = 0; i < numIter; ++i) { | |
// Load 4 points. With p1 from previous loop iteration make up 4 segments | |
const __m128 p23 = _mm_loadu_ps(srcPos); // { p2.x, p2.y, p3.x, p3.y } | |
const __m128 p45 = _mm_loadu_ps(srcPos + 4); // { p4.x, p4.y, p5.x, p5.y } | |
const __m128 p12 = _mm_movelh_ps(p1, p23); // { p1.x, p1.y, p2.x, p2.y } | |
const __m128 p34 = _mm_movelh_ps(_mm_movehl_ps(p23, p23), p45); // { p3.x, p3.y, p4.x, p4.y } | |
// Calculate the direction vector of the 4 segments | |
// NOTE: Tried to calc all 4 rsqrt in 1 call but it ends up being slower. Kept this version for now. | |
const __m128 d12_23_unorm = _mm_sub_ps(p23, p12); | |
const __m128 d34_45_unorm = _mm_sub_ps(p45, p34); | |
const __m128 d12_23_xy_sqr = _mm_mul_ps(d12_23_unorm, d12_23_unorm); | |
const __m128 d34_45_xy_sqr = _mm_mul_ps(d34_45_unorm, d34_45_unorm); | |
const __m128 d12_23_yx_sqr = _mm_shuffle_ps(d12_23_xy_sqr, d12_23_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1)); | |
const __m128 d34_45_yx_sqr = _mm_shuffle_ps(d34_45_xy_sqr, d34_45_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1)); | |
const __m128 len12_23_sqr = _mm_add_ps(d12_23_xy_sqr, d12_23_yx_sqr); | |
const __m128 len34_45_sqr = _mm_add_ps(d34_45_xy_sqr, d34_45_yx_sqr); | |
const __m128 lenSqr123_ge_eps = _mm_cmpge_ps(len12_23_sqr, xmm_epsilon); | |
const __m128 lenSqr345_ge_eps = _mm_cmpge_ps(len34_45_sqr, xmm_epsilon); | |
const __m128 invLen12_23 = xmm_rsqrt(len12_23_sqr); | |
const __m128 invLen34_45 = xmm_rsqrt(len34_45_sqr); | |
const __m128 invLen12_23_masked = _mm_and_ps(invLen12_23, lenSqr123_ge_eps); | |
const __m128 invLen34_45_masked = _mm_and_ps(invLen34_45, lenSqr345_ge_eps); | |
const __m128 d12_23 = _mm_mul_ps(d12_23_unorm, invLen12_23_masked); | |
const __m128 d34_45 = _mm_mul_ps(d34_45_unorm, invLen34_45_masked); | |
// Calculate the 4 extrusion vectors for the 4 points based on the equ | |
// abs(cross(d12, d01) > epsilon ? ((d01 - d12) / cross(d12, d01)) : rot90CCW(d01) | |
const __m128 v012_123_fake = _mm_xor_ps(_mm_shuffle_ps(d01, d12_23, _MM_SHUFFLE(0, 1, 0, 1)), vec2x2_perpCCW_xorMask); | |
const __m128 v234_345_fake = _mm_xor_ps(_mm_shuffle_ps(d12_23, d34_45, _MM_SHUFFLE(0, 1, 2, 3)), vec2x2_perpCCW_xorMask); | |
// cross012 = d12.x * d01.y - d12.y * d01.x | |
// cross123 = d23.x * d12.y - d23.y * d12.x | |
// cross234 = d34.x * d23.y - d34.y * d23.x | |
// cross345 = d45.x * d34.y - d45.y * d34.x | |
const __m128 dxy01_12 = _mm_shuffle_ps(d01, d12_23, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 dxy12_23 = d12_23; | |
const __m128 dxy23_34 = _mm_shuffle_ps(d12_23, d34_45, _MM_SHUFFLE(1, 0, 3, 2)); | |
const __m128 dxy34_45 = d34_45; | |
const __m128 dx01_12_23_34 = _mm_shuffle_ps(dxy01_12, dxy23_34, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128 dy01_12_23_34 = _mm_shuffle_ps(dxy01_12, dxy23_34, _MM_SHUFFLE(3, 1, 3, 1)); | |
const __m128 dx12_23_34_45 = _mm_shuffle_ps(dxy12_23, dxy34_45, _MM_SHUFFLE(2, 0, 2, 0)); | |
const __m128 dy12_23_34_45 = _mm_shuffle_ps(dxy12_23, dxy34_45, _MM_SHUFFLE(3, 1, 3, 1)); | |
const __m128 crossx012_123_234_345 = _mm_mul_ps(dx12_23_34_45, dy01_12_23_34); | |
const __m128 crossy012_123_234_345 = _mm_mul_ps(dy12_23_34_45, dx01_12_23_34); | |
const __m128 cross012_123_234_345 = _mm_sub_ps(crossx012_123_234_345, crossy012_123_234_345); | |
const __m128 inv_cross012_123_234_345 = xmm_rcp(cross012_123_234_345); | |
const __m128 cross_gt_eps012_123_234_345 = _mm_cmpge_ps(cross012_123_234_345, xmm_epsilon); | |
const __m128 inv_cross012_123 = _mm_shuffle_ps(inv_cross012_123_234_345, inv_cross012_123_234_345, _MM_SHUFFLE(1, 1, 0, 0)); | |
const __m128 inv_cross234_345 = _mm_shuffle_ps(inv_cross012_123_234_345, inv_cross012_123_234_345, _MM_SHUFFLE(3, 3, 2, 2)); | |
const __m128 cross012_123_gt_eps = _mm_shuffle_ps(cross_gt_eps012_123_234_345, cross_gt_eps012_123_234_345, _MM_SHUFFLE(1, 1, 0, 0)); | |
const __m128 cross234_345_gt_eps = _mm_shuffle_ps(cross_gt_eps012_123_234_345, cross_gt_eps012_123_234_345, _MM_SHUFFLE(3, 3, 2, 2)); | |
const __m128 dxy012_123 = _mm_sub_ps(dxy01_12, dxy12_23); | |
const __m128 dxy234_345 = _mm_sub_ps(dxy23_34, dxy34_45); | |
const __m128 v012_123_true = _mm_mul_ps(dxy012_123, inv_cross012_123); | |
const __m128 v234_345_true = _mm_mul_ps(dxy234_345, inv_cross234_345); | |
const __m128 v012_123_true_masked = _mm_and_ps(cross012_123_gt_eps, v012_123_true); | |
const __m128 v234_345_true_masked = _mm_and_ps(cross234_345_gt_eps, v234_345_true); | |
const __m128 v012_123_fake_masked = _mm_andnot_ps(cross012_123_gt_eps, v012_123_fake); | |
const __m128 v245_345_fake_masked = _mm_andnot_ps(cross234_345_gt_eps, v234_345_fake); | |
const __m128 v012_123 = _mm_or_ps(v012_123_true_masked, v012_123_fake_masked); | |
const __m128 v234_345 = _mm_or_ps(v234_345_true_masked, v245_345_fake_masked); | |
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa); | |
const __m128 v234_v345_aa = _mm_mul_ps(v234_345, xmm_aa); | |
// Calculate the 2 fringe points for each of p1, p2, p3 and p4 | |
const __m128 posEdge12 = _mm_add_ps(p12, v012_v123_aa); | |
const __m128 negEdge12 = _mm_sub_ps(p12, v012_v123_aa); | |
const __m128 posEdge34 = _mm_add_ps(p34, v234_v345_aa); | |
const __m128 negEdge34 = _mm_sub_ps(p34, v234_v345_aa); | |
const __m128 p1_in_out = _mm_shuffle_ps(posEdge12, negEdge12, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 p2_in_out = _mm_shuffle_ps(posEdge12, negEdge12, _MM_SHUFFLE(3, 2, 3, 2)); | |
const __m128 p3_in_out = _mm_shuffle_ps(posEdge34, negEdge34, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 p4_in_out = _mm_shuffle_ps(posEdge34, negEdge34, _MM_SHUFFLE(3, 2, 3, 2)); | |
// Store the fringe points | |
_mm_store_ps(dstPos + 0, p1_in_out); | |
_mm_store_ps(dstPos + 4, p2_in_out); | |
_mm_store_ps(dstPos + 8, p3_in_out); | |
_mm_store_ps(dstPos + 12, p4_in_out); | |
// Move on to the next iteration. | |
d01 = _mm_movehl_ps(d34_45, d34_45); | |
p1 = _mm_movehl_ps(p45, p45); // p1 = p5 | |
srcPos += 8; | |
dstPos += 16; | |
} | |
uint32_t rem = (lastVertexID & 3); | |
if (rem >= 2) { | |
const __m128 p23 = _mm_loadu_ps(srcPos); | |
const __m128 p12 = _mm_movelh_ps(p1, p23); | |
const __m128 d12_23 = _mm_sub_ps(p23, p12); | |
const __m128 d12_23_xy_sqr = _mm_mul_ps(d12_23, d12_23); | |
const __m128 d12_23_yx_sqr = _mm_shuffle_ps(d12_23_xy_sqr, d12_23_xy_sqr, _MM_SHUFFLE(2, 3, 0, 1)); | |
const __m128 len12_23_sqr = _mm_add_ps(d12_23_xy_sqr, d12_23_yx_sqr); | |
const __m128 lenSqr_ge_eps = _mm_cmpge_ps(len12_23_sqr, xmm_epsilon); | |
const __m128 invLen12_23 = xmm_rsqrt(len12_23_sqr); | |
const __m128 invLen12_23_masked = _mm_and_ps(invLen12_23, lenSqr_ge_eps); | |
const __m128 d12_23_norm = _mm_mul_ps(d12_23, invLen12_23_masked); | |
const __m128 d12 = _mm_movelh_ps(d12_23_norm, d12_23_norm); | |
const __m128 d23 = _mm_movehl_ps(d12_23_norm, d12_23_norm); | |
const __m128 d12xy_d01xy = _mm_movelh_ps(d12, d01); | |
const __m128 d23xy_d12xy = _mm_movelh_ps(d23, d12); | |
const __m128 d01yx_d12yx = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(0, 1, 2, 3)); | |
const __m128 d12yx_d23yx = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(0, 1, 2, 3)); | |
const __m128 d12xd01y_d12yd01x = _mm_mul_ps(d12xy_d01xy, d01yx_d12yx); | |
const __m128 d23xd12y_d23yd12x = _mm_mul_ps(d23xy_d12xy, d12yx_d23yx); | |
const __m128 d12yd01x_d23yd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(1, 1, 1, 1)); | |
const __m128 d12xd01y_d23xd12x = _mm_shuffle_ps(d12xd01y_d12yd01x, d23xd12y_d23yd12x, _MM_SHUFFLE(0, 0, 0, 0)); | |
const __m128 cross012_123 = _mm_sub_ps(d12xd01y_d23xd12x, d12yd01x_d23yd12x); | |
const __m128 inv_cross012_123 = xmm_rcp(cross012_123); | |
const __m128 v012_123_fake = _mm_xor_ps(d01yx_d12yx, vec2x2_perpCCW_xorMask); | |
const __m128 d01xy_d12xy = _mm_shuffle_ps(d12xy_d01xy, d12xy_d01xy, _MM_SHUFFLE(1, 0, 3, 2)); | |
const __m128 d12xy_d23xy = _mm_shuffle_ps(d23xy_d12xy, d23xy_d12xy, _MM_SHUFFLE(1, 0, 3, 2)); | |
const __m128 d012xy_d123xy = _mm_sub_ps(d01xy_d12xy, d12xy_d23xy); | |
const __m128 v012_123_true = _mm_mul_ps(d012xy_d123xy, inv_cross012_123); | |
const __m128 cross_gt_eps = _mm_cmpge_ps(cross012_123, xmm_epsilon); | |
const __m128 v012_123_true_masked = _mm_and_ps(cross_gt_eps, v012_123_true); | |
const __m128 v012_123_fake_masked = _mm_andnot_ps(cross_gt_eps, v012_123_fake); | |
const __m128 v012_123 = _mm_or_ps(v012_123_true_masked, v012_123_fake_masked); | |
const __m128 v012_v123_aa = _mm_mul_ps(v012_123, xmm_aa); | |
const __m128 posEdge = _mm_add_ps(p12, v012_v123_aa); | |
const __m128 negEdge = _mm_sub_ps(p12, v012_v123_aa); | |
const __m128 packed0 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(1, 0, 1, 0)); | |
const __m128 packed1 = _mm_shuffle_ps(posEdge, negEdge, _MM_SHUFFLE(3, 2, 3, 2)); | |
_mm_store_ps(dstPos, packed0); | |
_mm_store_ps(dstPos + 4, packed1); | |
dstPos += 8; | |
srcPos += 4; | |
d01 = d23; | |
p1 = _mm_movehl_ps(p23, p23); | |
rem -= 2; | |
} | |
if (rem) { | |
const __m128 p2 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)srcPos); | |
const __m128 d12 = _mm_vec2_dir(p1, p2); | |
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, d12), xmm_aa); | |
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa)); | |
_mm_store_ps(dstPos, packed); | |
dstPos += 4; | |
srcPos += 2; | |
d01 = d12; | |
p1 = p2; | |
} | |
// Last segment | |
{ | |
const __m128 v_aa = _mm_mul_ps(calcExtrusionVector(d01, _mm_vec2_dir(p1, vtx0)), xmm_aa); | |
const __m128 packed = _mm_movelh_ps(_mm_add_ps(p1, v_aa), _mm_sub_ps(p1, v_aa)); | |
_mm_storeu_ps(dstPos, packed); | |
} | |
stroker->m_NumVertices += numDrawVertices; | |
} | |
#endif // CREATE_VERTEX_BUFFER | |
#if CREATE_INDEX_BUFFER | |
// Index buffer | |
{ | |
expandIB(stroker, numDrawIndices); | |
uint16_t* dstIndex = stroker->m_IndexBuffer; | |
// First fringe quad | |
dstIndex[0] = 0; dstIndex[1] = 1; dstIndex[2] = 3; | |
dstIndex[3] = 0; dstIndex[4] = 3; dstIndex[5] = 2; | |
dstIndex += 6; | |
const uint32_t numFanTris = numVertices - 2; | |
#if !SIMD_INDEX_BUFFER | |
uint16_t secondTriVertex = 2; | |
for (uint32_t i = 0; i < numFanTris; ++i) { | |
const uint16_t id0 = secondTriVertex; | |
const uint16_t id1 = secondTriVertex + 1; | |
const uint16_t id2 = secondTriVertex + 2; | |
const uint16_t id3 = secondTriVertex + 3; | |
// Fan triangle | |
dstIndex[0] = 0; | |
dstIndex[1] = id0; | |
dstIndex[2] = id2; | |
// Fringe quad | |
dstIndex[3] = id0; | |
dstIndex[4] = id1; | |
dstIndex[5] = id3; | |
dstIndex[6] = id0; | |
dstIndex[7] = id3; | |
dstIndex[8] = id2; | |
dstIndex += 9; | |
secondTriVertex += 2; | |
} | |
#else | |
#if 0 | |
static const uint16_t delta0[8] = { 0, 2, 0, 1, 3, 0, 3, 2 }; | |
static const uint16_t delta1[8] = { 2, 4, 2, 3, 5, 2, 5, 4 }; | |
static const uint16_t delta2[8] = { 4, 6, 4, 5, 7, 4, 7, 6 }; | |
static const uint16_t delta3[8] = { 6, 8, 6, 7, 9, 6, 9, 8 }; | |
const __m128i xmm_delta0 = _mm_loadu_si128((const __m128i*)delta0); | |
const __m128i xmm_delta1 = _mm_loadu_si128((const __m128i*)delta1); | |
const __m128i xmm_delta2 = _mm_loadu_si128((const __m128i*)delta2); | |
const __m128i xmm_delta3 = _mm_loadu_si128((const __m128i*)delta3); | |
const __m128i xmm_stv_delta = _mm_set1_epi16(8); | |
__m128i xmm_stv = _mm_set1_epi16(2); | |
const uint32_t numIter = numFanTris >> 2; | |
for (uint32_t i = 0; i < numIter; ++i) { | |
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS | |
IACA_VC64_START; | |
#endif | |
// { 0, stv + 0, stv + 2, stv + 0, stv + 1, stv + 3, stv + 0, stv + 3 } | |
// { stv + 2, 0, stv + 2, stv + 4, stv + 2, stv + 3, stv + 5, stv + 2 } | |
// { stv + 5, stv + 4, 0, stv + 4, stv + 6, stv + 4, stv + 5, stv + 7 } | |
// { stv + 4, stv + 7, stv + 6, 0, stv + 6, stv + 8, stv + 6, stv + 7 } | |
// { stv + 9, stv + 6, stv + 9, stv + 8 } | |
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0); // { stv + 0, stv + 2, stv + 0, stv + 1, stv + 3, stv + 0, stv + 3, stv + 2 } | |
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1); // { stv + 2, stv + 4, stv + 2, stv + 3, stv + 5, stv + 2, stv + 5, stv + 4 } | |
const __m128i xmm_id2 = _mm_add_epi16(xmm_stv, xmm_delta2); // { stv + 4, stv + 6, stv + 4, stv + 5, stv + 7, stv + 4, stv + 7, stv + 6 } | |
const __m128i xmm_id3 = _mm_add_epi16(xmm_stv, xmm_delta3); // { stv + 6, stv + 8, stv + 6, stv + 7, stv + 9, stv + 6, stv + 9, stv + 8 } | |
dstIndex[0] = 0; | |
dstIndex[9] = 0; | |
dstIndex[18] = 0; | |
dstIndex[27] = 0; | |
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0); | |
_mm_storeu_si128((__m128i*)(dstIndex + 10), xmm_id1); | |
_mm_storeu_si128((__m128i*)(dstIndex + 19), xmm_id2); | |
_mm_storeu_si128((__m128i*)(dstIndex + 28), xmm_id3); | |
dstIndex += 36; | |
xmm_stv = _mm_add_epi16(xmm_stv, xmm_stv_delta); | |
} | |
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS | |
IACA_VC64_END; | |
#endif | |
uint32_t rem = numFanTris & 3; | |
if (rem >= 2) { | |
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0); | |
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1); | |
dstIndex[0] = 0; | |
dstIndex[9] = 0; | |
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0); | |
_mm_storeu_si128((__m128i*)(dstIndex + 10), xmm_id1); | |
dstIndex += 18; | |
xmm_stv = _mm_add_epi16(xmm_stv, _mm_set1_epi16(4)); | |
rem -= 2; | |
} | |
if (rem) { | |
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0); | |
dstIndex[0] = 0; | |
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0); | |
dstIndex += 9; | |
} | |
#else | |
__m128i xmm_stv = _mm_set1_epi16(2); | |
{ | |
static const uint16_t delta0[8] = { 0, 0, 2, 0, 1, 3, 0, 3 }; | |
static const uint16_t delta1[8] = { 2, 0, 2, 4, 2, 3, 5, 2 }; | |
static const uint16_t delta2[8] = { 5, 4, 0, 4, 6, 4, 5, 7 }; | |
static const uint16_t delta3[8] = { 4, 7, 6, 0, 6, 8, 6, 7 }; | |
static const uint16_t delta4[8] = { 9, 6, 9, 8, 0, 0, 0, 0 }; | |
const __m128i xmm_delta0 = _mm_loadu_si128((const __m128i*)delta0); | |
const __m128i xmm_delta1 = _mm_loadu_si128((const __m128i*)delta1); | |
const __m128i xmm_delta2 = _mm_loadu_si128((const __m128i*)delta2); | |
const __m128i xmm_delta3 = _mm_loadu_si128((const __m128i*)delta3); | |
const __m128i xmm_delta4 = _mm_loadu_si128((const __m128i*)delta4); | |
const __m128i xmm_stv_delta = _mm_set1_epi16(8); | |
const uint32_t numIter = numFanTris >> 2; | |
for (uint32_t i = 0; i < numIter; ++i) { | |
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS | |
IACA_VC64_START; | |
#endif | |
// { 0, stv + 0, stv + 2, stv + 0, stv + 1, stv + 3, stv + 0, stv + 3 } | |
// { stv + 2, 0, stv + 2, stv + 4, stv + 2, stv + 3, stv + 5, stv + 2 } | |
// { stv + 5, stv + 4, 0, stv + 4, stv + 6, stv + 4, stv + 5, stv + 7 } | |
// { stv + 4, stv + 7, stv + 6, 0, stv + 6, stv + 8, stv + 6, stv + 7 } | |
// { stv + 9, stv + 6, stv + 9, stv + 8 } | |
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0); | |
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1); | |
const __m128i xmm_id2 = _mm_add_epi16(xmm_stv, xmm_delta2); | |
const __m128i xmm_id3 = _mm_add_epi16(xmm_stv, xmm_delta3); | |
const __m128i xmm_id4 = _mm_add_epi16(xmm_stv, xmm_delta4); | |
_mm_storeu_si128((__m128i*)(dstIndex + 0), _mm_insert_epi16(xmm_id0, 0, 0)); | |
_mm_storeu_si128((__m128i*)(dstIndex + 8), _mm_insert_epi16(xmm_id1, 0, 1)); | |
_mm_storeu_si128((__m128i*)(dstIndex + 16), _mm_insert_epi16(xmm_id2, 0, 2)); | |
_mm_storeu_si128((__m128i*)(dstIndex + 24), _mm_insert_epi16(xmm_id3, 0, 3)); | |
_mm_storel_epi64((__m128i*)(dstIndex + 32), xmm_id4); | |
dstIndex += 36; | |
xmm_stv = _mm_add_epi16(xmm_stv, xmm_stv_delta); | |
} | |
#if TEST_SIMD == 3 && INSERT_IACA_MARKERS | |
IACA_VC64_END; | |
#endif | |
} | |
{ | |
static const uint16_t delta0[8] = { 0, 2, 0, 1, 3, 0, 3, 2 }; | |
static const uint16_t delta1[8] = { 2, 4, 2, 3, 5, 2, 5, 4 }; | |
const __m128i xmm_delta0 = _mm_loadu_si128((const __m128i*)delta0); | |
const __m128i xmm_delta1 = _mm_loadu_si128((const __m128i*)delta1); | |
uint32_t rem = numFanTris & 3; | |
if (rem >= 2) { | |
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0); | |
const __m128i xmm_id1 = _mm_add_epi16(xmm_stv, xmm_delta1); | |
dstIndex[0] = 0; | |
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0); | |
dstIndex[9] = 0; | |
_mm_storeu_si128((__m128i*)(dstIndex + 10), xmm_id1); | |
dstIndex += 18; | |
xmm_stv = _mm_add_epi16(xmm_stv, _mm_set1_epi16(4)); | |
rem -= 2; | |
} | |
if (rem) { | |
const __m128i xmm_id0 = _mm_add_epi16(xmm_stv, xmm_delta0); | |
dstIndex[0] = 0; | |
_mm_storeu_si128((__m128i*)(dstIndex + 1), xmm_id0); | |
dstIndex += 9; | |
} | |
} | |
#endif // 0 | |
#endif // SIMD_INDEX_BUFFER | |
// Last fringe quad | |
const uint16_t lastID = (uint16_t)((numVertices - 1) << 1); | |
dstIndex[0] = lastID; | |
dstIndex[1] = lastID + 1; | |
dstIndex[2] = 1; | |
dstIndex[3] = lastID; | |
dstIndex[4] = 1; | |
dstIndex[5] = 0; | |
stroker->m_NumIndices += numDrawIndices; | |
} | |
#endif | |
} | |
static void generatePath(float* vtx, uint32_t numVertices) | |
{ | |
// Circle | |
const float cx = 0.0f; | |
const float cy = 0.0f; | |
const float r = 100.0f; | |
vtx[0] = cx + r; | |
vtx[1] = cy; | |
vtx += 2; | |
const float dtheta = -(2.0f * PI) / (float)numVertices; | |
const float cos_dtheta = cosf(dtheta); | |
const float sin_dtheta = sinf(dtheta); | |
float ca = 1.0f; | |
float sa = 0.0f; | |
for (uint32_t i = 1; i < numVertices; ++i) { | |
const float nextSin = sin_dtheta * ca + cos_dtheta * sa; | |
const float nextCos = cos_dtheta * ca - sin_dtheta * sa; | |
ca = nextCos; | |
sa = nextSin; | |
vtx[0] = cx + r * ca; | |
vtx[1] = cy + r * sa; | |
vtx += 2; | |
} | |
} | |
static inline int64_t getHPCounter() | |
{ | |
LARGE_INTEGER li; | |
QueryPerformanceCounter(&li); | |
int64_t i64 = li.QuadPart; | |
return i64; | |
} | |
static inline int64_t getHPFrequency() | |
{ | |
LARGE_INTEGER li; | |
QueryPerformanceFrequency(&li); | |
return li.QuadPart; | |
} | |
int main() | |
{ | |
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS); | |
SetThreadAffinityMask(GetCurrentThread(), 0x00000001); | |
float* vertexList = (float*)_aligned_malloc(sizeof(float) * 2 * NUM_VERTICES, 16); | |
generatePath(vertexList, NUM_VERTICES); | |
Stroker stroker; | |
memset(&stroker, 0, sizeof(Stroker)); | |
stroker.m_FringeWidth = 1.0f; | |
#if TEST_SIMD == 1 | |
strokerConvexFillAA_SIMD(&stroker, vertexList, NUM_VERTICES); | |
#elif TEST_SIMD == 2 | |
strokerConvexFillAA_SIMD2(&stroker, vertexList, NUM_VERTICES); | |
#elif TEST_SIMD == 3 | |
strokerConvexFillAA_SIMD3(&stroker, vertexList, NUM_VERTICES); | |
#else | |
strokerConvexFillAA(&stroker, vertexList, NUM_VERTICES); | |
#endif | |
int64_t start = getHPCounter(); | |
for (uint32_t i = 0; i < NUM_ITERATIONS; ++i) { | |
#if TEST_SIMD == 1 | |
strokerConvexFillAA_SIMD(&stroker, vertexList, NUM_VERTICES); | |
#elif TEST_SIMD == 2 | |
strokerConvexFillAA_SIMD2(&stroker, vertexList, NUM_VERTICES); | |
#elif TEST_SIMD == 3 | |
strokerConvexFillAA_SIMD3(&stroker, vertexList, NUM_VERTICES); | |
#else | |
strokerConvexFillAA(&stroker, vertexList, NUM_VERTICES); | |
#endif | |
} | |
int64_t elapsed = getHPCounter() - start; | |
printf("Elapsed (raw): %" PRId64 "\n", elapsed); | |
const int64_t freq = getHPFrequency(); | |
double elapsed_msec = 1000.0 * (double)elapsed / (double)freq; | |
printf("Elapsed time: %f msec (%f usec / call)\n", elapsed_msec, elapsed_msec * 1000.0 / (double)NUM_ITERATIONS); | |
#if CREATE_VERTEX_BUFFER && TEST_SIMD && VERIFY_SIMD | |
{ | |
printf("Veryfing SIMD vertex buffer...\n"); | |
Stroker strokerRef; | |
memset(&strokerRef, 0, sizeof(Stroker)); | |
strokerRef.m_FringeWidth = 1.0f; | |
strokerConvexFillAA(&strokerRef, vertexList, NUM_VERTICES); | |
double maxError = -1.0; | |
const float* simdVtx = &stroker.m_PosBuffer->x; | |
const float* refVtx = &strokerRef.m_PosBuffer->x; | |
for (uint32_t i = 0; i < stroker.m_NumVertices; ++i) { | |
const double dx = (double)simdVtx[0] - (double)refVtx[0]; | |
const double dy = (double)simdVtx[1] - (double)refVtx[1]; | |
const double len = dx * dx + dy * dy; | |
if (len > maxError) { | |
maxError = len; | |
} | |
simdVtx += 2; | |
refVtx += 2; | |
} | |
printf("- Max error: %g\n", maxError); | |
} | |
#endif | |
#if CREATE_INDEX_BUFFER && TEST_SIMD && VERIFY_SIMD | |
{ | |
printf("Veryfing SIMD index buffer...\n"); | |
Stroker strokerRef; | |
memset(&strokerRef, 0, sizeof(Stroker)); | |
strokerRef.m_FringeWidth = 1.0f; | |
strokerConvexFillAA(&strokerRef, vertexList, NUM_VERTICES); | |
double maxError = 0.0; | |
const uint16_t* simdID = stroker.m_IndexBuffer; | |
const uint16_t* refID = strokerRef.m_IndexBuffer; | |
bool err = false; | |
for (uint32_t i = 0; i < stroker.m_NumIndices; ++i) { | |
if (simdID[i] != refID[i]) { | |
printf("Index %d is wrong!\n", i); | |
err = true; | |
break; | |
} | |
} | |
if (!err) { | |
printf("SIMD index buffer is correct\n"); | |
} | |
} | |
#endif | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment