Created
January 4, 2013 22:20
-
-
Save BenjaminPoulain/4457933 to your computer and use it in GitHub Desktop.
SSE2 TransformationMatrix
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog | |
index 48379ba..27872ed 100644 | |
--- a/Source/WebCore/ChangeLog | |
+++ b/Source/WebCore/ChangeLog | |
@@ -1,3 +1,31 @@ | |
+2013-01-04 Benjamin Poulain <[email protected]> | |
+ | |
+ Optimize TransformationMatrix::multiply() for x86_64 | |
+ https://bugs.webkit.org/show_bug.cgi?id=105719 | |
+ | |
+ Reviewed by NOBODY (OOPS!). | |
+ | |
+ On x86_64, we have access to 16 XMM registers. This can hold 32 double values. | |
+ We can use that in two ways to optimize matrix multiplications: | |
+ -Keep the source matrix completely in registers. Write the result directly in | |
+ the source matrix's memory. This avoids the memcpy at the end of the multiplication | |
+ and various memory operations. | |
+ -Use SIMD with SSE to perform 2 operations at a time. | |
+ | |
+ The parameter from the second matrix are loaded one by one in XMM registers. | |
+ Loading them with SSE then shuffling the values perform worse than loading | |
+ one by one. | |
+ | |
+ This is only enabled on 64bits as x86 only has access to 8 XMM registers and | |
+ the function should be written differently. | |
+ | |
+ On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change. | |
+ | |
+ * platform/graphics/transforms/TransformationMatrix.cpp: | |
+ (WebCore::TransformationMatrix::multiply): | |
+ * platform/graphics/transforms/TransformationMatrix.h: | |
+ (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file. | |
+ | |
2013-01-04 Zoltan Horvath <[email protected]> | |
[CSS Regions] @region rules inside media queries are ignored | |
diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp | |
index adcb48b..bfbc355 100644 | |
--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp | |
+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp | |
@@ -36,6 +36,10 @@ | |
#include <wtf/Assertions.h> | |
#include <wtf/MathExtras.h> | |
+#if CPU(X86_64) | |
+#include <emmintrin.h> | |
+#endif | |
+ | |
using namespace std; | |
namespace WebCore { | |
@@ -968,9 +972,7 @@ TransformationMatrix TransformationMatrix::rectToRect(const FloatRect& from, con | |
to.y() - from.y()); | |
} | |
-// | |
-// *this = mat * *this | |
-// | |
+// this = mat * this. | |
TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat) | |
{ | |
#if CPU(APPLE_ARMV7S) | |
@@ -1115,6 +1117,129 @@ TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& | |
} | |
#undef MATRIX_MULTIPLY_ONE_LINE | |
+#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2) | |
+ // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers. | |
+ __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0])); | |
+ __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0])); | |
+ __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0])); | |
+ __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0])); | |
+ | |
+ // First row. | |
+ __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]); | |
+ __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]); | |
+ __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]); | |
+ __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]); | |
+ | |
+ // output00 and output01. | |
+ __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); | |
+ __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); | |
+ __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); | |
+ __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); | |
+ | |
+ __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2])); | |
+ __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2])); | |
+ __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2])); | |
+ __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2])); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[0][0], accumulator); | |
+ | |
+ // output02 and output03. | |
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); | |
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); | |
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); | |
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[0][2], accumulator); | |
+ | |
+ // Second row. | |
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]); | |
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]); | |
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]); | |
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]); | |
+ | |
+ // output10 and output11. | |
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); | |
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); | |
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); | |
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[1][0], accumulator); | |
+ | |
+ // output12 and output13. | |
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); | |
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); | |
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); | |
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[1][2], accumulator); | |
+ | |
+ // Third row. | |
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]); | |
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]); | |
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]); | |
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]); | |
+ | |
+ // output20 and output21. | |
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); | |
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); | |
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); | |
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[2][0], accumulator); | |
+ | |
+ // output22 and output23. | |
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); | |
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); | |
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); | |
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[2][2], accumulator); | |
+ | |
+ // Fourth row. | |
+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]); | |
+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]); | |
+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]); | |
+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]); | |
+ | |
+ // output30 and output31. | |
+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam); | |
+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam); | |
+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam); | |
+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[3][0], accumulator); | |
+ | |
+ // output32 and output33. | |
+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam); | |
+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam); | |
+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam); | |
+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam); | |
+ | |
+ accumulator = _mm_add_pd(accumulator, temp1); | |
+ accumulator = _mm_add_pd(accumulator, temp2); | |
+ accumulator = _mm_add_pd(accumulator, temp3); | |
+ _mm_store_pd(&m_matrix[3][2], accumulator); | |
#else | |
Matrix4 tmp; | |
diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h | |
index 181e033..f324c35 100644 | |
--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h | |
+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h | |
@@ -69,10 +69,14 @@ class LayoutRect; | |
class FloatRect; | |
class FloatQuad; | |
+#if CPU(X86_64) && !PLATFORM(WINDOWS) | |
+#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2 | |
+#endif | |
+ | |
class TransformationMatrix { | |
WTF_MAKE_FAST_ALLOCATED; | |
public: | |
-#if CPU(APPLE_ARMV7S) | |
+#if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2) | |
typedef double Matrix4[4][4] __attribute__((aligned (16))); | |
#else | |
typedef double Matrix4[4][4]; | |
@@ -226,7 +230,7 @@ public: | |
double f() const { return m_matrix[3][1]; } | |
void setF(double f) { m_matrix[3][1] = f; } | |
- // this = this * mat | |
+ // this = mat * this. | |
TransformationMatrix& multiply(const TransformationMatrix&); | |
TransformationMatrix& scale(double); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment