BenjaminPoulain · January 4, 2013 22:20
diff --git a/gistfile1.diff b/gistfile1.diff
 diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
 index 48379ba..27872ed 100644
 --- a/Source/WebCore/ChangeLog
 +++ b/Source/WebCore/ChangeLog
 @@ -1,3 +1,31 @@
 +2013-01-04  Benjamin Poulain  <[email protected]>
 +
 +        Optimize TransformationMatrix::multiply() for x86_64
 +        https://bugs.webkit.org/show_bug.cgi?id=105719
 +
 +        Reviewed by NOBODY (OOPS!).
 +
 +        On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
 +        We can use that in two ways to optimize matrix multiplications:
 +        -Keep the source matrix completely in registers. Write the result directly in
 +         the source matrix's memory. This avoids the memcpy at the end of the multiplication
 +         and various memory operations.
 +        -Use SIMD with SSE to perform 2 operations at a time.
 +
 +        The parameter from the second matrix are loaded one by one in XMM registers.
 +        Loading them with SSE then shuffling the values perform worse than loading
 +        one by one.
 +
 +        This is only enabled on 64bits as x86 only has access to 8 XMM registers and
 +        the function should be written differently.
 +
 +        On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
 +
 +        * platform/graphics/transforms/TransformationMatrix.cpp:
 +        (WebCore::TransformationMatrix::multiply):
 +        * platform/graphics/transforms/TransformationMatrix.h:
 +        (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
 +
 2013-01-04  Zoltan Horvath  <[email protected]>
 
         [CSS Regions] @region rules inside media queries are ignored
 diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
 index adcb48b..bfbc355 100644
 --- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
 +++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
 @@ -36,6 +36,10 @@
 #include <wtf/Assertions.h>
 #include <wtf/MathExtras.h>
 
 +#if CPU(X86_64)
 +#include <emmintrin.h>
 +#endif
 +
 using namespace std;
 
 namespace WebCore {
 @@ -968,9 +972,7 @@ TransformationMatrix TransformationMatrix::rectToRect(const FloatRect& from, con
                                 to.y() - from.y());
 }
 
 -//
 -// *this = mat * *this
 -//
 +// this = mat * this.
 TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
 {
 #if CPU(APPLE_ARMV7S)
 @@ -1115,6 +1117,129 @@ TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix&
     }
 #undef MATRIX_MULTIPLY_ONE_LINE
 
 +#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
 +    // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
 +    __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
 +    __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
 +    __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
 +    __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
 +
 +    // First row.
 +    __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
 +    __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
 +    __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
 +    __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
 +
 +    // output00 and output01.
 +    __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
 +    __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
 +    __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
 +    __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
 +
 +    __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
 +    __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
 +    __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
 +    __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[0][0], accumulator);
 +
 +    // output02 and output03.
 +    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
 +    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
 +    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
 +    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[0][2], accumulator);
 +
 +    // Second row.
 +    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
 +    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
 +    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
 +    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
 +
 +    // output10 and output11.
 +    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
 +    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
 +    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
 +    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[1][0], accumulator);
 +
 +    // output12 and output13.
 +    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
 +    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
 +    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
 +    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[1][2], accumulator);
 +
 +    // Third row.
 +    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
 +    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
 +    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
 +    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
 +
 +    // output20 and output21.
 +    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
 +    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
 +    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
 +    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[2][0], accumulator);
 +
 +    // output22 and output23.
 +    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
 +    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
 +    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
 +    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[2][2], accumulator);
 +
 +    // Fourth row.
 +    otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
 +    otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
 +    otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
 +    otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
 +
 +    // output30 and output31.
 +    accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
 +    temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
 +    temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
 +    temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[3][0], accumulator);
 +
 +    // output32 and output33.
 +    accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
 +    temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
 +    temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
 +    temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
 +
 +    accumulator = _mm_add_pd(accumulator, temp1);
 +    accumulator = _mm_add_pd(accumulator, temp2);
 +    accumulator = _mm_add_pd(accumulator, temp3);
 +    _mm_store_pd(&m_matrix[3][2], accumulator);
 #else
     Matrix4 tmp;
     
 diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
 index 181e033..f324c35 100644
 --- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
 +++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
 @@ -69,10 +69,14 @@ class LayoutRect;
 class FloatRect;
 class FloatQuad;
 
 +#if CPU(X86_64) && !PLATFORM(WINDOWS)
 +#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
 +#endif
 +
 class TransformationMatrix {
     WTF_MAKE_FAST_ALLOCATED;
 public:
 -#if CPU(APPLE_ARMV7S)
 +#if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
     typedef double Matrix4[4][4] __attribute__((aligned (16)));
 #else
     typedef double Matrix4[4][4];
 @@ -226,7 +230,7 @@ public:
     double f() const { return m_matrix[3][1]; }
     void setF(double f) { m_matrix[3][1] = f; }
 
 -    // this = this * mat
 +    // this = mat * this.
     TransformationMatrix& multiply(const TransformationMatrix&);
 
     TransformationMatrix& scale(double);
	diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
	index 48379ba..27872ed 100644
	--- a/Source/WebCore/ChangeLog
	+++ b/Source/WebCore/ChangeLog
	@@ -1,3 +1,31 @@
	+2013-01-04 Benjamin Poulain <[email protected]>
	+
	+ Optimize TransformationMatrix::multiply() for x86_64
	+ https://bugs.webkit.org/show_bug.cgi?id=105719
	+
	+ Reviewed by NOBODY (OOPS!).
	+
	+ On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
	+ We can use that in two ways to optimize matrix multiplications:
	+ -Keep the source matrix completely in registers. Write the result directly in
	+ the source matrix's memory. This avoids the memcpy at the end of the multiplication
	+ and various memory operations.
	+ -Use SIMD with SSE to perform 2 operations at a time.
	+
	+ The parameter from the second matrix are loaded one by one in XMM registers.
	+ Loading them with SSE then shuffling the values perform worse than loading
	+ one by one.
	+
	+ This is only enabled on 64bits as x86 only has access to 8 XMM registers and
	+ the function should be written differently.
	+
	+ On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.
	+
	+ * platform/graphics/transforms/TransformationMatrix.cpp:
	+ (WebCore::TransformationMatrix::multiply):
	+ * platform/graphics/transforms/TransformationMatrix.h:
	+ (TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.
	+
	2013-01-04 Zoltan Horvath <[email protected]>

	[CSS Regions] @region rules inside media queries are ignored
	diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
	index adcb48b..bfbc355 100644
	--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
	+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
	@@ -36,6 +36,10 @@
	#include <wtf/Assertions.h>
	#include <wtf/MathExtras.h>

	+#if CPU(X86_64)
	+#include <emmintrin.h>
	+#endif
	+
	using namespace std;

	namespace WebCore {
	@@ -968,9 +972,7 @@ TransformationMatrix TransformationMatrix::rectToRect(const FloatRect& from, con
	to.y() - from.y());
	}

	-//
	-// this = mat *this
	-//
	+// this = mat * this.
	TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
	{
	#if CPU(APPLE_ARMV7S)
	@@ -1115,6 +1117,129 @@ TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix&
	}
	#undef MATRIX_MULTIPLY_ONE_LINE

	+#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
	+ // x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
	+ __m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
	+ __m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
	+ __m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
	+ __m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));
	+
	+ // First row.
	+ __m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
	+ __m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
	+ __m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
	+ __m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);
	+
	+ // output00 and output01.
	+ __m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ __m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ __m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ __m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ __m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
	+ __m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
	+ __m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
	+ __m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[0][0], accumulator);
	+
	+ // output02 and output03.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[0][2], accumulator);
	+
	+ // Second row.
	+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
	+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
	+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
	+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);
	+
	+ // output10 and output11.
	+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[1][0], accumulator);
	+
	+ // output12 and output13.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[1][2], accumulator);
	+
	+ // Third row.
	+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
	+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
	+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
	+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);
	+
	+ // output20 and output21.
	+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[2][0], accumulator);
	+
	+ // output22 and output23.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[2][2], accumulator);
	+
	+ // Fourth row.
	+ otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
	+ otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
	+ otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
	+ otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);
	+
	+ // output30 and output31.
	+ accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[3][0], accumulator);
	+
	+ // output32 and output33.
	+ accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
	+ temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
	+ temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
	+ temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);
	+
	+ accumulator = _mm_add_pd(accumulator, temp1);
	+ accumulator = _mm_add_pd(accumulator, temp2);
	+ accumulator = _mm_add_pd(accumulator, temp3);
	+ _mm_store_pd(&m_matrix[3][2], accumulator);
	#else
	Matrix4 tmp;

	diff --git a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
	index 181e033..f324c35 100644
	--- a/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
	+++ b/Source/WebCore/platform/graphics/transforms/TransformationMatrix.h
	@@ -69,10 +69,14 @@ class LayoutRect;
	class FloatRect;
	class FloatQuad;

	+#if CPU(X86_64) && !PLATFORM(WINDOWS)
	+#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
	+#endif
	+
	class TransformationMatrix {
	WTF_MAKE_FAST_ALLOCATED;
	public:
	-#if CPU(APPLE_ARMV7S)
	+#if CPU(APPLE_ARMV7S) \|\| defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
	typedef double Matrix4[4][4] __attribute__((aligned (16)));
	#else
	typedef double Matrix4[4][4];
	@@ -226,7 +230,7 @@ public:
	double f() const { return m_matrix[3][1]; }
	void setF(double f) { m_matrix[3][1] = f; }

	- // this = this * mat
	+ // this = mat * this.
	TransformationMatrix& multiply(const TransformationMatrix&);

	TransformationMatrix& scale(double);