diff --git a/rts/System/Matrix44f.cpp b/rts/System/Matrix44f.cpp index 6368561aca2..c2eaccdac46 100644 --- a/rts/System/Matrix44f.cpp +++ b/rts/System/Matrix44f.cpp @@ -326,12 +326,9 @@ CMatrix44f& CMatrix44f::Translate(const float x, const float y, const float z) return *this; } - - __FORCE_ALIGN_STACK__ static inline void MatrixMatrixMultiplySSE(const CMatrix44f& m1, const CMatrix44f& m2, CMatrix44f* mout) { - //alignof guarantees 16 byte alignment required by SSE2 const __m128 m1c1 = _mm_load_ps(&m1.md[0][0]); const __m128 m1c2 = _mm_load_ps(&m1.md[1][0]); const __m128 m1c3 = _mm_load_ps(&m1.md[2][0]); @@ -342,43 +339,35 @@ static inline void MatrixMatrixMultiplySSE(const CMatrix44f& m1, const CMatrix44 assert(m2.m[7] == 0.0f); // assert(m2.m[11] == 0.0f); in case of a gluPerspective it's -1 - const __m128 m2i0 = _mm_load1_ps(&m2.m[0]); - const __m128 m2i1 = _mm_load1_ps(&m2.m[1]); - const __m128 m2i2 = _mm_load1_ps(&m2.m[2]); - //const __m128 m2i3 = _mm_load1_ps(&m2.m[3]); - const __m128 m2i4 = _mm_load1_ps(&m2.m[4]); - const __m128 m2i5 = _mm_load1_ps(&m2.m[5]); - const __m128 m2i6 = _mm_load1_ps(&m2.m[6]); - //const __m128 m2i7 = _mm_load1_ps(&m2.m[7]); - const __m128 m2i8 = _mm_load1_ps(&m2.m[8]); - const __m128 m2i9 = _mm_load1_ps(&m2.m[9]); - const __m128 m2i10 = _mm_load1_ps(&m2.m[10]); - const __m128 m2i11 = _mm_load1_ps(&m2.m[11]); - const __m128 m2i12 = _mm_load1_ps(&m2.m[12]); - const __m128 m2i13 = _mm_load1_ps(&m2.m[13]); - const __m128 m2i14 = _mm_load1_ps(&m2.m[14]); - const __m128 m2i15 = _mm_load1_ps(&m2.m[15]); - - __m128 moutc1, moutc2, moutc3, moutc4; - moutc1 = _mm_mul_ps(m1c1, m2i0); - moutc2 = _mm_mul_ps(m1c1, m2i4); - moutc3 = _mm_mul_ps(m1c1, m2i8); - moutc4 = _mm_mul_ps(m1c1, m2i12); - - moutc1 = _mm_add_ps(moutc1, _mm_mul_ps(m1c2, m2i1)); - moutc2 = _mm_add_ps(moutc2, _mm_mul_ps(m1c2, m2i5)); - moutc3 = _mm_add_ps(moutc3, _mm_mul_ps(m1c2, m2i9)); - moutc4 = _mm_add_ps(moutc4, _mm_mul_ps(m1c2, m2i13)); - - moutc1 = _mm_add_ps(moutc1, _mm_mul_ps(m1c3, m2i2)); - moutc2 = _mm_add_ps(moutc2, _mm_mul_ps(m1c3, m2i6)); - moutc3 = _mm_add_ps(moutc3, _mm_mul_ps(m1c3, m2i10)); - moutc4 = _mm_add_ps(moutc4, _mm_mul_ps(m1c3, m2i14)); - - //moutc1 = _mm_add_ps(moutc1, _mm_mul_ps(m1c4, _mm_load1_ps(&m2.m[3]))); - //moutc2 = _mm_add_ps(moutc2, _mm_mul_ps(m1c4, _mm_load1_ps(&m2.m[7]))); - moutc3 = _mm_add_ps(moutc3, _mm_mul_ps(m1c4, m2i11)); - moutc4 = _mm_add_ps(moutc4, _mm_mul_ps(m1c4, m2i15)); + // Load each column of m2 as a full vector, then use _mm_shuffle_ps to broadcast + // each element — avoids 12 separate scalar loads with _mm_load1_ps + const __m128 m2c0 = _mm_load_ps(&m2.m[0]); // [m00, m10, m20, m30=0] + const __m128 m2c1 = _mm_load_ps(&m2.m[4]); // [m01, m11, m21, m31=0] + const __m128 m2c2 = _mm_load_ps(&m2.m[8]); // [m02, m12, m22, m32] + const __m128 m2c3 = _mm_load_ps(&m2.m[12]); // [m03, m13, m23, m33] + + #define SPLAT(v, i) _mm_shuffle_ps(v, v, _MM_SHUFFLE(i,i,i,i)) + + __m128 moutc1 = _mm_mul_ps(m1c1, SPLAT(m2c0, 0)); + __m128 moutc2 = _mm_mul_ps(m1c1, SPLAT(m2c1, 0)); + __m128 moutc3 = _mm_mul_ps(m1c1, SPLAT(m2c2, 0)); + __m128 moutc4 = _mm_mul_ps(m1c1, SPLAT(m2c3, 0)); + + moutc1 = _mm_add_ps(moutc1, _mm_mul_ps(m1c2, SPLAT(m2c0, 1))); + moutc2 = _mm_add_ps(moutc2, _mm_mul_ps(m1c2, SPLAT(m2c1, 1))); + moutc3 = _mm_add_ps(moutc3, _mm_mul_ps(m1c2, SPLAT(m2c2, 1))); + moutc4 = _mm_add_ps(moutc4, _mm_mul_ps(m1c2, SPLAT(m2c3, 1))); + + moutc1 = _mm_add_ps(moutc1, _mm_mul_ps(m1c3, SPLAT(m2c0, 2))); + moutc2 = _mm_add_ps(moutc2, _mm_mul_ps(m1c3, SPLAT(m2c1, 2))); + moutc3 = _mm_add_ps(moutc3, _mm_mul_ps(m1c3, SPLAT(m2c2, 2))); + moutc4 = _mm_add_ps(moutc4, _mm_mul_ps(m1c3, SPLAT(m2c3, 2))); + + // m2.m[3] and m2.m[7] are zero — skip those terms + moutc3 = _mm_add_ps(moutc3, _mm_mul_ps(m1c4, SPLAT(m2c2, 3))); + moutc4 = _mm_add_ps(moutc4, _mm_mul_ps(m1c4, SPLAT(m2c3, 3))); + + #undef SPLAT _mm_store_ps(&mout->md[0][0], moutc1); _mm_store_ps(&mout->md[1][0], moutc2); diff --git a/test/engine/System/testMatrix44f.cpp b/test/engine/System/testMatrix44f.cpp index 9ddc7457e9c..53ca5de1863 100644 --- a/test/engine/System/testMatrix44f.cpp +++ b/test/engine/System/testMatrix44f.cpp @@ -1,5 +1,7 @@ /* This file is part of the Spring engine (GPL v2 or later), see LICENSE.html */ +#include + #include "System/simd_compat.h" #include "System/Matrix44f.h" #include "System/float4.h" @@ -99,7 +101,7 @@ static const int testRuns = 40000000; } -_noinline static void MatrixMatrixMultiply(CMatrix44f* m1, const CMatrix44f& m2) +_noinline static void MatrixMatrixMultiplySSEOld(CMatrix44f* m1, const CMatrix44f& m2) { assert(long(&m1->m[0]) % 16 == 0); // 16byte aligned @@ -206,11 +208,12 @@ _noinline static int TestMMSSE() ScopedOnceTimer timer("Matrix-Matrix-Mult: sse"); CMatrix44f m1(m_); for (int i = 0; i < testRuns; ++i) { - MatrixMatrixMultiply(&m1, m); + MatrixMatrixMultiplySSEOld(&m1, m); } return spring::LiteHash(&m1, sizeof(CMatrix44f), 0); } + _noinline static int TestSpring() { ScopedOnceTimer timer("Matrix-Vector-Mult: spring"); @@ -287,3 +290,82 @@ TEST_CASE("Matrix44MatrixMultiply") } } } + +TEST_CASE("Matrix44MatrixMultiplySSE") +{ + for (int i = 0; i < 16; ++i) { + if ((i != 7) && (i != 3)) { + m[i] = float(i + 1) / 31.3125f; + m_[i] = float(i + 1) / 31.3125f; + } else { + m[i] = 0.0f; + m_[i] = 0.0f; + } + } + + ScopedOnceTimer timer("Matrix-Matrix-Mult: sse"); + CMatrix44f m1(m_); + for (int i = 0; i < testRuns; ++i) { + MatrixMatrixMultiplySSEOld(&m1, m); + } + spring::LiteHash(&m1, sizeof(CMatrix44f), 0); + + spring_clock::PopTickRate(); +} + +TEST_CASE("Matrix44MatrixMultiplySSEOldVsSSENew") +{ + const int numTests = 100000; + bool allMatch = true; + std::mt19937 rng(12345); + std::uniform_real_distribution dist(-10.0f, 10.0f); + + for (int t = 0; t < numTests; ++t) { + for (int i = 0; i < 16; ++i) { + m[i] = dist(rng); + } + + for (int i = 0; i < 16; ++i) { + if (i == 3 || i == 7) { + m_[i] = 0.0f; + } else { + m_[i] = dist(rng); + } + } + + assert(m_[3] == 0.0f); + assert(m_[7] == 0.0f); + + CMatrix44f resultNew = m * m_; + MatrixMatrixMultiplySSEOld(&m, m_); + + if (!(m == resultNew)) { + allMatch = false; + break; + } + } + + CHECK(allMatch == true); +} + +TEST_CASE("Matrix44MatrixMultiplySSE_Opt") +{ + for (int i = 0; i < 16; ++i) { + if ((i != 7) && (i != 3)) { + m[i] = float(i + 1) / 31.3125f; + m_[i] = float(i + 1) / 31.3125f; + } else { + m[i] = 0.0f; + m_[i] = 0.0f; + } + } + + ScopedOnceTimer timer("Matrix-Matrix-Mult: sse_new"); + CMatrix44f m1(m_); + for (int i = 0; i < testRuns; ++i) { + m1 = m1 * m_; + } + spring::LiteHash(&m1, sizeof(CMatrix44f), 0); + + spring_clock::PopTickRate(); +}