SSE4.1 slower than SSE3 on 4x4 matrix multiplication?

So I have this SSE3 implementation for matrix multiplication:

/**
 * Loop is unwrapped for performance
 * @attention As opposed to non-SIMD multiplication we're using column-major
 */
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
  // std::cout << "INSIDE SS3" << std::endl;
  // Load rows of matrix B and transpose to columns
  __m128 a0 = _mm_load_ps(&affector[0]);
  __m128 a1 = _mm_load_ps(&affector[4]);
  __m128 a2 = _mm_load_ps(&affector[8]);
  __m128 a3 = _mm_load_ps(&affector[12]);

  // Load rows of matrix A
  __m128 b0 = _mm_load_ps(&affected[0]);
  __m128 b1 = _mm_load_ps(&affected[4]);
  __m128 b2 = _mm_load_ps(&affected[8]);
  __m128 b3 = _mm_load_ps(&affected[12]);

  // b0 = [1, 2, 3, 4]
  // b1 = [5, 6, 7, 8]
  // b2 = [9, 10, 11, 12]
  // b3 = [13, 14, 15, 16]
  // need to arrive at-> b0 = [1, 5, 9, 10]
  // need to arrive at-> b1 = [2, 6, 10, 14]
  // need to arrive at-> b2 = [3, 7, 11, 15]
  // need to arrive at-> b3 = [4, 8, 12, 16]

  // tmp0 = [1, 5, 2, 6]
  __m128 tmp0 = _mm_unpacklo_ps(b0, b1);
  // tmp1 = [3, 7, 4, 8]
  __m128 tmp1 = _mm_unpackhi_ps(b0, b1);
  // tmp2 = [9, 13, 10, 14]
  __m128 tmp2 = _mm_unpacklo_ps(b2, b3);
  // tmp3 = [11, 15, 12, 16]
  __m128 tmp3 = _mm_unpackhi_ps(b2, b3);

  // b0 = [1, 5, ....] = move tmp2 low into tmp0 high
  b0 = _mm_movelh_ps(tmp0, tmp2);
  // b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
  b1 = _mm_movehl_ps(tmp2, tmp0);
  // b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
  b2 = _mm_movelh_ps(tmp1, tmp3);
  // b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
  b3 = _mm_movehl_ps(tmp3, tmp1);

  // Need to perform dot product [x, y, z, d] * [1, 5, 9, 10]
  // This results in [x + 1, y + 5, z + 9, d + 10]
  __m128 mul = _mm_mul_ps(a0, b0);
  // Perform horizontal addition to sum of all of these values
  // This results in [x + 1 + y + 5, z + 9 + d + 10, 0.0, 0.0]
  mul = _mm_hadd_ps(mul, mul);
  // This results in [x + 1 + y + 5 + z + 9 + d + 10, 0.0, 0.0, 0.0]
  mul = _mm_hadd_ps(mul, mul);
  // Retrieve the result into result[0]
  result[0] = _mm_cvtss_f32(mul);

  // Perform the same for the rest of the matrix elements

  mul = _mm_mul_ps(a0, b1);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[1] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a0, b2);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[2] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a0, b3);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[3] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a1, b0);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[4] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a1, b1);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[5] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a1, b2);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[6] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a1, b3);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[7] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a2, b0);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[8] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a2, b1);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[9] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a2, b2);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[10] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a2, b3);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[11] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a3, b0);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[12] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a3, b1);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[13] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a3, b2);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[14] = _mm_cvtss_f32(mul);

  mul = _mm_mul_ps(a3, b3);
  mul = _mm_hadd_ps(mul, mul);
  mul = _mm_hadd_ps(mul, mul);
  result[15] = _mm_cvtss_f32(mul);
}

Running this function 1,000,000 times results in a speed of ~0.04 seconds.
Now, I was thinking using a dot product would speed things up, since I don't have to:

1. Multiply
2. Do horizontal addition
3. Do another horizontal addition

but instead just:

1. Single Dot product

Here is the SSE4.1 implementation:

 * Loop is unwrapped for performance
 * @attention As opposed to non-SIMD multiplication we're using column-major
 */
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
  // std::cout << "INSIDE SSE4" << std::endl;
  // Load rows of matrix B and transpose to columns
  __m128 a0 = _mm_load_ps(&affector[0]);
  __m128 a1 = _mm_load_ps(&affector[4]);
  __m128 a2 = _mm_load_ps(&affector[8]);
  __m128 a3 = _mm_load_ps(&affector[12]);

  // Load rows of matrix A
  __m128 b0 = _mm_load_ps(&affected[0]);
  __m128 b1 = _mm_load_ps(&affected[4]);
  __m128 b2 = _mm_load_ps(&affected[8]);
  __m128 b3 = _mm_load_ps(&affected[12]);

  // b0 = [1, 2, 3, 4]
  // b1 = [5, 6, 7, 8]
  // b2 = [9, 10, 11, 12]
  // b3 = [13, 14, 15, 16]
  // need to arrive at-> b0 = [1, 5, 9, 10]
  // need to arrive at-> b1 = [2, 6, 10, 14]
  // need to arrive at-> b2 = [3, 7, 11, 15]
  // need to arrive at-> b3 = [4, 8, 12, 16]

  // tmp0 = [1, 5, 2, 6]
  __m128 tmp0 = _mm_unpacklo_ps(b0, b1);
  // tmp1 = [3, 7, 4, 8]
  __m128 tmp1 = _mm_unpackhi_ps(b0, b1);
  // tmp2 = [9, 13, 10, 14]
  __m128 tmp2 = _mm_unpacklo_ps(b2, b3);
  // tmp3 = [11, 15, 12, 16]
  __m128 tmp3 = _mm_unpackhi_ps(b2, b3);

  // b0 = [1, 5, ....] = move tmp2 low into tmp0 high
  b0 = _mm_movelh_ps(tmp0, tmp2);
  // b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
  b1 = _mm_movehl_ps(tmp2, tmp0);
  // b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
  b2 = _mm_movelh_ps(tmp1, tmp3);
  // b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
  b3 = _mm_movehl_ps(tmp3, tmp1);

  __m128 mul;

  // Perform the matrix multiplication for each element
  mul = _mm_dp_ps(a0, b0, 0xF1);  // Dot product of a0 and b0, 0xF1 means all four elements
  result[0] = _mm_cvtss_f32(mul); // Store result

  mul = _mm_dp_ps(a0, b1, 0xF1); // Dot product of a0 and b1
  result[1] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a0, b2, 0xF1); // Dot product of a0 and b2
  result[2] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a0, b3, 0xF1); // Dot product of a0 and b3
  result[3] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a1, b0, 0xF1); // Dot product of a1 and b0
  result[4] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a1, b1, 0xF1); // Dot product of a1 and b1
  result[5] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a1, b2, 0xF1); // Dot product of a1 and b2
  result[6] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a1, b3, 0xF1); // Dot product of a1 and b3
  result[7] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a2, b0, 0xF1); // Dot product of a2 and b0
  result[8] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a2, b1, 0xF1); // Dot product of a2 and b1
  result[9] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a2, b2, 0xF1); // Dot product of a2 and b2
  result[10] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a2, b3, 0xF1); // Dot product of a2 and b3
  result[11] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a3, b0, 0xF1); // Dot product of a3 and b0
  result[12] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a3, b1, 0xF1); // Dot product of a3 and b1
  result[13] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a3, b2, 0xF1); // Dot product of a3 and b2
  result[14] = _mm_cvtss_f32(mul);

  mul = _mm_dp_ps(a3, b3, 0xF1); // Dot product of a3 and b3
  result[15] = _mm_cvtss_f32(mul);
}

The result was: ~0.15 seconds!!! This is even slower than my implementation that doesn't use intrinsics (~0.11 - 0.12 seconds) and the one that uses SSE2 (~0.10 - 0.90 seconds). What's going on? Is it due to how the dot product is implemented at the lower level or am I doing something wrong?
All 3 (non-intrinsic, SSE2, SSE3, SSE4.1) were optimized with -O2.

Solution

haddps is also slow, 3 uops. Never use it with the same input for both operands unless you're optimizing for code-size, not speed. (It has use-cases for a transpose-and-reduce with different inputs.)

But dpps has gotten slower on recent CPUs: 6 uops on Alder Lake P-cores vs. 4 on SKL. (https://uops.info/). It's bad on recent AMD, too, 8 uops on Zen 4. You haven't said anything about what CPU you tested this on, but on some yeah it makes sense that the dpps version is even worse than the hadd version.

Anyway, horizontal summing each element separately even with the best possible shuffles is still definitely not close to the most efficient way to do a 4x4 matmul.

You should be shuffling together (or vertically adding) multiple source vectors as you reduce, to produce a whole row or column of outputs. Efficient 4x4 matrix multiplication (C vs assembly) shows a pretty clean version that loads all 4 rows of one matrix into 4 vectors, then broadcasts each scalar element of the other matrix separately, 4 at a time.

There's no single instruction for a broadcast-load until AVX1 vbroadcastss, so with only SSE the best bet is probably a vector load and 4x _mm_shuffle_ps(v,v, _MM_SHUFFLE(0,0,0,0)) and 1,1,1,1 etc. Or maybe a compile will do that for you with _mm_set1_ps(A[4*i + 1]) etc.

None of the SSE3 or SSE4 instructions are useful for an efficient 4x4 matmul.