https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87077
--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
Just to quote, with the inner loop forced not unrolled we get
<bb 2> [local count: 53687093]:
vect__1.11_14 = MEM <const vector(4) float> [(float *)mtx_12(D)];
vect__2.14_15 = MEM <const vector(4) float> [(float *)vec_13(D)];
vect__3.15_21 = vect__1.11_14 * vect__2.14_15;
_37 = .REDUC_PLUS (vect__3.15_21);
vectp_mtx.10_46 = mtx_12(D) + 32;
vect__1.11_47 = MEM <const vector(4) float> [(float *)vectp_mtx.10_46];
vect__3.15_49 = vect__2.14_15 * vect__1.11_47;
_52 = .REDUC_PLUS (vect__3.15_49);
vectp_mtx.10_61 = mtx_12(D) + 64;
vect__1.11_62 = MEM <const vector(4) float> [(float *)vectp_mtx.10_61];
vect__3.15_64 = vect__2.14_15 * vect__1.11_62;
_67 = .REDUC_PLUS (vect__3.15_64);
vectp_mtx.10_17 = mtx_12(D) + 96;
vect__1.11_5 = MEM <const vector(4) float> [(float *)vectp_mtx.10_17];
vect__3.15_30 = vect__1.11_5 * vect__2.14_15;
_33 = .REDUC_PLUS (vect__3.15_30);
so 4 optimal inner loop executions
_27 = {_37, _52, _67, _33};
MEM <vector(4) float> [(float *)&<retval>] = _27;
the BB store vectorized.
This results in
vmovaps (%rdx), %xmm1
vmulps (%rsi), %xmm1, %xmm0
movq %rdi, %rax
vmovhlps %xmm0, %xmm0, %xmm2
vaddps %xmm0, %xmm2, %xmm2
vshufps $85, %xmm2, %xmm2, %xmm0
vaddps %xmm2, %xmm0, %xmm0
vmulps 32(%rsi), %xmm1, %xmm2
vmovhlps %xmm2, %xmm2, %xmm3
vaddps %xmm2, %xmm3, %xmm3
vshufps $85, %xmm3, %xmm3, %xmm2
vaddps %xmm3, %xmm2, %xmm2
vmovaps %xmm2, %xmm3
vmulps 64(%rsi), %xmm1, %xmm2
vunpcklps %xmm3, %xmm0, %xmm0
vmulps 96(%rsi), %xmm1, %xmm1
vmovhlps %xmm2, %xmm2, %xmm4
vaddps %xmm2, %xmm4, %xmm4
vshufps $85, %xmm4, %xmm4, %xmm2
vaddps %xmm4, %xmm2, %xmm2
vmovhlps %xmm1, %xmm1, %xmm4
vaddps %xmm1, %xmm4, %xmm4
vshufps $85, %xmm4, %xmm4, %xmm1
vaddps %xmm4, %xmm1, %xmm1
vunpcklps %xmm1, %xmm2, %xmm2
vmovlhps %xmm2, %xmm0, %xmm0
vmovaps %xmm0, (%rdi)
ret
which I think is quite optimal - using hadd would likely be slower if not
for cleverly re-using its permutation handling.