https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93930
Bug ID: 93930 Summary: Unnecessary broadcast instructions for AVX512 Product: gcc Version: 9.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: fredrik987 at gmail dot com Target Milestone: --- Created attachment 47908 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47908&action=edit Test case The code below generates unnecessary broadcast instructions for AVX512, compiled with "-Ofast -march=skylake-avx512". This occurs for gcc trunk and 9.2/8.3 but not 7.5. Most constants are read from memory via vbroadcastss except two, which are read as scalars and then broadcast within the loop. For gcc 7.5 all constants are read via vbroadcastss. The problem seems to be more frequent for larger functions. --- Compiler output for gcc 9.2: ... .L3: vmovaps zmm0, ZMMWORD PTR [rdi] add rdi, 64 vmovaps zmm3, zmm0 vmovaps zmm1, zmm0 vmulps zmm2, zmm0, zmm0 vfmadd132ps zmm3, zmm11, zmm12 vfmadd132ps zmm1, zmm13, zmm14 vmovaps zmm4, zmm0 vfmadd132ps zmm4, zmm7, zmm8 sub rsi, -128 vfmadd132ps zmm1, zmm3, zmm2 vmovaps zmm3, zmm0 vfmadd132ps zmm3, zmm9, zmm10 vfmadd132ps zmm3, zmm4, zmm2 vbroadcastss zmm4, xmm15 <--- Broadcast within loop vmulps zmm3, zmm3, zmm1 vmovaps ZMMWORD PTR [rsi-128], zmm3 vbroadcastss zmm3, xmm16 <--- Broadcast within loop vfmadd132ps zmm3, zmm4, zmm0 vfmadd132ps zmm0, zmm5, zmm6 vfmadd132ps zmm0, zmm3, zmm2 vmulps zmm1, zmm1, zmm0 vmovaps ZMMWORD PTR [rsi-64], zmm1 cmp rdi, rax jne .L3 ... --- #include <immintrin.h> static __m512 f(__m512 x) { __m512 a = _mm512_set1_ps(11); __m512 b = _mm512_set1_ps(12); __m512 c = _mm512_set1_ps(13); __m512 d = _mm512_set1_ps(14); __m512 y = _mm512_mul_ps(x, x); return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c, d)); } static __m512 g(__m512 x) { __m512 a = _mm512_set1_ps(21); __m512 b = _mm512_set1_ps(22); __m512 c = _mm512_set1_ps(23); __m512 d = _mm512_set1_ps(24); __m512 y = _mm512_mul_ps(x, x); return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c, d)); } static __m512 h(__m512 x) { __m512 a = _mm512_set1_ps(31); __m512 b = _mm512_set1_ps(32); __m512 c = _mm512_set1_ps(33); __m512 d = _mm512_set1_ps(34); __m512 y = _mm512_mul_ps(x, x); return _mm512_fmadd_ps(y, _mm512_fmadd_ps(x, a, b), _mm512_fmadd_ps(x, c, d)); } void test(__m512 *x, __m512 *y, int n) { for (int i = 0; i < n; i++) { __m512 u = *x++; __m512 v = h(u); *y++ = _mm512_mul_ps(f(u), v); *y++ = _mm512_mul_ps(g(u), v); } }