https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111874
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
For integer, We have _mm512_mask_reduce_add_epi32 defined as
extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A)
{
__A = _mm512_maskz_mov_epi32 (__U, __A);
__MM512_REDUCE_OP (+);
}
#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
__v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); \
__v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); \
__m256i __T3 = (__m256i) (__T1 op __T2); \
__v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); \
__v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); \
__v4si __T6 = __T4 op __T5; \
__v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \
__v4si __T8 = __T6 op __T7; \
return __T8[0] op __T8[1]
There's correponding floating point version, but it's not in-order adds.