https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120138
Bug ID: 120138 Summary: (14/15/16) -Wmaybe-uninitialized triggered after specific optimizations Product: gcc Version: 16.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c++ Assignee: unassigned at gcc dot gnu.org Reporter: tcpreimesberger at gmail dot com Target Milestone: --- Created attachment 61344 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=61344&action=edit preprocessed source, compressed https://godbolt.org/z/vhKa5sh6e This is reduced from an fbgemm kernel. The attached code triggers tree-uninit1 on -O3, but not on -O1. In particular, the optimization options -O1 -fenable-tree-cunrolli -fenable-tree-threadfull1 -fenable-tree-uninit1 triggers the uninit1 warning. This stops happening on gcc 13.3 Preprocessed source is attached (and much longer...) Reduced testcase: #include <immintrin.h> #include <cstdint> template <typename T> void transpose_avx512_contiguous_thin( const int64_t M, const T* src, int64_t ld_src, T* dst, int64_t ld_dst); static inline void transpose_contiguous_16x2_block( const float* src, float* dst, int64_t ld_dst, int mrem = 16) { __m512i r[2], d[2]; int i = 0; for (; (i + 1) * 16 <= mrem * 2; i++) { // normal load r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i * 16)); } if (i * 16 < mrem * 2) { __mmask16 mask_mrem_v = (1ULL << (mrem * 2 - i * 16)) - 1; r[i] = _mm512_maskz_loadu_epi32(mask_mrem_v, src + i * 16); } // transpose __m512i index1 = _mm512_set_epi32( 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00); __m512i index2 = _mm512_set_epi32( 0x1f, 0x1d, 0x1b, 0x19, 0x17, 0x15, 0x13, 0x11, 0x0f, 0x0d, 0x0b, 0x09, 0x07, 0x05, 0x03, 0x01); // a0--p0 // a1--p1 d[0] = _mm512_permutex2var_epi32(r[0], index1, r[1]); d[1] = _mm512_permutex2var_epi32(r[0], index2, r[1]); // store if (mrem < 16) { __mmask16 mask_rem_v = (1ULL << mrem) - 1; // mask store _mm512_mask_storeu_epi32(dst, mask_rem_v, d[0]); _mm512_mask_storeu_epi32(dst + ld_dst, mask_rem_v, d[1]); } else { // normal store _mm512_storeu_si512(dst, d[0]); _mm512_storeu_si512(dst + ld_dst, d[1]); } } template <> void transpose_avx512_contiguous_thin( int64_t M, const float* src, int64_t ld_src, float* dst, int64_t ld_dst) { int64_t i = 0; int mrem = M - i; if (mrem > 0) { transpose_contiguous_16x2_block(src + i * ld_src, dst + i, ld_dst, mrem); } }