https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120138

            Bug ID: 120138
           Summary: (14/15/16) -Wmaybe-uninitialized triggered after
                    specific optimizations
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tcpreimesberger at gmail dot com
  Target Milestone: ---

Created attachment 61344
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=61344&action=edit
preprocessed source, compressed

https://godbolt.org/z/vhKa5sh6e

This is reduced from an fbgemm kernel.

The attached code triggers tree-uninit1 on -O3, but not on -O1. In particular,
the optimization options

-O1 -fenable-tree-cunrolli -fenable-tree-threadfull1 -fenable-tree-uninit1

triggers the uninit1 warning. 

This stops happening on gcc 13.3

Preprocessed source is attached (and much longer...)

Reduced testcase:

#include <immintrin.h>
#include <cstdint>

template <typename T>
void transpose_avx512_contiguous_thin(
    const int64_t M,
    const T* src,
    int64_t ld_src,
    T* dst,
    int64_t ld_dst);

static inline void transpose_contiguous_16x2_block(
    const float* src,
    float* dst,
    int64_t ld_dst,
    int mrem = 16) {
  __m512i r[2], d[2];
  int i = 0;
  for (; (i + 1) * 16 <= mrem * 2; i++) {
    // normal load
    r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i * 16));
  }
  if (i * 16 < mrem * 2) {
    __mmask16 mask_mrem_v = (1ULL << (mrem * 2 - i * 16)) - 1;
    r[i] = _mm512_maskz_loadu_epi32(mask_mrem_v, src + i * 16);
  }
  // transpose
  __m512i index1 = _mm512_set_epi32(
      0x1e,
      0x1c,
      0x1a,
      0x18,
      0x16,
      0x14,
      0x12,
      0x10,
      0x0e,
      0x0c,
      0x0a,
      0x08,
      0x06,
      0x04,
      0x02,
      0x00);
  __m512i index2 = _mm512_set_epi32(
      0x1f,
      0x1d,
      0x1b,
      0x19,
      0x17,
      0x15,
      0x13,
      0x11,
      0x0f,
      0x0d,
      0x0b,
      0x09,
      0x07,
      0x05,
      0x03,
      0x01);

  // a0--p0
  // a1--p1
  d[0] = _mm512_permutex2var_epi32(r[0], index1, r[1]);
  d[1] = _mm512_permutex2var_epi32(r[0], index2, r[1]);

  // store
  if (mrem < 16) {
    __mmask16 mask_rem_v = (1ULL << mrem) - 1;
    // mask store
    _mm512_mask_storeu_epi32(dst, mask_rem_v, d[0]);
    _mm512_mask_storeu_epi32(dst + ld_dst, mask_rem_v, d[1]);
  } else {
    // normal store
    _mm512_storeu_si512(dst, d[0]);
    _mm512_storeu_si512(dst + ld_dst, d[1]);
  }
}

template <>
void transpose_avx512_contiguous_thin(
    int64_t M,
    const float* src,
    int64_t ld_src,
    float* dst,
    int64_t ld_dst) {
    int64_t i = 0;
    int mrem = M - i;
    if (mrem > 0) {
      transpose_contiguous_16x2_block(src + i * ld_src, dst + i, ld_dst, mrem);
  }
}

Reply via email to