https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267
Bug ID: 100267 Summary: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations Product: gcc Version: 10.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: konstantin.ananyev at intel dot com Target Milestone: --- The code snippet below compiles ok with '-O2' for gcc-9. But with gcc-10 (and gcc-11) it generates -Wuninitialized warnings. Another thing (which is probably worse) 'gcc-10 -O2' generates code with unnecessary loads for ymm registers from the initiliazed portion of the stack. As I understand, thats where from these -Wuninitialized warnings come from: by some reason gcc-10 wants to put local '__m256i pdatap[2]' variables on the stack. Note that only '-O2' affected, '-O3' looks good for all versions I tried (gcc-9, gcc-10, gcc-11).. ===================== $ cat tavx512u5.c #include <stddef.h> #include <stdint.h> #include <x86intrin.h> struct flow_avx512 { uint32_t num_packets; uint32_t total_packets; const uint8_t **idata; }; static inline void start_flow_avx512x8(const struct flow_avx512 *flow, uint32_t num, uint32_t msk, __m256i pdata[2]) { uint32_t n, m[2], nm[2]; __m256i nd[2]; m[0] = msk & 0xF; m[1] = msk >> 4; n = __builtin_popcount(m[0]); nm[0] = (1 << n) - 1; nm[1] = (1 << (num - n)) - 1; nd[0] = _mm256_maskz_loadu_epi64(nm[0], flow->idata + flow->num_packets); nd[1] = _mm256_maskz_loadu_epi64(nm[1], flow->idata + flow->num_packets + n); pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]); pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]); } __m256i dummyf1_avx512x8(const struct flow_avx512 *flow) { __m256i pdata[2]; start_flow_avx512x8(flow, 8, 0xFF, pdata); return _mm256_add_epi64(pdata[0], pdata[1]); } ==================== Good version (gcc-9) first: gcc-9 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o tavx512u5.gcc9-O2.o -c tavx512u5.c $ objdump -d tavx512u5.gcc9-O2.o tavx512u5.gcc9-O2.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <dummyf1_avx512x8>: 0: f3 0f 1e fa endbr64 4: 8b 17 mov (%rdi),%edx 6: 48 8b 47 08 mov 0x8(%rdi),%rax a: b9 0f 00 00 00 mov $0xf,%ecx f: c5 f8 92 c9 kmovw %ecx,%k1 13: 62 f2 fd a9 89 0c d0 vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z} 1a: 62 f2 fd a9 89 44 d0 vpexpandq 0x20(%rax,%rdx,8),%ymm0{%k1}{z} 21: 04 22: c5 f5 d4 c0 vpaddq %ymm0,%ymm1,%ymm0 26: c3 retq ======================= Now gcc-10: $ gcc-10 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o tavx512u5.gcc9-O2.o -c tavx512u5.c tavx512u5.c: In function ‘dummyf1_avx512x8’: tavx512u5.c:32:13: warning: ‘pdata’ is used uninitialized in this function [-Wuninitialized] 32 | pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tavx512u5.c:33:13: warning: ‘*((void *)&pdata+32)’ is used uninitialized in this function [-Wuninitialized] 33 | pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ $ objdump -d tavx512u5.gcc10-O2.o tavx512u5.gcc10-O2.o: file format elf64-x86-64 0000000000000000 <dummyf1_avx512x8>: 0: f3 0f 1e fa endbr64 4: 55 push %rbp 5: b9 0f 00 00 00 mov $0xf,%ecx a: c5 f8 92 c9 kmovw %ecx,%k1 e: 48 89 e5 mov %rsp,%rbp 11: 48 83 e4 e0 and $0xffffffffffffffe0,%rsp 15: 48 83 ec 60 sub $0x60,%rsp 19: 8b 17 mov (%rdi),%edx 1b: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax 22: 00 00 24: 48 89 44 24 58 mov %rax,0x58(%rsp) 29: 31 c0 xor %eax,%eax 2b: 48 8b 47 08 mov 0x8(%rdi),%rax 2f: c5 fd 6f 04 24 vmovdqa (%rsp),%ymm0 <=== load uninit data 34: c5 fd 6f 4c 24 20 vmovdqa 0x20(%rsp),%ymm1 <=== from stack 3a: 62 f2 fd 29 89 04 d0 vpexpandq (%rax,%rdx,8),%ymm0{%k1} 41: 62 f2 fd 29 89 4c d0 vpexpandq 0x20(%rax,%rdx,8),%ymm1{%k1} 48: 04 49: c5 fd d4 c1 vpaddq %ymm1,%ymm0,%ymm0 4d: 48 8b 44 24 58 mov 0x58(%rsp),%rax 52: 64 48 2b 04 25 28 00 sub %fs:0x28,%rax 59: 00 00 5b: 75 02 jne 5f <dummyf1_avx512x8+0x5f> 5d: c9 leaveq 5e: c3 retq 5f: c5 f8 77 vzeroupper 62: e8 00 00 00 00 callq 67 <dummyf1_avx512x8+0x67> ================ Running gcc-10 with -fdump-tree-optimized shows similar picture (as I can uderstand it wants to put pdata[2] on the stack): $ cat tavx512u5.gcc10-O2.optimized ;; Function dummyf1_avx512x8 (dummyf1_avx512x8, funcdef_no=5593, decl_uid=32966, cgraph_uid=5594, symbol_order=5593) dummyf1_avx512x8 (const struct flow_avx512 * flow) { __m256i pdata[2]; vector(4) long long unsigned int _6; vector(4) long long unsigned int _8; vector(4) long long unsigned int _9; vector(4) long long int _10; const uint8_t * * _22; unsigned int _23; long unsigned int _24; long unsigned int _25; const uint8_t * * _26; vector(4) long long int _29; const uint8_t * * _30; unsigned int _31; sizetype _32; sizetype _34; sizetype _35; const uint8_t * * _36; vector(4) long long int _39; vector(4) long long int _41; vector(4) long long int _42; vector(4) long long int _45; vector(4) long long int _46; <bb 2> [local count: 1073741824]: _22 = flow_4(D)->idata; _23 = flow_4(D)->num_packets; _24 = (long unsigned int) _23; _25 = _24 * 8; _26 = _22 + _25; _29 = __builtin_ia32_loaddqudi256_mask (_26, { 0, 0, 0, 0 }, 15); _30 = flow_4(D)->idata; _31 = flow_4(D)->num_packets; _32 = (sizetype) _31; _34 = _32 + 4; _35 = _34 * 8; _36 = _30 + _35; _39 = __builtin_ia32_loaddqudi256_mask (_36, { 0, 0, 0, 0 }, 15); _41 = MEM[(__m256i * {ref-all})&pdata]; _42 = __builtin_ia32_expanddi256_mask (_29, _41, 15); _45 = MEM[(__m256i * {ref-all})&pdata + 32B]; _46 = __builtin_ia32_expanddi256_mask (_39, _45, 15); _6 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_42); _8 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_46); _9 = _6 + _8; _10 = VIEW_CONVERT_EXPR<__m256i>(_9); pdata ={v} {CLOBBER}; return _10; } ========================= While gcc-9: $ cat tavx512u5.gcc9-O2.optimized ;; Function dummyf1_avx512x8 (dummyf1_avx512x8, funcdef_no=5525, decl_uid=32562, cgraph_uid=5526, symbol_order=5525) dummyf1_avx512x8 (const struct flow_avx512 * flow) { vector(4) long long int pdata$32; vector(4) long long int pdata; vector(4) long long unsigned int _3; vector(4) long long unsigned int _5; vector(4) long long unsigned int _6; vector(4) long long int _7; const uint8_t * * _9; unsigned int _10; long unsigned int _11; long unsigned int _12; const uint8_t * * _13; vector(4) long long int _14; const uint8_t * * _15; unsigned int _16; sizetype _17; sizetype _18; sizetype _19; const uint8_t * * _20; vector(4) long long int _21; vector(4) long long int _22; vector(4) long long int _23; <bb 2> [local count: 1073741824]: _9 = MEM[(const uint8_t * * const *)flow_2(D) + 8B]; _10 = MEM[(const uint32_t *)flow_2(D)]; _11 = (long unsigned int) _10; _12 = _11 * 8; _13 = _9 + _12; _14 = __builtin_ia32_loaddqudi256_mask (_13, { 0, 0, 0, 0 }, 15); _15 = MEM[(const uint8_t * * const *)flow_2(D) + 8B]; _16 = MEM[(const uint32_t *)flow_2(D)]; _17 = (sizetype) _16; _18 = _17 + 4; _19 = _18 * 8; _20 = _15 + _19; _21 = __builtin_ia32_loaddqudi256_mask (_20, { 0, 0, 0, 0 }, 15); _22 = __builtin_ia32_expanddi256_mask (_14, pdata_4(D), 15); _23 = __builtin_ia32_expanddi256_mask (_21, pdata$32_8(D), 15); _3 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_22); _5 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_23); _6 = _3 + _5; _7 = VIEW_CONVERT_EXPR<__m256i>(_6); return _7; }