https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267
Bug ID: 100267
Summary: gcc -O2 for avx512 instrincts generates extra warnings
and less optimizations
Product: gcc
Version: 10.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: konstantin.ananyev at intel dot com
Target Milestone: ---
The code snippet below compiles ok with '-O2' for gcc-9.
But with gcc-10 (and gcc-11) it generates -Wuninitialized warnings.
Another thing (which is probably worse) 'gcc-10 -O2' generates code with
unnecessary loads for ymm registers from the initiliazed portion of the stack.
As I understand, thats where from these -Wuninitialized warnings come from:
by some reason gcc-10 wants to put local '__m256i pdatap[2]' variables
on the stack.
Note that only '-O2' affected, '-O3' looks good for all versions I tried
(gcc-9, gcc-10, gcc-11)..
=====================
$ cat tavx512u5.c
#include <stddef.h>
#include <stdint.h>
#include <x86intrin.h>
struct flow_avx512 {
uint32_t num_packets;
uint32_t total_packets;
const uint8_t **idata;
};
static inline void
start_flow_avx512x8(const struct flow_avx512 *flow, uint32_t num,
uint32_t msk, __m256i pdata[2])
{
uint32_t n, m[2], nm[2];
__m256i nd[2];
m[0] = msk & 0xF;
m[1] = msk >> 4;
n = __builtin_popcount(m[0]);
nm[0] = (1 << n) - 1;
nm[1] = (1 << (num - n)) - 1;
nd[0] = _mm256_maskz_loadu_epi64(nm[0],
flow->idata + flow->num_packets);
nd[1] = _mm256_maskz_loadu_epi64(nm[1],
flow->idata + flow->num_packets + n);
pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
}
__m256i
dummyf1_avx512x8(const struct flow_avx512 *flow)
{
__m256i pdata[2];
start_flow_avx512x8(flow, 8, 0xFF, pdata);
return _mm256_add_epi64(pdata[0], pdata[1]);
}
====================
Good version (gcc-9) first:
gcc-9 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o
tavx512u5.gcc9-O2.o -c tavx512u5.c
$ objdump -d tavx512u5.gcc9-O2.o
tavx512u5.gcc9-O2.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <dummyf1_avx512x8>:
0: f3 0f 1e fa endbr64
4: 8b 17 mov (%rdi),%edx
6: 48 8b 47 08 mov 0x8(%rdi),%rax
a: b9 0f 00 00 00 mov $0xf,%ecx
f: c5 f8 92 c9 kmovw %ecx,%k1
13: 62 f2 fd a9 89 0c d0 vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z}
1a: 62 f2 fd a9 89 44 d0 vpexpandq 0x20(%rax,%rdx,8),%ymm0{%k1}{z}
21: 04
22: c5 f5 d4 c0 vpaddq %ymm0,%ymm1,%ymm0
26: c3 retq
=======================
Now gcc-10:
$ gcc-10 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o
tavx512u5.gcc9-O2.o -c tavx512u5.c
tavx512u5.c: In function ‘dummyf1_avx512x8’:
tavx512u5.c:32:13: warning: ‘pdata’ is used uninitialized in this function
[-Wuninitialized]
32 | pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
tavx512u5.c:33:13: warning: ‘*((void *)&pdata+32)’ is used uninitialized in
this function [-Wuninitialized]
33 | pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
$ objdump -d tavx512u5.gcc10-O2.o
tavx512u5.gcc10-O2.o: file format elf64-x86-64
0000000000000000 <dummyf1_avx512x8>:
0: f3 0f 1e fa endbr64
4: 55 push %rbp
5: b9 0f 00 00 00 mov $0xf,%ecx
a: c5 f8 92 c9 kmovw %ecx,%k1
e: 48 89 e5 mov %rsp,%rbp
11: 48 83 e4 e0 and $0xffffffffffffffe0,%rsp
15: 48 83 ec 60 sub $0x60,%rsp
19: 8b 17 mov (%rdi),%edx
1b: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
22: 00 00
24: 48 89 44 24 58 mov %rax,0x58(%rsp)
29: 31 c0 xor %eax,%eax
2b: 48 8b 47 08 mov 0x8(%rdi),%rax
2f: c5 fd 6f 04 24 vmovdqa (%rsp),%ymm0 <=== load uninit data
34: c5 fd 6f 4c 24 20 vmovdqa 0x20(%rsp),%ymm1 <=== from stack
3a: 62 f2 fd 29 89 04 d0 vpexpandq (%rax,%rdx,8),%ymm0{%k1}
41: 62 f2 fd 29 89 4c d0 vpexpandq 0x20(%rax,%rdx,8),%ymm1{%k1}
48: 04
49: c5 fd d4 c1 vpaddq %ymm1,%ymm0,%ymm0
4d: 48 8b 44 24 58 mov 0x58(%rsp),%rax
52: 64 48 2b 04 25 28 00 sub %fs:0x28,%rax
59: 00 00
5b: 75 02 jne 5f <dummyf1_avx512x8+0x5f>
5d: c9 leaveq
5e: c3 retq
5f: c5 f8 77 vzeroupper
62: e8 00 00 00 00 callq 67 <dummyf1_avx512x8+0x67>
================
Running gcc-10 with -fdump-tree-optimized shows similar picture
(as I can uderstand it wants to put pdata[2] on the stack):
$ cat tavx512u5.gcc10-O2.optimized
;; Function dummyf1_avx512x8 (dummyf1_avx512x8, funcdef_no=5593,
decl_uid=32966, cgraph_uid=5594, symbol_order=5593)
dummyf1_avx512x8 (const struct flow_avx512 * flow)
{
__m256i pdata[2];
vector(4) long long unsigned int _6;
vector(4) long long unsigned int _8;
vector(4) long long unsigned int _9;
vector(4) long long int _10;
const uint8_t * * _22;
unsigned int _23;
long unsigned int _24;
long unsigned int _25;
const uint8_t * * _26;
vector(4) long long int _29;
const uint8_t * * _30;
unsigned int _31;
sizetype _32;
sizetype _34;
sizetype _35;
const uint8_t * * _36;
vector(4) long long int _39;
vector(4) long long int _41;
vector(4) long long int _42;
vector(4) long long int _45;
vector(4) long long int _46;
<bb 2> [local count: 1073741824]:
_22 = flow_4(D)->idata;
_23 = flow_4(D)->num_packets;
_24 = (long unsigned int) _23;
_25 = _24 * 8;
_26 = _22 + _25;
_29 = __builtin_ia32_loaddqudi256_mask (_26, { 0, 0, 0, 0 }, 15);
_30 = flow_4(D)->idata;
_31 = flow_4(D)->num_packets;
_32 = (sizetype) _31;
_34 = _32 + 4;
_35 = _34 * 8;
_36 = _30 + _35;
_39 = __builtin_ia32_loaddqudi256_mask (_36, { 0, 0, 0, 0 }, 15);
_41 = MEM[(__m256i * {ref-all})&pdata];
_42 = __builtin_ia32_expanddi256_mask (_29, _41, 15);
_45 = MEM[(__m256i * {ref-all})&pdata + 32B];
_46 = __builtin_ia32_expanddi256_mask (_39, _45, 15);
_6 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_42);
_8 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_46);
_9 = _6 + _8;
_10 = VIEW_CONVERT_EXPR<__m256i>(_9);
pdata ={v} {CLOBBER};
return _10;
}
=========================
While gcc-9:
$ cat tavx512u5.gcc9-O2.optimized
;; Function dummyf1_avx512x8 (dummyf1_avx512x8, funcdef_no=5525,
decl_uid=32562, cgraph_uid=5526, symbol_order=5525)
dummyf1_avx512x8 (const struct flow_avx512 * flow)
{
vector(4) long long int pdata$32;
vector(4) long long int pdata;
vector(4) long long unsigned int _3;
vector(4) long long unsigned int _5;
vector(4) long long unsigned int _6;
vector(4) long long int _7;
const uint8_t * * _9;
unsigned int _10;
long unsigned int _11;
long unsigned int _12;
const uint8_t * * _13;
vector(4) long long int _14;
const uint8_t * * _15;
unsigned int _16;
sizetype _17;
sizetype _18;
sizetype _19;
const uint8_t * * _20;
vector(4) long long int _21;
vector(4) long long int _22;
vector(4) long long int _23;
<bb 2> [local count: 1073741824]:
_9 = MEM[(const uint8_t * * const *)flow_2(D) + 8B];
_10 = MEM[(const uint32_t *)flow_2(D)];
_11 = (long unsigned int) _10;
_12 = _11 * 8;
_13 = _9 + _12;
_14 = __builtin_ia32_loaddqudi256_mask (_13, { 0, 0, 0, 0 }, 15);
_15 = MEM[(const uint8_t * * const *)flow_2(D) + 8B];
_16 = MEM[(const uint32_t *)flow_2(D)];
_17 = (sizetype) _16;
_18 = _17 + 4;
_19 = _18 * 8;
_20 = _15 + _19;
_21 = __builtin_ia32_loaddqudi256_mask (_20, { 0, 0, 0, 0 }, 15);
_22 = __builtin_ia32_expanddi256_mask (_14, pdata_4(D), 15);
_23 = __builtin_ia32_expanddi256_mask (_21, pdata$32_8(D), 15);
_3 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_22);
_5 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_23);
_6 = _3 + _5;
_7 = VIEW_CONVERT_EXPR<__m256i>(_6);
return _7;
}