https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99397
Bug ID: 99397 Summary: s152 benchmark of TSVC is vectorized by clang and not by gcc Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- s152 is: void s152s(real_t a[LEN_1D], real_t b[LEN_1D], real_t c[LEN_1D], int i) { a[i] += b[i] * c[i]; } real_t s152(struct args_t * func_args) { // interprocedural data flow analysis // collecting information from a subroutine initialise_arrays(__func__); gettimeofday(&func_args->t1, NULL); for (int nl = 0; nl < iterations; nl++) { for (int i = 0; i < LEN_1D; i++) { b[i] = d[i] * e[i]; s152s(a, b, c, i); } dummy(a, b, c, d, e, aa, bb, cc, 0.); } gettimeofday(&func_args->t2, NULL); return calc_checksum(__func__); } and clang11 vectorizes it as: 00000000004048b0 <s152>: 4048b0: 41 56 push %r14 4048b2: 53 push %rbx 4048b3: 50 push %rax 4048b4: 49 89 fe mov %rdi,%r14 4048b7: bf b7 e1 42 00 mov $0x42e1b7,%edi 4048bc: e8 4f 2d 01 00 call 417610 <initialise_arrays> 4048c1: 31 db xor %ebx,%ebx 4048c3: 4c 89 f7 mov %r14,%rdi 4048c6: 31 f6 xor %esi,%esi 4048c8: e8 93 c7 ff ff call 401060 <gettimeofday@plt> 4048cd: 0f 1f 00 nopl (%rax) 4048d0: 31 c0 xor %eax,%eax 4048d2: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) 4048d9: 00 00 00 4048dc: 0f 1f 40 00 nopl 0x0(%rax) 4048e0: c5 fc 28 80 00 01 4b vmovaps 0x4b0100(%rax),%ymm0 4048e7: 00 4048e8: c5 fc 28 88 20 01 4b vmovaps 0x4b0120(%rax),%ymm1 4048ef: 00 4048f0: c5 fc 59 80 00 0d 49 vmulps 0x490d00(%rax),%ymm0,%ymm0 4048f7: 00 4048f8: c5 f4 59 88 20 0d 49 vmulps 0x490d20(%rax),%ymm1,%ymm1 4048ff: 00 404900: c5 fc 29 80 00 31 43 vmovaps %ymm0,0x433100(%rax) 404907: 00 404908: c5 fc 29 88 20 31 43 vmovaps %ymm1,0x433120(%rax) 40490f: 00 404910: c5 fc 28 90 00 19 47 vmovaps 0x471900(%rax),%ymm2 404917: 00 404918: c5 fc 28 98 20 19 47 vmovaps 0x471920(%rax),%ymm3 40491f: 00 404920: c4 e2 7d a8 90 00 25 vfmadd213ps 0x452500(%rax),%ymm0,%ymm2 404927: 45 00 404929: c4 e2 75 a8 98 20 25 vfmadd213ps 0x452520(%rax),%ymm1,%ymm3 404930: 45 00 404932: c5 fc 29 90 00 25 45 vmovaps %ymm2,0x452500(%rax) 404939: 00 40493a: c5 fc 29 98 20 25 45 vmovaps %ymm3,0x452520(%rax) 404941: 00 404942: 48 83 c0 40 add $0x40,%rax 404946: 48 3d 00 f4 01 00 cmp $0x1f400,%rax 40494c: 75 92 jne 4048e0 <s152+0x30> 40494e: bf 00 25 45 00 mov $0x452500,%edi 404953: be 00 31 43 00 mov $0x433100,%esi 404958: ba 00 19 47 00 mov $0x471900,%edx 40495d: b9 00 0d 49 00 mov $0x490d00,%ecx 404962: 41 b8 00 01 4b 00 mov $0x4b0100,%r8d 404968: 41 b9 00 f5 4c 00 mov $0x4cf500,%r9d 40496e: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0 404972: 68 00 f5 54 00 push $0x54f500 404977: 68 00 f5 50 00 push $0x50f500 40497c: c5 f8 77 vzeroupper 40497f: e8 3c 10 01 00 call 4159c0 <dummy> 404984: 48 83 c4 10 add $0x10,%rsp 404988: 83 c3 01 add $0x1,%ebx 40498b: 81 fb a0 86 01 00 cmp $0x186a0,%ebx 404991: 0f 85 39 ff ff ff jne 4048d0 <s152+0x20> 404997: 49 83 c6 10 add $0x10,%r14 40499b: 4c 89 f7 mov %r14,%rdi 40499e: 31 f6 xor %esi,%esi 4049a0: e8 bb c6 ff ff call 401060 <gettimeofday@plt> 4049a5: bf b7 e1 42 00 mov $0x42e1b7,%edi 4049aa: 48 83 c4 08 add $0x8,%rsp 4049ae: 5b pop %rbx 4049af: 41 5e pop %r14 4049b1: e9 4a 26 02 00 jmp 427000 <calc_checksum> 4049b6: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) 4049bd: 00 00 00 We get: real_t s152 (struct args_t * func_args) { int i; int nl; static const char __func__[5] = "s152"; struct timeval * _1; float _2; float _3; float _4; struct timeval * _5; real_t _16; long unsigned int _21; long unsigned int _22; real_t * _23; float _24; real_t * _25; float _26; real_t * _27; float _28; float _29; float _30; unsigned int ivtmp_48; unsigned int ivtmp_49; unsigned int ivtmp_50; unsigned int ivtmp_51; <bb 2> [local count: 108459]: initialise_arrays (&__func__); _1 = &func_args_12(D)->t1; gettimeofday (_1, 0B); goto <bb 5>; [100.00%] <bb 8> [local count: 1052266996]: <bb 3> [local count: 1063004409]: # i_40 = PHI <i_20(8), 0(5)> # ivtmp_51 = PHI <ivtmp_50(8), 32000(5)> _2 = d[i_40]; _3 = e[i_40]; _4 = _2 * _3; b[i_40] = _4; _21 = (long unsigned int) i_40; _22 = _21 * 4; _23 = &a + _22; _24 = *_23; _25 = &b + _22; _26 = *_25; _27 = &c + _22; _28 = *_27; _29 = _26 * _28; _30 = _24 + _29; *_23 = _30; i_20 = i_40 + 1; ivtmp_50 = ivtmp_51 - 1; if (ivtmp_50 != 0) goto <bb 8>; [98.99%] else goto <bb 4>; [1.01%] <bb 4> [local count: 10737416]: dummy (&a, &b, &c, &d, &e, &aa, &bb, &cc, 0.0); nl_18 = nl_39 + 1; ivtmp_48 = ivtmp_49 - 1; if (ivtmp_48 != 0) goto <bb 7>; [98.99%] else goto <bb 6>; [1.01%] <bb 7> [local count: 10628957]: <bb 5> [local count: 10737416]: # nl_39 = PHI <nl_18(7), 0(2)> # ivtmp_49 = PHI <ivtmp_48(7), 100000(2)> goto <bb 3>; [100.00%] <bb 6> [local count: 108459]: _5 = &func_args_12(D)->t2; gettimeofday (_5, 0B); _16 = calc_checksum (&__func__); return _16; } and fail at: tsvc.c:699:27: note: can tell at compile time that b[i_40] and *_25 alias tsvc.c:686:14: missed: not vectorized: compilation time alias: b[i_40] = _4; _26 = *_25; tsvc.c:699:27: note: ***** Analysis failed with vector mode V8QI tsvc.c:699:27: missed: couldn't vectorize loop tsvc.c:689:8: note: vectorized 0 loops in function. So I seem we get confused by: b[i_40] = _4; and _21 = (long unsigned int) i_40; _22 = _21 * 4; _25 = &b + _22; acessing same location...