https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99397

            Bug ID: 99397
           Summary: s152 benchmark of TSVC is vectorized by clang and not
                    by gcc
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

s152 is:
void s152s(real_t a[LEN_1D], real_t b[LEN_1D], real_t c[LEN_1D], int i)
{
    a[i] += b[i] * c[i];
}

real_t s152(struct args_t * func_args)
{

//    interprocedural data flow analysis
//    collecting information from a subroutine

    initialise_arrays(__func__);
    gettimeofday(&func_args->t1, NULL);

    for (int nl = 0; nl < iterations; nl++) {
        for (int i = 0; i < LEN_1D; i++) {
            b[i] = d[i] * e[i];
            s152s(a, b, c, i);
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    gettimeofday(&func_args->t2, NULL);
    return calc_checksum(__func__);
}


and clang11 vectorizes it as:
00000000004048b0 <s152>:
  4048b0:       41 56                   push   %r14
  4048b2:       53                      push   %rbx
  4048b3:       50                      push   %rax
  4048b4:       49 89 fe                mov    %rdi,%r14
  4048b7:       bf b7 e1 42 00          mov    $0x42e1b7,%edi
  4048bc:       e8 4f 2d 01 00          call   417610 <initialise_arrays>
  4048c1:       31 db                   xor    %ebx,%ebx
  4048c3:       4c 89 f7                mov    %r14,%rdi
  4048c6:       31 f6                   xor    %esi,%esi
  4048c8:       e8 93 c7 ff ff          call   401060 <gettimeofday@plt>
  4048cd:       0f 1f 00                nopl   (%rax)
  4048d0:       31 c0                   xor    %eax,%eax
  4048d2:       66 2e 0f 1f 84 00 00    cs nopw 0x0(%rax,%rax,1)
  4048d9:       00 00 00
  4048dc:       0f 1f 40 00             nopl   0x0(%rax)
  4048e0:       c5 fc 28 80 00 01 4b    vmovaps 0x4b0100(%rax),%ymm0
  4048e7:       00
  4048e8:       c5 fc 28 88 20 01 4b    vmovaps 0x4b0120(%rax),%ymm1
  4048ef:       00
  4048f0:       c5 fc 59 80 00 0d 49    vmulps 0x490d00(%rax),%ymm0,%ymm0
  4048f7:       00
  4048f8:       c5 f4 59 88 20 0d 49    vmulps 0x490d20(%rax),%ymm1,%ymm1
  4048ff:       00
  404900:       c5 fc 29 80 00 31 43    vmovaps %ymm0,0x433100(%rax)
  404907:       00
  404908:       c5 fc 29 88 20 31 43    vmovaps %ymm1,0x433120(%rax)
  40490f:       00
  404910:       c5 fc 28 90 00 19 47    vmovaps 0x471900(%rax),%ymm2
  404917:       00
  404918:       c5 fc 28 98 20 19 47    vmovaps 0x471920(%rax),%ymm3
  40491f:       00
  404920:       c4 e2 7d a8 90 00 25    vfmadd213ps 0x452500(%rax),%ymm0,%ymm2
  404927:       45 00
  404929:       c4 e2 75 a8 98 20 25    vfmadd213ps 0x452520(%rax),%ymm1,%ymm3
  404930:       45 00
  404932:       c5 fc 29 90 00 25 45    vmovaps %ymm2,0x452500(%rax)
  404939:       00
  40493a:       c5 fc 29 98 20 25 45    vmovaps %ymm3,0x452520(%rax)
  404941:       00
  404942:       48 83 c0 40             add    $0x40,%rax
  404946:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
  40494c:       75 92                   jne    4048e0 <s152+0x30>
  40494e:       bf 00 25 45 00          mov    $0x452500,%edi
  404953:       be 00 31 43 00          mov    $0x433100,%esi
  404958:       ba 00 19 47 00          mov    $0x471900,%edx
  40495d:       b9 00 0d 49 00          mov    $0x490d00,%ecx
  404962:       41 b8 00 01 4b 00       mov    $0x4b0100,%r8d
  404968:       41 b9 00 f5 4c 00       mov    $0x4cf500,%r9d
  40496e:       c5 f8 57 c0             vxorps %xmm0,%xmm0,%xmm0
  404972:       68 00 f5 54 00          push   $0x54f500
  404977:       68 00 f5 50 00          push   $0x50f500
  40497c:       c5 f8 77                vzeroupper 
  40497f:       e8 3c 10 01 00          call   4159c0 <dummy>
  404984:       48 83 c4 10             add    $0x10,%rsp
  404988:       83 c3 01                add    $0x1,%ebx
  40498b:       81 fb a0 86 01 00       cmp    $0x186a0,%ebx
  404991:       0f 85 39 ff ff ff       jne    4048d0 <s152+0x20>
  404997:       49 83 c6 10             add    $0x10,%r14
  40499b:       4c 89 f7                mov    %r14,%rdi
  40499e:       31 f6                   xor    %esi,%esi
  4049a0:       e8 bb c6 ff ff          call   401060 <gettimeofday@plt>
  4049a5:       bf b7 e1 42 00          mov    $0x42e1b7,%edi
  4049aa:       48 83 c4 08             add    $0x8,%rsp
  4049ae:       5b                      pop    %rbx
  4049af:       41 5e                   pop    %r14
  4049b1:       e9 4a 26 02 00          jmp    427000 <calc_checksum>
  4049b6:       66 2e 0f 1f 84 00 00    cs nopw 0x0(%rax,%rax,1)
  4049bd:       00 00 00 


We get:
real_t s152 (struct args_t * func_args)
{
  int i;
  int nl;
  static const char __func__[5] = "s152";
  struct timeval * _1;
  float _2;
  float _3;
  float _4;
  struct timeval * _5;
  real_t _16;
  long unsigned int _21;
  long unsigned int _22;
  real_t * _23;
  float _24;
  real_t * _25;
  float _26;
  real_t * _27;
  float _28;
  float _29;
  float _30;
  unsigned int ivtmp_48;
  unsigned int ivtmp_49;
  unsigned int ivtmp_50;
  unsigned int ivtmp_51;

  <bb 2> [local count: 108459]:
  initialise_arrays (&__func__);
  _1 = &func_args_12(D)->t1;
  gettimeofday (_1, 0B);
  goto <bb 5>; [100.00%]

  <bb 8> [local count: 1052266996]:

  <bb 3> [local count: 1063004409]:
  # i_40 = PHI <i_20(8), 0(5)>
  # ivtmp_51 = PHI <ivtmp_50(8), 32000(5)>
  _2 = d[i_40];
  _3 = e[i_40];
  _4 = _2 * _3;
  b[i_40] = _4;
  _21 = (long unsigned int) i_40;
  _22 = _21 * 4;
  _23 = &a + _22;
  _24 = *_23;
  _25 = &b + _22;
  _26 = *_25;
  _27 = &c + _22;
  _28 = *_27;
  _29 = _26 * _28;
  _30 = _24 + _29;
  *_23 = _30;
  i_20 = i_40 + 1;
  ivtmp_50 = ivtmp_51 - 1;
  if (ivtmp_50 != 0)
    goto <bb 8>; [98.99%]
  else
    goto <bb 4>; [1.01%]

  <bb 4> [local count: 10737416]:
  dummy (&a, &b, &c, &d, &e, &aa, &bb, &cc, 0.0);
  nl_18 = nl_39 + 1;
  ivtmp_48 = ivtmp_49 - 1;
  if (ivtmp_48 != 0)
    goto <bb 7>; [98.99%]
  else
    goto <bb 6>; [1.01%]

  <bb 7> [local count: 10628957]:

  <bb 5> [local count: 10737416]:
  # nl_39 = PHI <nl_18(7), 0(2)>
  # ivtmp_49 = PHI <ivtmp_48(7), 100000(2)>
  goto <bb 3>; [100.00%]

  <bb 6> [local count: 108459]:
  _5 = &func_args_12(D)->t2;
  gettimeofday (_5, 0B);
  _16 = calc_checksum (&__func__);
  return _16;
}

and fail at:
tsvc.c:699:27: note:   can tell at compile time that b[i_40] and *_25 alias
tsvc.c:686:14: missed:   not vectorized: compilation time alias: b[i_40] = _4;
_26 = *_25;
tsvc.c:699:27: note:  ***** Analysis failed with vector mode V8QI
tsvc.c:699:27: missed: couldn't vectorize loop
tsvc.c:689:8: note: vectorized 0 loops in function.

So I seem we get confused by:
  b[i_40] = _4;

and
  _21 = (long unsigned int) i_40;
  _22 = _21 * 4;
  _25 = &b + _22;

acessing same location...

Reply via email to