https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97770

--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
> But for vector byte/word/quadword, vectorizer still use vpopcntd, but not
> vpopcnt{b,w,q}, missing corresponding ifn?

We don't have __builtin_popcount{w,b}, but we have __builtin_popcountl.

for testcase
---
void
fooq(unsigned long long* __restrict dest, unsigned long long* src)
{
  for (int i = 0; i != 4; i++)
    dest[i] = __builtin_popcountl (src[i]);
}
----

icc/clang generate
---
_Z4fooqPxS_:                            # @_Z4fooqPxS_
        vpopcntq        ymm0, ymmword ptr [rsi]
        vmovdqu ymmword ptr [rdi], ymm0
        vzeroupper
        ret
---

But gcc generate
---
fooq:
.LFB0:
        .cfi_startproc
        vpopcntq        16(%rsi), %xmm1
        vpopcntq        (%rsi), %xmm0
        vshufps $136, %xmm1, %xmm0, %xmm0
        vpmovsxdq       %xmm0, %xmm1
        vpsrldq $8, %xmm0, %xmm0
        vpmovsxdq       %xmm0, %xmm0
        vmovdqu %xmm1, (%rdi)
        vmovdqu %xmm0, 16(%rdi)
        ret
        .cfi_endproc
---

dump for 164.vect

---
;; Function fooq (fooq, funcdef_no=0, decl_uid=4228, cgraph_uid=1,
symbol_order=0)

Merging blocks 2 and 6
fooq (long long unsigned int * restrict dest, long long unsigned int * src)
{
  vector(2) long long unsigned int * vectp_dest.10;
  vector(2) long long unsigned int * vectp_dest.9;
  vector(2) long long unsigned int vect__7.8;
  vector(4) int vect__5.7;
  vector(2) long long unsigned int vect__4.6;
  vector(2) long long unsigned int vect__4.5;
  vector(2) long long unsigned int * vectp_src.4;
  vector(2) long long unsigned int * vectp_src.3;
  int i;
  long unsigned int _1;
  long unsigned int _2;
  long long unsigned int * _3;
  long long unsigned int _4;
  int _5;
  long long unsigned int * _6;
  long long unsigned int _7;
  vector(2) long long unsigned int _8;
  vector(2) long long unsigned int _26;
  unsigned int ivtmp_30;
  unsigned int ivtmp_31;
  unsigned int ivtmp_36;
  unsigned int ivtmp_37;

  <bb 2> [local count: 214748368]:

  <bb 3> [local count: 214748371]:
  # i_18 = PHI <i_14(5), 0(2)>
  # ivtmp_31 = PHI <ivtmp_30(5), 4(2)>
  # vectp_src.3_20 = PHI <vectp_src.3_17(5), src_11(D)(2)>
  # vectp_dest.9_24 = PHI <vectp_dest.9_32(5), dest_12(D)(2)>
  # ivtmp_36 = PHI <ivtmp_37(5), 0(2)>
  _1 = (long unsigned int) i_18;
  _2 = _1 * 8;
  _3 = src_11(D) + _2;
  vect__4.5_16 = MEM <vector(2) long long unsigned int> [(long long unsigned
int *)vectp_src.3_20];
  vectp_src.3_15 = vectp_src.3_20 + 16;
  vect__4.6_9 = MEM <vector(2) long long unsigned int> [(long long unsigned int
*)vectp_src.3_15];
  _4 = *_3;
  _8 = .POPCOUNT (vect__4.5_16);
  _26 = .POPCOUNT (vect__4.6_9);
  vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this?
  _5 = 0;
  _6 = dest_12(D) + _2;
  vect__7.8_23 = [vec_unpack_lo_expr] vect__5.7_22;
  vect__7.8_25 = [vec_unpack_hi_expr] vect__5.7_22;
  _7 = (long long unsigned int) _5;
  MEM <vector(2) long long unsigned int> [(long long unsigned int
*)vectp_dest.9_24] = vect__7.8_23;
  vectp_dest.9_34 = vectp_dest.9_24 + 16;
  MEM <vector(2) long long unsigned int> [(long long unsigned int
*)vectp_dest.9_34] = vect__7.8_25;
  i_14 = i_18 + 1;
  ivtmp_30 = ivtmp_31 - 1;
  vectp_src.3_17 = vectp_src.3_15 + 16;
  vectp_dest.9_32 = vectp_dest.9_34 + 16;
  ivtmp_37 = ivtmp_36 + 1;
  if (ivtmp_37 < 1)
    goto <bb 5>; [0.00%]
  else
    goto <bb 4>; [100.00%]

  <bb 5> [local count: 0]:
  goto <bb 3>; [100.00%]

  <bb 4> [local count: 214748368]:
  return;

}
---

Reply via email to