https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97770
--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> --- > But for vector byte/word/quadword, vectorizer still use vpopcntd, but not > vpopcnt{b,w,q}, missing corresponding ifn? We don't have __builtin_popcount{w,b}, but we have __builtin_popcountl. for testcase --- void fooq(unsigned long long* __restrict dest, unsigned long long* src) { for (int i = 0; i != 4; i++) dest[i] = __builtin_popcountl (src[i]); } ---- icc/clang generate --- _Z4fooqPxS_: # @_Z4fooqPxS_ vpopcntq ymm0, ymmword ptr [rsi] vmovdqu ymmword ptr [rdi], ymm0 vzeroupper ret --- But gcc generate --- fooq: .LFB0: .cfi_startproc vpopcntq 16(%rsi), %xmm1 vpopcntq (%rsi), %xmm0 vshufps $136, %xmm1, %xmm0, %xmm0 vpmovsxdq %xmm0, %xmm1 vpsrldq $8, %xmm0, %xmm0 vpmovsxdq %xmm0, %xmm0 vmovdqu %xmm1, (%rdi) vmovdqu %xmm0, 16(%rdi) ret .cfi_endproc --- dump for 164.vect --- ;; Function fooq (fooq, funcdef_no=0, decl_uid=4228, cgraph_uid=1, symbol_order=0) Merging blocks 2 and 6 fooq (long long unsigned int * restrict dest, long long unsigned int * src) { vector(2) long long unsigned int * vectp_dest.10; vector(2) long long unsigned int * vectp_dest.9; vector(2) long long unsigned int vect__7.8; vector(4) int vect__5.7; vector(2) long long unsigned int vect__4.6; vector(2) long long unsigned int vect__4.5; vector(2) long long unsigned int * vectp_src.4; vector(2) long long unsigned int * vectp_src.3; int i; long unsigned int _1; long unsigned int _2; long long unsigned int * _3; long long unsigned int _4; int _5; long long unsigned int * _6; long long unsigned int _7; vector(2) long long unsigned int _8; vector(2) long long unsigned int _26; unsigned int ivtmp_30; unsigned int ivtmp_31; unsigned int ivtmp_36; unsigned int ivtmp_37; <bb 2> [local count: 214748368]: <bb 3> [local count: 214748371]: # i_18 = PHI <i_14(5), 0(2)> # ivtmp_31 = PHI <ivtmp_30(5), 4(2)> # vectp_src.3_20 = PHI <vectp_src.3_17(5), src_11(D)(2)> # vectp_dest.9_24 = PHI <vectp_dest.9_32(5), dest_12(D)(2)> # ivtmp_36 = PHI <ivtmp_37(5), 0(2)> _1 = (long unsigned int) i_18; _2 = _1 * 8; _3 = src_11(D) + _2; vect__4.5_16 = MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_src.3_20]; vectp_src.3_15 = vectp_src.3_20 + 16; vect__4.6_9 = MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_src.3_15]; _4 = *_3; _8 = .POPCOUNT (vect__4.5_16); _26 = .POPCOUNT (vect__4.6_9); vect__5.7_22 = VEC_PACK_TRUNC_EXPR <_8, _26>; --- Why do we do this? _5 = 0; _6 = dest_12(D) + _2; vect__7.8_23 = [vec_unpack_lo_expr] vect__5.7_22; vect__7.8_25 = [vec_unpack_hi_expr] vect__5.7_22; _7 = (long long unsigned int) _5; MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_dest.9_24] = vect__7.8_23; vectp_dest.9_34 = vectp_dest.9_24 + 16; MEM <vector(2) long long unsigned int> [(long long unsigned int *)vectp_dest.9_34] = vect__7.8_25; i_14 = i_18 + 1; ivtmp_30 = ivtmp_31 - 1; vectp_src.3_17 = vectp_src.3_15 + 16; vectp_dest.9_32 = vectp_dest.9_34 + 16; ivtmp_37 = ivtmp_36 + 1; if (ivtmp_37 < 1) goto <bb 5>; [0.00%] else goto <bb 4>; [100.00%] <bb 5> [local count: 0]: goto <bb 3>; [100.00%] <bb 4> [local count: 214748368]: return; } ---