https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104268
Bug ID: 104268 Summary: 390: inefficient vec_popcnt for 16-bit for z13 Product: gcc Version: 10.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: jens.seifert at de dot ibm.com Target Milestone: --- #include <vecintrin.h> vector unsigned short popcnt(vector unsigned short a) { return vec_popcnt(a); } Generates with -march=z13 _Z6popcntDv8_t: .LFB1: .cfi_startproc vzero %v0 vpopct %v24,%v24,0 vleib %v0,8,7 vsrlb %v0,%v24,%v0 vab %v24,%v24,%v0 vgbm %v0,21845 vn %v24,%v24,%v0 br %r14 .cfi_endproc Optimal sequence would be: vector unsigned short popcnt_opt(vector unsigned short a) { vector unsigned short r = (vector unsigned short)vec_popcnt((vector unsigned char)a); vector unsigned short b = vec_rli(r, 8); r = r + b; r = r >> 8; return r; } _Z10popcnt_optDv8_t: .LFB3: .cfi_startproc vpopct %v24,%v24,0 verllh %v0,%v24,8 vah %v24,%v0,%v24 vesrlh %v24,%v24,8 br %r14 .cfi_endproc