Issue 127789
Summary [AVX-512] Vector-NOT + masked `vpmovwb` loses merge-mask because backend flips order
Labels new issue
Assignees
Reporter Validark
    [Zig Godbolt](https://zig.godbo.lt/#g:!((g:!((g:!((h:codeEditor,i:(filename:'1',fontScale:14,fontUsePx:'0',j:1,lang:zig,selection:(endColumn:29,endLineNumber:1,positionColumn:29,positionLineNumber:1,selectionStartColumn:25,selectionStartLineNumber:1,startColumn:25,startLineNumber:1),source:'const+DEFAULT_VERSION+%3D+true%3B%0A%0Aexport+fn+foo(matched:+u32,+unused:+@Vector(64,+u8),+kw_table_indices:+@Vector(64,+u8))+@Vector(32,+u8)+%7B%0A++++_+%3D+unused%3B%0A++++const+keyword_kinds+%3D+@select(%0A++++++++u8,%0A++++++++@as(@Vector(32,+bool),+@bitCast(matched)),%0A++++++++@as(@Vector(32,+u8),+@truncate(@as(@Vector(32,+u16),+@bitCast(vpnot(kw_table_indices))))),%0A++++++++@as(@Vector(32,+u8),+@splat(252)),%0A++++)%3B%0A++++return+keyword_kinds%3B%0A%7D%0A%0Afn+vpnot(a_:+anytype)+@TypeOf(a_)+%7B%0A++++if+(DEFAULT_VERSION)+return+~a_%3B%0A++++const+a:+@Vector(8,+u64)+%3D+@bitCast(a_)%3B%0A++++return+@bitCast(vpternlog(a,+a,+a,+51))%3B%0A%7D%0A%0Afn+vpternlog(vec_1:+@Vector(8,+u64),+vec_2:+@Vector(8,+u64),+vec_3:+@Vector(8,+u64),+comptime+i:+i32)+@Vector(8,+u64)+%7B%0A++++var+result+%3D+vec_1%3B%0A++++asm+volatile+(%0A++++++++%5C%5C+vpternlogq+%25%5Bimm%5D,+%25%5Bvec_3%5D,+%25%5Bvec_2%5D,+%25%5Bvec_1%5D%0A++++++++:+%5Bvec_1%5D+%22%2Bx%22+(result)%0A++++++++:+%5Bvec_2%5D+%22x%22+(vec_2),+%5Bvec_3%5D+%22x%22+(vec_3),+%5Bimm%5D+%22n%22+(i),%0A++++++++:%0A++++)%3B%0A++++return+result%3B%0A%7D%0A'),l:'5',n:'0',o:'Zig+source+%231',t:'0')),k:50,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:ztrunk,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1',verboseDemangling:'0'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:2,lang:zig,libs:!(),options:'-O+ReleaseFast+-target+x86_64-linux+-mcpu%3Dznver5',overrides:!(),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+zig+trunk+(Editor+%231)',t:'0')),header:(),k:50,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)

This assembly:

```asm
vpmovwb         ymm1, zmm1
vpternlogq      ymm1, ymm1, ymm1, 15
vmovdqu8        ymm0 {k1}, ymm1
```

Should be:

```asm
vpternlogq      zmm1, zmm1, zmm1, 51
vpmovwb         ymm0 {k1}, zmm1
```

(I assume both 15 and 51 do a bitwise NOT on zmm1?)

Kinda similar to https://github.com/llvm/llvm-project/issues/113400

Optimized LLVM IR:

```llvm
; ModuleID = 'BitcodeBuffer'
source_filename = "root"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux4.19.0-musl"

; Function Attrs: mustprogress nofree norecurse nosanitize_coverage nosync nounwind skipprofile willreturn memory(none) uwtable
define dso_local <32 x i8> @foo(i32 %0, <64 x i8> %1, <64 x i8> %2) local_unnamed_addr #0 {
  %4 = bitcast i32 %0 to <32 x i1>
  %5 = bitcast <64 x i8> %2 to <32 x i16>
  %6 = trunc <32 x i16> %5 to <32 x i8>
  %7 = xor <32 x i8> %6, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %8 = select <32 x i1> %4, <32 x i8> %7, <32 x i8> <i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4, i8 -4>
  ret <32 x i8> %8
}

attributes #0 = { mustprogress nofree norecurse nosanitize_coverage nosync nounwind skipprofile willreturn memory(none) uwtable "frame-pointer"="all" "target-cpu"="znver5" "target-features"="+64bit,+adx,+aes,+allow-light-256-bit,+avx,+avx2,+avx512bf16,+avx512bitalg,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512ifma,+avx512vbmi,+avx512vbmi2,+avx512vl,+avx512vnni,+avx512vp2intersect,+avx512vpopcntdq,+avxvnni,+bmi,+bmi2,+branchfusion,+clflushopt,+clwb,+clzero,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fast-15bytenop,+fast-bextr,+fast-dpwssd,+fast-imm16,+fast-lzcnt,+fast-movbe,+fast-scalar-fsqrt,+fast-scalar-shift-masks,+fast-variable-perlane-shuffle,+fast-vector-fsqrt,+fma,+fsgsbase,+fsrm,+fxsr,+gfni,+idivq-to-divl,+invpcid,+lzcnt,+macrofusion,+mmx,+movbe,+movdir64b,+movdiri,+mwaitx,+nopl,+pclmul,+pku,+popcnt,+prefetchi,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sbb-dep-breaking,+sha,+shstk,+slow-shld,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+vzeroupper,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves,-16bit-mode,-32bit-mode,-amx-bf16,-amx-complex,-amx-fp16,-amx-int8,-amx-tile,-avx10.1-256,-avx10.1-512,-avx512fp16,-avxifma,-avxneconvert,-avxvnniint16,-avxvnniint8,-branch-hint,-ccmp,-cf,-cldemote,-cmpccxadd,-egpr,-enqcmd,-ermsb,-false-deps-getmant,-false-deps-lzcnt-tzcnt,-false-deps-mulc,-false-deps-mullq,-false-deps-perm,-false-deps-popcnt,-false-deps-range,-fast-11bytenop,-fast-7bytenop,-fast-gather,-fast-hops,-fast-shld-rotate,-fast-variable-crosslane-shuffle,-fast-vector-shift-masks,-faster-shift-than-shuffle,-fma4,-harden-sls-ijmp,-harden-sls-ret,-hreset,-idivl-to-divb,-inline-asm-use-gpr32,-kl,-lea-sp,-lea-uses-ag,-lvi-cfi,-lvi-load-hardening,-lwp,-ndd,-nf,-no-bypass-delay,-no-bypass-delay-blend,-no-bypass-delay-mov,-no-bypass-delay-shuffle,-pad-short-functions,-pconfig,-ppx,-prefer-128-bit,-prefer-256-bit,-prefer-mask-registers,-prefer-movmsk-over-vtest,-prefer-no-gather,-prefer-no-scatter,-ptwrite,-push2pop2,-raoint,-retpoline,-retpoline-external-thunk,-retpoline-indirect-branches,-retpoline-indirect-calls,-rtm,-serialize,-seses,-sgx,-sha512,-slow-3ops-lea,-slow-incdec,-slow-lea,-slow-pmaddwd,-slow-pmulld,-slow-two-mem-ops,-slow-unaligned-mem-16,-slow-unaligned-mem-32,-sm3,-sm4,-soft-float,-sse-unaligned-mem,-tagged-globals,-tbm,-tsxldtrk,-tuning-fast-imm-vector-shift,-uintr,-use-glm-div-sqrt-costs,-use-slm-arith-costs,-usermsr,-waitpkg,-widekl,-xop,-zu" }

!llvm.module.flags = !{}
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to