https://github.com/ahmednoursphinx updated https://github.com/llvm/llvm-project/pull/166178
>From 3cdf74e3eee0e206a3588acb1b4e53fd66eb9517 Mon Sep 17 00:00:00 2001 From: ahmed <[email protected]> Date: Mon, 3 Nov 2025 17:16:29 +0200 Subject: [PATCH 1/4] fix: rematerialize smaller predicate masks --- llvm/lib/Target/X86/X86InstrAVX512.td | 25 ++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 6 ++ llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 93 ++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 llvm/test/CodeGen/X86/avx512-mask-set-opt.ll diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1b748b7355716..9fae602974242 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> { defm KSET0 : avx512_mask_setop_w<immAllZerosV>; defm KSET1 : avx512_mask_setop_w<immAllOnesV>; +// 8-bit mask set operations for AVX512DQ +let Predicates = [HasDQI] in { + defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>; + defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>; +} + // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; @@ -3173,6 +3179,25 @@ let Predicates = [HasAVX512] in { def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } +// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper bits +let Predicates = [HasDQI] in { + def : Pat<(v8i1 immAllZerosV), (KSET0B)>; + def : Pat<(v8i1 immAllOnesV), (KSET1B)>; +} + +// Optimize bitconvert of all-ones constants to use kxnor instructions +let Predicates = [HasDQI] in { + def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>; + def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>; +} +let Predicates = [HasAVX512] in { + def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>; + def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>; +} + // Patterns for kmask insert_subvector/extract_subvector to/from index=0 multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT, RegisterClass RC, ValueType VT> { diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 6b2a7a4ec3583..3eadac4f827bc 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -789,9 +789,11 @@ bool X86InstrInfo::isReMaterializableImpl( case X86::FsFLD0SS: case X86::FsFLD0SH: case X86::FsFLD0F128: + case X86::KSET0B: case X86::KSET0D: case X86::KSET0Q: case X86::KSET0W: + case X86::KSET1B: case X86::KSET1D: case X86::KSET1Q: case X86::KSET1W: @@ -6352,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // registers, since it is not usable as a write mask. // FIXME: A more advanced approach would be to choose the best input mask // register based on context. + case X86::KSET0B: + return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0); case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0); case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0); case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0); + case X86::KSET1B: + return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0); case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0); case X86::KSET1D: diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll new file mode 100644 index 0000000000000..6a1a0af05d05c --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQBW + +declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) + +; Test case 1: v16i1 with all bits set (should use kxnorw on all targets) +define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512F-LABEL: gather_all: +; AVX512F: # %bb.0: +; AVX512F-NEXT: kxnorw %k0, %k0, %k1 +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: gather_all: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1 +; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: gather_all: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: gather_all: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k1 +; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQBW-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float>undef) + ret <16 x float> %res +} + +; Test case 2: v8i1 with lower 8 bits set (should use kxnorb on AVX512DQ targets) +define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { +; AVX512F-LABEL: gather_lower: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: movw $255, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: gather_lower: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: gather_lower: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: movw $255, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQBW-LABEL: gather_lower: +; AVX512DQBW: # %bb.0: +; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1 +; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} +; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512DQBW-NEXT: retq + %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %sext_ind = sext <16 x i32> %ind to <16 x i64> + %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float>undef) + ret <16 x float> %res +} + + >From 4d2cfe334a511eb4e6baa7c98a34ea3e51ecd62d Mon Sep 17 00:00:00 2001 From: ahmed <[email protected]> Date: Mon, 3 Nov 2025 17:20:20 +0200 Subject: [PATCH 2/4] chore: update formatting --- llvm/lib/Target/X86/X86InstrAVX512.td | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9fae602974242..8a06296751f0d 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3179,23 +3179,24 @@ let Predicates = [HasAVX512] in { def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; } -// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper bits +// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper +// bits let Predicates = [HasDQI] in { def : Pat<(v8i1 immAllZerosV), (KSET0B)>; - def : Pat<(v8i1 immAllOnesV), (KSET1B)>; + def : Pat<(v8i1 immAllOnesV), (KSET1B)>; } // Optimize bitconvert of all-ones constants to use kxnor instructions let Predicates = [HasDQI] in { - def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>; - def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>; + def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>; + def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>; } let Predicates = [HasAVX512] in { - def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>; + def : Pat<(v16i1(bitconvert(i16 65535))), (KSET1W)>; } let Predicates = [HasBWI] in { - def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>; - def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>; + def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>; + def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>; } // Patterns for kmask insert_subvector/extract_subvector to/from index=0 >From b46db3285690008f1e97561c65aa2f3eccbcbc37 Mon Sep 17 00:00:00 2001 From: ahmed <[email protected]> Date: Mon, 3 Nov 2025 17:24:48 +0200 Subject: [PATCH 3/4] fix: Use poison values for placeholders --- llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll index 6a1a0af05d05c..485ffe6ee07b6 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll @@ -39,11 +39,11 @@ define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) { ; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 ; AVX512DQBW-NEXT: retq - %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 - %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float>undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison) ret <16 x float> %res } @@ -82,11 +82,11 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) { ; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} ; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0 ; AVX512DQBW-NEXT: retq - %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 - %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer + %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0 + %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer %sext_ind = sext <16 x i32> %ind to <16 x i64> %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind - %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float>undef) + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison) ret <16 x float> %res } >From 25a8351e3af8fd8430e00ee53d97c25c3295163e Mon Sep 17 00:00:00 2001 From: ahmed <[email protected]> Date: Mon, 3 Nov 2025 17:26:43 +0200 Subject: [PATCH 4/4] fix: Update formatting --- clang/include/clang/Basic/DiagnosticLexKinds.td | 14 +++++++------- clang/include/clang/Driver/Options.td | 7 ++++--- llvm/lib/Target/PowerPC/PPCInstrFuture.td | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index 417187222e448..e3796e3637742 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -90,13 +90,13 @@ def err_unterminated___pragma : Error<"missing terminating ')' character">; def err_conflict_marker : Error<"version control conflict marker in file">; -def err_counter_overflow : Error< - "'__COUNTER__' value cannot exceed 2'147'483'647">; -def ext_counter : Extension< - "'__COUNTER__' is a C2y extension">, InGroup<C2y>; -def warn_counter : Warning< - "'__COUNTER__' is incompatible with standards before C2y">, - InGroup<CPre2yCompat>, DefaultIgnore; +def err_counter_overflow + : Error<"'__COUNTER__' value cannot exceed 2'147'483'647">; +def ext_counter : Extension<"'__COUNTER__' is a C2y extension">, InGroup<C2y>; +def warn_counter + : Warning<"'__COUNTER__' is incompatible with standards before C2y">, + InGroup<CPre2yCompat>, + DefaultIgnore; def err_raw_delim_too_long : Error< "raw string delimiter longer than 16 characters" diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 20955ef1b852e..af254bc0a7cf8 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8445,9 +8445,10 @@ def aligned_alloc_unavailable : Flag<["-"], "faligned-alloc-unavailable">, MarshallingInfoFlag<LangOpts<"AlignedAllocationUnavailable">>, ShouldParseIf<faligned_allocation.KeyPath>; -def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">, - HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">, - MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">; +def finitial_counter_value_EQ + : Joined<["-"], "finitial-counter-value=">, + HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">, + MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">; } // let Visibility = [CC1Option] diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 0c2e44e18f463..424f0e06cc3d3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -362,7 +362,7 @@ let Predicates = [HasVSX, IsISAFuture] in { "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; def LXVPB32X : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp), - (ins (memr $RA):$addr, g8rc:$RB), + (ins(memr $RA):$addr, g8rc:$RB), "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>; } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
