[clang] [llvm] [X86][AVX512] rematerialize smaller predicate masks (PR #166178)

Ahmed Nour via cfe-commits Mon, 03 Nov 2025 07:27:04 -0800

https://github.com/ahmednoursphinx updated 
https://github.com/llvm/llvm-project/pull/166178


>From 3cdf74e3eee0e206a3588acb1b4e53fd66eb9517 Mon Sep 17 00:00:00 2001
From: ahmed <[email protected]>
Date: Mon, 3 Nov 2025 17:16:29 +0200
Subject: [PATCH 1/4] fix: rematerialize smaller predicate masks

---
 llvm/lib/Target/X86/X86InstrAVX512.td        | 25 ++++++
 llvm/lib/Target/X86/X86InstrInfo.cpp         |  6 ++
 llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 93 ++++++++++++++++++++
 3 files changed, 124 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/avx512-mask-set-opt.ll

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td 
b/llvm/lib/Target/X86/X86InstrAVX512.td
index 1b748b7355716..9fae602974242 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> {
 defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
 defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
 
+// 8-bit mask set operations for AVX512DQ
+let Predicates = [HasDQI] in {
+  defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>;
+  defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>;
+}
+
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
@@ -3173,6 +3179,25 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v1i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK1)>;
 }
 
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper 
bits
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
+  def : Pat<(v8i1 immAllOnesV),  (KSET1B)>;
+}
+
+// Optimize bitconvert of all-ones constants to use kxnor instructions
+let Predicates = [HasDQI] in {
+  def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>;
+  def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>;
+}
+let Predicates = [HasAVX512] in {
+  def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>;
+}
+let Predicates = [HasBWI] in {
+  def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>;
+  def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>;
+}
+
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0
 multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType 
subVT,
                                              RegisterClass RC, ValueType VT> {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp 
b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4ec3583..3eadac4f827bc 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -789,9 +789,11 @@ bool X86InstrInfo::isReMaterializableImpl(
   case X86::FsFLD0SS:
   case X86::FsFLD0SH:
   case X86::FsFLD0F128:
+  case X86::KSET0B:
   case X86::KSET0D:
   case X86::KSET0Q:
   case X86::KSET0W:
+  case X86::KSET1B:
   case X86::KSET1D:
   case X86::KSET1Q:
   case X86::KSET1W:
@@ -6352,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) 
const {
   // registers, since it is not usable as a write mask.
   // FIXME: A more advanced approach would be to choose the best input mask
   // register based on context.
+  case X86::KSET0B:
+    return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
   case X86::KSET0W:
     return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
   case X86::KSET0D:
     return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
   case X86::KSET0Q:
     return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
+  case X86::KSET1B:
+    return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
   case X86::KSET1W:
     return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
   case X86::KSET1D:
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll 
b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
new file mode 100644
index 0000000000000..6a1a0af05d05c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s 
--check-prefixes=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | 
FileCheck %s --check-prefixes=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | 
FileCheck %s --check-prefixes=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown 
-mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQBW
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x 
i1>, <16 x float>)
+
+; Test case 1: v16i1 with all bits set (should use kxnorw on all targets)
+define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_all:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: gather_all:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: gather_all:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512BW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: gather_all:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT:    retq
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x 
ptr> undef, <16 x i32> zeroinitializer
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> 
%sext_ind
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> 
%gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, 
i1 true, i1 true>, <16 x float>undef)
+  ret <16 x float> %res
+}
+
+; Test case 2: v8i1 with lower 8 bits set (should use kxnorb on AVX512DQ 
targets)
+define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_lower:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    movw $255, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512DQ-LABEL: gather_lower:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: gather_lower:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    movw $255, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQBW-LABEL: gather_lower:
+; AVX512DQBW:       # %bb.0:
+; AVX512DQBW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT:    kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT:    retq
+  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x 
ptr> undef, <16 x i32> zeroinitializer
+  %sext_ind = sext <16 x i32> %ind to <16 x i64>
+  %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> 
%sext_ind
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> 
%gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 
true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 
false, i1 false, i1 false>, <16 x float>undef)
+  ret <16 x float> %res
+}
+
+

>From 4d2cfe334a511eb4e6baa7c98a34ea3e51ecd62d Mon Sep 17 00:00:00 2001
From: ahmed <[email protected]>
Date: Mon, 3 Nov 2025 17:20:20 +0200
Subject: [PATCH 2/4] chore: update formatting

---
 llvm/lib/Target/X86/X86InstrAVX512.td | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td 
b/llvm/lib/Target/X86/X86InstrAVX512.td
index 9fae602974242..8a06296751f0d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3179,23 +3179,24 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v1i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK1)>;
 }
 
-// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper 
bits
+// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper
+// bits
 let Predicates = [HasDQI] in {
   def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
-  def : Pat<(v8i1 immAllOnesV),  (KSET1B)>;
+  def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
 }
 
 // Optimize bitconvert of all-ones constants to use kxnor instructions
 let Predicates = [HasDQI] in {
-  def : Pat<(v8i1 (bitconvert (i8 255))), (KSET1B)>;
-  def : Pat<(v16i1 (bitconvert (i16 255))), (COPY_TO_REGCLASS (KSET1B), VK16)>;
+  def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>;
+  def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>;
 }
 let Predicates = [HasAVX512] in {
-  def : Pat<(v16i1 (bitconvert (i16 65535))), (KSET1W)>;
+  def : Pat<(v16i1(bitconvert(i16 65535))), (KSET1W)>;
 }
 let Predicates = [HasBWI] in {
-  def : Pat<(v32i1 (bitconvert (i32 -1))), (KSET1D)>;
-  def : Pat<(v64i1 (bitconvert (i64 -1))), (KSET1Q)>;
+  def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>;
+  def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>;
 }
 
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0

>From b46db3285690008f1e97561c65aa2f3eccbcbc37 Mon Sep 17 00:00:00 2001
From: ahmed <[email protected]>
Date: Mon, 3 Nov 2025 17:24:48 +0200
Subject: [PATCH 3/4] fix: Use poison values for placeholders

---
 llvm/test/CodeGen/X86/avx512-mask-set-opt.ll | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll 
b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
index 6a1a0af05d05c..485ffe6ee07b6 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -39,11 +39,11 @@ define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, 
i16 %mask) {
 ; AVX512DQBW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; AVX512DQBW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512DQBW-NEXT:    retq
-  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x 
ptr> undef, <16 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x 
ptr> poison, <16 x i32> zeroinitializer
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> 
%sext_ind
-  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> 
%gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, 
i1 true, i1 true>, <16 x float>undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> 
%gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 
true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, 
i1 true, i1 true>, <16 x float> poison)
   ret <16 x float> %res
 }
 
@@ -82,11 +82,11 @@ define <16 x float> @gather_lower(ptr %base, <16 x i32> 
%ind, i16 %mask) {
 ; AVX512DQBW-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; AVX512DQBW-NEXT:    vmovaps %zmm1, %zmm0
 ; AVX512DQBW-NEXT:    retq
-  %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
-  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x 
ptr> undef, <16 x i32> zeroinitializer
+  %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+  %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x 
ptr> poison, <16 x i32> zeroinitializer
   %sext_ind = sext <16 x i32> %ind to <16 x i64>
   %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> 
%sext_ind
-  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> 
%gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 
true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 
false, i1 false, i1 false>, <16 x float>undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> 
%gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 
true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 
false, i1 false, i1 false>, <16 x float> poison)
   ret <16 x float> %res
 }
 

>From 25a8351e3af8fd8430e00ee53d97c25c3295163e Mon Sep 17 00:00:00 2001
From: ahmed <[email protected]>
Date: Mon, 3 Nov 2025 17:26:43 +0200
Subject: [PATCH 4/4] fix: Update formatting

---
 clang/include/clang/Basic/DiagnosticLexKinds.td | 14 +++++++-------
 clang/include/clang/Driver/Options.td           |  7 ++++---
 llvm/lib/Target/PowerPC/PPCInstrFuture.td       |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td 
b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 417187222e448..e3796e3637742 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -90,13 +90,13 @@ def err_unterminated___pragma : Error<"missing terminating 
')' character">;
 
 def err_conflict_marker : Error<"version control conflict marker in file">;
 
-def err_counter_overflow : Error<
-  "'__COUNTER__' value cannot exceed 2'147'483'647">;
-def ext_counter : Extension<
-  "'__COUNTER__' is a C2y extension">, InGroup<C2y>;
-def warn_counter : Warning<
-  "'__COUNTER__' is incompatible with standards before C2y">,
-  InGroup<CPre2yCompat>, DefaultIgnore;
+def err_counter_overflow
+    : Error<"'__COUNTER__' value cannot exceed 2'147'483'647">;
+def ext_counter : Extension<"'__COUNTER__' is a C2y extension">, InGroup<C2y>;
+def warn_counter
+    : Warning<"'__COUNTER__' is incompatible with standards before C2y">,
+      InGroup<CPre2yCompat>,
+      DefaultIgnore;
 
 def err_raw_delim_too_long : Error<
   "raw string delimiter longer than 16 characters"
diff --git a/clang/include/clang/Driver/Options.td 
b/clang/include/clang/Driver/Options.td
index 20955ef1b852e..af254bc0a7cf8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8445,9 +8445,10 @@ def aligned_alloc_unavailable : Flag<["-"], 
"faligned-alloc-unavailable">,
   MarshallingInfoFlag<LangOpts<"AlignedAllocationUnavailable">>,
   ShouldParseIf<faligned_allocation.KeyPath>;
 
-def finitial_counter_value_EQ : Joined<["-"], "finitial-counter-value=">,
-  HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
-  MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">;
+def finitial_counter_value_EQ
+    : Joined<["-"], "finitial-counter-value=">,
+      HelpText<"Sets the initial value for __COUNTER__, defaults to 0.">,
+      MarshallingInfoInt<PreprocessorOpts<"InitialCounterValue">, "0">;
 
 } // let Visibility = [CC1Option]
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td 
b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 0c2e44e18f463..424f0e06cc3d3 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -362,7 +362,7 @@ let Predicates = [HasVSX, IsISAFuture] in {
                                   "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
     def LXVPB32X
         : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp),
-                          (ins (memr $RA):$addr, g8rc:$RB),
+                          (ins(memr $RA):$addr, g8rc:$RB),
                           "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [X86][AVX512] rematerialize smaller predicate masks (PR #166178)

Reply via email to