[PATCH] i386: Support APX NF and NDD for imul/mul

2024-07-01 Thread kong lingling
Add some missing APX NF and NDD support for imul and mul.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.md (*imulhizu): Added APX
NF support.
(*imulhizu): New define_insn.
(*mulsi3_1_zext): Ditto.
(*mul3_1): Ditto.
(*mulqihi3_1): Ditto.
(*mul3_1): Added APX NDD support.
(*mulv4): Ditto.
(*mulvhi4): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Add test for imul ndd.
---
 gcc/config/i386/i386.md | 98 +
 gcc/testsuite/gcc.target/i386/apx-ndd.c |  8 ++
 2 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fd48e764469..c1f29fee412 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6488,8 +6488,8 @@
 (define_subst_attr "nf_nonf_x64_attr" "nf_subst" "noapx_nf" "x64")

 (define_subst "nf_subst"
-  [(set (match_operand:SWI 0)
-   (match_operand:SWI 1))]
+  [(set (match_operand:SWIDWI 0)
+   (match_operand:SWIDWI 1))]
   ""
   [(set (match_dup 0)
(match_dup 1))
@@ -10028,24 +10028,26 @@
 ;; On BDVER1, all HI MULs use DoublePath

 (define_insn "*mul3_1"
-  [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
+  [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r,r")
(mult:SWIM248
- (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0")
- (match_operand:SWIM248 2 "" "K,,r")))]
+ (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0,r")
+ (match_operand:SWIM248 2 "" "K,,r,r")))]
   "!(MEM_P (operands[1]) && MEM_P (operands[2]))
&& "
   "@
imul{}\t{%2, %1, %0|%0, %1, %2}
imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %0|%0, %2}"
+   imul{}\t{%2, %0|%0, %2}
+   imul{}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "imul")
-   (set_attr "prefix_0f" "0,0,1")
+   (set_attr "prefix_0f" "0,0,1,1")
+   (set_attr "isa" "*,*,*,apx_ndd")
(set (attr "athlon_decode")
(cond [(eq_attr "cpu" "athlon")
  (const_string "vector")
   (eq_attr "alternative" "1")
  (const_string "vector")
-  (and (eq_attr "alternative" "2")
+  (and (eq_attr "alternative" "2,3")
(ior (match_test "mode == HImode")
 (match_operand 1 "memory_operand")))
  (const_string "vector")]
@@ -10063,33 +10065,34 @@
(const_string "direct")))
(set_attr "mode" "")])

-(define_insn "*imulhizu"
+(define_insn "*imulhizu"
   [(set (match_operand:SWI48x 0 "register_operand" "=r,r")
(zero_extend:SWI48x
  (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm")
-  (match_operand:HI 2 "immediate_operand" "K,n"
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_APX_ZU"
+  (match_operand:HI 2 "immediate_operand" "K,n"]
+  "TARGET_APX_ZU && "
   "@
-   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}
-   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}"
+   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}
+   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}"
   [(set_attr "type" "imul")
(set_attr "mode" "HI")])

-(define_insn "*mulsi3_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+(define_insn "*mulsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,r")
(zero_extend:DI
- (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0")
-  (match_operand:SI 2 "x86_64_general_operand"
"K,e,BMr"
-   (clobber (reg:CC FLAGS_REG))]
+ (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0,r")
+  (match_operand:SI 2 "x86_64_general_operand"
"K,e,BMr,BMr"]
   "TARGET_64BIT
-   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
   "@
-   imul{l}\t{%2, %1, %k0|%k0, %1, %2}
-   imul{l}\t{%2, %1, %k0|%k0, %1, %2}
-   imul{l}\t{%2, %k0|%k0, %2}"
+   imul{l}\t{%2, %1, %k0|%k0, %1, %2}
+   imul{l}\t{%2, %1, %k0|%k0, %1, %2}
+   imul{l}\t{%2, %k0|%k0, %2}
+   imul{l}\t{%2, %1, %k0|%k0, %1, %2}"
   [(set_attr "type" "imul")
-   (set_attr "prefix_0f" "0,0,1")
+   (set_attr "prefix_0f" "0,0,1,1")
+   (set_attr "isa" "*,*,*,apx_ndd")
(set (attr "athlon_decode")
(cond [(eq_attr "cpu" "athlon")
  (const_string "vector")
@@ -10158,30 +10161,32 @@
   [(set (reg:CCO FLAGS_REG)
(eq:CCO (mult:
   (sign_extend:
- (match_operand:SWI48 1 "nonimmediate_operand"
"%rm,0"))
+ (match_operand:SWI48 1 "nonimmediate_operand"
"%rm,0,r"))
   (sign_extend:
- (match_operand:SWI48 2 "x86_64_sext_operand"
"We,mr")))
+ (match_operand:SWI48 2 "x86_64_sext_operand"
"We,mr,mr")))
(sign_extend:
   (mult:SWI48 (match_dup 1) (match_dup 2)
-   (set (match

[PATCH] i386: Remove report error for -mapxf/-muintr with -m32

2024-07-17 Thread kong lingling
Also add some comment for list cpuid are not supported in 32 bit.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
Remove compiler report error for -mapxf or -muintr with -m32.
And just disable them in 32-bit code.
* config/i386/i386.opt: Add comment into target help for flags
-m32, list cupid like apx are not supported in 32-bit mode.
* doc/invoke.texi: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-1.c: Removed error test.
---
 gcc/config/i386/i386-options.cc   | 26 --
 gcc/config/i386/i386.opt  |  3 ++-
 gcc/doc/invoke.texi   |  4 +++-
 gcc/testsuite/gcc.target/i386/apx-1.c |  1 -
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc
b/gcc/config/i386/i386-options.cc
index 059ef3ae6ad..3afc5107b69 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2142,16 +2142,22 @@ ix86_option_override_internal (bool main_args_p,
   opts->x_ix86_stringop_alg = no_stringop;
 }

-  if (TARGET_APX_F_P (opts->x_ix86_isa_flags2)
-  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
-error ("%<-mapxf%> is not supported for 32-bit code");
-  else if (opts->x_ix86_apx_features != apx_none
-  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
-error ("%<-mapx-features=%> option is not supported for 32-bit code");
-
-  if (TARGET_UINTR_P (opts->x_ix86_isa_flags2)
-  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
-error ("%<-muintr%> not supported for 32-bit code");
+  /* Some features like apx are not available in 32-bit code.  */
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+{
+  opts->x_ix86_apx_features = apx_none;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_APX_F;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_TILE;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_INT8;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_BF16;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP16;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_COMPLEX;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_CMPCCXADD;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_CX16;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_PREFETCHI;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_UINTR;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_USER_MSR;
+}

   if (ix86_lam_type && !TARGET_LP64_P (opts->x_ix86_isa_flags))
 error ("%<-mlam=%> option: [u48|u57] not supported for 32-bit code");
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 353fffb2343..4f9110f05f9 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -667,7 +667,8 @@ Optimize noreturn functions by not saving callee-saved
registers used in the fun

 m32
 Target RejectNegative Negative(m64) InverseMask(ISA_64BIT)
Var(ix86_isa_flags) Save
-Generate 32bit i386 code.
+Generate 32bit i386 code. mapxf, mamx, mcx16, mcmpccxadd, mprefetchi,
muintr and
+musermsr are not supported in 32-bit mode.

 m64
 Target RejectNegative Negative(mx32) Mask(ABI_64) Var(ix86_isa_flags) Save
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 403ea9da1ab..41410b9ee76 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -36427,7 +36427,9 @@ on x86-64 processors in 64-bit environments.
 Generate code for a 16-bit, 32-bit or 64-bit environment.
 The @option{-m32} option sets @code{int}, @code{long}, and pointer types
 to 32 bits, and
-generates code that runs in 32-bit mode.
+generates code that runs in 32-bit mode.  The @option{-mapxf},
@option{-mamx},
+@option{-mcx16}, @option{-mcmpccxadd}, @option{-mprefetchi},
@option{-muintr}
+and @option{-musermsr} are not supported in 32-bit mode.

 The @option{-m64} option sets @code{int} to 32 bits and @code{long} and
pointer
 types to 64 bits, and generates code for the x86-64 architecture.
diff --git a/gcc/testsuite/gcc.target/i386/apx-1.c
b/gcc/testsuite/gcc.target/i386/apx-1.c
index 4e580ecdf37..3853534cd84 100644
--- a/gcc/testsuite/gcc.target/i386/apx-1.c
+++ b/gcc/testsuite/gcc.target/i386/apx-1.c
@@ -1,6 +1,5 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mapxf" } */
-/* { dg-error "'-mapxf' is not supported for 32-bit code" "" { target ia32
} 0 } */

 void
 apx_hanlder ()
-- 
2.31.1


RE: [PATCH] i386: Remove report error for -mapxf/-muintr with -m32

2024-07-17 Thread Kong, Lingling
On Thu, Jul 18, 2024, 10:00 AM kong lingling 
mailto:lingling.ko...@gmail.com>> wrote:
Also add some comment for list cpuid are not supported in 32 bit.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
Remove compiler report error for -mapxf or -muintr with -m32.
And just disable them in 32-bit code.
* config/i386/i386.opt: Add comment into target help for flags
-m32, list cupid like apx are not supported in 32-bit mode.
* doc/invoke.texi: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-1.c: Removed error test.
---
 gcc/config/i386/i386-options.cc   | 26 --
 gcc/config/i386/i386.opt  |  3 ++-
 gcc/doc/invoke.texi   |  4 +++-
 gcc/testsuite/gcc.target/i386/apx-1.c |  1 -
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 059ef3ae6ad..3afc5107b69 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2142,16 +2142,22 @@ ix86_option_override_internal (bool main_args_p,
   opts->x_ix86_stringop_alg = no_stringop;
 }

-  if (TARGET_APX_F_P (opts->x_ix86_isa_flags2)
-  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
-error ("%<-mapxf%> is not supported for 32-bit code");
-  else if (opts->x_ix86_apx_features != apx_none
-  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
-error ("%<-mapx-features=%> option is not supported for 32-bit code");
-
-  if (TARGET_UINTR_P (opts->x_ix86_isa_flags2)
-  && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
-error ("%<-muintr%> not supported for 32-bit code");
+  /* Some features like apx are not available in 32-bit code.  */
+  if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
+{
+  opts->x_ix86_apx_features = apx_none;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_APX_F;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_TILE;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_INT8;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_BF16;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_FP16;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AMX_COMPLEX;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_CMPCCXADD;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_CX16;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_PREFETCHI;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_UINTR;
+  opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_USER_MSR;
+}

   if (ix86_lam_type && !TARGET_LP64_P (opts->x_ix86_isa_flags))
 error ("%<-mlam=%> option: [u48|u57] not supported for 32-bit code");
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 353fffb2343..4f9110f05f9 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -667,7 +667,8 @@ Optimize noreturn functions by not saving callee-saved 
registers used in the fun

 m32
 Target RejectNegative Negative(m64) InverseMask(ISA_64BIT) Var(ix86_isa_flags) 
Save
-Generate 32bit i386 code.
+Generate 32bit i386 code. mapxf, mamx, mcx16, mcmpccxadd, mprefetchi, muintr 
and
+musermsr are not supported in 32-bit mode.

 m64
 Target RejectNegative Negative(mx32) Mask(ABI_64) Var(ix86_isa_flags) Save
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 403ea9da1ab..41410b9ee76 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -36427,7 +36427,9 @@ on x86-64 processors in 64-bit environments.
 Generate code for a 16-bit, 32-bit or 64-bit environment.
 The @option{-m32} option sets @code{int}, @code{long}, and pointer types
 to 32 bits, and
-generates code that runs in 32-bit mode.
+generates code that runs in 32-bit mode.  The @option{-mapxf}, @option{-mamx},
+@option{-mcx16}, @option{-mcmpccxadd}, @option{-mprefetchi}, @option{-muintr}
+and @option{-musermsr} are not supported in 32-bit mode.

 The @option{-m64} option sets @code{int} to 32 bits and @code{long} and pointer
 types to 64 bits, and generates code for the x86-64 architecture.
diff --git a/gcc/testsuite/gcc.target/i386/apx-1.c 
b/gcc/testsuite/gcc.target/i386/apx-1.c
index 4e580ecdf37..3853534cd84 100644
--- a/gcc/testsuite/gcc.target/i386/apx-1.c
+++ b/gcc/testsuite/gcc.target/i386/apx-1.c
@@ -1,6 +1,5 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mapxf" } */
-/* { dg-error "'-mapxf' is not supported for 32-bit code" "" { target ia32 } 0 
} */

This looks odd.  Please open a GCC bug if there is a real issue.

Thanks.

Yes, I open a bug in  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115978.

 void
 apx_hanlder ()
--
2.31.1



[PATCH] x86: Don't enable APX_F in 32-bit mode.

2024-07-18 Thread Kong, Lingling
I adjusted my patch based on the comments by H.J.
And I will add the testcase like  gcc.target/i386/pr101395-1.c when the march 
for APX is determined.

Ok for trunk?

Thanks,
Lingling

gcc/ChangeLog:

PR target/115978
* config/i386/driver-i386.cc (host_detect_local_cpu): Enable
APX_F only for 64-bit codegen.
* config/i386/i386-options.cc (DEF_PTA): Skip PTA_APX_F if
not in 64-bit mode.

gcc/testsuite/ChangeLog:

PR target/115978
* gcc.target/i386/pr115978-1.c: New test.
* gcc.target/i386/pr115978-2.c: Ditto.
---
 gcc/config/i386/driver-i386.cc |  3 ++-
 gcc/config/i386/i386-options.cc|  3 ++-
 gcc/testsuite/gcc.target/i386/pr115978-1.c | 22 ++ 
 gcc/testsuite/gcc.target/i386/pr115978-2.c |  6 ++
 4 files changed, 32 insertions(+), 2 deletions(-) 
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115978-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115978-2.c

diff --git a/gcc/config/i386/driver-i386.cc b/gcc/config/i386/driver-i386.cc 
index 11470eaea12..445f5640155 100644
--- a/gcc/config/i386/driver-i386.cc
+++ b/gcc/config/i386/driver-i386.cc
@@ -900,7 +900,8 @@ const char *host_detect_local_cpu (int argc, const char 
**argv)
if (has_feature (isa_names_table[i].feature))
  {
if (codegen_x86_64
-   || isa_names_table[i].feature != FEATURE_UINTR)
+   || (isa_names_table[i].feature != FEATURE_UINTR
+   && isa_names_table[i].feature != FEATURE_APX_F))
  options = concat (options, " ",
isa_names_table[i].option, NULL);
  }
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc 
index 059ef3ae6ad..1c8f7835af2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2351,7 +2351,8 @@ ix86_option_override_internal (bool main_args_p,  #define 
DEF_PTA(NAME) \
if (((processor_alias_table[i].flags & PTA_ ## NAME) != 0) \
&& PTA_ ## NAME != PTA_64BIT \
-   && (TARGET_64BIT || PTA_ ## NAME != PTA_UINTR) \
+   && (TARGET_64BIT || (PTA_ ## NAME != PTA_UINTR \
+&& PTA_ ## NAME != PTA_APX_F))\
&& !TARGET_EXPLICIT_ ## NAME ## _P (opts)) \
  SET_TARGET_ ## NAME (opts);
 #include "i386-isa.def"
diff --git a/gcc/testsuite/gcc.target/i386/pr115978-1.c 
b/gcc/testsuite/gcc.target/i386/pr115978-1.c
new file mode 100644
index 000..18a1c5f153a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115978-1.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -march=native" } */
+
+int
+main ()
+{
+  if (__builtin_cpu_supports ("apxf"))
+{
+#ifdef __x86_64__
+# ifndef __APX_F__
+  __builtin_abort ();
+# endif
+#else
+# ifdef __APX_F__
+  __builtin_abort ();
+# endif
+#endif
+  return 0;
+}
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr115978-2.c 
b/gcc/testsuite/gcc.target/i386/pr115978-2.c
new file mode 100644
index 000..900d6eb096a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115978-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=native -mno-apxf" } */
+
+#ifdef __APX_F__
+# error APX_F should be disabled
+#endif
--
2.31.1



[PATCH 1/8] [APX NF]: Support APX NF add

2024-05-15 Thread Kong, Lingling
From: Hongyu Wang 

APX NF(no flags) feature implements suppresses the update of status flags for 
arithmetic operations.

For NF add, it is not clear whether NF add can be faster than lea. If so, the 
pattern needs to be adjusted to prefer LEA generation.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features): Add nf
enumeration.
* config/i386/i386.h (TARGET_APX_NF): New.
* config/i386/i386.md (*add_1_nf): New define_insn.
* config/i386/i386.opt: Add apx_nf enumeration.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Fixed test.
* gcc.target/i386/apx-nf.c: New test.

Co-authored-by: Lingling Kong 

Bootstrapped and regtested on x86_64-linux-gnu. And Supported SPEC 2017 run 
normally on Intel software development emulator.
Ok for trunk?

---
 gcc/config/i386/i386-opts.h |  3 +-
 gcc/config/i386/i386.h  |  1 +
 gcc/config/i386/i386.md | 42 +
 gcc/config/i386/i386.opt|  3 ++
 gcc/testsuite/gcc.target/i386/apx-ndd.c |  2 +-
 gcc/testsuite/gcc.target/i386/apx-nf.c  |  6 
 6 files changed, 55 insertions(+), 2 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/apx-nf.c

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index 
ef2825803b3..60176ce609f 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -140,7 +140,8 @@ enum apx_features {
   apx_push2pop2 = 1 << 1,
   apx_ndd = 1 << 2,
   apx_ppx = 1 << 3,
-  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx,
+  apx_nf = 1<< 4,
+  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx | apx_nf,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 
529edff93a4..f20ae4726da 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -55,6 +55,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see  #define TARGET_APX_PUSH2POP2 (ix86_apx_features & apx_push2pop2)  
#define TARGET_APX_NDD (ix86_apx_features & apx_ndd)  #define TARGET_APX_PPX 
(ix86_apx_features & apx_ppx)
+#define TARGET_APX_NF (ix86_apx_features & apx_nf)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
764bfe20ff2..4a9e35c4990 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6233,6 +6233,48 @@
 }
 })
 

+;; NF instructions.
+
+(define_insn "*add_1_nf"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=rm,rje,r,r,r,r,r,r")
+   (plus:SWI
+ (match_operand:SWI 1 "nonimmediate_operand" "%0,0,0,r,r,rje,jM,r")
+ (match_operand:SWI 2 "x86_64_general_operand" 
+"r,e,BM,0,le,r,e,BM")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (PLUS, mode, operands,
+   TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  if (which_alternative == 3)
+  std::swap (operands[1], operands[2]);
+
+  if (operands[2] == const1_rtx)
+return use_ndd
+ ? "%{nf%} inc{}\t{%1, %0|%0, %1}"
+ : "%{nf%} inc{}\t{%0|%0}";
+
+  if (operands[2] == constm1_rtx)
+return use_ndd
+ ? "%{nf%} dec{}\t{%1, %0|%0, %1}"
+ : "%{nf%} dec{}\t{%0|%0}";
+
+  return use_ndd
+? "%{nf%} add{}\t{%2, %1, %0|%0, %1, %2}"
+: "%{nf%} add{}\t{%2, %0|%0, %2}"; }
+  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd")
+   (set (attr "type")
+ (cond [(eq_attr "alternative" "4")
+  (const_string "lea")
+  ]
+  (const_string "alu")))
+   (set (attr "length_immediate")
+  (if_then_else
+   (and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))
+   (const_string "1")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 ;; Load effective address instructions
 
 (define_insn "*lea"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 
d5f793a9e8b..66021d59d4e 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1356,6 +1356,9 @@ Enum(apx_features) String(ndd) Value(apx_ndd) Set(4)  
EnumValue
 Enum(apx_features) String(ppx) Value(apx_ppx) Set(5)
 
+EnumValue
+Enum(apx_features) String(nf) Value(apx_nf) Set(6)
+
 EnumValue
 Enum(apx_features) String(all) Value(apx_all) Set(1)
 
diff --git a/gcc/testsuite/gcc.target/i386/apx-ndd.c 
b/gcc/testsuite/gcc.target/i386/apx-ndd.c
index 0eb751ad225..0ff4df0780c 100644
--- a/gcc/testsuite/gcc.target/i386/apx-ndd.c
+++ b/gcc/testsuite/gcc.target/i386/apx-ndd.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-mapxf -march=x86-64 -O2" } */
+/* { dg-options "-mapx-features=egpr,push2pop2,ndd,ppx -march=x86-64 
+-O2" } */
 /* { dg-final { scan-assembler-not "movl"} } */
 
 #include 
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
new file mode 100644
index 000..3adc7a27902
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -0,0 +1,6 @@
+/* { dg

[PATCH 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*sub_1_nf): New define_insn.
(*anddi_1_nf): Ditto.
(*and_1_nf): Ditto.
(*qi_1_nf): Ditto.
(*_1_nf): Ditto.
(*neg_1_nf): Ditto.
* config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add test.
---
 gcc/config/i386/i386.md| 129 +
 gcc/config/i386/sse.md |  11 +++
 gcc/testsuite/gcc.target/i386/apx-nf.c |   9 ++
 3 files changed, 149 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
4a9e35c4990..66dc5e1035f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -7888,6 +7888,24 @@
   "split_double_mode (mode, &operands[0], 2, &operands[0], &operands[3]);"
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd")])
 
+(define_insn "*sub_1_nf"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,rjM,,r,r,r")
+   (minus:SWI
+ (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+ (match_operand:SWI 2 "" ",,,r,,")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+  "@
+  %{nf%} sub{}\t{%2, %0|%0, %2}
+  %{nf%} sub{}\t{%2, %0|%0, %2}
+  %{nf%} sub{}\t{%2, %0|%0, %2}
+  %{nf%} sub{}\t{%2, %1, %0|%0, %1, %2}
+  %{nf%} sub{}\t{%2, %1, %0|%0, %1, %2}
+  %{nf%} sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
+   (set_attr "type" "alu")
+   (set_attr "mode" "")])
+
 (define_insn "*sub_1"
   [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
(minus:SWI
@@ -11790,6 +11808,27 @@
 }
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])
 
+(define_insn "*anddi_1_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,rjM,r,r,r,r,?k")
+   (and:DI
+(match_operand:DI 1 "nonimmediate_operand" "%0,r,0,0,0,rm,rjM,r,k")
+(match_operand:DI 2 "x86_64_szext_general_operand" 
+"Z,Z,r,e,m,r,e,m,k")))]
+  "TARGET_APX_NF
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} and{l}\t{%k2, %k0|%k0, %k2}
+   %{nf%} and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   %{nf%} and{q}\t{%2, %0|%0, %2}
+   %{nf%} and{q}\t{%2, %0|%0, %2}
+   %{nf%} and{q}\t{%2, %0|%0, %2}
+   %{nf%} and{q}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{q}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{q}\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set_attr "isa" "*,apx_ndd,*,*,*,apx_ndd,apx_ndd,apx_ndd,avx512bw")
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,alu,msklog")
+   (set_attr "mode" "SI,SI,DI,DI,DI,DI,DI,DI,DI")])
+
 (define_insn "*anddi_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
(and:DI
@@ -11889,6 +11928,33 @@
(set_attr "isa" "*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "mode" "SI")])
 
+(define_insn "*and_1_nf"
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,rjM,r,r,r,r,?k")
+   (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" 
"%0,0,0,rm,rjM,r,k")
+  (match_operand:SWI24 2 "" 
+"r,,,r,,,k")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (AND, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} and{}\t{%2, %0|%0, %2}
+   %{nf%} and{}\t{%2, %0|%0, %2}
+   %{nf%} and{}\t{%2, %0|%0, %2}
+   %{nf%} and{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} and{}\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set (attr "isa")
+   (cond [(eq_attr "alternative" "3,4,5")
+(const_string "apx_ndd")
+  (eq_attr "alternative" "6")
+(if_then_else (eq_attr "mode" "SI")
+  (const_string "avx512bw")
+  (const_string "avx512f"))
+ ]
+ (const_string "*")))
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,msklog")
+   (set_attr "type" "alu")
+   (set_attr "mode" "")])
+
 (define_insn "*and_1"
   [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,r,r,r,Ya,?k")
(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" 
"%0,0,rm,rjM,r,qm,k") @@ -11923,6 +11989,37 @@
(const_string "*")))
(set_attr "mode" ",SI,")])
 
+;; NF for and,or,xor
+
+(define_insn "*qi_1_nf"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,r,r,?k")
+   (any_logic:QI (match_operand:QI 1 "nonimmediate_operand" 
"%0,0,0,rm,r,k")
+  (match_operand:QI 2 "general_operand" "qn,m,rn,rn,m,k")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (, QImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} {b}\t{%2, %0|%0, %2}
+   %{nf%} {b}\t{%2, %0|%0, %2}
+   %{nf%} {l}\t{%k2, %k0|%k0, %k2}
+   %{nf%} {b}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} {b}\t{%2, %1, %0|%0, %1, %2}
+   #"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,avx512f")
+   (set_attr "type" "alu,alu,alu,alu,alu,msklog")
+   (set (attr "mode")
+   (cond [(eq_attr "alternative" "2")
+(const_string "SI")
+   (and (eq_attr "alternative" "5")
+   

[PATCH 3/8] [APX NF] Support APX NF for left shift insns

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashl3_1_nf): New.
(*ashlhi3_1_nf): Ditto.
(*ashlqi3_1_nf): Ditto.
* config/i386/sse.md: New define_split.
---
 gcc/config/i386/i386.md | 175 
 gcc/config/i386/sse.md  |  13 +++
 2 files changed, 188 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
66dc5e1035f..9ffdb3fe71a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15126,6 +15126,54 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
+(define_insn "*ashl3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm")
+ (match_operand:QI 2 "nonmemory_operand" 
+"c,M,r,,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  switch (get_attr_type (insn))
+{
+case TYPE_LEA:
+case TYPE_ISHIFTX:
+case TYPE_MSKLOG:
+  return "#";
+
+case TYPE_ALU:
+  gcc_assert (operands[2] == const1_rtx);
+  gcc_assert (rtx_equal_p (operands[0], operands[1]));
+  return "%{nf%} add{}\t%0, %0";
+
+default:
+  return use_ndd ? "%{nf%} sal{}\t{%2, %1, %0|%0, %1, %2}"
+: "%{nf%} sal{}\t{%2, %0|%0, %2}";
+}
+}
+  [(set_attr "isa" "*,*,bmi2,avx512bw,apx_ndd")
+   (set (attr "type")
+ (cond [(eq_attr "alternative" "1")
+ (const_string "lea")
+   (eq_attr "alternative" "2")
+ (const_string "ishiftx")
+   (eq_attr "alternative" "4")
+ (const_string "ishift")
+(and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+ (match_operand 0 "register_operand"))
+(match_operand 2 "const1_operand"))
+ (const_string "alu")
+   (eq_attr "alternative" "3")
+ (const_string "msklog")
+  ]
+  (const_string "ishift")))
+   (set (attr "length_immediate")
+ (if_then_else
+   (eq_attr "type" "alu")
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 (define_insn "*ashl3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm") @@ -15187,6 +15235,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (ashift:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-15273,6 +15332,50 @@
(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
+(define_insn "*ashlhi3_1_nf"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
+   (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm")
+  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  switch (get_attr_type (insn))
+{
+case TYPE_LEA:
+case TYPE_MSKLOG:
+  return "#";
+
+case TYPE_ALU:
+  gcc_assert (operands[2] == const1_rtx);
+  return "%{nf%} add{w}\t%0, %0";
+
+default:
+  return use_ndd ? "%{nf%} sal{w}\t{%2, %1, %0|%0, %1, %2}"
+: "%{nf%} sal{w}\t{%2, %0|%0, %2}";
+}
+}
+  [(set_attr "isa" "*,*,avx512f,apx_ndd")
+   (set (attr "type")
+ (cond [(eq_attr "alternative" "1")
+ (const_string "lea")
+   (eq_attr "alternative" "2")
+ (const_string "msklog")
+   (eq_attr "alternative" "3")
+ (const_string "ishift")
+(and (and (match_test "TARGET_DOUBLE_WITH_ADD")
+ (match_operand 0 "register_operand"))
+(match_operand 2 "const1_operand"))
+ (const_string "alu")
+  ]
+  (const_string "ishift")))
+   (set (attr "length_immediate")
+ (if_then_else
+   (eq_attr "type" "alu")
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "HI,SI,HI,HI")])
+
 (define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm") @@ 
-15326,6 +15429,61 @@
(const_string "*"))

[PATCH 4/8] [APX NF] Support APX NF for right shift insns

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashr3_1_nf): New.
(*lshr3_1_nf): Ditto.
(*lshrqi3_1_nf): Ditto.
(*lshrhi3_1_nf): Ditto.
---
 gcc/config/i386/i386.md | 85 +
 1 file changed, 85 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
9ffdb3fe71a..adcb09fcdd0 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16569,6 +16569,21 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
+(define_insn "*ashr3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
+   (ashiftrt:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
+ (match_operand:QI 2 "nonmemory_operand" "c,r,c")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} sar{}\t{%2, %0|%0, %2}
+   #
+   %{nf%} sar{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,bmi2,apx_ndd")
+   (set_attr "type" "ishift,ishiftx,ishift")
+   (set_attr "mode" "")])
+
 (define_insn "*ashr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(ashiftrt:SWI48
@@ -16630,6 +16645,21 @@
 }
 [(set_attr "isa" "*,*,*,apx_ndd")])
 
+(define_insn "*lshr3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r")
+   (lshiftrt:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k,rm")
+ (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} shr{}\t{%2, %0|%0, %2}
+   #
+   #
+   %{nf%} shr{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,bmi2,avx512bw,apx_ndd")
+   (set_attr "type" "ishift,ishiftx,msklog,ishift")
+   (set_attr "mode" "")])
 
 (define_insn "*lshr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r") @@ 
-16669,6 +16699,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+  (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-16737,6 +16778,20 @@
(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
+(define_insn "*ashr3_1_nf"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
+   (ashiftrt:SWI12
+ (match_operand:SWI12 1 "nonimmediate_operand" "0, rm")
+ (match_operand:QI 2 "nonmemory_operand" "c, c")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} sar{}\t{%2, %0|%0, %2}
+   %{nf%} sar{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*, apx_ndd")
+   (set_attr "type" "ishift")
+   (set_attr "mode" "")])
+
 (define_insn "*ashr3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
(ashiftrt:SWI12
@@ -16765,6 +16820,21 @@
(const_string "*")))
(set_attr "mode" "")])
 
+(define_insn "*lshrqi3_1_nf"
+  [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
+   (lshiftrt:QI
+ (match_operand:QI 1 "nonimmediate_operand" "0,k,rm")
+ (match_operand:QI 2 "nonmemory_operand""cI,Wb,cI")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (LSHIFTRT, QImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} shr{b}\t{%2, %0|%0, %2}
+   #
+   %{nf%} shr{b}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,avx512dq,apx_ndd")
+   (set_attr "type" "ishift,msklog,ishift")
+   (set_attr "mode" "QI")])
+
 (define_insn "*lshrqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
(lshiftrt:QI
@@ -16802,6 +16872,21 @@
(const_string "*")))
(set_attr "mode" "QI")])
 
+(define_insn "*lshrhi3_1_nf"
+  [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,?k,r")
+   (lshiftrt:HI
+ (match_operand:HI 1 "nonimmediate_operand" "0,k,rm")
+ (match_operand:QI 2 "nonmemory_operand" "cI,Ww,cI")))]
+  "TARGET_APX_NF &&
+   ix86_binary_operator_ok (LSHIFTRT, HImode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} shr{w}\t{%2, %0|%0, %2}
+   #
+   %{nf%} shr{w}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*, avx512f, apx_ndd")
+   (set_attr "type" "ishift,msklog,ishift")
+   (set_attr "mode" "HI")])
+
 (define_insn "*lshrhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm, ?k, r")
(lshiftrt:HI
--
2.31.1



[PATCH 5/8] [APX NF] Support APX NF for rotate insns

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (ashr3_cvt_nf): New define_insn.
(*3_1_nf): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add NF test for rotate insns.
---
 gcc/config/i386/i386.md| 80 ++
 gcc/testsuite/gcc.target/i386/apx-nf.c |  5 ++
 2 files changed, 85 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
adcb09fcdd0..ff44154b26b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16491,6 +16491,25 @@
 (define_mode_attr cvt_mnemonic
   [(SI "{cltd|cdq}") (DI "{cqto|cqo}")])
 
+(define_insn "ashr3_cvt_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+   (ashiftrt:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+ (match_operand:QI 2 "const_int_operand")))]
+  "TARGET_APX_NF &&
+   INTVAL (operands[2]) == GET_MODE_BITSIZE (mode)-1
+   && (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
+   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+  "@
+   %{nf%} sar{}\t{%2, %0|%0, %2}
+   %{nf%} sar{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,apx_ndd")
+   (set_attr "type" "ishift")
+   (set_attr "prefix_0f" "*")
+   (set_attr "length_immediate" "*")
+   (set_attr "modrm" "1")
+   (set_attr "mode" "")])
+
 (define_insn "ashr3_cvt"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=*d,rm,r")
(ashiftrt:SWI48
@@ -17430,6 +17449,39 @@
   [(set_attr "type" "rotatex")
(set_attr "mode" "")])
 
+(define_insn "*3_1_nf"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r")
+   (any_rotate:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "0,rm")
+ (match_operand:QI 2 "nonmemory_operand" "c,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  if (operands[2] == const1_rtx
+  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
+  && !use_ndd)
+return "%{nf%} {}\t%0";
+  else
+return use_ndd ? "%{nf%} {}\t{%2, %1, %0|%0, %1, %2}"
+  : "%{nf%} {}\t{%2, %0|%0, %2}"; }
+  [(set_attr "isa" "*,apx_ndd")
+   (set_attr "type" "rotate")
+   (set (attr "preferred_for_size")
+ (cond [(eq_attr "alternative" "0")
+ (symbol_ref "true")]
+  (symbol_ref "false")))
+   (set (attr "length_immediate")
+ (if_then_else
+   (and (eq_attr "type" "rotate")
+   (and (match_operand 2 "const1_operand")
+(ior (match_test "TARGET_SHIFT1")
+ (match_test "optimize_function_for_size_p (cfun)"
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 (define_insn "*3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(any_rotate:SWI48
@@ -17572,6 +17624,34 @@
   [(set (match_dup 0)
(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2])
 
+(define_insn "*3_1_nf"
+  [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
+   (any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm")
+ (match_operand:QI 2 "nonmemory_operand" 
"c,c")))]
+  "TARGET_APX_NF &&
+  ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+{
+  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
+  if (operands[2] == const1_rtx
+  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
+  && !use_ndd)
+return "%{nf%} {}\t%0";
+  else
+return use_ndd
+  ? "%{nf%} {}\t{%2, %1, %0|%0, %1, %2}"
+  : "%{nf%} {}\t{%2, %0|%0, %2}"; }
+  [(set_attr "isa" "*,apx_ndd")
+   (set_attr "type" "rotate")
+   (set (attr "length_immediate")
+ (if_then_else
+   (and (match_operand 2 "const1_operand")
+   (ior (match_test "TARGET_SHIFT1")
+(match_test "optimize_function_for_size_p (cfun)")))
+   (const_string "0")
+   (const_string "*")))
+   (set_attr "mode" "")])
+
 (define_insn "*3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm") 
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
index 608dbf8f5f7..6e59803be64 100644
--- a/gcc/testsuite/gcc.target/i386/apx-nf.c
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -3,6 +3,7 @@
 /* { dg-final { scan-assembler-times "\{nf\} add" 4 } } */
 /* { dg-final { scan-assembler-times "\{nf\} and" 1 } } */
 /* { dg-final { scan-assembler-times "\{nf\} or" 1 } } */
+/* { dg-final { scan-assembler-times "\{nf\} rol" 4 } } */
 
 #include "apx-ndd.c"
 
@@ -13,3 +14,7 @@ foo (struct B *b)
 {
 b->bit0 = b->bit0 | b->bit1;
 }
+long int f1 (int x) { return ~(1ULL << (x & 0x3f)); } long int f2 (int 
+x) { return ~(1ULL << x); } long int f3 (unsigned char *x) { return 
+~(1ULL << (x[0] & 0x3f)); } long int f4 (unsigned char *x) { return 
+~(1ULL << x[0]); }
--
2.3

[PATCH 7/8] [APX NF] Support APX NF for mul/div

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*mul3_1_nf): New define_insn.
(*mulqi3_1_nf): Ditto.
(*divmod4_noext_nf): Ditto.
(divmodhiqi3_nf): Ditto.
---
 gcc/config/i386/i386.md | 86 +
 1 file changed, 86 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
f9a62fba0c4..55f65a31b16 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9907,6 +9907,42 @@
 ;;
 ;; On BDVER1, all HI MULs use DoublePath
 
+(define_insn "*mul3_1_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
+   (mult:SWIM248
+ (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0")
+ (match_operand:SWIM248 2 "" "K,,r")))]
+  "TARGET_APX_NF &&
+  !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   %{nf%} imul{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} imul{}\t{%2, %1, %0|%0, %1, %2}
+   %{nf%} imul{}\t{%2, %0|%0, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "prefix_0f" "0,0,1")
+   (set (attr "athlon_decode")
+   (cond [(eq_attr "cpu" "athlon")
+ (const_string "vector")
+  (eq_attr "alternative" "1")
+ (const_string "vector")
+  (and (eq_attr "alternative" "2")
+   (ior (match_test "mode == HImode")
+(match_operand 1 "memory_operand")))
+ (const_string "vector")]
+ (const_string "direct")))
+   (set (attr "amdfam10_decode")
+   (cond [(and (eq_attr "alternative" "0,1")
+   (ior (match_test "mode == HImode")
+(match_operand 1 "memory_operand")))
+ (const_string "vector")]
+ (const_string "direct")))
+   (set (attr "bdver1_decode")
+   (if_then_else
+ (match_test "mode == HImode")
+   (const_string "double")
+   (const_string "direct")))
+   (set_attr "mode" "")])
+
 (define_insn "*mul3_1"
   [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
(mult:SWIM248
@@ -9978,6 +10014,24 @@
 ;; MUL reg8Direct
 ;; MUL mem8Direct
 
+(define_insn "*mulqi3_1_nf"
+  [(set (match_operand:QI 0 "register_operand" "=a")
+   (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
+(match_operand:QI 2 "nonimmediate_operand" "qm")))]
+  "TARGET_APX_NF &&
+  TARGET_QIMODE_MATH
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "%{nf%} mul{b}\t%2"
+  [(set_attr "type" "imul")
+   (set_attr "length_immediate" "0")
+   (set (attr "athlon_decode")
+ (if_then_else (eq_attr "cpu" "athlon")
+(const_string "vector")
+(const_string "direct")))
+   (set_attr "amdfam10_decode" "direct")
+   (set_attr "bdver1_decode" "direct")
+   (set_attr "mode" "QI")])
+
 (define_insn "*mulqi3_1"
   [(set (match_operand:QI 0 "register_operand" "=a")
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") @@ -11128,6 
+11182,19 @@
   [(set_attr "type" "multi")
(set_attr "mode" "SI")])
 
+(define_insn "*divmod4_noext_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+   (any_div:SWIM248
+ (match_operand:SWIM248 2 "register_operand" "0")
+ (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+   (:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))]
+  "TARGET_APX_NF"
+  "%{nf%} div{}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "")])
+
 (define_insn "*divmod4_noext"
   [(set (match_operand:SWIM248 0 "register_operand" "=a")
(any_div:SWIM248
@@ -11275,6 +11342,25 @@
 ;; Change div/mod to HImode and extend the second argument to HImode  ;; so 
that mode of div/mod matches with mode of arguments.  Otherwise  ;; combine may 
fail.
+(define_insn "divmodhiqi3_nf"
+  [(set (match_operand:HI 0 "register_operand" "=a")
+   (ior:HI
+ (ashift:HI
+   (zero_extend:HI
+ (truncate:QI
+   (mod:HI (match_operand:HI 1 "register_operand" "0")
+   (any_extend:HI
+ (match_operand:QI 2 "nonimmediate_operand" "qm")
+   (const_int 8))
+ (zero_extend:HI
+   (truncate:QI
+ (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))]
+  "TARGET_APX_NF
+  && TARGET_QIMODE_MATH"
+  "%{nf%} div{b}\t%2"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "QI")])
+
 (define_insn "divmodhiqi3"
   [(set (match_operand:HI 0 "register_operand" "=a")
(ior:HI
--
2.31.1



[PATCH 6/8] [APX NF] Support APX NF for shld/shrd

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (x86_64_shld_nf): New define_insn.
(x86_64_shld_ndd_nf): Ditto.
(x86_64_shld_1_nf): Ditto.
(x86_64_shld_ndd_1_nf): Ditto.
(*x86_64_shld_shrd_1_nozext_nf): Ditto.
(x86_shld_nf): Ditto.
(x86_shld_ndd_nf): Ditto.
(x86_shld_1_nf): Ditto.
(x86_shld_ndd_1_nf): Ditto.
(*x86_shld_shrd_1_nozext_nf): Ditto.
(3_doubleword_lowpart_nf): Ditto.
(x86_64_shrd_nf): Ditto.
(x86_64_shrd_ndd_nf): Ditto.
(x86_64_shrd_1_nf): Ditto.
(x86_64_shrd_ndd_1_nf): Ditto.
(*x86_64_shrd_shld_1_nozext_nf): Ditto.
(x86_shrd_nf): Ditto.
(x86_shrd_ndd_nf): Ditto.
(x86_shrd_1_nf): Ditto.
(x86_shrd_ndd_1_nf): Ditto.
(*x86_shrd_shld_1_nozext_nf): Ditto.
---
 gcc/config/i386/i386.md | 518 
 1 file changed, 518 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
ff44154b26b..f9a62fba0c4 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14666,6 +14666,26 @@
   DONE;
 })
 
+(define_insn "x86_64_shld_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
+(ior:DI (ashift:DI (match_dup 0)
+ (and:QI (match_operand:QI 2 "nonmemory_operand" "Jc")
+ (const_int 63)))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 1 "register_operand" "r"))
+   (minus:QI (const_int 64)
+ (and:QI (match_dup 2) (const_int 63 0)))]
+  "TARGET_APX_NF"
+  "%{nf%} shld{q}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
 (define_insn "x86_64_shld"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0) @@ -14687,6 +14707,22 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
+(define_insn "x86_64_shld_ndd_nf"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+(ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
+ (and:QI (match_operand:QI 3 "nonmemory_operand" "Jc")
+ (const_int 63)))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 2 "register_operand" "r"))
+   (minus:QI (const_int 64)
+ (and:QI (match_dup 3) (const_int 63 0)))]
+  "TARGET_APX_NDD && TARGET_APX_NF"
+  "%{nf%} shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "DI")])
+
 (define_insn "x86_64_shld_ndd"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm") @@ 
-14704,6 +14740,43 @@
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")])
 
+(define_insn "x86_64_shld_1_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
+(ior:DI (ashift:DI (match_dup 0)
+  (match_operand:QI 2 "const_0_to_63_operand"))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 1 "register_operand" "r"))
+   (match_operand:QI 3 "const_0_to_255_operand")) 0)))]
+  "TARGET_64BIT
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && TARGET_APX_NF"
+  "%{nf%} shld{q}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ishift")
+   (set_attr "prefix_0f" "1")
+   (set_attr "mode" "DI")
+   (set_attr "length_immediate" "1")
+   (set_attr "athlon_decode" "vector")
+   (set_attr "amdfam10_decode" "vector")
+   (set_attr "bdver1_decode" "vector")])
+
+(define_insn "x86_64_shld_ndd_1_nf"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+(ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
+  (match_operand:QI 3 "const_0_to_63_operand"))
+   (subreg:DI
+ (lshiftrt:TI
+   (zero_extend:TI
+ (match_operand:DI 2 "register_operand" "r"))
+   (match_operand:QI 4 "const_0_to_255_operand")) 0)))]
+  "TARGET_APX_NDD && TARGET_APX_NF
+   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])"
+  "%{nf%} shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ishift")
+   (set_attr "mode" "DI")
+   (set_attr "length_immediate" "1")])
+
 (define_insn "x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0) @@ -14742,6 +14815,58 @@
(set_attr "mode" "DI")
(set_attr "length_immediate" "1")])
 
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext_nf"
+

[PATCH 8/8] [APX NF] Support APX NF for lzcnt/tzcnt/popcnt

2024-05-15 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (clz2_lzcnt_nf): New define_insn.
(*clz2_lzcnt_falsedep_nf): Ditto.
(__nf): Ditto.
(*__falsedep_nf): Ditto.
(_hi_nf): Ditto.
(popcount2_nf): Ditto.
(*popcount2_falsedep_nf): Ditto.
(popcounthi2_nf): Ditto.
---
 gcc/config/i386/i386.md | 132 
 1 file changed, 132 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
55f65a31b16..ddde83e57f5 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -21029,6 +21029,24 @@
   operands[3] = gen_reg_rtx (mode);
 })
 
+(define_insn_and_split "clz2_lzcnt_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (clz:SWI48 (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "clz2_lzcnt"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -21052,6 +21070,18 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+(define_insn "*clz2_lzcnt_falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn "*clz2_lzcnt_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -21158,6 +21188,25 @@
 ;; Version of lzcnt/tzcnt that is expanded from intrinsics.  This version  ;; 
provides operand size as output when source operand is zero. 
 
+(define_insn_and_split "__nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (unspec:SWI48 [(match_dup 1)] LT_ZCNT))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "_"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -21182,6 +21231,20 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+; also used in source.
+(define_insn "*__falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn "*__falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -21196,6 +21259,17 @@
(set_attr "prefix_rep" "1")
(set_attr "mode" "")])
 
+(define_insn "_hi_nf"
+  [(set (match_operand:HI 0 "register_operand" "=r")
+   (unspec:HI
+ [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {w}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "HI")])
+
 (define_insn "_hi"
   [(set (match_operand:HI 0 "register_operand" "=r")
(unspec:HI
@@ -21620,6 +21694,30 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "")])
 
+(define_insn_and_split "popcount2_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (popcount:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_POPCNT"
+{
+#if TARGET_MACHO
+  return "%{nf%} popcnt\t{%1, %0|%0, %1}"; #else
+  return "%{nf%} popcnt{}\t{%1, %0|%0, %1}"; #endif }
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned

RE: [PATCH 1/8] [APX NF]: Support APX NF add

2024-05-15 Thread Kong, Lingling
> -Original Message-
> From: Uros Bizjak 
> Sent: Wednesday, May 15, 2024 4:15 PM
> To: Kong, Lingling 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; Wang,
> Hongyu 
> Subject: Re: [PATCH 1/8] [APX NF]: Support APX NF add
> 
> On Wed, May 15, 2024 at 9:43 AM Kong, Lingling 
> wrote:
> >
> > From: Hongyu Wang 
> >
> > APX NF(no flags) feature implements suppresses the update of status flags 
> > for
> arithmetic operations.
> >
> > For NF add, it is not clear whether NF add can be faster than lea. If so, 
> > the
> pattern needs to be adjusted to prefer LEA generation.
> 
> > diff --git a/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > b/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > index 0eb751ad225..0ff4df0780c 100644
> > --- a/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > +++ b/gcc/testsuite/gcc.target/i386/apx-ndd.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile { target { ! ia32 } } } */
> > -/* { dg-options "-mapxf -march=x86-64 -O2" } */
> > +/* { dg-options "-mapx-features=egpr,push2pop2,ndd,ppx -march=x86-64
> > +-O2" } */
> 
> Please do not split options to a separate line; here and in other places.
> 
> Uros.

Sorry,  my send-email adjusted some formatting incorrectly, I added attachments.

Thanks, 
Lingling



0004-APX-NF-Support-APX-NF-for-right-shift-insns.patch
Description: 0004-APX-NF-Support-APX-NF-for-right-shift-insns.patch


0005-APX-NF-Support-APX-NF-for-rotate-insns.patch
Description: 0005-APX-NF-Support-APX-NF-for-rotate-insns.patch


0006-APX-NF-Support-APX-NF-for-shld-shrd.patch
Description: 0006-APX-NF-Support-APX-NF-for-shld-shrd.patch


0007-APX-NF-Support-APX-NF-for-mul-div.patch
Description: 0007-APX-NF-Support-APX-NF-for-mul-div.patch


0008-APX-NF-Support-APX-NF-for-lzcnt-tzcnt-popcnt.patch
Description: 0008-APX-NF-Support-APX-NF-for-lzcnt-tzcnt-popcnt.patch


0001-APX-NF-Support-APX-NF-add.patch
Description: 0001-APX-NF-Support-APX-NF-add.patch


0002-APX-NF-Support-APX-NF-for-sub-and-or-xor-neg.patch
Description: 0002-APX-NF-Support-APX-NF-for-sub-and-or-xor-neg.patch


0003-APX-NF-Support-APX-NF-for-left-shift-insns.patch
Description: 0003-APX-NF-Support-APX-NF-for-left-shift-insns.patch


[PATCH v2 1/8] [APX NF]: Support APX NF add

2024-05-22 Thread Kong, Lingling
> I wonder if we can use "define_subst" to conditionally add flags clobber
> for !TARGET_APX_NF targets. Even the example for "Define Subst" uses the insn
> w/ and w/o the clobber, so I think it is worth considering this approach.
> 
> Uros.

Good Suggestion, I defined new subst for no flags, and Bootstrapped and 
regtested on x86_64-linux-gnu. Also supported SPEC 2017 run normally on Intel 
software development emulator.
Ok for trunk?

Thanks,
Lingling

Subject: [PATCH v2 1/8] [APX NF]: Support APX NF add
APX NF(no flags) feature implements suppresses the update of status flags
for arithmetic operations.

For NF add, it is not clear whether nf add can be faster than lea. If so,
the pattern needs to be adjusted to perfer lea generation.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features): Add nf
enumeration.
* config/i386/i386.h (TARGET_APX_NF): New.
* config/i386/i386.md (nf_subst): New define_subst.
(nf_name): New subst_attr.
(nf_prefix): Ditto.
(nf_condition): Ditto.
(nf_mem_constraint): Ditto.
(nf_applied): Ditto.
(*add_1_nf): New define_insn.
(addhi_1_nf): Ditto.
(addqi_1_nf): Ditto.
* config/i386/i386.opt: Add apx_nf enumeration.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Fixed test.
* gcc.target/i386/apx-nf.c: New test.

Co-authored-by: Lingling Kong 
---
 gcc/config/i386/i386-opts.h |   3 +-
 gcc/config/i386/i386.h  |   1 +
 gcc/config/i386/i386.md | 179 +++-
 gcc/config/i386/i386.opt|   3 +
 gcc/testsuite/gcc.target/i386/apx-ndd.c |   2 +-
 gcc/testsuite/gcc.target/i386/apx-nf.c  |   6 +
 6 files changed, 126 insertions(+), 68 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-nf.c

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index ef2825803b3..60176ce609f 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -140,7 +140,8 @@ enum apx_features {
   apx_push2pop2 = 1 << 1,
   apx_ndd = 1 << 2,
   apx_ppx = 1 << 3,
-  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx,
+  apx_nf = 1<< 4,
+  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx | apx_nf,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 529edff93a4..f20ae4726da 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -55,6 +55,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 #define TARGET_APX_PUSH2POP2 (ix86_apx_features & apx_push2pop2)
 #define TARGET_APX_NDD (ix86_apx_features & apx_ndd)
 #define TARGET_APX_PPX (ix86_apx_features & apx_ppx)
+#define TARGET_APX_NF (ix86_apx_features & apx_nf)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 764bfe20ff2..bae344518bd 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6233,28 +6233,6 @@
 }
 })
 

-;; Load effective address instructions
-
-(define_insn "*lea"
-  [(set (match_operand:SWI48 0 "register_operand" "=r")
-   (match_operand:SWI48 1 "address_no_seg_operand" "Ts"))]
-  "ix86_hardreg_mov_ok (operands[0], operands[1])"
-{
-  if (SImode_address_operand (operands[1], VOIDmode))
-{
-  gcc_assert (TARGET_64BIT);
-  return "lea{l}\t{%E1, %k0|%k0, %E1}";
-}
-  else
-return "lea{}\t{%E1, %0|%0, %E1}";
-}
-  [(set_attr "type" "lea")
-   (set (attr "mode")
- (if_then_else
-   (match_operand 1 "SImode_address_operand")
-   (const_string "SI")
-   (const_string "")))])
-
 (define_peephole2
   [(set (match_operand:SWI48 0 "register_operand")
(match_operand:SWI48 1 "address_no_seg_operand"))]
@@ -6290,6 +6268,13 @@
   [(parallel [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))
   (clobber (reg:CC FLAGS_REG))])]
   "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+
+(define_split
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+   (mult:SWI48 (match_dup 0) (match_operand:SWI48 1 "const1248_operand")))]
+  "TARGET_APX_NF && reload_completed"
+  [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))]
+  "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
 

 ;; Add instructions
 
@@ -6437,48 +6422,65 @@
  (clobber (reg:CC FLAGS_REG))])]
  "split_double_mode (mode, &operands[0], 1, &operands[0], &operands[5]);")
 
-(define_insn "*add_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r,r,r,r")
+(define_subst_attr "nf_name" "nf_subst" "_nf" "")
+(define_subst_attr "nf_prefix" "nf_subst" "%{nf%} " "")
+(define_subst_attr "nf_condition" "nf_subst" "TARGET_APX_NF" "true")
+(define_subst_attr "nf_mem_constraint" "nf_subst" "je" "m")
+(define_subst_attr "nf_applied" "nf_subst" "true" "false")
+
+(define_subst "nf_subst"
+  [(set (match_operand:SWI 0)
+(match_operand:SWI 1))]
+  ""
+  [(set (match_dup 0)
+ 

[PATCH v2 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

   * config/i386/i386.md (nf_and_applied): New subst_attr.
   (nf_x64_and_applied): Ditto.
   (*sub_1_nf): New define_insn.
   (*anddi_1_nf): Ditto.
   (*and_1_nf): Ditto.
   (*qi_1_nf): Ditto.
   (*_1_nf): Ditto.
   (*neg_1_nf): Ditto.
   * config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

   * gcc.target/i386/apx-nf.c: Add test.
---
gcc/config/i386/i386.md| 174 +
gcc/config/i386/sse.md |  11 ++
gcc/testsuite/gcc.target/i386/apx-nf.c |   9 ++
3 files changed, 112 insertions(+), 82 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bae344518bd..099d7f35c8f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -575,7 +575,7 @@
 
noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
 
avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
 
avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
- vaes_avx512vl"
+vaes_avx512vl,noapx_nf"
   (const_string "base"))
 ;; The (bounding maximum) length of an instruction immediate.
@@ -981,6 +981,7 @@
 (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")
   (eq_attr "mmx_isa" "avx")
 (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")
+ (eq_attr "isa" "noapx_nf") (symbol_ref "!TARGET_APX_NF")
  ]
  (const_int 1)))
@@ -7893,20 +7894,21 @@
   "split_double_mode (mode, &operands[0], 2, &operands[0], &operands[3]);"
[(set_attr "isa" "*,*,apx_ndd,apx_ndd")])
-(define_insn "*sub_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
+(define_insn "*sub_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" 
"=m,r,,r,r,r")
  (minus:SWI
-(match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,rjM,r")
-(match_operand:SWI 2 "" 
",,r,,")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+   (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+   (match_operand:SWI 2 "" 
",,,r,,")))]
+  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)
+  && "
   "@
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "type" "alu")
(set_attr "mode" "")])
@@ -11795,27 +11797,31 @@
}
[(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])
-(define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
+(define_subst_attr "nf_and_applied" "nf_subst"  "noapx_nf" "*")
+(define_subst_attr "nf_x64_and_applied" "nf_subst" "noapx_nf" "x64")
+
+(define_insn "*anddi_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" 
"=r,r,rm,r,r,r,r,r,r,?k")
  (and:DI
-  (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,rm,rjM,r,qm,k")
-  (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,re,m,r,e,m,L,k")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,0,rm,rjM,r,qm,k")
+ (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,r,e,m,r,e,m,L,k")))]
   "TARGET_64BIT
-   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)
+   && "
   "@
-   and{l}\t{%k2, %k0|%k0, %k2}
-   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{l}\t{%k2, %k0|%k0, %k2}
+   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
#
#"
-  [(set_attr "isa" "x64,apx_ndd,x64,x64,apx_ndd,apx_ndd,apx_ndd,x64,avx512bw")
-   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
-   (set_attr "length_immediate" "*,*,*,*,*,*,*,0,*")
+  [(set_attr "isa" 
"x64,apx_ndd,x64,x64,x64,apx_ndd,apx_ndd,apx_ndd,,avx512bw")
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,*,*,*,*,*,0,*")
(set (attr "prefix_rex")
  (if_then_else
(and (eq_attr "type" "imovx")
@@ -11

[PATCH v2 3/8] [APX NF] Support APX NF for left shift insns

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashl3_1_nf): New.
(*ashlhi3_1_nf): Ditto.
(*ashlqi3_1_nf): Ditto.
* config/i386/sse.md: New define_split.
---
 gcc/config/i386/i386.md | 80 +++--
 gcc/config/i386/sse.md  | 13 +++
 2 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
099d7f35c8f..271d449d7c4 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15012,12 +15012,12 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashl3_1"
+(define_insn "*ashl3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))]
+  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -15030,7 +15030,7 @@
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
   gcc_assert (rtx_equal_p (operands[0], operands[1]));
-  return "add{}\t%0, %0";
+  return "add{}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
@@ -15038,11 +15038,11 @@
  /* For NDD form instructions related to TARGET_SHIFT1, the $1
 immediate do not need to be omitted as assembler will map it
 to use shorter encoding. */
- && !use_ndd)
+ && !use_ndd && !)
return "sal{}\t%0";
   else
-   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sal{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,bmi2,avx512bw,apx_ndd") @@ -15073,6 +15073,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (ashift:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-15159,12 +15170,12 @@
(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashlhi3_1"
+(define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)"
+  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))]
+  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -15175,16 +15186,16 @@
 
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
-  return "add{w}\t%0, %0";
+  return "add{w}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sal{w}\t%0";
   else
-   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{w}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
+  : "sal{w}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,avx512f,apx_ndd") @@ -15212,12 +15223,12 @@
(const_string "*")))
(set_attr "mode" "HI,SI,HI,HI")])
 
-(define_insn "*ashlqi3_1"
+(define_insn "*ashlqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp,?k,r")
(ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, QImode, operands, TARGET_APX_NDD)"
+  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb,cI")))]
+  "ix86_binary_operator_ok (ASHIFT, QImode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -15229,14 +15240,14 @@
 case TYPE_ALU:
   gcc_as

[PATCH v2 4/8] [APX NF] Support APX NF for right shift insns

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashr3_1_nf): New.
(*lshr3_1_nf): Ditto.
(*lshrqi3_1_nf): Ditto.
(*lshrhi3_1_nf): Ditto.
---
 gcc/config/i386/i386.md | 82 +++--
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
271d449d7c4..7f191749342 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16308,13 +16308,13 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16325,11 +16325,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sar{}\t%0";
   else
-   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -16369,14 +16369,13 @@
 }
 [(set_attr "isa" "*,*,*,apx_ndd")])
 
-
-(define_insn "*lshr3_1"
+(define_insn "*lshr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r")
(lshiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))]
+  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16388,11 +16387,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "shr{}\t%0";
   else
-   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, %2}"
-  : "shr{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "shr{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,avx512bw,apx_ndd") @@ -16408,6 +16407,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c, 
+;; but it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+  (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-16476,22 +16486,22 @@
(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
(ashiftrt:SWI12
  (match_operand:SWI12 1 "nonimmediate_operand" "0, rm")
- (match_operand:QI 2 "nonmemory_operand" "c, c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c, c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "sar{}\t%0";
   else
-return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*, apx_ndd")
(set_attr "type" "ishift")
@@ -16504,13 +16514,13 @@
(const_string "*")))
(set_attr "mode" "")])
 
-(define_insn "*lshrqi3_1"
+(define_insn "*lshrqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
(lshiftrt:QI
  (match_operand

[PATCH v2 5/8] [APX NF] Support APX NF for rotate insns

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (ashr3_cvt_nf): New define_insn.
(*3_1_nf): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add NF test for rotate insns.
---
 gcc/config/i386/i386.md| 53 --
 gcc/testsuite/gcc.target/i386/apx-nf.c |  5 +++
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
7f191749342..731eb12d13a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16230,19 +16230,19 @@
 (define_mode_attr cvt_mnemonic
   [(SI "{cltd|cdq}") (DI "{cqto|cqo}")])
 
-(define_insn "ashr3_cvt"
+(define_insn "ashr3_cvt"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=*d,rm,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "*a,0,rm")
- (match_operand:QI 2 "const_int_operand")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:QI 2 "const_int_operand")))]
   "INTVAL (operands[2]) == GET_MODE_BITSIZE (mode)-1
&& (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
-   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
   "@

-   sar{}\t{%2, %0|%0, %2}
-   sar{}\t{%2, %1, %0|%0, %1, %2}"
+   sar{}\t{%2, %0|%0, %2}
+   sar{}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "*,*,apx_ndd")
(set_attr "type" "imovx,ishift,ishift")
(set_attr "prefix_0f" "0,*,*")
@@ -17094,13 +17094,13 @@
   [(set_attr "type" "rotatex")
(set_attr "mode" "")])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(any_rotate:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -17111,11 +17111,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "{}\t%0";
   else
-   return use_ndd ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "{}\t{%2, %1, %0|%0, 
%1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -17135,6 +17135,19 @@
(set_attr "mode" "")])
 
 ;; Convert rotate to the rotatex pattern to avoid flags dependency.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "const_int_operand")))]
+  "TARGET_BMI2 && reload_completed && !optimize_function_for_size_p (cfun)"
+  [(set (match_dup 0)
+   (rotatert:SWI48 (match_dup 1) (match_dup 2)))] {
+  int bitsize = GET_MODE_BITSIZE (mode);
+
+  operands[2] = GEN_INT ((bitsize - INTVAL (operands[2])) % bitsize);
+})
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand") @@ 
-17236,22 +17249,22 @@
   [(set (match_dup 0)
(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "{}\t%0";
   else
 return use_ndd
-  ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+  ? "{}\t{%2, %1, %0|%0, %1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*,apx_ndd")
(set_attr "type" "rotate")
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
index 608dbf8f5f7..6e59803be64 100644
--- a/gcc/testsuite/gcc.target/i386/apx-nf.c
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -3,6 +3,7 @@
 /* { dg-final { scan-assembler-times "\{nf\} add" 4 } } */
 /* { dg-final { scan-assembler-times "\{nf\} and" 1 } } */
 /* { dg-final { scan-assembler-times "\{nf\} or" 1 } } */
+/* { dg-final { scan-assembler-times "\{nf\} rol" 4 } } */
 
 #

[PATCH v2 6/8] [APX NF] Support APX NF for shld/shrd

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (x86_64_shld_nf): New define_insn.
(x86_64_shld_ndd_nf): Ditto.
(x86_64_shld_1_nf): Ditto.
(x86_64_shld_ndd_1_nf): Ditto.
(*x86_64_shld_shrd_1_nozext_nf): Ditto.
(x86_shld_nf): Ditto.
(x86_shld_ndd_nf): Ditto.
(x86_shld_1_nf): Ditto.
(x86_shld_ndd_1_nf): Ditto.
(*x86_shld_shrd_1_nozext_nf): Ditto.
(3_doubleword_lowpart_nf): Ditto.
(x86_64_shrd_nf): Ditto.
(x86_64_shrd_ndd_nf): Ditto.
(x86_64_shrd_1_nf): Ditto.
(x86_64_shrd_ndd_1_nf): Ditto.
(*x86_64_shrd_shld_1_nozext_nf): Ditto.
(x86_shrd_nf): Ditto.
(x86_shrd_ndd_nf): Ditto.
(x86_shrd_1_nf): Ditto.
(x86_shrd_ndd_1_nf): Ditto.
(*x86_shrd_shld_1_nozext_nf): Ditto.
---
 gcc/config/i386/i386.md | 377 +++-
 1 file changed, 296 insertions(+), 81 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
731eb12d13a..4d684e8d919 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14552,7 +14552,7 @@
   DONE;
 })
 
-(define_insn "x86_64_shld"
+(define_insn "x86_64_shld"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
  (and:QI (match_operand:QI 2 "nonmemory_operand" "Jc") @@ 
-14562,10 +14562,9 @@
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 2) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_64BIT"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+ (and:QI (match_dup 2) (const_int 63 0)))]
+  "TARGET_64BIT && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14573,7 +14572,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd"
+(define_insn "x86_64_shld_ndd"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
  (and:QI (match_operand:QI 3 "nonmemory_operand" "Jc") @@ 
-14583,14 +14582,13 @@
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 3) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_APX_NDD"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ (and:QI (match_dup 3) (const_int 63 0)))]
+  "TARGET_APX_NDD && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")])
 
-(define_insn "x86_64_shld_1"
+(define_insn "x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
   (match_operand:QI 2 "const_0_to_63_operand")) @@ 
-14598,11 +14596,11 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
-   (match_operand:QI 3 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 3 "const_0_to_255_operand")) 0)))]
   "TARGET_64BIT
-   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14611,7 +14609,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd_1"
+(define_insn "x86_64_shld_ndd_1"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
   (match_operand:QI 3 "const_0_to_63_operand")) @@ 
-14619,15 +14617,66 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
-   (match_operand:QI 4 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 4 "const_0_to_255_operand")) 0)))]
   "TARGET_APX_NDD
-   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])
+   && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")
(set_attr "length_immediate" "1")])
 
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+   (ior:DI (ashift:DI (match_operand:DI 4 "nonimmediate_oper

[PATCH v2 7/8] [APX NF] Support APX NF for mul/div

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*mul3_1_nf): New define_insn.
(*mulqi3_1_nf): Ditto.
(*divmod4_noext_nf): Ditto.
(divmodhiqi3_nf): Ditto.
---
 gcc/config/i386/i386.md | 47 ++---
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
4d684e8d919..087761e5b3a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9896,17 +9896,17 @@
 ;;
 ;; On BDVER1, all HI MULs use DoublePath
 
-(define_insn "*mul3_1"
+(define_insn "*mul3_1"
   [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
(mult:SWIM248
  (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0")
- (match_operand:SWIM248 2 "" "K,,r")))
-   (clobber (reg:CC FLAGS_REG))]
-  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ (match_operand:SWIM248 2 "" "K,,r")))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
   "@
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %0|%0, %2}"
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %0|%0, %2}"
   [(set_attr "type" "imul")
(set_attr "prefix_0f" "0,0,1")
(set (attr "athlon_decode")
@@ -9967,14 +9967,14 @@
 ;; MUL reg8Direct
 ;; MUL mem8Direct
 
-(define_insn "*mulqi3_1"
+(define_insn "*mulqi3_1"
   [(set (match_operand:QI 0 "register_operand" "=a")
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
-(match_operand:QI 2 "nonimmediate_operand" "qm")))
-   (clobber (reg:CC FLAGS_REG))]
+(match_operand:QI 2 "nonimmediate_operand" "qm")))]
   "TARGET_QIMODE_MATH
-   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
-  "mul{b}\t%2"
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
+  "mul{b}\t%2"
   [(set_attr "type" "imul")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
@@ -7,6 +7,19 @@
   [(set_attr "type" "multi")
(set_attr "mode" "SI")])
 
+(define_insn "*divmod4_noext_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+   (any_div:SWIM248
+ (match_operand:SWIM248 2 "register_operand" "0")
+ (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+   (:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))]
+  "TARGET_APX_NF"
+  "%{nf%} div{}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "")])
+
 (define_insn "*divmod4_noext"
   [(set (match_operand:SWIM248 0 "register_operand" "=a")
(any_div:SWIM248
@@ -11264,7 +11277,7 @@
 ;; Change div/mod to HImode and extend the second argument to HImode  ;; so 
that mode of div/mod matches with mode of arguments.  Otherwise  ;; combine may 
fail.
-(define_insn "divmodhiqi3"
+(define_insn "divmodhiqi3"
   [(set (match_operand:HI 0 "register_operand" "=a")
(ior:HI
  (ashift:HI
@@ -11276,10 +11289,10 @@
(const_int 8))
  (zero_extend:HI
(truncate:QI
- (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_QIMODE_MATH"
-  "div{b}\t%2"
+ (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))]
+  "TARGET_QIMODE_MATH
+   && "
+  "div{b}\t%2"
   [(set_attr "type" "idiv")
(set_attr "mode" "QI")])
 
--
2.31.1



[PATCH v2 8/8] [APX NF] Support APX NF for lzcnt/tzcnt/popcnt

2024-05-22 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (clz2_lzcnt_nf): New define_insn.
(*clz2_lzcnt_falsedep_nf): Ditto.
(__nf): Ditto.
(*__falsedep_nf): Ditto.
(_hi_nf): Ditto.
(popcount2_nf): Ditto.
(*popcount2_falsedep_nf): Ditto.
(popcounthi2_nf): Ditto.
---
 gcc/config/i386/i386.md | 124 
 1 file changed, 113 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
087761e5b3a..c9a3a99ca70 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -20250,6 +20250,24 @@
   operands[3] = gen_reg_rtx (mode);
 })
 
+(define_insn_and_split "clz2_lzcnt_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (clz:SWI48 (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "clz2_lzcnt"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20273,6 +20291,18 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+(define_insn "*clz2_lzcnt_falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn "*clz2_lzcnt_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20379,6 +20409,25 @@
 ;; Version of lzcnt/tzcnt that is expanded from intrinsics.  This version  ;; 
provides operand size as output when source operand is zero. 
 
+(define_insn_and_split "__nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (unspec:SWI48 [(match_dup 1)] LT_ZCNT))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "_"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20403,6 +20452,19 @@
 ; False dependency happens when destination is only updated by tzcnt,  ; lzcnt 
or popcnt.  There is no false dependency when destination is  ; also used in 
source.
+(define_insn "*__falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn "*__falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20417,13 +20479,12 @@
(set_attr "prefix_rep" "1")
(set_attr "mode" "")])
 
-(define_insn "_hi"
+(define_insn "_hi"
   [(set (match_operand:HI 0 "register_operand" "=r")
(unspec:HI
- [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))
-   (clobber (reg:CC FLAGS_REG))]
-  ""
-  "{w}\t{%1, %0|%0, %1}"
+ [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  ""
+  "{w}\t{%1, %0|%0, %1}"
   [(set_attr "type" "")
(set_attr "prefix_0f" "1")
(set_attr "prefix_rep" "1")
@@ -20841,6 +20902,30 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "")])
 
+(define_insn_and_split "popcount2_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (popcount:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_POPCNT"
+{
+#if TARGET_MACHO
+  return "%{nf%} popcnt\t{%1, %0|%0, %1}"; #else
+  return "%{nf%} popcnt{}\t{%1, %0|%0, %1}"; #endif }
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_menti

RE: [PATCH v2 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-22 Thread Kong, Lingling
Cc Uros.

From: Kong, Lingling 
Sent: Wednesday, May 22, 2024 4:35 PM
To: gcc-patches@gcc.gnu.org
Cc: Liu, Hongtao ; Kong, Lingling 

Subject: [PATCH v2 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

gcc/ChangeLog:

   * config/i386/i386.md (nf_and_applied): New subst_attr.
   (nf_x64_and_applied): Ditto.
   (*sub_1_nf): New define_insn.
   (*anddi_1_nf): Ditto.
   (*and_1_nf): Ditto.
   (*qi_1_nf): Ditto.
   (*_1_nf): Ditto.
   (*neg_1_nf): Ditto.
   * config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

   * gcc.target/i386/apx-nf.c: Add test.
---
gcc/config/i386/i386.md| 174 +
gcc/config/i386/sse.md |  11 ++
gcc/testsuite/gcc.target/i386/apx-nf.c |   9 ++
3 files changed, 112 insertions(+), 82 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index bae344518bd..099d7f35c8f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -575,7 +575,7 @@
 
noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
 
avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
 
avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
- vaes_avx512vl"
+vaes_avx512vl,noapx_nf"
   (const_string "base"))

 ;; The (bounding maximum) length of an instruction immediate.
@@ -981,6 +981,7 @@
 (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")
   (eq_attr "mmx_isa" "avx")
 (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")
+ (eq_attr "isa" "noapx_nf") (symbol_ref "!TARGET_APX_NF")
  ]
  (const_int 1)))

@@ -7893,20 +7894,21 @@
   "split_double_mode (mode, &operands[0], 2, &operands[0], &operands[3]);"
[(set_attr "isa" "*,*,apx_ndd,apx_ndd")])

-(define_insn "*sub_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
+(define_insn "*sub_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" 
"=m,r,,r,r,r")
  (minus:SWI
-(match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,rjM,r")
-(match_operand:SWI 2 "" 
",,r,,")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+   (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+   (match_operand:SWI 2 "" 
",,,r,,")))]
+  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)
+  && "
   "@
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "type" "alu")
(set_attr "mode" "")])

@@ -11795,27 +11797,31 @@
}
[(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])

-(define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
+(define_subst_attr "nf_and_applied" "nf_subst"  "noapx_nf" "*")
+(define_subst_attr "nf_x64_and_applied" "nf_subst" "noapx_nf" "x64")
+
+(define_insn "*anddi_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" 
"=r,r,rm,r,r,r,r,r,r,?k")
  (and:DI
-  (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,rm,rjM,r,qm,k")
-  (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,re,m,r,e,m,L,k")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:DI 1 "nonimmediate_operand" 
"%0,r,0,0,0,rm,rjM,r,qm,k")
+ (match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,r,e,m,r,e,m,L,k")))]
   "TARGET_64BIT
-   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)
+   && "
   "@
-   and{l}\t{%k2, %k0|%k0, %k2}
-   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
-   and{q}\t{%2, %0|%0, %2

[PATCH v3 1/8] [APX NF]: Support APX NF add

2024-05-28 Thread Kong, Lingling
Hi, compared with v2, these patches restored the original lea patten position 
and addressed hongtao's comment. 

APX NF(no flags) feature implements suppresses the update of status flags
for arithmetic operations.

For NF add, it is not clear whether nf add can be faster than lea. If so,
the pattern needs to be adjusted to perfer lea generation.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features): Add nf
enumeration.
* config/i386/i386.h (TARGET_APX_NF): New.
* config/i386/i386.md (*add_1_nf): New define_insn.
* config/i386/i386.opt: Add apx_nf enumeration.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-ndd.c: Fixed test.

Co-authored-by: Lingling Kong 
---
 gcc/config/i386/i386-opts.h |   3 +-
 gcc/config/i386/i386.h  |   1 +
 gcc/config/i386/i386.md | 135 
 gcc/config/i386/i386.opt|   3 +
 gcc/testsuite/gcc.target/i386/apx-ndd.c |   2 +-
 5 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index ef2825803b3..60176ce609f 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -140,7 +140,8 @@ enum apx_features {
   apx_push2pop2 = 1 << 1,
   apx_ndd = 1 << 2,
   apx_ppx = 1 << 3,
-  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx,
+  apx_nf = 1<< 4,
+  apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx | apx_nf,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 359a8408263..969391d3013 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -55,6 +55,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
 #define TARGET_APX_PUSH2POP2 (ix86_apx_features & apx_push2pop2)
 #define TARGET_APX_NDD (ix86_apx_features & apx_ndd)
 #define TARGET_APX_PPX (ix86_apx_features & apx_ppx)
+#define TARGET_APX_NF (ix86_apx_features & apx_nf)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e8073f5a200..1eeadaddeba 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6290,6 +6290,13 @@
   [(parallel [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))
   (clobber (reg:CC FLAGS_REG))])]
   "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+
+(define_split
+  [(set (match_operand:SWI48 0 "general_reg_operand")
+   (mult:SWI48 (match_dup 0) (match_operand:SWI48 1 "const1248_operand")))]
+  "TARGET_APX_NF && reload_completed"
+  [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))]
+  "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
 

 ;; Add instructions
 
@@ -6437,48 +6444,65 @@
  (clobber (reg:CC FLAGS_REG))])]
  "split_double_mode (mode, &operands[0], 1, &operands[0], &operands[5]);")
 
-(define_insn "*add_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r,r,r,r")
+(define_subst_attr "nf_name" "nf_subst" "_nf" "")
+(define_subst_attr "nf_prefix" "nf_subst" "%{nf%} " "")
+(define_subst_attr "nf_condition" "nf_subst" "TARGET_APX_NF" "true")
+(define_subst_attr "nf_mem_constraint" "nf_subst" "je" "m")
+(define_subst_attr "nf_applied" "nf_subst" "true" "false")
+
+(define_subst "nf_subst"
+  [(set (match_operand:SWI 0)
+(match_operand:SWI 1))]
+  ""
+  [(set (match_dup 0)
+   (match_dup 1))
+   (clobber (reg:CC FLAGS_REG))])
+
+(define_insn "*add_1"
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" 
"=rm,r,r,r,r,r,r,r")
(plus:SWI48
- (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r,rje,jM,r")
- (match_operand:SWI48 2 "x86_64_general_operand" "re,BM,0,le,r,e,BM")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (PLUS, mode, operands, TARGET_APX_NDD)"
+ (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,0,r,r,rje,jM,r")
+ (match_operand:SWI48 2 "x86_64_general_operand" 
"r,e,BM,0,le,r,e,BM")))]
+  "ix86_binary_operator_ok (PLUS, mode, operands, TARGET_APX_NDD)
+  && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
-  return "#";
+  if (TARGET_APX_NDD && )
+   return "%{nf%} add{}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
 
 case TYPE_INCDEC:
   if (operands[2] == const1_rtx)
-return use_ndd ? "inc{}\t{%1, %0|%0, %1}"
- : "inc{}\t%0";
+return use_ndd ? "inc{}\t{%1, %0|%0, %1}"
+ : "inc{}\t%0";
   else
 {
  gcc_assert (operands[2] == constm1_rtx);
- return use_ndd ? "dec{}\t{%1, %0|%0, %1}"
-   : "dec{}\t%0";
+ return use_ndd ? "dec{}\t{%1, %0|%0, %1}"
+   : "dec{}\t%0";
}
 
 default:
   /* For most processors, ADD is faster than LEA.  This alternative
 was added to use ADD as much as possible.  */
-  if (whic

[PATCH v3 3/8] [APX NF] Support APX NF for left shift insns

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashl3_1_nf): New.
(*ashlhi3_1_nf): Ditto.
(*ashlqi3_1_nf): Ditto.
* config/i386/sse.md: New define_split.
---
 gcc/config/i386/i386.md | 96 ++---
 gcc/config/i386/sse.md  | 13 ++
 2 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d3cb224abad..4c06c243cc3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -15011,17 +15011,22 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashl3_1"
+(define_insn "*ashl3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,?k,r")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" 
"0,l,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,M,r,,c")))]
+  "ix86_binary_operator_ok (ASHIFT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
+  if (TARGET_APX_NDD && )
+   return "%{nf%} sal{}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
+
 case TYPE_ISHIFTX:
 case TYPE_MSKLOG:
   return "#";
@@ -15029,7 +15034,7 @@
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
   gcc_assert (rtx_equal_p (operands[0], operands[1]));
-  return "add{}\t%0, %0";
+  return "add{}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
@@ -15037,11 +15042,11 @@
  /* For NDD form instructions related to TARGET_SHIFT1, the $1
 immediate do not need to be omitted as assembler will map it
 to use shorter encoding. */
- && !use_ndd)
+ && !use_ndd && !)
return "sal{}\t%0";
   else
-   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sal{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,bmi2,avx512bw,apx_ndd")
@@ -15072,6 +15077,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c,
+;; and it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (ashift:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
@@ -15158,32 +15174,37 @@
(zero_extend:DI (ashift:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashlhi3_1"
+(define_insn "*ashlhi3_1"
   [(set (match_operand:HI 0 "nonimmediate_operand" "=rm,Yp,?k,r")
(ashift:HI (match_operand:HI 1 "nonimmediate_operand" "0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)"
+  (match_operand:QI 2 "nonmemory_operand" "cI,M,Ww,cI")))]
+  "ix86_binary_operator_ok (ASHIFT, HImode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_LEA:
+  if (TARGET_APX_NDD && )
+   return "%{nf%} sal{w}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
+
 case TYPE_MSKLOG:
   return "#";
 
 case TYPE_ALU:
   gcc_assert (operands[2] == const1_rtx);
-  return "add{w}\t%0, %0";
+  return "add{w}\t%0, %0";
 
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sal{w}\t%0";
   else
-   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
-  : "sal{w}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sal{w}\t{%2, %1, %0|%0, %1, %2}"
+  : "sal{w}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,*,avx512f,apx_ndd")
@@ -15211,31 +15232,36 @@
(const_string "*")))
(set_attr "mode" "HI,SI,HI,HI")])
 
-(define_insn "*ashlqi3_1"
+(define_insn "*ashlqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,Yp,?k,r")
(ashift:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,l,k,rm")
-  (match_operand:QI 2 "nonmemory_operand" "cI,cI,M,Wb,cI")))
-   (clobber (reg:CC FLAGS_REG))]
-  "i

[PATCH v3 2/8] [APX NF] Support APX NF for {sub/and/or/xor/neg}

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (nf_nonf_attr): New subst_attr.
(nf_nonf_x64_attr): Ditto.
(*sub_1_nf): New define_insn.
(*anddi_1_nf): Ditto.
(*and_1_nf): Ditto.
(*qi_1_nf): Ditto.
(*_1_nf): Ditto.
(*neg_1_nf): Ditto.
* config/i386/sse.md : New define_split.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add test.
---
 gcc/config/i386/i386.md| 173 +
 gcc/config/i386/sse.md |  11 ++
 gcc/testsuite/gcc.target/i386/apx-nf.c |  12 ++
 3 files changed, 114 insertions(+), 82 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-nf.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1eeadaddeba..d3cb224abad 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -575,7 +575,7 @@
noavx512dq,fma_or_avx512vl,avx512vl,noavx512vl,avxvnni,
avx512vnnivl,avx512fp16,avxifma,avx512ifmavl,avxneconvert,
avx512bf16vl,vpclmulqdqvl,avx_noavx512f,avx_noavx512vl,
-   vaes_avx512vl"
+   vaes_avx512vl,noapx_nf"
   (const_string "base"))
 
 ;; The (bounding maximum) length of an instruction immediate.
@@ -981,6 +981,7 @@
   (symbol_ref "TARGET_MMX_WITH_SSE && !TARGET_AVX")
 (eq_attr "mmx_isa" "avx")
   (symbol_ref "TARGET_MMX_WITH_SSE && TARGET_AVX")
+(eq_attr "isa" "noapx_nf") (symbol_ref "!TARGET_APX_NF")
]
(const_int 1)))
 
@@ -6449,6 +6450,8 @@
 (define_subst_attr "nf_condition" "nf_subst" "TARGET_APX_NF" "true")
 (define_subst_attr "nf_mem_constraint" "nf_subst" "je" "m")
 (define_subst_attr "nf_applied" "nf_subst" "true" "false")
+(define_subst_attr "nf_nonf_attr" "nf_subst"  "noapx_nf" "*")
+(define_subst_attr "nf_nonf_x64_attr" "nf_subst" "noapx_nf" "x64")
 
 (define_subst "nf_subst"
   [(set (match_operand:SWI 0)
@@ -7893,20 +7896,21 @@
   "split_double_mode (mode, &operands[0], 2, &operands[0], &operands[3]);"
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd")])
 
-(define_insn "*sub_1"
-  [(set (match_operand:SWI 0 "nonimmediate_operand" "=m,,r,r,r")
+(define_insn "*sub_1"
+  [(set (match_operand:SWI 0 "nonimmediate_operand" 
"=m,r,,r,r,r")
(minus:SWI
- (match_operand:SWI 1 "nonimmediate_operand" "0,0,rm,rjM,r")
- (match_operand:SWI 2 "" ",,r,,")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)"
+ (match_operand:SWI 1 "nonimmediate_operand" "0,0,0,rm,rjM,r")
+ (match_operand:SWI 2 "" ",,,r,,")))]
+  "ix86_binary_operator_ok (MINUS, mode, operands, TARGET_APX_NDD)
+  && "
   "@
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %0|%0, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}
-  sub{}\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd")
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %0|%0, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}
+  sub{}\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "*,*,*,apx_ndd,apx_ndd,apx_ndd")
(set_attr "type" "alu")
(set_attr "mode" "")])
 
@@ -11795,27 +11799,28 @@
 }
 [(set_attr "isa" "*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd_64,apx_ndd")])
 
-(define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,r,rm,r,r,r,r,r,?k")
+(define_insn "*anddi_1"
+  [(set (match_operand:DI 0 "nonimmediate_operand" 
"=r,r,rm,r,r,r,r,r,r,?k")
(and:DI
-(match_operand:DI 1 "nonimmediate_operand" "%0,r,0,0,rm,rjM,r,qm,k")
-(match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,re,m,r,e,m,L,k")))
-   (clobber (reg:CC FLAGS_REG))]
+(match_operand:DI 1 "nonimmediate_operand" "%0,r,0,0,0,rm,rjM,r,qm,k")
+(match_operand:DI 2 "x86_64_szext_general_operand" 
"Z,Z,r,e,m,r,e,m,L,k")))]
   "TARGET_64BIT
-   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (AND, DImode, operands, TARGET_APX_NDD)
+   && "
   "@
-   and{l}\t{%k2, %k0|%k0, %k2}
-   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %0|%0, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
-   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{l}\t{%k2, %k0|%k0, %k2}
+   and{l}\t{%k2, %k1, %k0|%k0, %k1, %k2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %0|%0, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
+   and{q}\t{%2, %1, %0|%0, %1, %2}
#
#"
-  [(set_attr "isa" "x64,apx_ndd,x64,x64,apx_ndd,apx_ndd,apx_ndd,x64,avx512bw")
-   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
-   (set_attr "length_immediate" "*,*,*,*,*,*,*,0,*")
+  [(set_attr "isa" 
"x64,apx_ndd,x64,x64,x64,apx_ndd,apx_ndd,apx_ndd,,avx512bw")
+   (set_attr "type" "alu,alu,alu,alu,alu,alu,alu,alu,imovx,msklog")
+   (set_attr "length_immediate" "*,*,*,*,*,*,*,*

[PATCH v3 5/8] [APX NF] Support APX NF for rotate insns

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (ashr3_cvt_nf): New define_insn.
(*3_1_nf): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-nf.c: Add NF test for rotate insns.
---
 gcc/config/i386/i386.md| 59 +-
 gcc/testsuite/gcc.target/i386/apx-nf.c |  5 +++
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d10caf04fcc..9d518e90d07 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16245,19 +16245,19 @@
 (define_mode_attr cvt_mnemonic
   [(SI "{cltd|cdq}") (DI "{cqto|cqo}")])
 
-(define_insn "ashr3_cvt"
+(define_insn "ashr3_cvt"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=*d,rm,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "*a,0,rm")
- (match_operand:QI 2 "const_int_operand")))
-   (clobber (reg:CC FLAGS_REG))]
+ (match_operand:QI 2 "const_int_operand")))]
   "INTVAL (operands[2]) == GET_MODE_BITSIZE (mode)-1
&& (TARGET_USE_CLTD || optimize_function_for_size_p (cfun))
-   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+   && ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
   "@

-   sar{}\t{%2, %0|%0, %2}
-   sar{}\t{%2, %1, %0|%0, %1, %2}"
+   sar{}\t{%2, %0|%0, %2}
+   sar{}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "isa" "*,*,apx_ndd")
(set_attr "type" "imovx,ishift,ishift")
(set_attr "prefix_0f" "0,*,*")
@@ -17109,28 +17109,31 @@
   [(set_attr "type" "rotatex")
(set_attr "mode" "")])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(any_rotate:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_ROTATEX:
-  return "#";
+  if (TARGET_APX_NDD && )
+   return "%{nf%} {}\t{%2, %1, %0|%0, %1, %2}";
+  else
+   return "#";
 
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "{}\t%0";
   else
-   return use_ndd ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "{}\t{%2, %1, %0|%0, 
%1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -17164,6 +17167,20 @@
   operands[2] = GEN_INT ((bitsize - INTVAL (operands[2])) % bitsize);
 })
 
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (rotate:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+ (match_operand:QI 2 "const_int_operand")))]
+  "TARGET_BMI2 && reload_completed && !optimize_function_for_size_p (cfun)
+   && !TARGET_APX_NDD"
+  [(set (match_dup 0)
+   (rotatert:SWI48 (match_dup 1) (match_dup 2)))]
+{
+  int bitsize = GET_MODE_BITSIZE (mode);
+
+  operands[2] = GEN_INT ((bitsize - INTVAL (operands[2])) % bitsize);
+})
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(rotatert:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
@@ -17251,22 +17268,22 @@
   [(set (match_dup 0)
(zero_extend:DI (rotatert:SI (match_dup 1) (match_dup 2])
 
-(define_insn "*3_1"
+(define_insn "*3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m,r")
(any_rotate:SWI12 (match_operand:SWI12 1 "nonimmediate_operand" "0,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" 
"c,c")))]
+  "ix86_binary_operator_ok (, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "{}\t%0";
   else
 return use_ndd
-  ? "{}\t{%2, %1, %0|%0, %1, %2}"
-  : "{}\t{%2, %0|%0, %2}";
+  ? "{}\t{%2, %1, %0|%0, %1, %2}"
+  : "{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*,apx_ndd")
(set_attr "type" "rotate")
diff --git a/gcc/testsuite/gcc.target/i386/apx-nf.c 
b/gcc/testsuite/gcc.target/i386/apx-nf.c
index f33a994f0b7..ed859b399b8 100644
--- a/gcc/testsuite/gcc.target/i386/apx-nf.c
+++ b/gcc/testsuite/gcc.target/i386/apx-nf.c
@@ -2,6 +2,7 @@
 /* { dg-options "-mapx-features=egpr,push2pop2,ndd,ppx,nf -march=x86-64 -O2" } 
*/
 /* { dg-final

[PATCH v3 8/8] [APX NF] Support APX NF for lzcnt/tzcnt/popcnt

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (clz2_lzcnt_nf): New define_insn.
(*clz2_lzcnt_falsedep_nf): Ditto.
(__nf): Ditto.
(*__falsedep_nf): Ditto.
(_hi_nf): Ditto.
(popcount2_nf): Ditto.
(*popcount2_falsedep_nf): Ditto.
(popcounthi2_nf): Ditto.
---
 gcc/config/i386/i386.md | 124 
 1 file changed, 113 insertions(+), 11 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e688e92785e..b0eb497cd23 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -20269,6 +20269,24 @@
   operands[3] = gen_reg_rtx (mode);
 })
 
+(define_insn_and_split "clz2_lzcnt_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (clz:SWI48 (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "clz2_lzcnt"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20292,6 +20310,18 @@
 ; False dependency happens when destination is only updated by tzcnt,
 ; lzcnt or popcnt.  There is no false dependency when destination is
 ; also used in source.
+(define_insn "*clz2_lzcnt_falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (clz:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF && TARGET_LZCNT"
+  "%{nf%} lzcnt{}\t{%1, %0|%0, %1}"
+  [(set_attr "prefix_rep" "1")
+   (set_attr "type" "bitmanip")
+   (set_attr "mode" "")])
+
 (define_insn "*clz2_lzcnt_falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(clz:SWI48
@@ -20398,6 +20428,25 @@
 ;; Version of lzcnt/tzcnt that is expanded from intrinsics.  This version
 ;; provides operand size as output when source operand is zero. 
 
+(define_insn_and_split "__nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+[(set (match_dup 0)
+ (unspec:SWI48 [(match_dup 1)] LT_ZCNT))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "_"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20422,6 +20471,19 @@
 ; False dependency happens when destination is only updated by tzcnt,
 ; lzcnt or popcnt.  There is no false dependency when destination is
 ; also used in source.
+(define_insn "*__falsedep_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (unspec:SWI48
+ [(match_operand:SWI48 1 "nonimmediate_operand" "rm")] LT_ZCNT))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+  UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_APX_NF"
+  "%{nf%} {}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "")
+   (set_attr "prefix_0f" "1")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "")])
+
 (define_insn "*__falsedep"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
(unspec:SWI48
@@ -20436,13 +20498,12 @@
(set_attr "prefix_rep" "1")
(set_attr "mode" "")])
 
-(define_insn "_hi"
+(define_insn "_hi"
   [(set (match_operand:HI 0 "register_operand" "=r")
(unspec:HI
- [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))
-   (clobber (reg:CC FLAGS_REG))]
-  ""
-  "{w}\t{%1, %0|%0, %1}"
+ [(match_operand:HI 1 "nonimmediate_operand" "rm")] LT_ZCNT))]
+  ""
+  "{w}\t{%1, %0|%0, %1}"
   [(set_attr "type" "")
(set_attr "prefix_0f" "1")
(set_attr "prefix_rep" "1")
@@ -20860,6 +20921,30 @@
   [(set_attr "type" "bitmanip")
(set_attr "mode" "")])
 
+(define_insn_and_split "popcount2_nf"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+   (popcount:SWI48
+ (match_operand:SWI48 1 "nonimmediate_operand" "rm")))]
+  "TARGET_APX_NF && TARGET_POPCNT"
+{
+#if TARGET_MACHO
+  return "%{nf%} popcnt\t{%1, %0|%0, %1}";
+#else
+  return "%{nf%} popcnt{}\t{%1, %0|%0, %1}";
+#endif
+}
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentione

[PATCH v3 4/8] [APX NF] Support APX NF for right shift insns

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*ashr3_1_nf): New.
(*lshr3_1_nf): Ditto.
(*lshrqi3_1_nf): Ditto.
(*lshrhi3_1_nf): Ditto.
---
 gcc/config/i386/i386.md | 82 +++--
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4c06c243cc3..d10caf04fcc 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16323,13 +16323,13 @@
   [(set_attr "type" "ishiftx")
(set_attr "mode" "")])
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r")
(ashiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16340,11 +16340,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "sar{}\t%0";
   else
-   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,apx_ndd")
@@ -16384,14 +16384,13 @@
 }
 [(set_attr "isa" "*,*,*,apx_ndd")])
 
-
-(define_insn "*lshr3_1"
+(define_insn "*lshr3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k,r")
(lshiftrt:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "0,rm,k,rm")
- (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c,r,,c")))]
+  "ix86_binary_operator_ok (LSHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
@@ -16403,11 +16402,11 @@
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
- && !use_ndd)
+ && !use_ndd && !)
return "shr{}\t%0";
   else
-   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, %2}"
-  : "shr{}\t{%2, %0|%0, %2}";
+   return use_ndd ? "shr{}\t{%2, %1, %0|%0, %1, 
%2}"
+  : "shr{}\t{%2, %0|%0, %2}";
 }
 }
   [(set_attr "isa" "*,bmi2,avx512bw,apx_ndd")
@@ -16423,6 +16422,17 @@
(set_attr "mode" "")])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
+;; For NF/NDD doesn't support shift count as r, it just support c,
+;; and it has no flag.
+(define_split
+  [(set (match_operand:SWI48 0 "register_operand")
+   (any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
+  (match_operand:QI 2 "register_operand")))]
+  "TARGET_BMI2 && reload_completed"
+  [(set (match_dup 0)
+   (any_shiftrt:SWI48 (match_dup 1) (match_dup 2)))]
+  "operands[2] = gen_lowpart (mode, operands[2]);")
+
 (define_split
   [(set (match_operand:SWI48 0 "register_operand")
(any_shiftrt:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")
@@ -16491,22 +16501,22 @@
(zero_extend:DI (any_shiftrt:SI (match_dup 1) (match_dup 2]
   "operands[2] = gen_lowpart (SImode, operands[2]);")
 
-(define_insn "*ashr3_1"
+(define_insn "*ashr3_1"
   [(set (match_operand:SWI12 0 "nonimmediate_operand" "=m, r")
(ashiftrt:SWI12
  (match_operand:SWI12 1 "nonimmediate_operand" "0, rm")
- (match_operand:QI 2 "nonmemory_operand" "c, c")))
-   (clobber (reg:CC FLAGS_REG))]
-  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)"
+ (match_operand:QI 2 "nonmemory_operand" "c, c")))]
+  "ix86_binary_operator_ok (ASHIFTRT, mode, operands, TARGET_APX_NDD)
+   && "
 {
   bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   if (operands[2] == const1_rtx
   && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
-  && !use_ndd)
+  && !use_ndd && !)
 return "sar{}\t%0";
   else
-return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sar{}\t{%2, %0|%0, %2}";
+return use_ndd ? "sar{}\t{%2, %1, %0|%0, %1, %2}"
+  : "sar{}\t{%2, %0|%0, %2}";
 }
   [(set_attr "isa" "*, apx_ndd")
(set_attr "type" "ishift")
@@ -16519,13 +16529,13 @@
(const_string "*")))
(set_attr "mode" "")])
 
-(define_insn "*lshrqi3_1"
+(define_insn "*lshrqi3_1"
   [(set (match_operand:QI 0 "nonimmediate_operand"  "=qm,?k,r")
(lshiftrt:QI
  (match_operand:QI

[PATCH v3 7/8] [APX NF] Support APX NF for mul/div

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (*mul3_1_nf): New define_insn.
(*mulqi3_1_nf): Ditto.
(*divmod4_noext_nf): Ditto.
(divmodhiqi3_nf): Ditto.
---
 gcc/config/i386/i386.md | 47 ++---
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 719cce7d3ef..e688e92785e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9898,17 +9898,17 @@
 ;;
 ;; On BDVER1, all HI MULs use DoublePath
 
-(define_insn "*mul3_1"
+(define_insn "*mul3_1"
   [(set (match_operand:SWIM248 0 "register_operand" "=r,r,r")
(mult:SWIM248
  (match_operand:SWIM248 1 "nonimmediate_operand" "%rm,rm,0")
- (match_operand:SWIM248 2 "" "K,,r")))
-   (clobber (reg:CC FLAGS_REG))]
-  "!(MEM_P (operands[1]) && MEM_P (operands[2]))"
+ (match_operand:SWIM248 2 "" "K,,r")))]
+  "!(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
   "@
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %1, %0|%0, %1, %2}
-   imul{}\t{%2, %0|%0, %2}"
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %1, %0|%0, %1, %2}
+   imul{}\t{%2, %0|%0, %2}"
   [(set_attr "type" "imul")
(set_attr "prefix_0f" "0,0,1")
(set (attr "athlon_decode")
@@ -9969,14 +9969,14 @@
 ;; MUL reg8Direct
 ;; MUL mem8Direct
 
-(define_insn "*mulqi3_1"
+(define_insn "*mulqi3_1"
   [(set (match_operand:QI 0 "register_operand" "=a")
(mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0")
-(match_operand:QI 2 "nonimmediate_operand" "qm")))
-   (clobber (reg:CC FLAGS_REG))]
+(match_operand:QI 2 "nonimmediate_operand" "qm")))]
   "TARGET_QIMODE_MATH
-   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
-  "mul{b}\t%2"
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))
+   && "
+  "mul{b}\t%2"
   [(set_attr "type" "imul")
(set_attr "length_immediate" "0")
(set (attr "athlon_decode")
@@ -9,6 +9,19 @@
   [(set_attr "type" "multi")
(set_attr "mode" "SI")])
 
+(define_insn "*divmod4_noext_nf"
+  [(set (match_operand:SWIM248 0 "register_operand" "=a")
+   (any_div:SWIM248
+ (match_operand:SWIM248 2 "register_operand" "0")
+ (match_operand:SWIM248 3 "nonimmediate_operand" "rm")))
+   (set (match_operand:SWIM248 1 "register_operand" "=d")
+   (:SWIM248 (match_dup 2) (match_dup 3)))
+   (use (match_operand:SWIM248 4 "register_operand" "1"))]
+  "TARGET_APX_NF"
+  "%{nf%} div{}\t%3"
+  [(set_attr "type" "idiv")
+   (set_attr "mode" "")])
+
 (define_insn "*divmod4_noext"
   [(set (match_operand:SWIM248 0 "register_operand" "=a")
(any_div:SWIM248
@@ -11266,7 +11279,7 @@
 ;; Change div/mod to HImode and extend the second argument to HImode
 ;; so that mode of div/mod matches with mode of arguments.  Otherwise
 ;; combine may fail.
-(define_insn "divmodhiqi3"
+(define_insn "divmodhiqi3"
   [(set (match_operand:HI 0 "register_operand" "=a")
(ior:HI
  (ashift:HI
@@ -11278,10 +11291,10 @@
(const_int 8))
  (zero_extend:HI
(truncate:QI
- (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_QIMODE_MATH"
-  "div{b}\t%2"
+ (div:HI (match_dup 1) (any_extend:HI (match_dup 2)))]
+  "TARGET_QIMODE_MATH
+   && "
+  "div{b}\t%2"
   [(set_attr "type" "idiv")
(set_attr "mode" "QI")])
 
-- 
2.31.1



[PATCH v3 6/8] [APX NF] Support APX NF for shld/shrd

2024-05-28 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386.md (x86_64_shld_nf): New define_insn.
(x86_64_shld_ndd_nf): Ditto.
(x86_64_shld_1_nf): Ditto.
(x86_64_shld_ndd_1_nf): Ditto.
(*x86_64_shld_shrd_1_nozext_nf): Ditto.
(x86_shld_nf): Ditto.
(x86_shld_ndd_nf): Ditto.
(x86_shld_1_nf): Ditto.
(x86_shld_ndd_1_nf): Ditto.
(*x86_shld_shrd_1_nozext_nf): Ditto.
(3_doubleword_lowpart_nf): Ditto.
(x86_64_shrd_nf): Ditto.
(x86_64_shrd_ndd_nf): Ditto.
(x86_64_shrd_1_nf): Ditto.
(x86_64_shrd_ndd_1_nf): Ditto.
(*x86_64_shrd_shld_1_nozext_nf): Ditto.
(x86_shrd_nf): Ditto.
(x86_shrd_ndd_nf): Ditto.
(x86_shrd_1_nf): Ditto.
(x86_shrd_ndd_1_nf): Ditto.
(*x86_shrd_shld_1_nozext_nf): Ditto.
---
 gcc/config/i386/i386.md | 377 +++-
 1 file changed, 296 insertions(+), 81 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9d518e90d07..719cce7d3ef 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14551,7 +14551,7 @@
   DONE;
 })
 
-(define_insn "x86_64_shld"
+(define_insn "x86_64_shld"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
  (and:QI (match_operand:QI 2 "nonmemory_operand" "Jc")
@@ -14561,10 +14561,9 @@
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 2) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_64BIT"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+ (and:QI (match_dup 2) (const_int 63 0)))]
+  "TARGET_64BIT && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14572,7 +14571,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd"
+(define_insn "x86_64_shld_ndd"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
  (and:QI (match_operand:QI 3 "nonmemory_operand" "Jc")
@@ -14582,14 +14581,13 @@
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
(minus:QI (const_int 64)
- (and:QI (match_dup 3) (const_int 63 0)))
-   (clobber (reg:CC FLAGS_REG))]
-  "TARGET_APX_NDD"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ (and:QI (match_dup 3) (const_int 63 0)))]
+  "TARGET_APX_NDD && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")])
 
-(define_insn "x86_64_shld_1"
+(define_insn "x86_64_shld_1"
   [(set (match_operand:DI 0 "nonimmediate_operand" "+r*m")
 (ior:DI (ashift:DI (match_dup 0)
   (match_operand:QI 2 "const_0_to_63_operand"))
@@ -14597,11 +14595,11 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 1 "register_operand" "r"))
-   (match_operand:QI 3 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 3 "const_0_to_255_operand")) 0)))]
   "TARGET_64BIT
-   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])"
-  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
+   && INTVAL (operands[3]) == 64 - INTVAL (operands[2])
+   && "
+  "shld{q}\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ishift")
(set_attr "prefix_0f" "1")
(set_attr "mode" "DI")
@@ -14610,7 +14608,7 @@
(set_attr "amdfam10_decode" "vector")
(set_attr "bdver1_decode" "vector")])
 
-(define_insn "x86_64_shld_ndd_1"
+(define_insn "x86_64_shld_ndd_1"
   [(set (match_operand:DI 0 "register_operand" "=r")
 (ior:DI (ashift:DI (match_operand:DI 1 "nonimmediate_operand" "rm")
   (match_operand:QI 3 "const_0_to_63_operand"))
@@ -14618,15 +14616,66 @@
  (lshiftrt:TI
(zero_extend:TI
  (match_operand:DI 2 "register_operand" "r"))
-   (match_operand:QI 4 "const_0_to_255_operand")) 0)))
-   (clobber (reg:CC FLAGS_REG))]
+   (match_operand:QI 4 "const_0_to_255_operand")) 0)))]
   "TARGET_APX_NDD
-   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])"
-  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+   && INTVAL (operands[4]) == 64 - INTVAL (operands[3])
+   && "
+  "shld{q}\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ishift")
(set_attr "mode" "DI")
(set_attr "length_immediate" "1")])
 
+(define_insn_and_split "*x86_64_shld_shrd_1_nozext_nf"
+  [(set (match_operand:DI 0 "nonimmediate_operand")
+   (ior:DI (ashift:DI (match_operand:DI 4 "nonimmediate_operand")

PING [PATCH v2 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-08-08 Thread Kong, Lingling
Hi,

Gently ping.

Thanks,
Lingling
From: Kong, Lingling 
Sent: Tuesday, June 25, 2024 2:46 PM
To: gcc-patches@gcc.gnu.org
Cc: Alexander Monakov ; Uros Bizjak ; 
lingling.ko...@gmail.com; Hongtao Liu ; Jeff Law 
; Richard Biener 
Subject: RE: [PATCH v2 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

Hi,


Gently ping for this.

This version has removed the target hook and added a new optab for cfcmov.

Thanks,
Lingling

From: Kong, Lingling mailto:lingling.k...@intel.com>>
Sent: Tuesday, June 18, 2024 3:41 PM
To: gcc-patches@gcc.gnu.org<mailto:gcc-patches@gcc.gnu.org>
Cc: Alexander Monakov mailto:amona...@ispras.ru>>; Uros 
Bizjak mailto:ubiz...@gmail.com>>; 
lingling.ko...@gmail.com<mailto:lingling.ko...@gmail.com>; Hongtao Liu 
mailto:crazy...@gmail.com>>; Jeff Law 
mailto:jeffreya...@gmail.com>>; Richard Biener 
mailto:richard.guent...@gmail.com>>
Subject: [PATCH v2 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass


APX CFCMOV feature implements conditionally faulting which means

that all memory faults are suppressed when the condition code

evaluates to false and load or store a memory operand. Now we

could load or store a memory operand may trap or fault for

conditional move.



In middle-end, now we don't support a conditional move if we knew

that a load from A or B could trap or fault. To enable CFCMOV, we

added a new optab.



Conditional move suppress_fault for condition mem store would not

move any arithmetic calculations. For condition mem load now just

support a conditional move one trap mem and one no trap and no mem

cases.



gcc/ChangeLog:



   * ifcvt.cc (noce_try_cmove_load_mem_notrap): Allow convert

   to cfcmov for conditional load.

   (noce_try_cmove_store_mem_notrap): Convert to conditional store.

   (noce_process_if_block): Ditto.

   * optabs.def (OPTAB_D): New optab.

---

gcc/ifcvt.cc   | 246 -

gcc/optabs.def |   1 +

2 files changed, 246 insertions(+), 1 deletion(-)



diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc

index 58ed42673e5..65c069b8cc6 100644

--- a/gcc/ifcvt.cc

+++ b/gcc/ifcvt.cc

@@ -783,6 +783,8 @@ static rtx noce_emit_cmove (struct noce_if_info *, rtx, 
enum rtx_code, rtx,

 rtx, rtx, rtx, rtx = NULL, rtx 
= NULL);

static bool noce_try_cmove (struct noce_if_info *);

static bool noce_try_cmove_arith (struct noce_if_info *);

+static bool noce_try_cmove_load_mem_notrap (struct noce_if_info *);

+static bool noce_try_cmove_store_mem_notrap (struct noce_if_info *, rtx *, 
rtx);

static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn **);

static bool noce_try_minmax (struct noce_if_info *);

static bool noce_try_abs (struct noce_if_info *);

@@ -2401,6 +2403,233 @@ noce_try_cmove_arith (struct noce_if_info *if_info)

   return false;

}



+/* When target support suppress memory fault, try more complex cases involving

+   conditional_move's source or dest may trap or fault.  */

+

+static bool

+noce_try_cmove_load_mem_notrap (struct noce_if_info *if_info)

+{

+  rtx a = if_info->a;

+  rtx b = if_info->b;

+  rtx x = if_info->x;

+

+  if (MEM_P (x))

+return false;

+  /* Just handle a conditional move from one trap MEM + other non_trap,

+ non mem cases.  */

+  if (!(MEM_P (a) ^ MEM_P (b)))

+  return false;

+  bool a_trap = may_trap_or_fault_p (a);

+  bool b_trap = may_trap_or_fault_p (b);

+

+  if (!(a_trap ^ b_trap))

+return false;

+  if (a_trap && !MEM_P (a))

+return false;

+  if (b_trap && !MEM_P (b))

+return false;

+

+  rtx orig_b;

+  rtx_insn *insn_a, *insn_b;

+  bool a_simple = if_info->then_simple;

+  bool b_simple = if_info->else_simple;

+  basic_block then_bb = if_info->then_bb;

+  basic_block else_bb = if_info->else_bb;

+  rtx target;

+  enum rtx_code code;

+  rtx cond = if_info->cond;

+  rtx_insn *ifcvt_seq;

+

+  /* if (test) x = *a; else x = c - d;

+ => x = c - d;

+ if (test)

+   x = *a;

+  */

+

+  code = GET_CODE (cond);

+  insn_a = if_info->insn_a;

+  insn_b = if_info->insn_b;

+  machine_mode x_mode = GET_MODE (x);

+

+  /* Because we only handle one trap MEM + other non_trap, non mem cases,

+ just move one trap MEM always in then_bb.  */

+  if (noce_reversed_cond_code (if_info) != UNKNOWN)

+{

+  bool reversep = false;

+  if (b_trap)

+ reversep = true;

+

+  if (reversep)

+ {

+   if (if_info->rev_cond)

+ {

+   cond = if_info->rev_cond;

+   code = GET_CODE (cond);

+ }

+   else

+ code = reversed_comparison_code (cond, if_info->jump);

+   std::swap (a, b);

+   std::

[PATCH 1/4] i386: Optimization for APX NDD is always zero-uppered for ADD

2024-08-12 Thread kong lingling
For APX instruction with an NDD, the destination GPR will get the
instruction’s result in bits [OSIZE-1:0] and, if OSIZE < 64b, have its
upper bits [63:OSIZE] zeroed. Now supporting other NDD instructions.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for trunk?


gcc/ChangeLog:



   PR target/113729

   * config/i386/i386.md (*addqi_1_zext): New

   define.

   (*addhi_1_zext): Ditto.



gcc/testsuite/ChangeLog:



   * gcc.target/i386/pr113729.c: New test.

---

 gcc/config/i386/i386.md  | 80 

 gcc/testsuite/gcc.target/i386/pr113729.c | 27 

 2 files changed, 107 insertions(+)

 create mode 100644 gcc/testsuite/gcc.target/i386/pr113729.c



diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

index 6207036a2a0..b1cf0868efe 100644

--- a/gcc/config/i386/i386.md

+++ b/gcc/config/i386/i386.md

@@ -6571,6 +6571,86 @@

(set_attr "has_nf" "1")

(set_attr "mode" "")])



+;; For APX instruction with an NDD, the destination GPR  will get the

+;; instruction’s result in bits [OSIZE-1:0] and, if OSIZE < 64b, have

+;; its upper bits [63:OSIZE] zeroed.

+

+(define_insn "*addqi_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r,r")

+ (zero_extend:SWI248x

+   (plus:QI (match_operand:QI 1 "nonimmediate_operand" "%rm,r")

+   (match_operand:QI 2 "general_operand"
"rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+{

+  switch (get_attr_type (insn))

+{

+case TYPE_INCDEC:

+  if (operands[2] == const1_rtx)

+ return "inc{b}\t{%1, %b0|%b0, %1}";

+  else

+ {

+   gcc_assert (operands[2] == constm1_rtx);

+   return "dec{b}\t{%1, %b0|%b0, %1}";

+ }

+

+default:

+  if (x86_maybe_negate_const_int (&operands[2], QImode))

+ return "sub{b}\t{%2, %1, %b0|%b0, %1, %2}";

+  return "add{b}\t{%2, %1, %b0|%b0, %1, %2}";

+}

+}

+  [(set (attr "type")

+ (cond [(match_operand:QI 2 "incdec_operand")

+   (const_string "incdec")

+]

+(const_string "alu")))

+   (set (attr "length_immediate")

+  (if_then_else

+ (and (eq_attr "type" "alu") (match_operand 2
"const128_operand"))

+ (const_string "1")

+ (const_string "*")))

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*addhi_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")

+ (zero_extend:SWI48x

+   (plus:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,r")

+   (match_operand:HI 2 "general_operand"
"rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+{

+  switch (get_attr_type (insn))

+{

+case TYPE_INCDEC:

+  if (operands[2] == const1_rtx)

+ return "inc{w}\t{%1, %w0|%w0, %1}";

+  else

+ {

+   gcc_assert (operands[2] == constm1_rtx);

+   return "dec{w}\t{%1, %w0|%w0, %1}";

+ }

+

+default:

+  if (x86_maybe_negate_const_int (&operands[2], HImode))

+ return "sub{w}\t{%2, %1, %w0|%w0, %1, %2}";

+  return "add{w}\t{%2, %1, %w0|%w0, %1, %2}";

+}

+}

+  [(set (attr "type")

+ (cond [(match_operand:QI 2 "incdec_operand")

+   (const_string "incdec")

+]

+(const_string "alu")))

+   (set (attr "length_immediate")

+  (if_then_else

+ (and (eq_attr "type" "alu") (match_operand 2
"const128_operand"))

+ (const_string "1")

+ (const_string "*")))

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 ;; It may seem that nonimmediate operand is proper one for operand 1.

 ;; The addsi_1 pattern allows nonimmediate operand at that place and

 ;; we take care in ix86_binary_operator_ok to not allow two memory

diff --git a/gcc/testsuite/gcc.target/i386/pr113729.c
b/gcc/testsuite/gcc.target/i386/pr113729.c

new file mode 100644

index 000..74f65506ad4

--- /dev/null

+++ b/gcc/testsuite/gcc.target/i386/pr113729.c

@@ -0,0 +1,27 @@

+/* { dg-do compile { target { ! ia32 } } } */

+/* { dg-options "-mapx-features=ndd -march=x86-64 -O2" } */

+/* { dg-final { scan-assembler-not "movz"} } */

+

+#include 

+

+#define F(TYPE1, TYPE2, OP_NAME, OP)\

+TYPE1
   \

+__attribute__ ((noipa))
   \

+f_##OP_NAME##_##TYPE2##_##TYPE1 (unsigned TYPE2 b)  \

+{
 \

+  return (unsigned TYPE2) (200 OP b);
\

+}
 \

+TYPE1
   \

+__attribute__ ((noipa))
   \

+f1_##OP_NAME##_##TYPE2##_##TYPE1

[PATCH 2/4] i386: Optimization for APX NDD is always zero-uppered for sub/adc/sbb

2024-08-12 Thread kong lingling
gcc/ChangeLog:



   PR target/113729

   * config/i386/i386.md (*subqi_1_zext): New

   define_insn.

   (*subhi_1_zext): Ditto.

   (*addqi3_carry_zext): Ditto.

   (*addhi3_carry_zext): Ditto.

   (*addqi3_carry_zext_0): Ditto.

   (*addhi3_carry_zext_0): Ditto.

   (*addqi3_carry_zext_0r): Ditto.

   (*addhi3_carry_zext_0r): Ditto.

   (*subqi3_carry_zext): Ditto.

   (*subhi3_carry_zext): Ditto.

   (*subqi3_carry_zext_0): Ditto.

   (*subhi3_carry_zext_0): Ditto.

   (*subqi3_carry_zext_0r): Ditto.

   (*subhi3_carry_zext_0r): Ditto.



gcc/testsuite/ChangeLog:



   * gcc.target/i386/pr113729.c: Add test for sub.

   * gcc.target/i386/pr113729-adc-sbb.c: New test.

---

 gcc/config/i386/i386.md   | 244 +-

 gcc/testsuite/gcc.target/i386/pr113729-adc-sbb.c|  63 +

 gcc/testsuite/gcc.target/i386/pr113729.c  |   6 +

 3 files changed, 305 insertions(+), 8 deletions(-)  create mode 100644
gcc/testsuite/gcc.target/i386/pr113729-adc-sbb.c



diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
b1cf0868efe..bb90a67b16d 100644

--- a/gcc/config/i386/i386.md

+++ b/gcc/config/i386/i386.md

@@ -8052,6 +8052,34 @@

(set_attr "has_nf" "1")

(set_attr "mode" "")])



+(define_insn "*subqi_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r,r")

+ (zero_extend:SWI248x

+   (minus:QI (match_operand:QI 1 "nonimmediate_operand" "rm,r")

+(match_operand:QI 2
"x86_64_general_operand" "rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  sub{b}\t{%2, %1, %b0|%b0, %1, %2}

+  sub{b}\t{%2, %1, %b0|%b0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*subhi_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")

+ (zero_extend:SWI48x

+   (minus:HI (match_operand:HI 1 "nonimmediate_operand" "rm,r")

+(match_operand:HI 2
"x86_64_general_operand" "rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  sub{w}\t{%2, %1, %w0|%w0, %1, %2}

+  sub{w}\t{%2, %1, %w0|%w0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 (define_insn "*subsi_1_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r,r")

   (zero_extend:DI

@@ -8771,6 +8799,44 @@

(set_attr "pent_pair" "pu")

(set_attr "mode" "")])



+(define_insn "*addqi3_carry_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r,r")

+ (zero_extend:SWI248x

+   (plus:QI

+ (plus:QI (match_operator:QI 3 "ix86_carry_flag_operator"

+  [(reg FLAGS_REG) (const_int 0)])

+ (match_operand:QI 1
"nonimmediate_operand" "%rm,r"))

+ (match_operand:QI 2 "x86_64_general_operand" "rn,m"

+   (clobber (reg:CC FLAGS_REG))]

+  "TARGET_APX_NDD

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  adc{b}\t{%2, %1, %b0|%b0, %1, %2}

+  adc{b}\t{%2, %1, %b0|%b0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "use_carry" "1")

+   (set_attr "pent_pair" "pu")

+   (set_attr "mode" "QI")])

+

+(define_insn "*addhi3_carry_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")

+ (zero_extend:SWI48x

+   (plus:HI

+ (plus:HI (match_operator:HI 3 "ix86_carry_flag_operator"

+  [(reg FLAGS_REG) (const_int 0)])

+ (match_operand:HI 1
"nonimmediate_operand" "%rm,r"))

+ (match_operand:HI 2 "x86_64_general_operand" "rn,m"

+   (clobber (reg:CC FLAGS_REG))]

+  "TARGET_APX_NDD

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  adc{w}\t{%2, %1, %w0|%w0, %1, %2}

+  adc{w}\t{%2, %1, %w0|%w0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "use_carry" "1")

+   (set_attr "pent_pair" "pu")

+   (set_attr "mode" "HI")])

+

 (define_insn "*addsi3_carry_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r,r")

   (zero_extend:DI

@@ -8792,6 +8858,34 @@

(set_attr "pent_pair" "pu")

(set_attr "mode" "SI")])



+(define_insn "*addqi3_carry_zext_0"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r")

+ (zero_extend:SWI248x

+   (plus:QI (match_operator:QI 2 "ix86_carry_flag_operator"

+[(reg FLAGS_REG) (const_int 0)])

+   (match_operand:QI 1 "nonimmediate_operand"
"rm"

+   (clobber (reg:CC FLAGS_REG))]

+  "TARGET_APX_NDD"

+  "adc{

[PATCH 3/4] i386: Optimization for APX NDD is always zero-uppered for logic

2024-08-12 Thread kong lingling
gcc/ChangeLog:


   PR target/113729

   * config/i386/i386.md (*andqi_1_zext):

   New define_insn.

   (*andhi_1_zext): Ditto.

   (*qi_1_zext): Ditto.

   (*hi_1_zext): Ditto.

   (*negqi_1_zext): Ditto.

   (*neghi_1_zext): Ditto.

   (*one_cmplqi2_1_zext): Ditto.

   (*one_cmplhi2_1_zext): Ditto.



gcc/testsuite/ChangeLog:



   * gcc.target/i386/pr113729.c: Add new test for logic.

---

 gcc/config/i386/i386.md  | 94 

 gcc/testsuite/gcc.target/i386/pr113729.c | 40 ++

 2 files changed, 134 insertions(+)



diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
bb90a67b16d..944ec55a61d 100644

--- a/gcc/config/i386/i386.md

+++ b/gcc/config/i386/i386.md

@@ -12487,6 +12487,34 @@

 operands[2] = gen_lowpart (SImode, operands[2]);

 })



+(define_insn "*andqi_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r,r")

+ (zero_extend:SWI248x

+   (and:QI (match_operand:QI 1 "nonimmediate_operand" "%rm,r")

+  (match_operand:QI 2 "x86_64_general_operand"
"rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  and{b}\t{%2, %1, %b0|%b0, %1, %2}

+  and{b}\t{%2, %1, %b0|%b0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*andhi_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")

+ (zero_extend:SWI48x

+   (and:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,r")

+  (match_operand:HI 2 "x86_64_general_operand"
"rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  and{w}\t{%2, %1, %w0|%w0, %1, %2}

+  and{w}\t{%2, %1, %w0|%w0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 ;; See comment for addsi_1_zext why we do use nonimmediate_operand
(define_insn "*andsi_1_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r,r,r") @@ -13548,6
+13576,34 @@

   operands[5] = gen_reg_rtx (mode);

 })



+(define_insn "*qi_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r,r")

+ (zero_extend:SWI248x

+ (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%rm,r")

+(match_operand:QI 2
"x86_64_general_operand" "rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  {b}\t{%2, %1, %b0|%b0, %1, %2}

+  {b}\t{%2, %1, %b0|%b0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*hi_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")

+ (zero_extend:SWI48x

+ (any_or:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,r")

+(match_operand:HI 2
"x86_64_general_operand" "rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+  "@

+  {w}\t{%2, %1, %w0|%w0, %1, %2}

+  {w}\t{%2, %1, %w0|%w0, %1, %2}"

+  [(set_attr "type" "alu")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 ;; See comment for addsi_1_zext why we do use nonimmediate_operand
(define_insn "*si_1_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r,r,r") @@ -14166,6
+14222,26 @@

(set_attr "has_nf" "1")

(set_attr "mode" "")])



+(define_insn "*negqi_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r")

+ (zero_extend:SWI248x

+   (neg:QI (match_operand:QI 1 "nonimmediate_operand" "rm"]

+  "TARGET_APX_NDD && "

+  "neg{b}\t{%b1, %b0|%b0, %b1}"

+  [(set_attr "type" "negnot")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*neghi_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r")

+ (zero_extend:SWI48x

+   (neg:HI (match_operand:HI 1 "nonimmediate_operand" "rm"]

+  "TARGET_APX_NDD && "

+  "neg{w}\t{%w1, %w0|%w0, %w1}"

+  [(set_attr "type" "negnot")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 (define_insn "*negsi_1_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r")

   (zero_extend:DI

@@ -14788,6 +14864,24 @@

(set_attr "type" "negnot,negnot,msklog")

(set_attr "mode" "")])



+(define_insn "*one_cmplqi2_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r")

+ (zero_extend:SWI248x

+   (not:QI (match_operand:QI 1 "nonimmediate_operand" "rm"]

+  "TARGET_APX_NDD"

+  "not{b}\t{%1, %b0|%b0, %1}"

+  [(set_attr "type" "negnot")

+   (set_attr "mode" "QI")])

+

+(define_insn "*one_cmplhi2_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r")

+ (zero_extend:SWI48x

+ 

[PATCH 4/4] i386: Optimization for APX NDD is always zero-uppered for shift

2024-08-12 Thread kong lingling
gcc/ChangeLog:


PR target/113729

   * config/i386/i386.md (*ashlqi3_1_zext):

   New define_insn.

   (*ashlhi3_1_zext): Ditto.

   (*qi3_1_zext): Ditto.

   (*hi3_1_zext): Ditto.

   (*qi3_1_zext): Ditto.

   (*hi3_1_zext): Ditto.



gcc/testsuite/ChangeLog:



   * gcc.target/i386/pr113729.c: Add testcase for shift and
rotate.

---

 gcc/config/i386/i386.md  | 66 

 gcc/testsuite/gcc.target/i386/pr113729.c | 62 ++

 2 files changed, 128 insertions(+)



diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
944ec55a61d..260482a7305 100644

--- a/gcc/config/i386/i386.md

+++ b/gcc/config/i386/i386.md

@@ -15928,6 +15928,28 @@

   [(set_attr "type" "ishiftx")

(set_attr "mode" "SI")])



+(define_insn "*ashlqi3_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r")

+ (zero_extend:SWI248x

+   (ashift:QI (match_operand:QI 1 "nonimmediate_operand" "rm")

+ (match_operand:QI 2 "nonmemory_operand"
"cI"]

+  "TARGET_APX_NDD && "

+  "sal{b}\t{%2, %1, %b0|%b0, %1, %2}"

+  [(set_attr "type" "ishiftx")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*ashlhi3_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r")

+ (zero_extend:SWI48x

+   (ashift:HI (match_operand:HI 1 "nonimmediate_operand" "rm")

+ (match_operand:QI 2 "nonmemory_operand"
"cI"]

+  "TARGET_APX_NDD && "

+  "sal{w}\t{%2, %1, %w0|%w0, %1, %2}"

+  [(set_attr "type" "ishiftx")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 (define_insn "*ashlsi3_1_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r,r,r")

   (zero_extend:DI

@@ -17412,6 +17434,28 @@

   [(set_attr "type" "ishiftx")

(set_attr "mode" "SI")])



+(define_insn "*qi3_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r")

+ (zero_extend:SWI248x

+   (any_shiftrt:QI (match_operand:QI 1 "nonimmediate_operand"
"rm")

+ (match_operand:QI 2
"nonmemory_operand" "cI"]

+  "TARGET_APX_NDD && "

+  "{b}\t{%2, %1, %b0|%b0, %1, %2}"

+  [(set_attr "type" "ishift")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*hi3_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r")

+ (zero_extend:SWI48x

+   (any_shiftrt:HI (match_operand:HI 1 "nonimmediate_operand"
"rm")

+ (match_operand:QI 2
"nonmemory_operand" "cI"]

+  "TARGET_APX_NDD && "

+  "{w}\t{%2, %1, %w0|%w0, %1, %2}"

+  [(set_attr "type" "ishift")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 (define_insn "*si3_1_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r,r,?k")

   (zero_extend:DI

@@ -18170,6 +18214,28 @@

   [(set_attr "type" "rotatex")

(set_attr "mode" "SI")])



+(define_insn "*qi3_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r")

+ (zero_extend:SWI248x

+   (any_rotate:QI (match_operand:QI 1 "nonimmediate_operand"
"rm")

+   (match_operand:QI 2
"nonmemory_operand" "cI"]

+  "TARGET_APX_NDD && "

+  "{b}\t{%2, %1, %b0|%b0, %1, %2}"

+  [(set_attr "type" "rotate")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*hi3_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r")

+ (zero_extend:SWI48x

+   (any_rotate:HI (match_operand:HI 1 "nonimmediate_operand"
"rm")

+   (match_operand:QI 2
"nonmemory_operand" "cI"]

+  "TARGET_APX_NDD && "

+  "{w}\t{%2, %1, %w0|%w0, %1, %2}"

+  [(set_attr "type" "rotate")

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 (define_insn "*si3_1_zext"

   [(set (match_operand:DI 0 "register_operand" "=r,r,r")

   (zero_extend:DI

diff --git a/gcc/testsuite/gcc.target/i386/pr113729.c
b/gcc/testsuite/gcc.target/i386/pr113729.c

index a49adcd7d6b..fa5def5ebf6 100644

--- a/gcc/testsuite/gcc.target/i386/pr113729.c

+++ b/gcc/testsuite/gcc.target/i386/pr113729.c

@@ -71,3 +71,65 @@ F (int, char, xor, ^)  F (int64_t, char, xor, ^)  F
(int, short, xor, ^)  F (int64_t, short, xor, ^)

+

+#define F2(TYPE1,TYPE2, OP_NAME, OP, IMM)  \

+TYPE1 \

+__attribute__ ((noipa))  \

+f2_##OP_NAME##_##TYPE1##_##TYPE2 (TYPE2 a) \

+{
\

+  unsigned TYPE2 b = a OP IMM;\

+  return b; \

+}

+

+/* ashlqi3_1_zext */

+F2 (short, char, shl, <<, 7)

+F2 (int, char, shl, <<, 6)

+F2 (int6

RE: [PATCH 1/4] i386: Optimization for APX NDD is always zero-uppered for ADD

2024-08-13 Thread Kong, Lingling
Hi,

Gently ping.

Thanks,
Lingling
From: kong lingling 
Sent: Monday, August 12, 2024 3:10 PM
To: gcc-patches@gcc.gnu.org
Cc: H. J. Lu ; Kong, Lingling ; 
Liu, Hongtao 
Subject: [PATCH 1/4] i386: Optimization for APX NDD is always zero-uppered for 
ADD


For APX instruction with an NDD, the destination GPR will get the instruction’s 
result in bits [OSIZE-1:0] and, if OSIZE < 64b, have its upper bits [63:OSIZE] 
zeroed.



Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for trunk?



gcc/ChangeLog:



   PR target/113729

   * config/i386/i386.md (*addqi_1_zext): New

   define.

   (*addhi_1_zext): Ditto.



gcc/testsuite/ChangeLog:



   * gcc.target/i386/pr113729.c: New test.

---

 gcc/config/i386/i386.md  | 80 

 gcc/testsuite/gcc.target/i386/pr113729.c | 27 

 2 files changed, 107 insertions(+)

 create mode 100644 gcc/testsuite/gcc.target/i386/pr113729.c



diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

index 6207036a2a0..b1cf0868efe 100644

--- a/gcc/config/i386/i386.md

+++ b/gcc/config/i386/i386.md

@@ -6571,6 +6571,86 @@

(set_attr "has_nf" "1")

(set_attr "mode" "")])



+;; For APX instruction with an NDD, the destination GPR  will get the

+;; instruction’s result in bits [OSIZE-1:0] and, if OSIZE < 64b, have

+;; its upper bits [63:OSIZE] zeroed.

+

+(define_insn "*addqi_1_zext"

+  [(set (match_operand:SWI248x 0 "register_operand" "=r,r")

+ (zero_extend:SWI248x

+   (plus:QI (match_operand:QI 1 "nonimmediate_operand" "%rm,r")

+   (match_operand:QI 2 "general_operand" 
"rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+{

+  switch (get_attr_type (insn))

+{

+case TYPE_INCDEC:

+  if (operands[2] == const1_rtx)

+ return "inc{b}\t{%1, %b0|%b0, %1}";

+  else

+ {

+   gcc_assert (operands[2] == constm1_rtx);

+   return "dec{b}\t{%1, %b0|%b0, %1}";

+ }

+

+default:

+  if (x86_maybe_negate_const_int (&operands[2], QImode))

+ return "sub{b}\t{%2, %1, %b0|%b0, %1, %2}";

+  return "add{b}\t{%2, %1, %b0|%b0, %1, %2}";

+}

+}

+  [(set (attr "type")

+ (cond [(match_operand:QI 2 "incdec_operand")

+   (const_string "incdec")

+]

+(const_string "alu")))

+   (set (attr "length_immediate")

+  (if_then_else

+ (and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))

+ (const_string "1")

+ (const_string "*")))

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "QI")])

+

+(define_insn "*addhi_1_zext"

+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")

+ (zero_extend:SWI48x

+   (plus:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,r")

+   (match_operand:HI 2 "general_operand" 
"rn,m"]

+  "TARGET_APX_NDD && 

+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"

+{

+  switch (get_attr_type (insn))

+{

+case TYPE_INCDEC:

+  if (operands[2] == const1_rtx)

+ return "inc{w}\t{%1, %w0|%w0, %1}";

+  else

+ {

+   gcc_assert (operands[2] == constm1_rtx);

+   return "dec{w}\t{%1, %w0|%w0, %1}";

+ }

+

+default:

+  if (x86_maybe_negate_const_int (&operands[2], HImode))

+ return "sub{w}\t{%2, %1, %w0|%w0, %1, %2}";

+  return "add{w}\t{%2, %1, %w0|%w0, %1, %2}";

+}

+}

+  [(set (attr "type")

+ (cond [(match_operand:QI 2 "incdec_operand")

+   (const_string "incdec")

+]

+(const_string "alu")))

+   (set (attr "length_immediate")

+  (if_then_else

+ (and (eq_attr "type" "alu") (match_operand 2 "const128_operand"))

+ (const_string "1")

+ (const_string "*")))

+   (set_attr "has_nf" "1")

+   (set_attr "mode" "HI")])

+

 ;; It may seem that nonimmediate operand is proper one for operand 1.

 ;; The addsi_1 pattern allows nonimmediate operand at that place and

 ;; we take care in ix86_binary_operator_ok to not allow two mem

[PATCH] i386: Fix some vex insns that prohibit egpr

2024-08-13 Thread Kong, Lingling
Although these vex insn have evex counterpart, but when it uses the displayed 
vex prefix should not support APX EGPR.
Like TARGET_AVXVNNI, TARGET_IFMA and TARGET_AVXNECONVERT.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/sse.md (vpmadd52):
Prohibit egpr for vex version.
(vcvtneps2bf16_v8sf): Ditto.
(vcvtneps2bf16_v8sf): Ditto.
(vpdpwssds_): Ditto.
(vpdpwssd_): Ditto.
(vpdpbusds_): Ditto.
(vpdpbusd_): Ditto.
---
 gcc/config/i386/sse.md | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
d1010bc5682..7b9f619e112 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -29886,7 +29886,7 @@
(unspec:VI8_AVX2
  [(match_operand:VI8_AVX2 1 "register_operand" "0,0")
   (match_operand:VI8_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI8_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI8_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  VPMADD52))]
   "TARGET_AVXIFMA || (TARGET_AVX512IFMA && TARGET_AVX512VL)"
   "@
@@ -30253,7 +30253,7 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPBUSD))]
   "TARGET_AVXVNNI || (TARGET_AVX512VNNI && TARGET_AVX512VL)"
   "@
@@ -30321,7 +30321,7 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPBUSDS))]
   "TARGET_AVXVNNI || (TARGET_AVX512VNNI && TARGET_AVX512VL)"
   "@
@@ -30389,7 +30389,7 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPWSSD))]
   "TARGET_AVXVNNI || (TARGET_AVX512VNNI && TARGET_AVX512VL)"
   "@
@@ -30457,7 +30457,7 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPWSSDS))]
   "TARGET_AVXVNNI || (TARGET_AVX512VNNI && TARGET_AVX512VL)"
   "@
@@ -30681,7 +30681,7 @@
   [(set (match_operand:V8BF 0 "register_operand" "=x,v")
(vec_concat:V8BF
  (float_truncate:V4BF
-   (match_operand:V4SF 1 "nonimmediate_operand" "xm,vm"))
+   (match_operand:V4SF 1 "nonimmediate_operand" "xjm,vm"))
  (match_operand:V4BF 2 "const0_operand")))]
   "TARGET_AVXNECONVERT || (TARGET_AVX512BF16 && TARGET_AVX512VL)"
   "@
@@ -30745,7 +30745,7 @@
 (define_insn "vcvtneps2bf16_v8sf"
   [(set (match_operand:V8BF 0 "register_operand" "=x,v")
(float_truncate:V8BF
- (match_operand:V8SF 1 "nonimmediate_operand" "xm,vm")))]
+ (match_operand:V8SF 1 "nonimmediate_operand" "xjm,vm")))]
   "TARGET_AVXNECONVERT || (TARGET_AVX512BF16 && TARGET_AVX512VL)"
   "@
   %{vex%} vcvtneps2bf16{y}\t{%1, %0|%0, %1} @@ -31216,7 +31216,7 @@
   [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
(vec_duplicate:VF1_128_256
  (float_extend:SF
-   (match_operand:BF 1 "memory_operand" "m"]
+   (match_operand:BF 1 "memory_operand" "jm"]
   "TARGET_AVXNECONVERT"
   "vbcstnebf162ps\t{%1, %0|%0, %1}"
   [(set_attr "prefix" "vex")
@@ -31226,7 +31226,7 @@
   [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
(vec_duplicate:VF1_128_256
  (float_extend:SF
-   (match_operand:HF 1 "memory_operand" "m"]
+   (match_operand:HF 1 "memory_operand" "jm"]
   "TARGET_AVXNECONVERT"
   "vbcstnesh2ps\t{%1, %0|%0, %1}"
   [(set_attr "prefix" "vex")
@@ -31240,7 +31240,7 @@
   [(set (match_operand:V4SF 0 "register_operand" "=x")
(float_extend:V4SF
  (vec_select:
-   (match_operand:VHFBF_128 1 "memory_operand" "m")
+   (match_operand:VHFBF_128 1 "memory_operand" "jm")
(parallel [(const_int 0) (const_int 2)
   (const_int 4) (const_int 6)]]
   "TARGET_AVXNECONVERT"
@@ -31252,7 +31252,7 @@
   [(set (match_operand:V8SF 0 "register_operand" "=x")
(float_extend:V8SF
  (vec_select:
-   (match_operand:VHFBF_256 1 "memory_operand" "m")
+   (match_operand:VHFBF_256 1 "memor

[PATCH v2] i386: Fix some vex insns that prohibit egpr

2024-08-14 Thread Kong, Lingling



-Original Message-
From: Kong, Lingling  
Sent: Wednesday, August 14, 2024 4:20 PM
To: Kong, Lingling 
Subject: [PATCH v2] i386: Fix some vex insns that prohibit egpr

Although these vex insn have evex counterpart, but when it uses the displayed 
vex prefix should not support APX EGPR.
Like TARGET_AVXVNNI, TARGET_IFMA and TARGET_AVXNECONVERT.
TARGET_AVXVNNIINT8 and TARGET_AVXVNNITINT16 are also vex insn should not 
support egpr.

gcc/ChangeLog:

* config/i386/sse.md (vpmadd52):
Prohibit egpr for vex version.
(vpdpbusd_): Ditto.
(vpdpbusds_): Ditto.
(vpdpwssd_): Ditto.
(vpdpwssds_): Ditto.
(*vcvtneps2bf16_v4sf): Ditto.
(vcvtneps2bf16_v8sf): Ditto.
(vpdp_): Ditto.
(vbcstnebf162ps_): Ditto.
(vbcstnesh2ps_): Ditto.
(vcvtnee2ps_): Ditto.
(vcvtneo2ps_): Ditto.
(vpdp_): Ditto.
---
 gcc/config/i386/sse.md | 49 +++---
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
d1010bc5682..f0d94bba4e7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -29886,7 +29886,7 @@
(unspec:VI8_AVX2
  [(match_operand:VI8_AVX2 1 "register_operand" "0,0")
   (match_operand:VI8_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI8_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI8_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  VPMADD52))]
   "TARGET_AVXIFMA || (TARGET_AVX512IFMA && TARGET_AVX512VL)"
   "@
@@ -29894,6 +29894,7 @@
   vpmadd52\t{%3, %2, %0|%0, %2, %3}"
   [(set_attr "isa" "avxifma,avx512ifmavl")
(set_attr "type" "ssemuladd")
+   (set_attr "addr" "gpr16,*")
(set_attr "prefix" "vex,evex")
(set_attr "mode" "")])
 
@@ -30253,13 +30254,14 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPBUSD))]
   "TARGET_AVXVNNI || (TARGET_AVX512VNNI && TARGET_AVX512VL)"
   "@
   %{vex%} vpdpbusd\t{%3, %2, %0|%0, %2, %3}
   vpdpbusd\t{%3, %2, %0|%0, %2, %3}"
   [(set_attr ("prefix") ("vex,evex"))
+   (set_attr "addr" "gpr16,*")
(set_attr ("isa") ("avxvnni,avx512vnnivl"))])
 
 (define_insn "vpdpbusd__mask"
@@ -30321,13 +30323,14 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPBUSDS))]
   "TARGET_AVXVNNI || (TARGET_AVX512VNNI && TARGET_AVX512VL)"
   "@
%{vex%} vpdpbusds\t{%3, %2, %0|%0, %2, %3}
vpdpbusds\t{%3, %2, %0|%0, %2, %3}"
   [(set_attr ("prefix") ("vex,evex"))
+   (set_attr "addr" "gpr16,*")
(set_attr ("isa") ("avxvnni,avx512vnnivl"))])
 
 (define_insn "vpdpbusds__mask"
@@ -30389,13 +30392,14 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPWSSD))]
   "TARGET_AVXVNNI || (TARGET_AVX512VNNI && TARGET_AVX512VL)"
   "@
   %{vex%} vpdpwssd\t{%3, %2, %0|%0, %2, %3}
   vpdpwssd\t{%3, %2, %0|%0, %2, %3}"
   [(set_attr ("prefix") ("vex,evex"))
+   (set_attr "addr" "gpr16,*")
(set_attr ("isa") ("avxvnni,avx512vnnivl"))])
 
 (define_insn "vpdpwssd__mask"
@@ -30457,13 +30461,14 @@
(unspec:VI4_AVX2
  [(match_operand:VI4_AVX2 1 "register_operand" "0,0")
   (match_operand:VI4_AVX2 2 "register_operand" "x,v")
-  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xm,vm")]
+  (match_operand:VI4_AVX2 3 "nonimmediate_operand" "xjm,vm")]
  UNSPEC_VPDPWSSDS))]
   "TARGET_AVXVNNI || (TARG

[PATCH] [APX ZU] Support APX zero-upper

2024-06-06 Thread Kong, Lingling
Enable ZU for IMUL (opcodes 0x69 and 0x6B) and SETcc.

gcc/ChangeLog:

* config/i386/i386-opts.h (enum apx_features):Add apx_zu.
* config/i386/i386.h (TARGET_APX_ZU): Define.
* config/i386/i386.md (*imulhizu): New define_insn.
(*setcc__zu): Ditto.
* config/i386/i386.opt: Add enum value for zu.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-zu-1.c: New test.
* gcc.target/i386/apx-zu-2.c: Ditto.

Bootstrapped & regtested on x86-64-pc-linux-gnu with binutils 2.42 branch.
OK for trunk?

---
 gcc/config/i386/i386-opts.h  |  3 +-
 gcc/config/i386/i386.h   |  1 +
 gcc/config/i386/i386.md  | 25 ++--
 gcc/config/i386/i386.opt |  3 ++
 gcc/testsuite/gcc.target/i386/apx-zu-1.c | 38   
gcc/testsuite/gcc.target/i386/apx-zu-2.c | 19 
 6 files changed, 86 insertions(+), 3 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/apx-zu-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-zu-2.c

diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h index 
5fcc4927978..c7ec0d9fd39 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -142,8 +142,9 @@ enum apx_features {
   apx_ppx = 1 << 3,
   apx_nf = 1 << 4,
   apx_ccmp = 1 << 5,
+  apx_zu = 1 << 6,
   apx_all = apx_egpr | apx_push2pop2 | apx_ndd
-   | apx_ppx | apx_nf | apx_ccmp,
+   | apx_ppx | apx_nf | apx_ccmp | apx_zu,
 };
 
 #endif
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 
7051c6c13e4..dc1a1f44320 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -57,6 +57,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see  #define TARGET_APX_PPX (ix86_apx_features & apx_ppx)  #define 
TARGET_APX_NF (ix86_apx_features & apx_nf) 
#define TARGET_APX_CCMP (ix86_apx_features & apx_ccmp)
+#define TARGET_APX_ZU (ix86_apx_features & apx_zu)
 
 #include "config/vxworks-dummy.h"
 
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
ffcf63e1cba..a2765f65754 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9967,6 +9967,19 @@
(const_string "direct")))
(set_attr "mode" "")])
 
+(define_insn "*imulhizu"
+  [(set (match_operand:SWI48x 0 "register_operand" "=r,r")
+   (zero_extend:SWI48x
+ (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm")
+  (match_operand:HI 2 "immediate_operand" "K,n"
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_APX_ZU"
+  "@
+   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}
+   imulzu{w}\t{%2, %1, %w0|%w0, %1, %2}"
+  [(set_attr "type" "imul")
+   (set_attr "mode" "HI")])
+
 (define_insn "*mulsi3_1_zext"
   [(set (match_operand:DI 0 "register_operand" "=r,r,r")
(zero_extend:DI
@@ -18354,11 +18367,19 @@
 ;; For all sCOND expanders, also expand the compare or test insn that  ;; 
generates cc0.  Generate an equality comparison if `seq' or `sne'.
 
+(define_insn "*setcc__zu"
+  [(set (match_operand:SWI248 0 "register_operand" "=r")
+   (match_operator:SWI248 1 "ix86_comparison_operator"
+ [(reg FLAGS_REG) (const_int 0)]))]
+  "TARGET_APX_ZU"
+  "setzu%C1\t%b0"
+  [(set_attr "type" "setcc")])
+
 (define_insn_and_split "*setcc_di_1"
   [(set (match_operand:DI 0 "register_operand" "=q")
(match_operator:DI 1 "ix86_comparison_operator"
  [(reg FLAGS_REG) (const_int 0)]))]
-  "TARGET_64BIT && !TARGET_PARTIAL_REG_STALL"
+  "!TARGET_APX_ZU && TARGET_64BIT && !TARGET_PARTIAL_REG_STALL"
   "#"
   "&& reload_completed"
   [(set (match_dup 2) (match_dup 1))
@@ -18391,7 +18412,7 @@
   [(set (match_operand:SWI24 0 "register_operand" "=q")
(match_operator:SWI24 1 "ix86_comparison_operator"
  [(reg FLAGS_REG) (const_int 0)]))]
-  "!TARGET_PARTIAL_REG_STALL
+  "!TARGET_APX_ZU && !TARGET_PARTIAL_REG_STALL
&& (!TARGET_ZERO_EXTEND_WITH_AND || optimize_function_for_size_p (cfun))"
   "#"
   "&& reload_completed"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 
7017cc87cec..353fffb2343 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1342,6 +1342,9 @@ Enum(apx_features) String(nf) Value(apx_nf) Set(6)  
EnumValue
 Enum(apx_features) String(ccmp) Value(apx_ccmp) Set(7)
 
+EnumValue
+Enum(apx_features) String(zu) Value(apx_zu) Set(8)
+
 EnumValue
 Enum(apx_features) String(all) Value(apx_all) Set(1)
 
diff --git a/gcc/testsuite/gcc.target/i386/apx-zu-1.c 
b/gcc/testsuite/gcc.target/i386/apx-zu-1.c
new file mode 100644
index 000..927a87673a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-zu-1.c
@@ -0,0 +1,38 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mapxf -march=x86-64 -O2" } */
+/* { dg-final { scan-assembler-not "setle"} } */
+/* { dg-final { scan-assembler-not "setge"} } */
+/* { dg-final { scan-assembler-not "sete"} } */
+/* { dg-final { scan-assembler-not "xor"} } */
+/* 

[PATCH 1/2] Add a new target hook: TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP

2024-06-13 Thread Kong, Lingling
From: konglin1 

gcc/ChangeLog:

* doc/tm.texi: Regenerated.
* doc/tm.texi.in: Add TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
* target.def (bool,): New hook.
* targhooks.cc (default_have_conditional_move_mem_notrap): New
function to hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP.
* targhooks.h (default_have_conditional_move_mem_notrap): New
target hook declear.
---
 gcc/doc/tm.texi|  6 ++
 gcc/doc/tm.texi.in |  2 ++
 gcc/target.def | 11 +++
 gcc/targhooks.cc   |  8 
 gcc/targhooks.h|  1 +
 5 files changed, 28 insertions(+)

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8a7aa70d605..f8faf44ab73 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7311,6 +7311,12 @@ candidate as a replacement for the if-convertible 
sequence described in
 @code{if_info}.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP (rtx 
@var{x})
+This hook returns true if the target supports condition move instructions
+  that enables fault suppression of memory operands when the condition code
+  evaluates to false.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_NEW_ADDRESS_PROFITABLE_P (rtx 
@var{memref}, rtx_insn * @var{insn}, rtx @var{new_addr})
 Return @code{true} if it is profitable to replace the address in
 @var{memref} with @var{new_addr}.  This allows targets to prevent the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 9e0830758ae..17c122aea43 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4748,6 +4748,8 @@ Define this macro if a non-short-circuit operation 
produced by
 
 @hook TARGET_NOCE_CONVERSION_PROFITABLE_P
 
+@hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
+
 @hook TARGET_NEW_ADDRESS_PROFITABLE_P
 
 @hook TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P
diff --git a/gcc/target.def b/gcc/target.def
index 70070caebc7..aa77737e006 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3993,6 +3993,17 @@ candidate as a replacement for the if-convertible 
sequence described in\n\
 bool, (rtx_insn *seq, struct noce_if_info *if_info),
 default_noce_conversion_profitable_p)
 
+/* Return true if the target support condition move instructions that enables
+   fault suppression of memory operands when the condition code evaluates to
+   false.  */
+DEFHOOK
+(have_conditional_move_mem_notrap,
+ "This hook returns true if the target supports condition move instructions\n\
+  that enables fault suppression of memory operands when the condition code\n\
+  evaluates to false.",
+bool, (rtx x),
+default_have_conditional_move_mem_notrap)
+
 /* Return true if new_addr should be preferred over the existing address used 
by
memref in insn.  */
 DEFHOOK
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index fb339bf75dd..a616371b204 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -2816,4 +2816,12 @@ default_memtag_untagged_pointer (rtx tagged_pointer, rtx 
target)
   return untagged_base;
 }
 
+/* The default implementation of
+   TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP.  */
+bool
+default_have_conditional_move_mem_notrap (rtx x ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
 #include "gt-targhooks.h"
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 85f3817c176..f8ea2fde53d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -305,5 +305,6 @@ extern rtx default_memtag_add_tag (rtx, poly_int64, 
uint8_t);
 extern rtx default_memtag_set_tag (rtx, rtx, rtx);
 extern rtx default_memtag_extract_tag (rtx, rtx);
 extern rtx default_memtag_untagged_pointer (rtx, rtx);
+extern bool default_have_conditional_move_mem_notrap (rtx x);
 
 #endif /* GCC_TARGHOOKS_H */
-- 
2.31.1



[PATCH 2/2] [APX CFCMOV] Support APX CFCMOV

2024-06-13 Thread Kong, Lingling
From: konglin1 mailto:lingling.k...@intel.com>>



APX CFCMOV feature implements conditionally faulting which means that all

memory faults are suppressed when the condition code evaluates to false and

load or store a memory operand. Now we could load or store a memory operand

may trap or fault for conditional move.



To enable CFCMOV, we add a target HOOK TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP

in if-conversion pass to allow convert to cmov.



Bootstrapped & regtested on x86-64-pc-linux-gnu with binutils 2.42 branch.

OK for trunk?



gcc/ChangeLog:



   * config/i386/i386-expand.cc (ix86_can_cfcmov_p): New function 
that

   test if the cfcmov can be generated.

   (ix86_expand_int_movcc): Expand to cfcmov pattern if 
ix86_can_cfcmov_p

   return ture.

   * config/i386/i386-opts.h (enum apx_features): Add apx_cfcmov.

   * config/i386/i386.cc (ix86_have_conditional_move_mem_notrap): 
New

   function to hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP

   (TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP): Target hook define.

   (ix86_rtx_costs): Add UNSPEC_APX_CFCMOV cost;

   * config/i386/i386.h (TARGET_APX_CFCMOV): Define.

   * config/i386/i386.md (*cfcmov_1): New define_insn to 
support

   cfcmov.

   (*cfcmov_2): Ditto.

   (UNSPEC_APX_CFCMOV): New unspec for cfcmov.

   * config/i386/i386.opt: Add enum value for cfcmov.

   * ifcvt.cc (noce_try_cmove_load_mem_notrap): Use target hook to 
allow

   convert to cfcmov for conditional load.

   (noce_try_cmove_store_mem_notrap): Convert to conditional store.

   (noce_process_if_block): Ditto.



gcc/testsuite/ChangeLog:



   * gcc.target/i386/apx-cfcmov-1.c: New test.

   * gcc.target/i386/apx-cfcmov-2.c: Ditto.

---

gcc/config/i386/i386-expand.cc   |  63 +

gcc/config/i386/i386-opts.h  |   4 +-

gcc/config/i386/i386.cc  |  33 ++-

gcc/config/i386/i386.h   |   1 +

gcc/config/i386/i386.md  |  53 +++-

gcc/config/i386/i386.opt |   3 +

gcc/config/i386/predicates.md|   7 +

gcc/ifcvt.cc | 247 ++-

gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c |  73 ++

gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c |  40 +++

10 files changed, 511 insertions(+), 13 deletions(-)

create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c

create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c



diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc

index 312329e550b..c02a4bcbec3 100644

--- a/gcc/config/i386/i386-expand.cc

+++ b/gcc/config/i386/i386-expand.cc

@@ -3336,6 +3336,30 @@ ix86_expand_int_addcc (rtx operands[])

   return true;

}



+/* Return TRUE if we could convert "if (test) x = a; else x = b;" to cfcmov,

+   especially when load a or b or x store may cause memmory faults.  */

+bool

+ix86_can_cfcmov_p (rtx x, rtx a, rtx b)

+{

+  machine_mode mode = GET_MODE (x);

+  if (TARGET_APX_CFCMOV

+  && (mode == DImode || mode == SImode || mode == HImode))

+{

+  /* C load (r m r), (r m C), (r r m). For r m m could use

+ two cfcmov. */

+  if (register_operand (x, mode)

+   && ((MEM_P (a) && register_operand (b, mode))

+   || (MEM_P (a) && b == const0_rtx)

+   || (register_operand (a, mode) && MEM_P (b))

+   || (MEM_P (a) && MEM_P (b

+ return true;

+  /* C store  (m r 0).  */

+  else if (MEM_P (x) && x == b && register_operand (a, mode))

+ return true;

+}

+  return false;

+}

+

bool

ix86_expand_int_movcc (rtx operands[])

{

@@ -3366,6 +3390,45 @@ ix86_expand_int_movcc (rtx operands[])



   compare_code = GET_CODE (compare_op);



+  if (MEM_P (operands[0])

+  && !ix86_can_cfcmov_p (operands[0], op2, op3))

+return false;

+

+  if (may_trap_or_fault_p (op2) || may_trap_or_fault_p (op3))

+  {

+ if (ix86_can_cfcmov_p (operands[0], op2, op3))

+   {

+ if (may_trap_or_fault_p (op2))

+   op2 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[2]),

+  
UNSPEC_APX_CFCMOV);

+ if (may_trap_or_fault_p (op3))

+   op3 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[3]),

+  
UNSPEC_APX_CFCMOV);

+ emit_insn (compare_seq);

+

+ if (may_trap_or_fault_p (op2) && may_trap_or_fault_p (op3))

+   {

+emit_insn (gen_rtx_SET (operands[0],

+ 

[PATCH 0/3] [APX CFCMOV] Support APX CFCMOV

2024-06-13 Thread Kong, Lingling
APX CFCMOV[1] feature implements conditionally faulting which means that all 
memory faults are suppressed
when the condition code evaluates to false and load or store a memory operand. 
Now we could load or store a
memory operand may trap or fault for conditional move.

In middle-end, now we don't support a conditional move if we knew that a load 
from A or B could trap or fault.

To enable CFCMOV, we add a target HOOK TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
in if-conversion pass to allow convert to cmov.

All the changes passed bootstrap & regtest x86-64-pc-linux-gnu.
We also tested spec with SDE and passed the runtime test.

Ok for trunk?

[1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html

Lingling Kong (3):
  [APX CFCMOV] Add a new target hook: TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
  [APX CFCMOV] Support APX CFCMOV in if_convert pass
  [APX CFCMOV] Support APX CFCMOV in backend

 gcc/config/i386/i386-expand.cc   |  63 +
 gcc/config/i386/i386-opts.h  |   4 +-
 gcc/config/i386/i386.cc  |  33 ++-
 gcc/config/i386/i386.h   |   1 +
 gcc/config/i386/i386.md  |  53 +++-
 gcc/config/i386/i386.opt |   3 +
 gcc/config/i386/predicates.md|   7 +
 gcc/doc/tm.texi  |   6 +
 gcc/doc/tm.texi.in   |   2 +
 gcc/ifcvt.cc | 247 ++-
 gcc/target.def   |  11 +
 gcc/targhooks.cc |   8 +
 gcc/targhooks.h  |   1 +
 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c |  73 ++
 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c |  40 +++
 15 files changed, 539 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c

-- 
2.31.1



[PATCH 1/3] [APX CFCMOV] Add a new target hook: TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP

2024-06-13 Thread Kong, Lingling
From: konglin1 

APX CFCMOV feature implements conditionally faulting which means that all
memory faults are suppressed when the condition code evaluates to false and
load or store a memory operand. Now we could load or store a memory operand
may trap or fault for conditional move.

In middle-end, now we don't support a conditional move if we knew
that a load from A or B could trap or fault.

To enable CFCMOV, we add a target HOOK TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
in if-conversion pass to allow convert to cmov.

gcc/ChangeLog:

* doc/tm.texi: Regenerated.
* doc/tm.texi.in: Add TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
* target.def (bool,): New hook.
* targhooks.cc (default_have_conditional_move_mem_notrap): New
function to hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP.
* targhooks.h (default_have_conditional_move_mem_notrap): New
target hook declear.
---
 gcc/doc/tm.texi|  6 ++
 gcc/doc/tm.texi.in |  2 ++
 gcc/target.def | 11 +++
 gcc/targhooks.cc   |  8 
 gcc/targhooks.h|  1 +
 5 files changed, 28 insertions(+)

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 8a7aa70d605..f8faf44ab73 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7311,6 +7311,12 @@ candidate as a replacement for the if-convertible 
sequence described in
 @code{if_info}.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP (rtx 
@var{x})
+This hook returns true if the target supports condition move instructions
+  that enables fault suppression of memory operands when the condition code
+  evaluates to false.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_NEW_ADDRESS_PROFITABLE_P (rtx 
@var{memref}, rtx_insn * @var{insn}, rtx @var{new_addr})
 Return @code{true} if it is profitable to replace the address in
 @var{memref} with @var{new_addr}.  This allows targets to prevent the
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 9e0830758ae..17c122aea43 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4748,6 +4748,8 @@ Define this macro if a non-short-circuit operation 
produced by
 
 @hook TARGET_NOCE_CONVERSION_PROFITABLE_P
 
+@hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
+
 @hook TARGET_NEW_ADDRESS_PROFITABLE_P
 
 @hook TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P
diff --git a/gcc/target.def b/gcc/target.def
index 70070caebc7..aa77737e006 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3993,6 +3993,17 @@ candidate as a replacement for the if-convertible 
sequence described in\n\
 bool, (rtx_insn *seq, struct noce_if_info *if_info),
 default_noce_conversion_profitable_p)
 
+/* Return true if the target support condition move instructions that enables
+   fault suppression of memory operands when the condition code evaluates to
+   false.  */
+DEFHOOK
+(have_conditional_move_mem_notrap,
+ "This hook returns true if the target supports condition move instructions\n\
+  that enables fault suppression of memory operands when the condition code\n\
+  evaluates to false.",
+bool, (rtx x),
+default_have_conditional_move_mem_notrap)
+
 /* Return true if new_addr should be preferred over the existing address used 
by
memref in insn.  */
 DEFHOOK
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index fb339bf75dd..a616371b204 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -2816,4 +2816,12 @@ default_memtag_untagged_pointer (rtx tagged_pointer, rtx 
target)
   return untagged_base;
 }
 
+/* The default implementation of
+   TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP.  */
+bool
+default_have_conditional_move_mem_notrap (rtx x ATTRIBUTE_UNUSED)
+{
+  return false;
+}
+
 #include "gt-targhooks.h"
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 85f3817c176..f8ea2fde53d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -305,5 +305,6 @@ extern rtx default_memtag_add_tag (rtx, poly_int64, 
uint8_t);
 extern rtx default_memtag_set_tag (rtx, rtx, rtx);
 extern rtx default_memtag_extract_tag (rtx, rtx);
 extern rtx default_memtag_untagged_pointer (rtx, rtx);
+extern bool default_have_conditional_move_mem_notrap (rtx x);
 
 #endif /* GCC_TARGHOOKS_H */
-- 
2.31.1



[PATCH 2/3] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-06-13 Thread Kong, Lingling
From: Lingling Kong 

After added target HOOK TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP,
we could support a conditional move that load or store mem may trap
or fault in if convert pass.

Conditional move suppress fault for conditional mem store would not
move any arithmetic calculations. For conditional mem load now just
support a conditional move one trap mem and one no trap and no mem
cases.

gcc/ChangeLog:

* ifcvt.cc (noce_try_cmove_load_mem_notrap): Use target hook
to allow convert to cfcmov for conditional load.
(noce_try_cmove_store_mem_notrap): Convert to conditional store.
(noce_process_if_block): Ditto.
---
 gcc/ifcvt.cc | 247 ++-
 1 file changed, 246 insertions(+), 1 deletion(-)

diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 58ed42673e5..6e3e48af810 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -783,6 +783,8 @@ static rtx noce_emit_cmove (struct noce_if_info *, rtx, 
enum rtx_code, rtx,
rtx, rtx, rtx, rtx = NULL, rtx = NULL);
 static bool noce_try_cmove (struct noce_if_info *);
 static bool noce_try_cmove_arith (struct noce_if_info *);
+static bool noce_try_cmove_load_mem_notrap (struct noce_if_info *);
+static bool noce_try_cmove_store_mem_notrap (struct noce_if_info *, rtx *, 
rtx);
 static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn **);
 static bool noce_try_minmax (struct noce_if_info *);
 static bool noce_try_abs (struct noce_if_info *);
@@ -2401,6 +2403,237 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
   return false;
 }
 
+/* When target support suppress memory fault, try more complex cases involving
+   conditional_move's source or dest may trap or fault.  */
+
+static bool
+noce_try_cmove_load_mem_notrap (struct noce_if_info *if_info)
+{
+  rtx a = if_info->a;
+  rtx b = if_info->b;
+  rtx x = if_info->x;
+
+  if (MEM_P (x))
+return false;
+  /* Just handle a conditional move from one trap MEM + other non_trap,
+ non mem cases.  */
+  if (!(MEM_P (a) ^ MEM_P (b)))
+  return false;
+  bool a_trap = may_trap_or_fault_p (a);
+  bool b_trap = may_trap_or_fault_p (b);
+
+  if (!(a_trap ^ b_trap))
+return false;
+  if (a_trap && (!MEM_P (a) || !targetm.have_conditional_move_mem_notrap (a)))
+return false;
+  if (b_trap && (!MEM_P (b) || !targetm.have_conditional_move_mem_notrap (b)))
+return false;
+
+  rtx orig_b;
+  rtx_insn *insn_a, *insn_b;
+  bool a_simple = if_info->then_simple;
+  bool b_simple = if_info->else_simple;
+  basic_block then_bb = if_info->then_bb;
+  basic_block else_bb = if_info->else_bb;
+  rtx target;
+  enum rtx_code code;
+  rtx cond = if_info->cond;
+  rtx_insn *ifcvt_seq;
+
+  /* if (test) x = *a; else x = c - d;
+ => x = c - d;
+   if (test)
+ x = *a;
+  */
+
+  code = GET_CODE (cond);
+  insn_a = if_info->insn_a;
+  insn_b = if_info->insn_b;
+
+  machine_mode x_mode = GET_MODE (x);
+
+  if (!can_conditionally_move_p (x_mode))
+return false;
+
+  /* Because we only handle one trap MEM + other non_trap, non mem cases,
+ just move one trap MEM always in then_bb.  */
+  if (noce_reversed_cond_code (if_info) != UNKNOWN)
+{
+  bool reversep = false;
+  if (b_trap)
+   reversep = true;
+
+  if (reversep)
+   {
+ if (if_info->rev_cond)
+   {
+ cond = if_info->rev_cond;
+ code = GET_CODE (cond);
+   }
+ else
+   code = reversed_comparison_code (cond, if_info->jump);
+ std::swap (a, b);
+ std::swap (insn_a, insn_b);
+ std::swap (a_simple, b_simple);
+ std::swap (then_bb, else_bb);
+   }
+}
+
+  if (then_bb && else_bb
+  && (!bbs_ok_for_cmove_arith (then_bb, else_bb,  if_info->orig_x)
+ || !bbs_ok_for_cmove_arith (else_bb, then_bb,  if_info->orig_x)))
+return false;
+
+  start_sequence ();
+
+  /* If one of the blocks is empty then the corresponding B or A value
+ came from the test block.  The non-empty complex block that we will
+ emit might clobber the register used by B or A, so move it to a pseudo
+ first.  */
+
+  rtx tmp_b = NULL_RTX;
+
+  /* Don't move trap mem to a pseudo. */
+  if (!may_trap_or_fault_p (b) && (b_simple || !else_bb))
+tmp_b = gen_reg_rtx (x_mode);
+
+  orig_b = b;
+
+  rtx emit_a = NULL_RTX;
+  rtx emit_b = NULL_RTX;
+  rtx_insn *tmp_insn = NULL;
+  bool modified_in_a = false;
+  bool modified_in_b = false;
+  /* If either operand is complex, load it into a register first.
+ The best way to do this is to copy the original insn.  In this
+ way we preserve any clobbers etc that the insn may have had.
+ This is of course not possible in the IS_MEM case.  */
+
+  if (! general_operand (b, GET_MODE (b)) || tmp_b)
+{
+ if (insn_b)
+   {
+ b = tmp_b ? tmp_b : gen_reg_rtx (GET_MODE (b));
+ rtx_insn *copy_of_b = as_a  (copy_rtx (insn_b));
+ rtx set 

[PATCH 3/3] [APX CFCMOV] Support APX CFCMOV in backend

2024-06-13 Thread Kong, Lingling
From: Lingling Kong 


Handle target hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP and support
CFCMOV in backend.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_can_cfcmov_p): New function that
test if the cfcmov can be generated.
(ix86_expand_int_movcc): Expand to cfcmov pattern if ix86_can_cfcmov_p
return ture.
* config/i386/i386-opts.h (enum apx_features): Add apx_cfcmov.
* config/i386/i386.cc (ix86_have_conditional_move_mem_notrap): New
function to hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
(TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP): Target hook define.
(ix86_rtx_costs): Add UNSPEC_APX_CFCMOV cost;
* config/i386/i386.h (TARGET_APX_CFCMOV): Define.
* config/i386/i386.md (*cfcmov_1): New define_insn to support
cfcmov.
(*cfcmov_2): Ditto.
(UNSPEC_APX_CFCMOV): New unspec for cfcmov.
* config/i386/i386.opt: Add enum value for cfcmov.
* config/i386/predicates.md (register_or_cfc_mem_operand): New
define_predicate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-cfcmov-1.c: New test.
* gcc.target/i386/apx-cfcmov-2.c: Ditto.
---
 gcc/config/i386/i386-expand.cc   | 63 +
 gcc/config/i386/i386-opts.h  |  4 +-
 gcc/config/i386/i386.cc  | 33 +++--
 gcc/config/i386/i386.h   |  1 +
 gcc/config/i386/i386.md  | 53 --
 gcc/config/i386/i386.opt |  3 +
 gcc/config/i386/predicates.md|  7 ++
 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c | 73 
 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c | 40 +++
 9 files changed, 265 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 312329e550b..c02a4bcbec3 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3336,6 +3336,30 @@ ix86_expand_int_addcc (rtx operands[])
   return true;
 }
 
+/* Return TRUE if we could convert "if (test) x = a; else x = b;" to cfcmov,
+   especially when load a or b or x store may cause memmory faults.  */
+bool
+ix86_can_cfcmov_p (rtx x, rtx a, rtx b)
+{
+  machine_mode mode = GET_MODE (x);
+  if (TARGET_APX_CFCMOV
+  && (mode == DImode || mode == SImode || mode == HImode))
+{
+  /* C load (r m r), (r m C), (r r m). For r m m could use
+two cfcmov. */
+  if (register_operand (x, mode)
+ && ((MEM_P (a) && register_operand (b, mode))
+ || (MEM_P (a) && b == const0_rtx)
+ || (register_operand (a, mode) && MEM_P (b))
+ || (MEM_P (a) && MEM_P (b
+   return true;
+  /* C store  (m r 0).  */
+  else if (MEM_P (x) && x == b && register_operand (a, mode))
+   return true;
+}
+  return false;
+}
+
 bool
 ix86_expand_int_movcc (rtx operands[])
 {
@@ -3366,6 +3390,45 @@ ix86_expand_int_movcc (rtx operands[])
 
   compare_code = GET_CODE (compare_op);
 
+  if (MEM_P (operands[0])
+  && !ix86_can_cfcmov_p (operands[0], op2, op3))
+return false;
+
+  if (may_trap_or_fault_p (op2) || may_trap_or_fault_p (op3))
+  {
+   if (ix86_can_cfcmov_p (operands[0], op2, op3))
+ {
+   if (may_trap_or_fault_p (op2))
+ op2 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[2]),
+   UNSPEC_APX_CFCMOV);
+   if (may_trap_or_fault_p (op3))
+ op3 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[3]),
+   UNSPEC_APX_CFCMOV);
+   emit_insn (compare_seq);
+
+   if (may_trap_or_fault_p (op2) && may_trap_or_fault_p (op3))
+ {
+   emit_insn (gen_rtx_SET (operands[0],
+   gen_rtx_IF_THEN_ELSE (mode,
+ compare_op,
+ op2,
+ operands[0])));
+   emit_insn (gen_rtx_SET (operands[0],
+   gen_rtx_IF_THEN_ELSE (mode,
+ compare_op,
+ operands[0],
+ op3)));
+ }
+   else
+ emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_IF_THEN_ELSE (mode,
+   compare_op,
+   op2, op3)));
+   return true;
+ }
+   return false;
+  }
+
   if ((op1 == const0_rtx && (code == GE || code == LT)

[PATCH Committed][APX ZU] Fix test for target-support check

2024-06-17 Thread Kong, Lingling
Fix test for APX ZU. Add attribute for no-inline and target APX, and 
target-support check.



Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Committed as an obvious patch.



gcc/testsuite/ChangeLog:



   * gcc.target/i386/apx-zu-1.c: Add attribute for noinline,

   and target apx.

   * gcc.target/i386/apx-zu-2.c: Add target-support check.

---

gcc/testsuite/gcc.target/i386/apx-zu-1.c | 6 ++

 gcc/testsuite/gcc.target/i386/apx-zu-2.c | 3 +++

2 files changed, 9 insertions(+)



diff --git a/gcc/testsuite/gcc.target/i386/apx-zu-1.c 
b/gcc/testsuite/gcc.target/i386/apx-zu-1.c

index 927a87673a7..bc0e7fbb4dd 100644

--- a/gcc/testsuite/gcc.target/i386/apx-zu-1.c

+++ b/gcc/testsuite/gcc.target/i386/apx-zu-1.c

@@ -9,26 +9,32 @@

/* { dg-final { scan-assembler-times "setzue" 1} } */

/* { dg-final { scan-assembler-times "setzuge" 1} } */

/* { dg-final { scan-assembler "imulzu"} } */

+

+__attribute__((noinline, noclone, target("apxf")))

long long foo0 (int a)

{

   return a == 0 ? 0 : 1;

}

+__attribute__((noinline, noclone, target("apxf")))

long foo1 (int a, int b)

{

   return a > b ? 0 : 1;

}

+__attribute__((noinline, noclone, target("apxf")))

int foo2 (int a, int b)

{

   return a != b ? 0 : 1;

}

+__attribute__((noinline, noclone, target("apxf")))

short foo3 (int a, int b)

{

   return a < b ? 0 : 1;

}

+__attribute__((noinline, noclone, target("apxf")))

unsigned long

f1(unsigned short x)

{

diff --git a/gcc/testsuite/gcc.target/i386/apx-zu-2.c 
b/gcc/testsuite/gcc.target/i386/apx-zu-2.c

index 3ee04495d98..7585492bd7c 100644

--- a/gcc/testsuite/gcc.target/i386/apx-zu-2.c

+++ b/gcc/testsuite/gcc.target/i386/apx-zu-2.c

@@ -5,6 +5,9 @@

 int main(void)

{

+  if (!__builtin_cpu_supports ("apxf"))

+return 0;

+

   if (foo0 (0))

 __builtin_abort ();

   if (foo1 (3, 2))

--

2.31.1



[PATCH v2 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-06-18 Thread Kong, Lingling
APX CFCMOV feature implements conditionally faulting which means

that all memory faults are suppressed when the condition code

evaluates to false and load or store a memory operand. Now we

could load or store a memory operand may trap or fault for

conditional move.



In middle-end, now we don't support a conditional move if we knew

that a load from A or B could trap or fault. To enable CFCMOV, we

added a new optab.



Conditional move suppress_fault for condition mem store would not

move any arithmetic calculations. For condition mem load now just

support a conditional move one trap mem and one no trap and no mem

cases.



gcc/ChangeLog:



   * ifcvt.cc (noce_try_cmove_load_mem_notrap): Allow convert

   to cfcmov for conditional load.

   (noce_try_cmove_store_mem_notrap): Convert to conditional store.

   (noce_process_if_block): Ditto.

   * optabs.def (OPTAB_D): New optab.

---

gcc/ifcvt.cc   | 246 -

gcc/optabs.def |   1 +

2 files changed, 246 insertions(+), 1 deletion(-)



diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc

index 58ed42673e5..65c069b8cc6 100644

--- a/gcc/ifcvt.cc

+++ b/gcc/ifcvt.cc

@@ -783,6 +783,8 @@ static rtx noce_emit_cmove (struct noce_if_info *, rtx, 
enum rtx_code, rtx,

 rtx, rtx, rtx, rtx = NULL, rtx 
= NULL);

static bool noce_try_cmove (struct noce_if_info *);

static bool noce_try_cmove_arith (struct noce_if_info *);

+static bool noce_try_cmove_load_mem_notrap (struct noce_if_info *);

+static bool noce_try_cmove_store_mem_notrap (struct noce_if_info *, rtx *, 
rtx);

static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn **);

static bool noce_try_minmax (struct noce_if_info *);

static bool noce_try_abs (struct noce_if_info *);

@@ -2401,6 +2403,233 @@ noce_try_cmove_arith (struct noce_if_info *if_info)

   return false;

}



+/* When target support suppress memory fault, try more complex cases involving

+   conditional_move's source or dest may trap or fault.  */

+

+static bool

+noce_try_cmove_load_mem_notrap (struct noce_if_info *if_info)

+{

+  rtx a = if_info->a;

+  rtx b = if_info->b;

+  rtx x = if_info->x;

+

+  if (MEM_P (x))

+return false;

+  /* Just handle a conditional move from one trap MEM + other non_trap,

+ non mem cases.  */

+  if (!(MEM_P (a) ^ MEM_P (b)))

+  return false;

+  bool a_trap = may_trap_or_fault_p (a);

+  bool b_trap = may_trap_or_fault_p (b);

+

+  if (!(a_trap ^ b_trap))

+return false;

+  if (a_trap && !MEM_P (a))

+return false;

+  if (b_trap && !MEM_P (b))

+return false;

+

+  rtx orig_b;

+  rtx_insn *insn_a, *insn_b;

+  bool a_simple = if_info->then_simple;

+  bool b_simple = if_info->else_simple;

+  basic_block then_bb = if_info->then_bb;

+  basic_block else_bb = if_info->else_bb;

+  rtx target;

+  enum rtx_code code;

+  rtx cond = if_info->cond;

+  rtx_insn *ifcvt_seq;

+

+  /* if (test) x = *a; else x = c - d;

+ => x = c - d;

+ if (test)

+   x = *a;

+  */

+

+  code = GET_CODE (cond);

+  insn_a = if_info->insn_a;

+  insn_b = if_info->insn_b;

+  machine_mode x_mode = GET_MODE (x);

+

+  /* Because we only handle one trap MEM + other non_trap, non mem cases,

+ just move one trap MEM always in then_bb.  */

+  if (noce_reversed_cond_code (if_info) != UNKNOWN)

+{

+  bool reversep = false;

+  if (b_trap)

+ reversep = true;

+

+  if (reversep)

+ {

+   if (if_info->rev_cond)

+ {

+   cond = if_info->rev_cond;

+   code = GET_CODE (cond);

+ }

+   else

+ code = reversed_comparison_code (cond, if_info->jump);

+   std::swap (a, b);

+   std::swap (insn_a, insn_b);

+   std::swap (a_simple, b_simple);

+   std::swap (then_bb, else_bb);

+ }

+}

+

+  if (then_bb && else_bb

+  && (!bbs_ok_for_cmove_arith (then_bb, else_bb,  if_info->orig_x)

+   || !bbs_ok_for_cmove_arith (else_bb, then_bb,  
if_info->orig_x)))

+return false;

+

+  start_sequence ();

+

+  /* If one of the blocks is empty then the corresponding B or A value

+ came from the test block.  The non-empty complex block that we will

+ emit might clobber the register used by B or A, so move it to a pseudo

+ first.  */

+

+  rtx tmp_b = NULL_RTX;

+

+  /* Don't move trap mem to a pseudo. */

+  if (!may_trap_or_fault_p (b) && (b_simple || !else_bb))

+tmp_b = gen_reg_rtx (x_mode);

+

+  orig_b = b;

+

+  rtx emit_a = NULL_RTX;

+  rtx emit_b = NULL_RTX;

+  rtx_insn *tmp_insn = NULL;

+  bool modified_in_a = false;

+  bool modified_in_b = false;

+  /* If either operand is complex, load it into a register first.

+ The best way to do this is to cop

[PATCH v2 0/2] [APX CFCMOV] Support APX CFCMOV

2024-06-18 Thread Kong, Lingling
Hi,

Thank you for reviewing v1!

Changes in v2:
Removed the target hook and added a new optab for cfcmov.

Lingling Kong (2):
  [APX CFCMOV] Support APX CFCMOV in if_convert pass
  [APX CFCMOV] Support APX CFCMOV in backend

 gcc/config/i386/i386-expand.cc   |  63 +
 gcc/config/i386/i386-opts.h  |   4 +-
 gcc/config/i386/i386.cc  |  16 +-
 gcc/config/i386/i386.h   |   1 +
 gcc/config/i386/i386.md  |  53 +++-
 gcc/config/i386/i386.opt |   3 +
 gcc/config/i386/predicates.md|   7 +
 gcc/ifcvt.cc | 246 ++-
 gcc/optabs.def   |   1 +
 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c |  73 ++ 
 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c |  40 +++
 11 files changed, 494 insertions(+), 13 deletions(-) 
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c

--

> -Original Message-
> From: Hongtao Liu 
> Sent: Monday, June 17, 2024 11:05 AM
> To: Jeff Law 
> Cc: Alexander Monakov ; Kong, Lingling
> ; gcc-patches@gcc.gnu.org; Liu, Hongtao
> ; Uros Bizjak 
> Subject: Re: [PATCH 0/3] [APX CFCMOV] Support APX CFCMOV
> 
> On Sat, Jun 15, 2024 at 1:22 AM Jeff Law  wrote:
> >
> >
> >
> > On 6/14/24 11:10 AM, Alexander Monakov wrote:
> > >
> > > On Fri, 14 Jun 2024, Kong, Lingling wrote:
> > >
> > >> APX CFCMOV[1] feature implements conditionally faulting which means
> > >> that all memory faults are suppressed when the condition code
> > >> evaluates to false and load or store a memory operand. Now we could load
> or store a memory operand may trap or fault for conditional move.
> > >>
> > >> In middle-end, now we don't support a conditional move if we knew
> > >> that a load from A or B could trap or fault.
> > >
> > > Predicated loads&stores on Itanium don't trap either. They are
> > > modeled via COND_EXEC on RTL. The late if-conversion pass (the
> > > instance that runs after
> > > reload) is capable of introducing them.
> > >
> > >> To enable CFCMOV, we add a target HOOK
> > >> TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
> > >> in if-conversion pass to allow convert to cmov.
> > >
> > > Considering the above, is the new hook really necessary? Can you
> > > model the new instructions via (cond_exec () (set ...)) instead of (set
> (if_then_else ...)) ?
> > Note that turning on cond_exec will turn off some of the cmove support.
> Yes, cfcmov looks more like a cmov than a cond_exec.
> >
> > But the general suggesting of trying to avoid a hook for this is a
> > good one.  In fact, my first reaction to this thread was "do we really
> > need a hook for this".
> Maybe a new optab, .i.e cfmovmodecc, and it differs from movcc for Conditional
> Fault?
> >
> > jeff
> 
> 
> 
> --
> BR,
> Hongtao


[PATCH v2 2/2] [APX CFCMOV] Support APX CFCMOV in backend

2024-06-18 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_can_cfcmov_p): New function that
test if the cfcmov can be generated.
(ix86_expand_int_movcc): Expand to cfcmov pattern if ix86_can_cfcmov_p
return ture.
* config/i386/i386-opts.h (enum apx_features): Add apx_cfcmov.
* config/i386/i386.cc (ix86_have_conditional_move_mem_notrap): New
function to hook TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP
(TARGET_HAVE_CONDITIONAL_MOVE_MEM_NOTRAP): Target hook define.
(ix86_rtx_costs): Add UNSPEC_APX_CFCMOV cost;
* config/i386/i386.h (TARGET_APX_CFCMOV): Define.
* config/i386/i386.md (cfmovcc): New define_insn to support
cfcmov.
(cfmovcc_2): Ditto.
(UNSPEC_APX_CFCMOV): New unspec for cfcmov.
* config/i386/i386.opt: Add enum value for cfcmov.
* config/i386/predicates.md (register_or_cfc_mem_operand): New
define_predicate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-cfcmov-1.c: New test.
* gcc.target/i386/apx-cfcmov-2.c: Ditto.
---
 gcc/config/i386/i386-expand.cc   | 63 +
 gcc/config/i386/i386-opts.h  |  4 +-
 gcc/config/i386/i386.cc  | 16 +++--
 gcc/config/i386/i386.h   |  1 +
 gcc/config/i386/i386.md  | 53 --
 gcc/config/i386/i386.opt |  3 +
 gcc/config/i386/predicates.md|  7 ++
 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c | 73 
 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c | 40 +++
 9 files changed, 248 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 312329e550b..c02a4bcbec3 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3336,6 +3336,30 @@ ix86_expand_int_addcc (rtx operands[])
   return true;
 }
 
+/* Return TRUE if we could convert "if (test) x = a; else x = b;" to cfcmov,
+   especially when load a or b or x store may cause memmory faults.  */
+bool
+ix86_can_cfcmov_p (rtx x, rtx a, rtx b)
+{
+  machine_mode mode = GET_MODE (x);
+  if (TARGET_APX_CFCMOV
+  && (mode == DImode || mode == SImode || mode == HImode))
+{
+  /* C load (r m r), (r m C), (r r m). For r m m could use
+two cfcmov. */
+  if (register_operand (x, mode)
+ && ((MEM_P (a) && register_operand (b, mode))
+ || (MEM_P (a) && b == const0_rtx)
+ || (register_operand (a, mode) && MEM_P (b))
+ || (MEM_P (a) && MEM_P (b
+   return true;
+  /* C store  (m r 0).  */
+  else if (MEM_P (x) && x == b && register_operand (a, mode))
+   return true;
+}
+  return false;
+}
+
 bool
 ix86_expand_int_movcc (rtx operands[])
 {
@@ -3366,6 +3390,45 @@ ix86_expand_int_movcc (rtx operands[])
 
   compare_code = GET_CODE (compare_op);
 
+  if (MEM_P (operands[0])
+  && !ix86_can_cfcmov_p (operands[0], op2, op3))
+return false;
+
+  if (may_trap_or_fault_p (op2) || may_trap_or_fault_p (op3))
+  {
+   if (ix86_can_cfcmov_p (operands[0], op2, op3))
+ {
+   if (may_trap_or_fault_p (op2))
+ op2 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[2]),
+   UNSPEC_APX_CFCMOV);
+   if (may_trap_or_fault_p (op3))
+ op3 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[3]),
+   UNSPEC_APX_CFCMOV);
+   emit_insn (compare_seq);
+
+   if (may_trap_or_fault_p (op2) && may_trap_or_fault_p (op3))
+ {
+   emit_insn (gen_rtx_SET (operands[0],
+   gen_rtx_IF_THEN_ELSE (mode,
+ compare_op,
+ op2,
+ operands[0])));
+   emit_insn (gen_rtx_SET (operands[0],
+   gen_rtx_IF_THEN_ELSE (mode,
+ compare_op,
+ operands[0],
+ op3)));
+ }
+   else
+ emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_IF_THEN_ELSE (mode,
+   compare_op,
+   op2, op3)));
+   return true;
+ }
+   return false;
+  }
+
   if ((op1 == const0_rtx && (code == GE || code == LT))
   || (op1 == constm1_rtx && (code == GT || code == LE)))
 sign_bit_compare_p = true;
diff --git a/gcc/config/

RE: [PATCH v2 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-06-24 Thread Kong, Lingling
Hi,


Gently ping for this.

This version has removed the target hook and added a new optab for cfcmov.

Thanks,
Lingling

From: Kong, Lingling 
Sent: Tuesday, June 18, 2024 3:41 PM
To: gcc-patches@gcc.gnu.org
Cc: Alexander Monakov ; Uros Bizjak ; 
lingling.ko...@gmail.com; Hongtao Liu ; Jeff Law 
; Richard Biener 
Subject: [PATCH v2 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass


APX CFCMOV feature implements conditionally faulting which means

that all memory faults are suppressed when the condition code

evaluates to false and load or store a memory operand. Now we

could load or store a memory operand may trap or fault for

conditional move.



In middle-end, now we don't support a conditional move if we knew

that a load from A or B could trap or fault. To enable CFCMOV, we

added a new optab.



Conditional move suppress_fault for condition mem store would not

move any arithmetic calculations. For condition mem load now just

support a conditional move one trap mem and one no trap and no mem

cases.



gcc/ChangeLog:



   * ifcvt.cc (noce_try_cmove_load_mem_notrap): Allow convert

   to cfcmov for conditional load.

   (noce_try_cmove_store_mem_notrap): Convert to conditional store.

   (noce_process_if_block): Ditto.

   * optabs.def (OPTAB_D): New optab.

---

gcc/ifcvt.cc   | 246 -

gcc/optabs.def |   1 +

2 files changed, 246 insertions(+), 1 deletion(-)



diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc

index 58ed42673e5..65c069b8cc6 100644

--- a/gcc/ifcvt.cc

+++ b/gcc/ifcvt.cc

@@ -783,6 +783,8 @@ static rtx noce_emit_cmove (struct noce_if_info *, rtx, 
enum rtx_code, rtx,

 rtx, rtx, rtx, rtx = NULL, rtx 
= NULL);

static bool noce_try_cmove (struct noce_if_info *);

static bool noce_try_cmove_arith (struct noce_if_info *);

+static bool noce_try_cmove_load_mem_notrap (struct noce_if_info *);

+static bool noce_try_cmove_store_mem_notrap (struct noce_if_info *, rtx *, 
rtx);

static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn **);

static bool noce_try_minmax (struct noce_if_info *);

static bool noce_try_abs (struct noce_if_info *);

@@ -2401,6 +2403,233 @@ noce_try_cmove_arith (struct noce_if_info *if_info)

   return false;

}



+/* When target support suppress memory fault, try more complex cases involving

+   conditional_move's source or dest may trap or fault.  */

+

+static bool

+noce_try_cmove_load_mem_notrap (struct noce_if_info *if_info)

+{

+  rtx a = if_info->a;

+  rtx b = if_info->b;

+  rtx x = if_info->x;

+

+  if (MEM_P (x))

+return false;

+  /* Just handle a conditional move from one trap MEM + other non_trap,

+ non mem cases.  */

+  if (!(MEM_P (a) ^ MEM_P (b)))

+  return false;

+  bool a_trap = may_trap_or_fault_p (a);

+  bool b_trap = may_trap_or_fault_p (b);

+

+  if (!(a_trap ^ b_trap))

+return false;

+  if (a_trap && !MEM_P (a))

+return false;

+  if (b_trap && !MEM_P (b))

+return false;

+

+  rtx orig_b;

+  rtx_insn *insn_a, *insn_b;

+  bool a_simple = if_info->then_simple;

+  bool b_simple = if_info->else_simple;

+  basic_block then_bb = if_info->then_bb;

+  basic_block else_bb = if_info->else_bb;

+  rtx target;

+  enum rtx_code code;

+  rtx cond = if_info->cond;

+  rtx_insn *ifcvt_seq;

+

+  /* if (test) x = *a; else x = c - d;

+ => x = c - d;

+ if (test)

+   x = *a;

+  */

+

+  code = GET_CODE (cond);

+  insn_a = if_info->insn_a;

+  insn_b = if_info->insn_b;

+  machine_mode x_mode = GET_MODE (x);

+

+  /* Because we only handle one trap MEM + other non_trap, non mem cases,

+ just move one trap MEM always in then_bb.  */

+  if (noce_reversed_cond_code (if_info) != UNKNOWN)

+{

+  bool reversep = false;

+  if (b_trap)

+ reversep = true;

+

+  if (reversep)

+ {

+   if (if_info->rev_cond)

+ {

+   cond = if_info->rev_cond;

+   code = GET_CODE (cond);

+ }

+   else

+ code = reversed_comparison_code (cond, if_info->jump);

+   std::swap (a, b);

+   std::swap (insn_a, insn_b);

+   std::swap (a_simple, b_simple);

+   std::swap (then_bb, else_bb);

+ }

+}

+

+  if (then_bb && else_bb

+  && (!bbs_ok_for_cmove_arith (then_bb, else_bb,  if_info->orig_x)

+   || !bbs_ok_for_cmove_arith (else_bb, then_bb,  
if_info->orig_x)))

+return false;

+

+  start_sequence ();

+

+  /* If one of the blocks is empty then the corresponding B or A value

+ came from the test block.  The non-empty complex block that we will

+ emit might clobber the register used by B or A, so mo

RE: [PATCH] i386: Change prefetchi output template

2024-07-22 Thread Kong, Lingling



> -Original Message-
> From: Haochen Jiang 
> Sent: Monday, July 22, 2024 2:41 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Liu, Hongtao ; ubiz...@gmail.com
> Subject: [PATCH] i386: Change prefetchi output template
> 
> Hi all,
> 
> For prefetchi instructions, RIP-relative address is explicitly mentioned for 
> operand
> and assembler obeys that rule strictly. This makes instruction like:
> 
>   prefetchit0 bar
> 
> got illegal for assembler, which should be a broad usage for prefetchi.
> 
> Explicitly add (%rip) after function label to make it legal in assembler so 
> that it
> could pass to linker to get the real address.
> 
> Ok for trunk and backport to GCC14 and GCC13 since prefetchi instructions are
> introduced in GCC13?
> 
> Thx,
> Haochen
> 
> gcc/ChangeLog:
> 
>   * config/i386/i386.md (prefetchi): Add explicit (%rip) after
>   function label.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/i386/prefetchi-1.c: Check (%rip).
> ---
>  gcc/config/i386/i386.md | 2 +-
>  gcc/testsuite/gcc.target/i386/prefetchi-1.c | 4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
> 90d3aa450f0..3ec51bad6fe 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -28004,7 +28004,7 @@
>"TARGET_PREFETCHI && TARGET_64BIT"
>  {
>static const char * const patterns[2] = {
> -"prefetchit1\t%0", "prefetchit0\t%0"
> +"prefetchit1\t{%p0(%%rip)|%p0[rip]}", 
> "prefetchit0\t{%p0(%%rip)|%p0[rip]}"
>};

"prefetchit1\t%a0", "prefetchit0\t%a0" maybe better.

Thanks,
Lingling

>int locality = INTVAL (operands[1]);
> diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1.c
> b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
> index 80f25e70e8e..03dfdc55e86 100644
> --- a/gcc/testsuite/gcc.target/i386/prefetchi-1.c
> +++ b/gcc/testsuite/gcc.target/i386/prefetchi-1.c
> @@ -1,7 +1,7 @@
>  /* { dg-do compile { target { ! ia32 } } } */
>  /* { dg-options "-mprefetchi -O2" } */
> -/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+" 2 } } */
> -/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+" 2 } } */
> +/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[
> +\\t\]+bar\\(%rip\\)" 2 } } */
> +/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[
> +\\t\]+bar\\(%rip\\)" 2 } } */
> 
>  #include 
> 
> --
> 2.31.1



[PATCH] i386: Adjust rtx cost for imulq and imulw [PR115749]

2024-07-24 Thread Kong, Lingling
Tested spec2017 performance in Sierra Forest, Icelake, CascadeLake, at least 
there is no obvious regression.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

OK for trunk?

gcc/ChangeLog:

* config/i386/x86-tune-costs.h (struct processor_costs):
Adjust rtx_cost of imulq and imulw for COST_N_INSNS (4)
to COST_N_INSNS (3).

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115749.c: New test.
---
 gcc/config/i386/x86-tune-costs.h | 16 
 gcc/testsuite/gcc.target/i386/pr115749.c | 16 
 2 files changed, 24 insertions(+), 8 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/pr115749.c

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 769f334e531..2bfaee554d5 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2182,7 +2182,7 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (1),   /* variable shift costs */
   COSTS_N_INSNS (1),   /* constant shift costs */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),  /*   HI */
+   COSTS_N_INSNS (3),  /*   HI */
COSTS_N_INSNS (3),  /*   SI */
COSTS_N_INSNS (3),  /*   DI */
COSTS_N_INSNS (3)}, /*other */
@@ -2310,7 +2310,7 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (1),   /* variable shift costs */
   COSTS_N_INSNS (1),   /* constant shift costs */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),  /*   HI */
+   COSTS_N_INSNS (3),  /*   HI */
COSTS_N_INSNS (3),  /*   SI */
COSTS_N_INSNS (3),  /*   DI */
COSTS_N_INSNS (3)}, /*other */
@@ -2434,9 +2434,9 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (1),   /* variable shift costs */
   COSTS_N_INSNS (1),   /* constant shift costs */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),  /*   HI */
+   COSTS_N_INSNS (3),  /*   HI */
COSTS_N_INSNS (3),  /*   SI */
-   COSTS_N_INSNS (4),  /*   DI */
+   COSTS_N_INSNS (3),  /*   DI */
COSTS_N_INSNS (4)}, /*other */
   0,   /* cost of multiply per each bit set */
   {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
@@ -3234,9 +3234,9 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (1),   /* variable shift costs */
   COSTS_N_INSNS (1),   /* constant shift costs */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),  /*   HI */
+   COSTS_N_INSNS (3),  /*   HI */
COSTS_N_INSNS (3),  /*   SI */
-   COSTS_N_INSNS (4),  /*   DI */
+   COSTS_N_INSNS (3),  /*   DI */
COSTS_N_INSNS (4)}, /*other */
   0,   /* cost of multiply per each bit set */
   {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
@@ -3816,9 +3816,9 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),   /* variable shift costs */
   COSTS_N_INSNS (1),   /* constant shift costs */
   {COSTS_N_INSNS (3),  /* cost of starting multiply for QI */
-   COSTS_N_INSNS (4),  /*   HI */
+   COSTS_N_INSNS (3),  /*   HI */
COSTS_N_INSNS (3),  /*   SI */
-   COSTS_N_INSNS (4),  /*   DI */
+   COSTS_N_INSNS (3),  /*   DI */
COSTS_N_INSNS (4)}, /*other */
   0,   /* cost of multiply per each bit set */
   {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
diff --git a/gcc/testsuite/gcc.target/i386/pr115749.c 
b/gcc/testsuite/

[PATCH] i386: Remove ndd support for *add_4 [PR113744]

2024-07-30 Thread Kong, Lingling
*add_4 and *adddi_4 are for shorter opcode from cmp to inc/dec or add 
$128.
But NDD code is longer than the cmp code, so there is no need to support NDD.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.

Ok for trunk?

gcc/ChangeLog:

   PR target/113744
   * config/i386/i386.md (*add_4): Remove NDD support.
   (*adddi_4): Ditto.

Co-Authored-By: Hu, Lin1 lin1...@intel.com
---
gcc/config/i386/i386.md | 40 +++-
1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fb10fdc9f96..3c293c14656 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -7146,35 +7146,31 @@
(define_insn "*adddi_4"
   [(set (reg FLAGS_REG)
  (compare
-(match_operand:DI 1 "nonimmediate_operand" "0,rm")
-(match_operand:DI 2 "x86_64_immediate_operand" "e,e")))
-   (clobber (match_scratch:DI 0 "=r,r"))]
+   (match_operand:DI 1 "nonimmediate_operand" "0")
+   (match_operand:DI 2 "x86_64_immediate_operand" "e")))
+   (clobber (match_scratch:DI 0 "=r"))]
   "TARGET_64BIT
&& ix86_match_ccmode (insn, CCGCmode)"
{
-  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_INCDEC:
   if (operands[2] == constm1_rtx)
-return use_ndd ? "inc{q}\t{%1, %0|%0, %1}" : "inc{q}\t%0";
+ return "inc{q}\t%0";
   else
 {
gcc_assert (operands[2] == const1_rtx);
-return use_ndd ? "dec{q}\t{%1, %0|%0, %1}" : "dec{q}\t%0";
+   return "dec{q}\t%0";
  }
 default:
   if (x86_maybe_negate_const_int (&operands[2], DImode))
-  return use_ndd ? "add{q}\t{%2, %1, %0|%0, %1, %2}"
-: "add{q}\t{%2, %0|%0, %2}";
+ return "add{q}\t{%2, %0|%0, %2}";
-  return use_ndd ? "sub{q}\t{%2, %1, %0|%0, %1, %2}"
-  : "sub{q}\t{%2, %0|%0, %2}";
+  return "sub{q}\t{%2, %0|%0, %2}";
 }
}
-  [(set_attr "isa" "*,apx_ndd")
-   (set (attr "type")
+  [(set (attr "type")
  (if_then_else (match_operand:DI 2 "incdec_operand")
  (const_string "incdec")
  (const_string "alu")))
@@ -7195,36 +7191,30 @@
(define_insn "*add_4"
   [(set (reg FLAGS_REG)
  (compare
-(match_operand:SWI124 1 "nonimmediate_operand" "0,rm")
+   (match_operand:SWI124 1 "nonimmediate_operand" "0")
(match_operand:SWI124 2 "const_int_operand")))
-   (clobber (match_scratch:SWI124 0 "=,r"))]
+   (clobber (match_scratch:SWI124 0 "="))]
   "ix86_match_ccmode (insn, CCGCmode)"
{
-  bool use_ndd = get_attr_isa (insn) == ISA_APX_NDD;
   switch (get_attr_type (insn))
 {
 case TYPE_INCDEC:
   if (operands[2] == constm1_rtx)
-return use_ndd ? "inc{}\t{%1, %0|%0, %1}"
-: "inc{}\t%0";
+return "inc{}\t%0";
   else
 {
gcc_assert (operands[2] == const1_rtx);
-return use_ndd ? "dec{}\t{%1, %0|%0, %1}"
-: "dec{}\t%0";
+   return "dec{}\t%0";
  }
 default:
   if (x86_maybe_negate_const_int (&operands[2], mode))
-  return use_ndd ? "add{}\t{%2, %1, %0|%0, %1, %2}"
-: "add{}\t{%2, %0|%0, %2}";
+ return "add{}\t{%2, %0|%0, %2}";
-  return use_ndd ? "sub{}\t{%2, %1, %0|%0, %1, %2}"
-  : "sub{}\t{%2, %0|%0, %2}";
+  return "sub{}\t{%2, %0|%0, %2}";
 }
}
-  [(set_attr "isa" "*,apx_ndd")
-   (set (attr "type")
+  [(set (attr "type")
  (if_then_else (match_operand: 2 "incdec_operand")
  (const_string "incdec")
  (const_string "alu")))
--
2.31.1


[PATCH] i386: Fix memory constraint for APX NF

2024-07-31 Thread Kong, Lingling
The je constraint should be used for APX NDD ADD with register source
operand. The jM is for APX NDD patterns with immediate operand.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.md (nf_mem_constraint): Fixed the constraint
for the define_subst_attr.
(nf_mem_constraint): Added new define_subst_attr.
(*add_1): Fixed the constraint.
---
 gcc/config/i386/i386.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fb10fdc9f96..aa7220ee17c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6500,7 +6500,8 @@
 (define_subst_attr "nf_name" "nf_subst" "_nf" "")
 (define_subst_attr "nf_prefix" "nf_subst" "%{nf%} " "")
 (define_subst_attr "nf_condition" "nf_subst" "TARGET_APX_NF" "true")
-(define_subst_attr "nf_mem_constraint" "nf_subst" "je" "m")
+(define_subst_attr "nf_add_mem_constraint" "nf_subst" "je" "m")
+(define_subst_attr "nf_mem_constraint" "nf_subst" "jM" "m")
 (define_subst_attr "nf_applied" "nf_subst" "true" "false")
 (define_subst_attr "nf_nonf_attr" "nf_subst"  "noapx_nf" "*")
 (define_subst_attr "nf_nonf_x64_attr" "nf_subst" "noapx_nf" "x64")
@@ -6514,7 +6515,7 @@
(clobber (reg:CC FLAGS_REG))])

 (define_insn "*add_1"
-  [(set (match_operand:SWI48 0 "nonimmediate_operand" 
"=rm,r,r,r,r,r,r,r")
+  [(set (match_operand:SWI48 0 "nonimmediate_operand" 
"=r,r,r,r,r,r,r,r")
(plus:SWI48
  (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,0,r,r,rje,jM,r")
  (match_operand:SWI48 2 "x86_64_general_operand" 
"r,e,BM,0,le,r,e,BM")))]
--
2.31.1


RE: [PATCH] i386: Fix memory constraint for APX NF

2024-07-31 Thread Kong, Lingling



> -Original Message-
> From: Liu, Hongtao 
> Sent: Thursday, August 1, 2024 9:35 AM
> To: Kong, Lingling ; gcc-patches@gcc.gnu.org
> Cc: Wang, Hongyu 
> Subject: RE: [PATCH] i386: Fix memory constraint for APX NF
> 
> 
> 
> > -Original Message-
> > From: Kong, Lingling 
> > Sent: Thursday, August 1, 2024 9:30 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Liu, Hongtao ; Wang, Hongyu
> > 
> > Subject: [PATCH] i386: Fix memory constraint for APX NF
> >
> > The je constraint should be used for APX NDD ADD with register source
> > operand. The jM is for APX NDD patterns with immediate operand.
> But these 2 alternatives is for Non-NDD.  
The jM constraint is for the size limit of 15 byes when non-default address 
space,
It also work to APX NF. The je is for TLS code with EVEX prefix for ADD, and 
APX NF
also has the EVEX prefix.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.md (nf_mem_constraint): Fixed the constraint
> > for the define_subst_attr.
> > (nf_mem_constraint): Added new define_subst_attr.
> > (*add_1): Fixed the constraint.
> > ---
> >  gcc/config/i386/i386.md | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index
> > fb10fdc9f96..aa7220ee17c 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -6500,7 +6500,8 @@
> >  (define_subst_attr "nf_name" "nf_subst" "_nf" "")  (define_subst_attr
> > "nf_prefix" "nf_subst" "%{nf%} " "")  (define_subst_attr "nf_condition"
> > "nf_subst" "TARGET_APX_NF" "true") -(define_subst_attr
> > "nf_mem_constraint" "nf_subst" "je" "m")
> > +(define_subst_attr "nf_add_mem_constraint" "nf_subst" "je" "m")
> > +(define_subst_attr "nf_mem_constraint" "nf_subst" "jM" "m")
> >  (define_subst_attr "nf_applied" "nf_subst" "true" "false")
> > (define_subst_attr "nf_nonf_attr" "nf_subst"  "noapx_nf" "*")
> > (define_subst_attr "nf_nonf_x64_attr" "nf_subst" "noapx_nf" "x64") @@ -
> 6514,7 +6515,7 @@
> > (clobber (reg:CC FLAGS_REG))])
> >
> >  (define_insn "*add_1"
> > -  [(set (match_operand:SWI48 0 "nonimmediate_operand"
> > "=rm,r,r,r,r,r,r,r")
> > +  [(set (match_operand:SWI48 0 "nonimmediate_operand"
> > + "=r,r,r,r,r,r,r,r")
> > (plus:SWI48
> >   (match_operand:SWI48 1 "nonimmediate_operand"
> > "%0,0,0,r,r,rje,jM,r")
> >   (match_operand:SWI48 2 "x86_64_general_operand"
> > "r,e,BM,0,le,r,e,BM")))]
> > --
> > 2.31.1


[PATCH] i386: Fix comment/naming for APX NDD constraints

2024-08-01 Thread Kong, Lingling
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/constraints.md: Fixed the comment/naming
for je/jM/jO.
* config/i386/predicates.md (apx_ndd_memory_operand):
Renamed and fixed the comment.
(apx_evex_memory_operand): New name.
(apx_ndd_add_memory_operand): Ditto.
(apx_evex_add_memory_operand): Ditto.
---
 gcc/config/i386/constraints.md | 12 ++--
 gcc/config/i386/predicates.md  | 21 +++--
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 18389c47800..e03d0e1b45b 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -463,14 +463,14 @@
  "TARGET_APX_EGPR && !TARGET_AVX ? GENERAL_GPR16 : GENERAL_REGS")

 (define_memory_constraint "je"
-  "@internal Memory operand for APX NDD ADD."
-  (match_operand 0 "apx_ndd_add_memory_operand"))
+  "@internal Memory operand for APX EVEX ADD for NDD or NF."
+  (match_operand 0 "apx_evex_add_memory_operand"))

 (define_memory_constraint "jM"
-  "@internal Memory operand, with APX NDD check."
-  (match_operand 0 "apx_ndd_memory_operand"))
+  "@internal Memory operand, with APX EVEX check for NDD or NF."
+  (match_operand 0 "apx_evex_memory_operand"))

 (define_memory_constraint "jO"
-  "@internal Offsettable memory operand, with APX NDD check."
-  (and (match_operand 0 "apx_ndd_memory_operand")
+  "@internal Offsettable memory operand, with APX EVEX check for NDD or NF."
+  (and (match_operand 0 "apx_evex_memory_operand")
   (match_test "offsettable_nonstrict_memref_p (op)")))
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 680594871de..8cab10550e8 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2264,11 +2264,11 @@
   return true;
 })

-;; Return true if OP is a memory operand that can be also used in APX
-;; NDD patterns with immediate operand.  With non-default address space,
-;; segment register or address size prefix, APX NDD instruction length
+;; Return true if OP is a memory operand that can be also used in APX EVEX for
+;; NDD or NF patterns with immediate operand.  With non-default address space,
+;; segment register or address size prefix, APX EVEX instruction length
 ;; can exceed the 15 byte size limit.
-(define_predicate "apx_ndd_memory_operand"
+(define_predicate "apx_evex_memory_operand"
   (match_operand 0 "memory_operand")
 {
   /* OK if immediate operand size < 4 bytes.  */
@@ -2312,19 +2312,20 @@
   return true;
 })

-;; Return true if OP is a memory operand which can be used in APX NDD
-;; ADD with register source operand.  UNSPEC_GOTNTPOFF memory operand
-;; is allowed with APX NDD ADD only if R_X86_64_CODE_6_GOTTPOFF works.
-(define_predicate "apx_ndd_add_memory_operand"
+;; Return true if OP is a memory operand which can be used in APX EVEX ADD for
+;; NDD or NF with register source operand.  UNSPEC_GOTNTPOFF memory operand is
+;; allowed with APX EVEX ADD only if R_X86_64_CODE_6_GOTTPOFF works.
+(define_predicate "apx_evex_add_memory_operand"
   (match_operand 0 "memory_operand")
 {
-  /* OK if "add %reg1, name@gottpoff(%rip), %reg2" is supported.  */
+  /* OK if "add %reg1, name@gottpoff(%rip), %reg2" or
+   "{nf} add name@gottpoff(%rip), %reg1" are supported.  */
   if (HAVE_AS_R_X86_64_CODE_6_GOTTPOFF)
 return true;

   op = XEXP (op, 0);

-  /* Disallow APX NDD ADD with UNSPEC_GOTNTPOFF.  */
+  /* Disallow APX EVEX ADD with UNSPEC_GOTNTPOFF.  */
   if (GET_CODE (op) == CONST
   && GET_CODE (XEXP (op, 0)) == UNSPEC
   && XINT (XEXP (op, 0), 1) == UNSPEC_GOTNTPOFF)
--
2.31.1


[PATCH] x86: Fix cmov cost model issue [PR109549]

2024-05-05 Thread Kong, Lingling
Hi,
(if_then_else:SI (eq (reg:CCZ 17 flags)
(const_int 0 [0]))
(reg/v:SI 101 [ e ])
(reg:SI 102))
The cost is 8 for the rtx, the cost for
(eq (reg:CCZ 17 flags) (const_int 0 [0])) is 4, but this is just an operator do 
not need to compute it's cost in cmov.

Bootstrapped and regtested on x86_64-pc-linux-gnu.
OK for trunk?

gcc/ChangeLog:

PR target/109549
* config/i386/i386.cc (ix86_rtx_costs): The XEXP (x, 0) for cmov
is an operator do not need to compute cost.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cmov6.c: Fixed.
---
 gcc/config/i386/i386.cc   | 2 +-
 gcc/testsuite/gcc.target/i386/cmov6.c | 5 +
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
4d6b2b98761..59b4ce3bfbf 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22237,7 +22237,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
{
  /* cmov.  */
  *total = COSTS_N_INSNS (1);
- if (!REG_P (XEXP (x, 0)))
+ if (!COMPARISON_P (XEXP (x, 0)) && !REG_P (XEXP (x, 0)))
*total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
  if (!REG_P (XEXP (x, 1)))
*total += rtx_cost (XEXP (x, 1), mode, code, 1, speed); diff --git 
a/gcc/testsuite/gcc.target/i386/cmov6.c b/gcc/testsuite/gcc.target/i386/cmov6.c
index 5111c8a9099..535326e4c2a 100644
--- a/gcc/testsuite/gcc.target/i386/cmov6.c
+++ b/gcc/testsuite/gcc.target/i386/cmov6.c
@@ -1,9 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -march=k8" } */
-/* if-converting this sequence would require two cmov
-   instructions and seems to always cost more independent
-   of the TUNE_ONE_IF_CONV setting.  */
-/* { dg-final { scan-assembler-not "cmov\[^6\]" } } */
+/* { dg-final { scan-assembler "cmov\[^6\]" } } */
 
 /* Verify that blocks are converted to conditional moves.  */  extern int bar 
(int, int);
--
2.31.1



[PATCH] i386: fix ix86_hardreg_mov_ok with lra_in_progress

2024-05-06 Thread Kong, Lingling
Hi,
Originally eliminate_regs_in_insn will transform 
(parallel [
  (set (reg:QI 130)
(plus:QI (subreg:QI (reg:DI 19 frame) 0)
  (const_int 96)))
  (clobber (reg:CC 17 flag))]) {*addqi_1} 
to 
(set (reg:QI 130) 
  (subreg:QI (reg:DI 19 frame) 0)) {*movqi_internal}
when verify_changes.

But with No Flags add, it transforms
(set (reg:QI 5 di)
  (plus:QI (subreg:QI (reg:DI 19 frame) 0)
   (const_int 96))) {*addqi_1_nf}
to
(set (reg:QI 5 di)
 (subreg:QI (reg:DI 19 frame) 0)) {*addqi_1_nf}.
there is no extra clobbers at the end, and its dest reg just is a hardreg. For 
ix86_hardreg_mov_ok, it returns false. So it fails to update insn and causes 
the ICE when transform to movqi_internal.

But actually it is ok and safe for ix86_hardreg_mov_ok when lra_in_progress.

And tested the spec2017, the performance was not affected.
Bootstrapped and regtested on x86_64-pc-linux-gnu. OK for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_hardreg_mov_ok): Relax
hard reg mov restriction when lra in progress.
---
 gcc/config/i386/i386.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 
4d6b2b98761..ca4348a18bf 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20357,7 +20357,8 @@ ix86_hardreg_mov_ok (rtx dst, rtx src)
   ? standard_sse_constant_p (src, GET_MODE (dst))
   : x86_64_immediate_operand (src, GET_MODE (dst)))
   && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
-  && !reload_completed)
+  && !reload_completed
+  && !lra_in_progress)
 return false;
   return true;
 }
--
2.31.1



[PATCH v3 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-09-05 Thread Kong, Lingling
Hi,

This version has added a new optab named 'cfmovcc'. The new optab is used
in the middle end to expand to cfcmov. And simplified my patch by trying to
generate the conditional faulting movcc in noce_try_cmove_arith function.

All the changes passed bootstrap & regtest x86-64-pc-linux-gnu.
We also tested spec with SDE and passed the runtime test.

Ok for trunk?


APX CFCMOV[1] feature implements conditionally faulting which means
If the comparison is false, all memory faults are suppressed when load
or store a memory operand. Now we could load or store a memory
operand may trap or fault for conditional move.

In middle-end, now we don't support a conditional move if we knew
that a load from A or B could trap or fault. To enable CFCMOV, we
added a new optab named cfmovcc.

Conditional move suppress fault for condition mem store would not
move any arithmetic calculations. For condition mem load now just
support a conditional move one trap mem and one no trap and no mem
cases.


[1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html

gcc/ChangeLog:

   * doc/md.texi: Add cfmovcc insn pattern explanation.
   * ifcvt.cc (can_use_cmove_load_mem_notrap): New func
   for conditional faulting movcc for load.
   (can_use_cmove_store_mem_notrap): New func for conditional
   faulting movcc for store.
   (can_use_cfmovcc):  New func for conditional faulting.
   (noce_try_cmove_arith): Try to convert to conditional faulting
   movcc.
   (noce_process_if_block): Ditto.
   * optabs.cc (emit_conditional_move): Handle cfmovcc.
   (emit_conditional_move_1): Ditto.
   * optabs.def (OPTAB_D): New optab.
---
gcc/doc/md.texi |  10 
gcc/ifcvt.cc| 119 
gcc/optabs.cc   |  14 +-
gcc/optabs.def  |   1 +
4 files changed, 132 insertions(+), 12 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index a9259112251..5f563787c49 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -8591,6 +8591,16 @@ Return 1 if operand 1 is a normal floating point number 
and 0
otherwise.  @var{m} is a scalar floating point mode.  Operand 0
has mode @code{SImode}, and operand 1 has mode @var{m}.
+@cindex @code{cfmov@var{mode}cc} instruction pattern
+@item @samp{cfmov@var{mode}cc}
+Similar to @samp{mov@var{mode}cc} but for conditional faulting,
+If the comparison is false, all memory faults are suppressed
+when load or store a memory operand.
+
+Conditionally move operand 2 or operand 3 into operand 0 according
+to the comparison in operand 1.  If the comparison is true, operand 2
+is moved into operand 0, otherwise operand 3 is moved.
+
@end table
 @end ifset
diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 6487574c514..59845390607 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -778,6 +778,9 @@ static bool noce_try_store_flag_mask (struct noce_if_info 
*);
static rtx noce_emit_cmove (struct noce_if_info *, rtx, enum rtx_code, rtx,
rtx, rtx, rtx, rtx = NULL, rtx 
= NULL);
static bool noce_try_cmove (struct noce_if_info *);
+static bool can_use_cmove_load_mem_notrap (rtx, rtx);
+static bool can_use_cmove_store_mem_notrap (rtx, rtx, rtx, bool);
+static bool can_use_cfmovcc (struct noce_if_info *);
static bool noce_try_cmove_arith (struct noce_if_info *);
static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn **);
static bool noce_try_minmax (struct noce_if_info *);
@@ -2132,6 +2135,69 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
   return true;
}
+/* Return TRUE if we could convert "if (test) x = *a; else x = b;"
+   or "if (test) x = a; else x = *b;" to conditional faulting movcc,
+   i.e. x86 cfcmov, especially when load a or b may cause memmory faults.  */
+
+static bool
+can_use_cmove_load_mem_notrap (rtx a, rtx b)
+{
+  /* Just handle a conditional move from one trap MEM + other non_trap,
+ non mem cases.  */
+  if (!(MEM_P (a) ^ MEM_P (b)))
+  return false;
+  bool a_trap = may_trap_or_fault_p (a);
+  bool b_trap = may_trap_or_fault_p (b);
+
+  if (!(a_trap ^ b_trap))
+return false;
+  if (a_trap && !MEM_P (a))
+return false;
+  if (b_trap && !MEM_P (b))
+return false;
+
+  return true;
+}
+
+/* Return TRUE if we could convert "if (test) *x = a; else skip" to
+   conditional faulting movcc, i.e. x86 cfcmov, especially when store
+   x may cause memmory faults and in else_bb x == b.  */
+
+static bool
+can_use_cmove_store_mem_notrap (rtx x, rtx a, rtx b, bool a_simple)
+{
+  gcc_assert (MEM_P (x));
+
+  machine_mode x_mode = GET_MODE (x);
+
+  if (!rtx_equal_p (x, b) || !may_trap_or_fault_p (x))
+return false;
+  if (!a_simple || !register_operand (a, x_mode))
+return false;
+
+  return true;
+}
+
+/* Return TRUE if backend supports cfmovcc_optab, which suppressed memory
+   f

[PATCH v3 2/2] [APX CFCMOV] Support APX CFCMOV in backend

2024-09-05 Thread Kong, Lingling
gcc/ChangeLog:

   * config/i386/i386-expand.cc (ix86_can_cfcmov_p): New func
   that test if the cfcmov can be generated.
   (ix86_expand_int_cfmovcc):  Expand to cfcmov pattern.
   * config/i386/i386-opts.h (enum apx_features): New.
   * config/i386/i386-protos.h (ix86_expand_int_cfmovcc): Define.
   * config/i386/i386.cc (ix86_rtx_costs): Add UNSPEC_APX_CFCMOV
   cost.
   * config/i386/i386.h (TARGET_APX_CFCMOV): Define.
   * config/i386/i386.md (cfmovcc): New define_expand.
   (*cfmovcc): New define_insn.
   (*cfmovcc_2): Ditto.
   (*cfmovccz): Ditto.
   (UNSPEC_APX_CFCMOV): New unspec for cfcmov.
   * config/i386/i386.opt: Add enum value for cfcmov.

gcc/testsuite/ChangeLog:

   * gcc.target/i386/apx-cfcmov-1.c: New test.
   * gcc.target/i386/apx-cfcmov-2.c: Ditto.
---
gcc/config/i386/i386-expand.cc   | 67 ++
gcc/config/i386/i386-opts.h  |  4 +-
gcc/config/i386/i386-protos.h|  1 +
gcc/config/i386/i386.cc  | 16 +++--
gcc/config/i386/i386.h   |  1 +
gcc/config/i386/i386.md  | 60 +++-
gcc/config/i386/i386.opt |  3 +
gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c | 73 
gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c | 40 +++
9 files changed, 259 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 124cb976ec8..7ba445a189b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3368,6 +3368,73 @@ ix86_expand_int_addcc (rtx operands[])
   return true;
}
+/* Return TRUE if we could convert "if (test) x = a; else x = b;" to cfcmov,
+   especially when load a or b or x store may cause memmory faults.  */
+
+bool
+ix86_can_cfcmov_p (rtx x, rtx a, rtx b)
+{
+  machine_mode mode = GET_MODE (x);
+  /* Conditional load for cfcmov.  */
+  if (register_operand (x, mode)
+  /* "if (test) x = *a; else x = b;". */
+  && ((MEM_P (a) && register_operand (b, mode))
+   /* "if (test) x = *a; else x = 0;". */
+   || (MEM_P (a) && b == const0_rtx)
+   /* "if (test) x = a; else x = *b;". */
+   || (register_operand (a, mode) && MEM_P (b
+return true;
+  /* Conditional store "if (test) *x = a; else skip;".  */
+  else if (MEM_P (x) && x == b && register_operand (a, mode))
+return true;
+  return false;
+}
+
+bool
+ix86_expand_int_cfmovcc (rtx operands[])
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  if ((mode != DImode && mode != SImode && mode != HImode)
+  || !TARGET_APX_CFCMOV)
+return false;
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx_insn *compare_seq;
+  rtx compare_op;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+
+  start_sequence ();
+  compare_op = ix86_expand_compare (code, op0, op1);
+  compare_seq = get_insns ();
+  end_sequence ();
+
+  /* Just handle a conditional move from one trap MEM + other non_trap,
+ non mem cases.  */
+
+  if (may_trap_or_fault_p (op2) ^ may_trap_or_fault_p (op3))
+  {
+ if (ix86_can_cfcmov_p (operands[0], op2, op3))
+   {
+ if (may_trap_or_fault_p (op2))
+   op2 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[2]),
+  
UNSPEC_APX_CFCMOV);
+ if (may_trap_or_fault_p (op3))
+   op3 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[3]),
+  
UNSPEC_APX_CFCMOV);
+ emit_insn (compare_seq);
+ emit_insn (gen_rtx_SET (operands[0],
+  
gen_rtx_IF_THEN_ELSE (mode,
+   
  compare_op,
+   
  op2, op3)));
+ return true;
+   }
+ return false;
+  }
+  return false;
+}
+
bool
ix86_expand_int_movcc (rtx operands[])
{
diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index c7ec0d9fd39..711519ffb53 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -143,8 +143,10 @@ enum apx_features {
   apx_nf = 1 << 4,
   apx_ccmp = 1 << 5,
   apx_zu = 1 << 6,
+  apx_cfcmov = 1 << 7,
   apx_all = apx_egpr | apx_push2pop2 | apx_ndd
-  | apx_ppx | apx_nf | apx_ccmp | apx_zu,
+ 

RE: [PATCH v3 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-09-11 Thread Kong, Lingling



> -Original Message-
> From: Richard Sandiford 
> Sent: Friday, September 6, 2024 5:19 PM
> To: Kong, Lingling 
> Cc: gcc-patches@gcc.gnu.org; Jeff Law ; Richard Biener
> ; Uros Bizjak ; Hongtao Liu
> ; Jakub Jelinek 
> Subject: Re: [PATCH v3 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert
> pass
> 
> "Kong, Lingling"  writes:
> > Hi,
> >
> > This version has added a new optab named 'cfmovcc'. The new optab is
> > used in the middle end to expand to cfcmov. And simplified my patch by
> > trying to generate the conditional faulting movcc in noce_try_cmove_arith
> function.
> >
> > All the changes passed bootstrap & regtest x86-64-pc-linux-gnu.
> > We also tested spec with SDE and passed the runtime test.
> >
> > Ok for trunk?
> >
> >
> > APX CFCMOV[1] feature implements conditionally faulting which means If
> > the comparison is false, all memory faults are suppressed when load or
> > store a memory operand. Now we could load or store a memory operand
> > may trap or fault for conditional move.
> >
> > In middle-end, now we don't support a conditional move if we knew that
> > a load from A or B could trap or fault. To enable CFCMOV, we added a
> > new optab named cfmovcc.
> >
> > Conditional move suppress fault for condition mem store would not move
> > any arithmetic calculations. For condition mem load now just support a
> > conditional move one trap mem and one no trap and no mem cases.
> 
> Sorry if this is going over old ground (I haven't read the earlier versions 
> yet), but:
> instead of adding a new optab, could we treat CFCMOV as a scalar instance of
> maskload_optab?  Robin is working on adding an "else" value for when the
> condition/mask is false.  After that, it would seem to be a pretty close 
> match to
> CFCMOV.
> 
> One reason for preferring maskload is that it makes the load an explicit part 
> of
> the interface.  We could then potentially use it in gimple too, not just 
> expand.
> 

Yes, for conditional load is like a scalar instance of  maskload_optab  with 
else operand.
I could try to use maskload_optab to generate cfcmov in rtl ifcvt pass. But it 
still after expand.
Now we don't have if-covert pass for scalar in gimple, do we have plan to do 
that ?

Thanks,
Lingling

> Thanks,
> Richard
> 
> >
> >
> > [1].https://www.intel.com/content/www/us/en/developer/articles/technic
> > al/advanced-performance-extensions-apx.html
> >
> > gcc/ChangeLog:
> >
> >* doc/md.texi: Add cfmovcc insn pattern explanation.
> >* ifcvt.cc (can_use_cmove_load_mem_notrap): New func
> >for conditional faulting movcc for load.
> >(can_use_cmove_store_mem_notrap): New func for conditional
> >faulting movcc for store.
> >(can_use_cfmovcc):  New func for conditional faulting.
> >(noce_try_cmove_arith): Try to convert to conditional 
> > faulting
> >movcc.
> >(noce_process_if_block): Ditto.
> >* optabs.cc (emit_conditional_move): Handle cfmovcc.
> >(emit_conditional_move_1): Ditto.
> >* optabs.def (OPTAB_D): New optab.
> > ---
> > gcc/doc/md.texi |  10 
> > gcc/ifcvt.cc| 119 
> > gcc/optabs.cc   |  14 +-
> > gcc/optabs.def  |   1 +
> > 4 files changed, 132 insertions(+), 12 deletions(-)
> >
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > a9259112251..5f563787c49 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -8591,6 +8591,16 @@ Return 1 if operand 1 is a normal floating
> > point number and 0 otherwise.  @var{m} is a scalar floating point
> > mode.  Operand 0 has mode @code{SImode}, and operand 1 has mode
> @var{m}.
> > +@cindex @code{cfmov@var{mode}cc} instruction pattern @item
> > +@samp{cfmov@var{mode}cc} Similar to @samp{mov@var{mode}cc} but for
> > +conditional faulting, If the comparison is false, all memory faults
> > +are suppressed when load or store a memory operand.
> > +
> > +Conditionally move operand 2 or operand 3 into operand 0 according to
> > +the comparison in operand 1.  If the comparison is true, operand 2 is
> > +moved into operand 0, otherwise operand 3 is moved.
> > +
> > @end table
> >  @end ifset
> > diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc index
> > 6487574c514..59845390607 100644
> > --- a/gcc/ifcvt.cc
> > +++ b/gcc/ifcvt.cc

[PATCH] i386: Fix scalar VCOMSBF16 which only compares low word

2024-10-09 Thread Kong, Lingling
Hi,

Fixed scalar VCOMSBF16 misused in AVX10.2.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m64}.

Ok for trunk?

gcc/ChangeLog:

* config/i386/sse.md (avx10_2_comsbf16_v8bf): Fixed scalar
operands.
---
 gcc/config/i386/sse.md | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
d6e2135423d..a529849898e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -32332,8 +32332,12 @@
 (define_insn "avx10_2_comsbf16_v8bf"
   [(set (reg:CCFP FLAGS_REG)
(unspec:CCFP
- [(match_operand:V8BF 0 "register_operand" "v")
-  (match_operand:V8BF 1 "nonimmediate_operand" "vm")]
+ [(vec_select:BF
+(match_operand:V8BF 0 "register_operand" "v")
+(parallel [(const_int 0)]))
+  (vec_select:BF
+(match_operand:V8BF 1 "nonimmediate_operand" "vm")
+(parallel [(const_int 0)]))]
 UNSPEC_VCOMSBF16))]
   "TARGET_AVX10_2_256"
   "vcomsbf16\t{%1, %0|%0, %1}"
--
2.31.1



RE: [PATCH v3 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-09-19 Thread Kong, Lingling
> > "Kong, Lingling"  writes:
> > > Hi,
> > >
> > > This version has added a new optab named 'cfmovcc'. The new optab is
> > > used in the middle end to expand to cfcmov. And simplified my patch
> > > by trying to generate the conditional faulting movcc in
> > > noce_try_cmove_arith
> > function.
> > >
> > > All the changes passed bootstrap & regtest x86-64-pc-linux-gnu.
> > > We also tested spec with SDE and passed the runtime test.
> > >
> > > Ok for trunk?
> > >
> > >
> > > APX CFCMOV[1] feature implements conditionally faulting which means
> > > If the comparison is false, all memory faults are suppressed when
> > > load or store a memory operand. Now we could load or store a memory
> > > operand may trap or fault for conditional move.
> > >
> > > In middle-end, now we don't support a conditional move if we knew
> > > that a load from A or B could trap or fault. To enable CFCMOV, we
> > > added a new optab named cfmovcc.
> > >
> > > Conditional move suppress fault for condition mem store would not
> > > move any arithmetic calculations. For condition mem load now just
> > > support a conditional move one trap mem and one no trap and no mem cases.
> >
> > Sorry if this is going over old ground (I haven't read the earlier versions 
> > yet), but:
> > instead of adding a new optab, could we treat CFCMOV as a scalar
> > instance of maskload_optab?  Robin is working on adding an "else"
> > value for when the condition/mask is false.  After that, it would seem
> > to be a pretty close match to CFCMOV.
> >
> > One reason for preferring maskload is that it makes the load an
> > explicit part of the interface.  We could then potentially use it in gimple 
> > too, not
> just expand.
> >
> 
> Yes, for conditional load is like a scalar instance of  maskload_optab  with 
> else
> operand.
> I could try to use maskload_optab to generate cfcmov in rtl ifcvt pass. But 
> it still
> after expand.
> Now we don't have if-covert pass for scalar in gimple, do we have plan to do
> that ?
> 

Hi,

I have tried to use maskload/maskstore to generate CFCMOV in ifcvt pass,
Unlike movcc, maskload/maskstore are not allowed to FAIL.
But I need restrictions for CFCMOV in backend expand.  Since expand 
maskload/maskstore 
cannot fail, I can only make restrictions in ifcvt and emit_conditional_move 
(in optabs.cc).

I'm not sure if this is the right approach, do you have any suggestions?

Thanks,
Lingling

> > Thanks,
> > Richard
> >
> > >
> > >
> > > [1].https://www.intel.com/content/www/us/en/developer/articles/techn
> > > ic al/advanced-performance-extensions-apx.html
> > >
> > > gcc/ChangeLog:
> > >
> > >* doc/md.texi: Add cfmovcc insn pattern explanation.
> > >* ifcvt.cc (can_use_cmove_load_mem_notrap): New func
> > >for conditional faulting movcc for load.
> > >(can_use_cmove_store_mem_notrap): New func for conditional
> > >faulting movcc for store.
> > >(can_use_cfmovcc):  New func for conditional faulting.
> > >(noce_try_cmove_arith): Try to convert to conditional 
> > > faulting
> > >movcc.
> > >(noce_process_if_block): Ditto.
> > >* optabs.cc (emit_conditional_move): Handle cfmovcc.
> > >(emit_conditional_move_1): Ditto.
> > >* optabs.def (OPTAB_D): New optab.
> > > ---
> > > gcc/doc/md.texi |  10 
> > > gcc/ifcvt.cc| 119 
> > > gcc/optabs.cc   |  14 +-
> > > gcc/optabs.def  |   1 +
> > > 4 files changed, 132 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > > a9259112251..5f563787c49 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -8591,6 +8591,16 @@ Return 1 if operand 1 is a normal floating
> > > point number and 0 otherwise.  @var{m} is a scalar floating point
> > > mode.  Operand 0 has mode @code{SImode}, and operand 1 has mode
> > @var{m}.
> > > +@cindex @code{cfmov@var{mode}cc} instruction pattern @item
> > > +@samp{cfmov@var{mode}cc} Similar to @samp{mov@var{mode}cc} but for
> > > +conditional faulting, If the comparison is false, all memory faults
&g

[PATCH] i386: Update the comment for mapxf option

2024-09-18 Thread Kong, Lingling
Hi,

After APX NF, CCMP and NF features supported, the comment for APX option also 
need update.

Ok for trunk?
 
gcc/ChangeLog:

* config/i386/i386.opt: Update the features included in apxf.
---
 gcc/config/i386/i386.opt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 
fe16e44a4ea..64c295d344c 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1313,7 +1313,7 @@ Enable vectorization for scatter instruction.
 mapxf
 Target Mask(ISA2_APX_F) Var(ix86_isa_flags2) Save  Support code generation for 
APX features, including EGPR, PUSH2POP2, 
-NDD and PPX.
+NDD, PPX, NF, CCMP and ZU.
 
 mapx-features=
 Target Undocumented Joined Enum(apx_features) EnumSet Var(ix86_apx_features) 
Init(apx_none) Save
--
2.31.1



[PATCH v4 1/2] [APX CFCMOV] Support APX CFCMOV in if_convert pass

2024-11-13 Thread Kong, Lingling
Hi,

Many thanks to Richard for the suggestion that conditional load is like a 
scalar instance of maskload_optab . So this version has use maskload and 
maskstore optab to expand and generate cfcmov in ifcvt pass.

All the changes passed bootstrap & regtest x86-64-pc-linux-gnu.
We also tested spec with SDE and passed the runtime test.

Ok for trunk?

APX CFCMOV[1] feature implements conditionally faulting which means that all 
memory faults are suppressed when the condition code evaluates to false and 
load or store a memory operand. Now we could load or store a memory operand may 
trap or fault for conditional move.

In middle-end, now we don't support a conditional move if we knew that a load 
from A or B could trap or fault. To enable CFCMOV, use mask_load and mask_store 
to expand.

Conditional move suppress_fault for condition mem store would not move any 
arithmetic calculations. For condition mem load now just support a conditional 
move one trap mem and one no trap and no mem cases.

[1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html

gcc/ChangeLog:

* ifcvt.cc (can_use_scalar_mask_store): New func for conditional
faulting movcc for store.
(can_use_scalar_mask_load_store):  New func for conditional faulting.
(noce_try_cmove_arith): Try to convert to conditional faulting
movcc.
(noce_process_if_block): Ditto.
* optabs.cc (emit_conditional_move): Handle cfmovcc.
(emit_conditional_move_1): Ditto.
---
 gcc/ifcvt.cc  | 105 +-
 gcc/optabs.cc |  20 ++
 2 files changed, 115 insertions(+), 10 deletions(-)

diff --git a/gcc/ifcvt.cc b/gcc/ifcvt.cc
index 74f13a637b2..b3adee35ff5 100644
--- a/gcc/ifcvt.cc
+++ b/gcc/ifcvt.cc
@@ -778,6 +778,8 @@ static bool noce_try_store_flag_mask (struct noce_if_info 
*);
 static rtx noce_emit_cmove (struct noce_if_info *, rtx, enum rtx_code, rtx,
rtx, rtx, rtx, rtx = NULL, rtx = NULL);
 static bool noce_try_cmove (struct noce_if_info *);
+static bool can_use_scalar_mask_store (rtx, rtx, rtx, bool);
+static bool can_use_scalar_mask_load_store (struct noce_if_info *);
 static bool noce_try_cmove_arith (struct noce_if_info *);
 static rtx noce_get_alt_condition (struct noce_if_info *, rtx, rtx_insn **);
 static bool noce_try_minmax (struct noce_if_info *);
@@ -2132,6 +2134,54 @@ noce_emit_bb (rtx last_insn, basic_block bb, bool simple)
   return true;
 }
 
+/* Return TRUE if we could convert "if (test) *x = a; else skip" to
+   scalar mask store and could do conditional faulting movcc, i.e.
+   x86 cfcmov, especially when store x may cause memmory faults and
+   in else_bb x == b.  */
+
+static bool
+can_use_scalar_mask_store (rtx x, rtx a, rtx b, bool a_simple)
+{
+  gcc_assert (MEM_P (x));
+
+  machine_mode x_mode = GET_MODE (x);
+  if (convert_optab_handler (maskstore_optab, x_mode,
+x_mode) == CODE_FOR_nothing)
+return false;
+
+  if (!rtx_equal_p (x, b) || !may_trap_or_fault_p (x))
+return false;
+  if (!a_simple || !register_operand (a, x_mode))
+return false;
+
+  return true;
+}
+
+/* Return TRUE if backend supports scalar maskload_optab/maskstore_optab,
+   which suppressed memory faults when load or store a memory operand
+   and the condition code evaluates to false.  */
+
+static bool
+can_use_scalar_mask_load_store (struct noce_if_info *if_info)
+{
+  rtx a = if_info->a;
+  rtx b = if_info->b;
+  rtx x = if_info->x;
+
+  if (!MEM_P (a) && !MEM_P (b))
+return false;
+
+  if (MEM_P (x))
+return can_use_scalar_mask_store (x, a, b, if_info->then_simple);
+  else
+/* Return TRUE if backend supports scalar maskload_optab, we could convert
+   "if (test) x = *a; else x = b;" or "if (test) x = a; else x = *b;"
+   to conditional faulting movcc, i.e. x86 cfcmov, especially when load a
+   or b may cause memmory faults.  */
+return convert_optab_handler (maskstore_optab, GET_MODE (a),
+ GET_MODE (a)) != CODE_FOR_nothing;
+}
+
 /* Try more complex cases involving conditional_move.  */
 
 static bool
@@ -2171,7 +2221,17 @@ noce_try_cmove_arith (struct noce_if_info *if_info)
   /* ??? We could handle this if we knew that a load from A or B could
  not trap or fault.  This is also true if we've already loaded
  from the address along the path from ENTRY.  */
-  else if (may_trap_or_fault_p (a) || may_trap_or_fault_p (b))
+  /* Just wait cse_not_expected, then convert to conditional mov on their
+ addresses followed by a load.  */
+  else if (may_trap_or_fault_p (a) && may_trap_or_fault_p (b))
+return false;
+  /* Scalar maskload_optab/maskstore_optab implements conditionally faulting
+ which means that if the condition code evaluates to false, all memory
+ faults are suppressed when load or store a memory operand.  Now we could
+ load or store a 

[PATCH v4 2/2] [APX CFCMOV] Support APX CFCMOV in backend

2024-11-13 Thread Kong, Lingling
gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_int_cfmovcc):  Expand
to cfcmov pattern.
* config/i386/i386-opts.h (enum apx_features): New.
* config/i386/i386-protos.h (ix86_expand_int_cfmovcc): Define.
* config/i386/i386.cc (ix86_rtx_costs): Add UNSPEC_APX_CFCMOV
cost.
* config/i386/i386.h (TARGET_APX_CFCMOV): Define.
* config/i386/i386.md (maskload): New define_expand.
(maskstore): Ditto.
(*cfmovcc): New define_insn.
(*cfmovcc_2): Ditto.
(*cfmovccz): Ditto.
(UNSPEC_APX_CFCMOV): New unspec for cfcmov.
* config/i386/i386.opt: Add enum value for cfcmov.

gcc/testsuite/ChangeLog:

* gcc.target/i386/apx-cfcmov-1.c: New test.
* gcc.target/i386/apx-cfcmov-2.c: Ditto.
---
 gcc/config/i386/i386-expand.cc   | 47 +
 gcc/config/i386/i386-opts.h  |  4 +-
 gcc/config/i386/i386-protos.h|  1 +
 gcc/config/i386/i386.cc  | 16 +++--
 gcc/config/i386/i386.h   |  1 +
 gcc/config/i386/i386.md  | 74 +++-
 gcc/config/i386/i386.opt |  3 +
 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c | 73 +++
 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c | 40 +++
 9 files changed, 253 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-cfcmov-2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 36011cc6b35..c956bd96edb 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -3529,6 +3529,53 @@ ix86_expand_int_addcc (rtx operands[])
   return true;
 }
 
+void
+ix86_expand_int_cfmovcc (rtx operands[])
+{
+  machine_mode mode = GET_MODE (operands[0]);
+  enum rtx_code code = GET_CODE (operands[1]);
+  rtx_insn *compare_seq;
+  rtx compare_op;
+  rtx op0 = XEXP (operands[1], 0);
+  rtx op1 = XEXP (operands[1], 1);
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+
+  gcc_assert (may_trap_or_fault_p (op2) || may_trap_or_fault_p (op3));
+  /* For Conditional store only handle "if (test) *x = a; else skip;".  */
+  if (MEM_P (operands[0]))
+gcc_assert (operands[0] == op3);
+
+  start_sequence ();
+  compare_op = ix86_expand_compare (code, op0, op1);
+  compare_seq = get_insns ();
+  end_sequence ();
+
+  if (may_trap_or_fault_p (op2))
+op2 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[2]),
+ UNSPEC_APX_CFCMOV);
+  if (may_trap_or_fault_p (op3))
+op3 = gen_rtx_UNSPEC (mode, gen_rtvec (1, operands[3]),
+ UNSPEC_APX_CFCMOV);
+  emit_insn (compare_seq);
+  /* For "if (test) x = *a; else x = *b",generate 2 cfcmov.  */
+  if (may_trap_or_fault_p (op2) && may_trap_or_fault_p (op3))
+{
+  emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_IF_THEN_ELSE (mode, compare_op,
+   op2, operands[0])));
+  emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_IF_THEN_ELSE (mode, compare_op,
+   operands[0], op3)));
+}
+  /* For conditional load one mem, like "if (test) x = *a; else x = b/0."
+ and "if (test) x = b/0; else x = *b".  */
+  else
+emit_insn (gen_rtx_SET (operands[0],
+   gen_rtx_IF_THEN_ELSE (mode, compare_op,
+ op2, op3)));
+}
+
 bool
 ix86_expand_int_movcc (rtx operands[])
 {
diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index 35542b28936..a11c800448b 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -143,8 +143,10 @@ enum apx_features {
   apx_nf = 1 << 4,
   apx_ccmp = 1 << 5,
   apx_zu = 1 << 6,
+  apx_cfcmov = 1 << 7,
   apx_all = apx_egpr | apx_push2pop2 | apx_ndd
-   | apx_ppx | apx_nf | apx_ccmp | apx_zu,
+   | apx_ppx | apx_nf | apx_ccmp | apx_zu
+   | apx_cfcmov,
 };
 
 #endif
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index c1f9147769c..eacd38b5bc5 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -153,6 +153,7 @@ extern bool ix86_match_ccmode (rtx, machine_mode);
 extern bool ix86_match_ptest_ccmode (rtx);
 extern void ix86_expand_branch (enum rtx_code, rtx, rtx, rtx);
 extern void ix86_expand_setcc (rtx, enum rtx_code, rtx, rtx);
+extern void ix86_expand_int_cfmovcc (rtx[]);
 extern bool ix86_expand_int_movcc (rtx[]);
 extern bool ix86_expand_fp_movcc (rtx[]);
 extern bool ix86_expand_fp_vcond (rtx[]);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 473e4cbf10e..5ec5d81bf10 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22480,10 +22480,18 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_

RE: Patch ping - [PATCH] [APX EGPR] Fix indirect call prefix

2024-11-24 Thread Kong, Lingling
Hi,

LGTM.
Now Hongyu and Hongtao are working on APX.

Thanks,
Lingling

> -Original Message-
> From: Gregory Kanter 
> Sent: Saturday, November 23, 2024 8:16 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Kong, Lingling ; Gregory Kanter
> 
> Subject: Patch ping - [PATCH] [APX EGPR] Fix indirect call prefix
> 
> Hello,
> I would like to ping the patch
> https://gcc.gnu.org/pipermail/gcc-patches/2024-November/668105.html
> please.
> 
> Also CC'ing someone who is working on APX, sorry if this is frowned upon.
> 
> Thanks.


[PATCH] i386: Fix _mm_[u]comixx_{ss,sd} codegen and add PF result. [PR106113]

2022-07-13 Thread Kong, Lingling via Gcc-patches
Hi,

The patch is to fix _mm_[u]comixx_{ss,sd} codegen and add PF result.  These 
intrinsics have changed over time, like `_mm_comieq_ss ` old operation is 
`RETURN ( a[31:0] == b[31:0] ) ? 1 : 0`, and new operation update is `RETURN ( 
a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0`.

OK for master?

gcc/ChangeLog:

PR target/106113
* config/i386/i386-builtin.def (BDESC): Fix [u]comi{ss,sd}
comparison due to intrinsics changed over time.
* config/i386/i386-expand.cc (ix86_ssecom_setcc):
Add unordered check and mode for sse comi codegen.
(ix86_expand_sse_comi): Add unordered check and check a different
CCmode.
(ix86_expand_sse_comi_round):Extract unordered check and mode part
in ix86_ssecom_setcc.

gcc/testsuite/ChangeLog:

PR target/106113
* gcc.target/i386/avx-vcomisd-pr106113-2.c: New test.
* gcc.target/i386/avx-vcomiss-pr106113-2.c: Ditto.
* gcc.target/i386/avx-vucomisd-pr106113-2.c: Ditto.
* gcc.target/i386/avx-vucomiss-pr106113-2.c: Ditto.
* gcc.target/i386/sse-comiss-pr106113-1.c: Ditto.
* gcc.target/i386/sse-comiss-pr106113-2.c: Ditto.
* gcc.target/i386/sse-ucomiss-pr106113-1.c: Ditto.
* gcc.target/i386/sse-ucomiss-pr106113-2.c: Ditto.
* gcc.target/i386/sse2-comisd-pr106113-1.c: Ditto.
* gcc.target/i386/sse2-comisd-pr106113-2.c: Ditto.
* gcc.target/i386/sse2-ucomisd-pr106113-1.c: Ditto.
* gcc.target/i386/sse2-ucomisd-pr106113-2.c: Ditto.
---
 gcc/config/i386/i386-builtin.def  |  32 ++--
 gcc/config/i386/i386-expand.cc| 140 +++---
 .../gcc.target/i386/avx-vcomisd-pr106113-2.c  |   8 +
 .../gcc.target/i386/avx-vcomiss-pr106113-2.c  |   8 +
 .../gcc.target/i386/avx-vucomisd-pr106113-2.c |   8 +
 .../gcc.target/i386/avx-vucomiss-pr106113-2.c |   8 +
 .../gcc.target/i386/sse-comiss-pr106113-1.c   |  19 +++
 .../gcc.target/i386/sse-comiss-pr106113-2.c   |  59 
 .../gcc.target/i386/sse-ucomiss-pr106113-1.c  |  19 +++
 .../gcc.target/i386/sse-ucomiss-pr106113-2.c  |  59 
 .../gcc.target/i386/sse2-comisd-pr106113-1.c  |  19 +++
 .../gcc.target/i386/sse2-comisd-pr106113-2.c  |  59 
 .../gcc.target/i386/sse2-ucomisd-pr106113-1.c |  19 +++
 .../gcc.target/i386/sse2-ucomisd-pr106113-2.c |  59 
 14 files changed, 450 insertions(+), 66 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vcomisd-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vcomiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vucomisd-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vucomiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-comiss-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-comiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-ucomiss-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-ucomiss-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-comisd-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-comisd-pr106113-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-ucomisd-pr106113-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-ucomisd-pr106113-2.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index fd160935e67..acb7e8ca64b 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -35,30 +35,30 @@
 IX86_BUILTIN__BDESC_##NEXT_KIND##_FIRST - 1.  */
 
 BDESC_FIRST (comi, COMI,
-   OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comieq", 
IX86_BUILTIN_COMIEQSS, UNEQ, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comilt", 
IX86_BUILTIN_COMILTSS, UNLT, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comile", 
IX86_BUILTIN_COMILESS, UNLE, 0)
+   OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comieq", 
IX86_BUILTIN_COMIEQSS, EQ, 0)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comilt", 
IX86_BUILTIN_COMILTSS, LT, 0)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comile", 
IX86_BUILTIN_COMILESS, LE, 0)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comigt", 
IX86_BUILTIN_COMIGTSS, GT, 0)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comige", 
IX86_BUILTIN_COMIGESS, GE, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_comi, "__builtin_ia32_comineq", 
IX86_BUILTIN_COMINEQSS, LTGT, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", 
IX86_BUILTIN_UCOMIEQSS, UNEQ, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", 
IX86_BUILTIN_UCOMILTSS, UNLT, 0)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", 
IX86_BUILTIN_UCOMILESS, UNLE, 0)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE

[PATCH] x86: Enable __bf16 type for TARGET_SSE2 and above

2022-07-25 Thread Kong, Lingling via Gcc-patches
Hi,

The patch is enable __bf16 scalar type for target sse2 and above according to 
psABI(https://gitlab.com/x86-psABIs/x86-64-ABI/-/merge_requests/35/diffs).
The __bf16 type is a storage type like arm.

OK for master?

gcc/ChangeLog:

* config/i386/i386-builtin-types.def (BFLOAT16): New primitive type.
* config/i386/i386-builtins.cc : Support __bf16 type for i386 backend.
(ix86_register_bf16_builtin_type): New function.
(ix86_bf16_type_node): New.
(ix86_bf16_ptr_type_node): Ditto.
(ix86_init_builtin_types): Add ix86_register_bf16_builtin_type function 
call.
* config/i386/i386-modes.def (FLOAT_MODE): Add BFmode.
(ADJUST_FLOAT_FORMAT): Ditto.
* config/i386/i386.cc (merge_classes): Handle BFmode.
(classify_argument): Ditto.
(examine_argument): Ditto.
(construct_container): Ditto.
(function_value_32): Return __bf16 by %xmm0.
(function_value_64): Return __bf16 by SSE register.
(ix86_print_operand): Handle CONST_DOUBLE BFmode.
(ix86_secondary_reload): Require gpr as intermediate register
to store __bf16 from sse register when sse4 is not available.
(ix86_scalar_mode_supported_p): Enable __bf16 under sse2.
(ix86_mangle_type): Add manlging for __bf16 type.
(ix86_invalid_conversion): New function for target hook.
(ix86_invalid_unary_op): Ditto.
(ix86_invalid_binary_op): Ditto.
(TARGET_INVALID_CONVERSION): New define for target hook.
(TARGET_INVALID_UNARY_OP): Ditto.
(TARGET_INVALID_BINARY_OP): Ditto.
* config/i386/i386.h (host_detect_local_cpu): Add BFmode.
* config/i386/i386.md (*pushhf_rex64): Change for BFmode.
(*push_rex64): Ditto.
(*pushhf): Ditto.
(*push): Ditto.
(*movhf_internal): Ditto.
(*mov_internal): Ditto.

gcc/testsuite/ChangeLog:

* g++.target/i386/bfloat_cpp_typecheck.C: New test.
* gcc.target/i386/bfloat16-1.c: Ditto.
* gcc.target/i386/sse2-bfloat16-1.c: Ditto.
* gcc.target/i386/sse2-bfloat16-2.c: Ditto.
* gcc.target/i386/sse2-bfloat16-scalar-typecheck.c: Ditto.
---
 gcc/config/i386/i386-builtin-types.def|   1 +
 gcc/config/i386/i386-builtins.cc  |  21 ++
 gcc/config/i386/i386-modes.def|   2 +
 gcc/config/i386/i386.cc   |  75 +-
 gcc/config/i386/i386.h|   4 +-
 gcc/config/i386/i386.md   |  32 +--
 .../g++.target/i386/bfloat_cpp_typecheck.C|  10 +
 gcc/testsuite/gcc.target/i386/bfloat16-1.c|  12 +
 .../gcc.target/i386/sse2-bfloat16-1.c |   8 +
 .../gcc.target/i386/sse2-bfloat16-2.c |  17 ++
 .../i386/sse2-bfloat16-scalar-typecheck.c | 215 ++
 11 files changed, 375 insertions(+), 22 deletions(-)  create mode 100644 
gcc/testsuite/g++.target/i386/bfloat_cpp_typecheck.C
 create mode 100644 gcc/testsuite/gcc.target/i386/bfloat16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-2.c
 create mode 100644 
gcc/testsuite/gcc.target/i386/sse2-bfloat16-scalar-typecheck.c

diff --git a/gcc/config/i386/i386-builtin-types.def 
b/gcc/config/i386/i386-builtin-types.def
index 7a2da1db0b0..63a360b0f8b 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -69,6 +69,7 @@ DEF_PRIMITIVE_TYPE (UINT16, short_unsigned_type_node)  
DEF_PRIMITIVE_TYPE (INT64, long_long_integer_type_node)  DEF_PRIMITIVE_TYPE 
(UINT64, long_long_unsigned_type_node)  DEF_PRIMITIVE_TYPE (FLOAT16, 
ix86_float16_type_node)
+DEF_PRIMITIVE_TYPE (BFLOAT16, ix86_bf16_type_node)
 DEF_PRIMITIVE_TYPE (FLOAT, float_type_node)  DEF_PRIMITIVE_TYPE (DOUBLE, 
double_type_node)  DEF_PRIMITIVE_TYPE (FLOAT80, float80_type_node) diff --git 
a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index fe7243c3837..6a04fb57e65 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -126,6 +126,9 @@ BDESC_VERIFYS (IX86_BUILTIN_MAX,  static GTY(()) tree 
ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
 
 tree ix86_float16_type_node = NULL_TREE;
+tree ix86_bf16_type_node = NULL_TREE;
+tree ix86_bf16_ptr_type_node = NULL_TREE;
+
 /* Retrieve an element from the above table, building some of
the types lazily.  */
 
@@ -1366,6 +1369,22 @@ ix86_register_float16_builtin_type (void)
"_Float16");
 }
 
+static void
+ix86_register_bf16_builtin_type (void)
+{
+  ix86_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (ix86_bf16_type_node) = 16;
+  SET_TYPE_MODE (ix86_bf16_type_node, BFmode);
+  layout_type (ix86_bf16_type_node);
+
+  if (!maybe_get_identifier ("__bf16") && TARGET_SSE2)
+{
+  lang_hooks.types.register_builtin_type (ix86_bf16_type_node,
+   "__bf1

RE: [PATCH] x86: Enable __bf16 type for TARGET_SSE2 and above

2022-08-03 Thread Kong, Lingling via Gcc-patches
Hi,

Old patch has some mistake in `*movbf_internal` , now disable BFmode constant 
double move in `*movbf_internal`.

Thanks,
Lingling

> -Original Message-
> From: Kong, Lingling 
> Sent: Tuesday, July 26, 2022 9:31 AM
> To: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> Cc: Kong, Lingling 
> Subject: [PATCH] x86: Enable __bf16 type for TARGET_SSE2 and above
> 
> Hi,
> 
> The patch is enable __bf16 scalar type for target sse2 and above according to
> psABI(https://gitlab.com/x86-psABIs/x86-64-ABI/-/merge_requests/35/diffs).
> The __bf16 type is a storage type like arm.
> 
> OK for master?
> 
> gcc/ChangeLog:
> 
>   * config/i386/i386-builtin-types.def (BFLOAT16): New primitive type.
>   * config/i386/i386-builtins.cc : Support __bf16 type for i386 backend.
>   (ix86_register_bf16_builtin_type): New function.
>   (ix86_bf16_type_node): New.
>   (ix86_bf16_ptr_type_node): Ditto.
>   (ix86_init_builtin_types): Add ix86_register_bf16_builtin_type function
> call.
>   * config/i386/i386-modes.def (FLOAT_MODE): Add BFmode.
>   (ADJUST_FLOAT_FORMAT): Ditto.
>   * config/i386/i386.cc (merge_classes): Handle BFmode.
>   (classify_argument): Ditto.
>   (examine_argument): Ditto.
>   (construct_container): Ditto.
>   (function_value_32): Return __bf16 by %xmm0.
>   (function_value_64): Return __bf16 by SSE register.
>   (ix86_print_operand): Handle CONST_DOUBLE BFmode.
>   (ix86_secondary_reload): Require gpr as intermediate register
>   to store __bf16 from sse register when sse4 is not available.
>   (ix86_scalar_mode_supported_p): Enable __bf16 under sse2.
>   (ix86_mangle_type): Add manlging for __bf16 type.
>   (ix86_invalid_conversion): New function for target hook.
>   (ix86_invalid_unary_op): Ditto.
>   (ix86_invalid_binary_op): Ditto.
>   (TARGET_INVALID_CONVERSION): New define for target hook.
>   (TARGET_INVALID_UNARY_OP): Ditto.
>   (TARGET_INVALID_BINARY_OP): Ditto.
>   * config/i386/i386.h (host_detect_local_cpu): Add BFmode.
>   * config/i386/i386.md (*pushhf_rex64): Change for BFmode.
>   (*push_rex64): Ditto.
>   (*pushhf): Ditto.
>   (*push): Ditto.
>   (*movhf_internal): Ditto.
>   (*mov_internal): Ditto.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.target/i386/bfloat_cpp_typecheck.C: New test.
>   * gcc.target/i386/bfloat16-1.c: Ditto.
>   * gcc.target/i386/sse2-bfloat16-1.c: Ditto.
>   * gcc.target/i386/sse2-bfloat16-2.c: Ditto.
>   * gcc.target/i386/sse2-bfloat16-scalar-typecheck.c: Ditto.
> ---
>  gcc/config/i386/i386-builtin-types.def|   1 +
>  gcc/config/i386/i386-builtins.cc  |  21 ++
>  gcc/config/i386/i386-modes.def|   2 +
>  gcc/config/i386/i386.cc   |  75 +-
>  gcc/config/i386/i386.h|   4 +-
>  gcc/config/i386/i386.md   |  32 +--
>  .../g++.target/i386/bfloat_cpp_typecheck.C|  10 +
>  gcc/testsuite/gcc.target/i386/bfloat16-1.c|  12 +
>  .../gcc.target/i386/sse2-bfloat16-1.c |   8 +
>  .../gcc.target/i386/sse2-bfloat16-2.c |  17 ++
>  .../i386/sse2-bfloat16-scalar-typecheck.c | 215 ++
>  11 files changed, 375 insertions(+), 22 deletions(-)  create mode 100644
> gcc/testsuite/g++.target/i386/bfloat_cpp_typecheck.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/bfloat16-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-bfloat16-scalar-
> typecheck.c
> 
> diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-
> builtin-types.def
> index 7a2da1db0b0..63a360b0f8b 100644
> --- a/gcc/config/i386/i386-builtin-types.def
> +++ b/gcc/config/i386/i386-builtin-types.def
> @@ -69,6 +69,7 @@ DEF_PRIMITIVE_TYPE (UINT16,
> short_unsigned_type_node)  DEF_PRIMITIVE_TYPE (INT64,
> long_long_integer_type_node)  DEF_PRIMITIVE_TYPE (UINT64,
> long_long_unsigned_type_node)  DEF_PRIMITIVE_TYPE (FLOAT16,
> ix86_float16_type_node)
> +DEF_PRIMITIVE_TYPE (BFLOAT16, ix86_bf16_type_node)
>  DEF_PRIMITIVE_TYPE (FLOAT, float_type_node)  DEF_PRIMITIVE_TYPE
> (DOUBLE, double_type_node)  DEF_PRIMITIVE_TYPE (FLOAT80,
> float80_type_node) diff --git a/gcc/config/i386/i386-builtins.cc
> b/gcc/config/i386/i386-builtins.cc
> index fe7243c3837..6a04fb57e65 100644
> --- a/gcc/config/i386/i386-builtins.cc
> +++ b/gcc/config/i386/i386-builtins.cc
> @@ -126,6 +126,9 @@ BDESC_VERIFYS (IX86_BUILTIN_MAX,  static GTY(()) tree
> ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1]

[PATCH] i386: Fix _mm512_fpclass_ps_mask in O0 [PR 101471]

2021-08-24 Thread Kong, Lingling via Gcc-patches
Hi,

For _mm512_fpclass_ps_mask in O0, mask should be (__mmask16)-1 instead of
(__mmask8)-1).

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for master?

gcc/ChangeLog:

* gcc/config/i386/avx512dqintrin.h : fix _mm512_fpclass_ps_mask define in O0

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-pr101471.c: add new test


0001-i386-Fix-_mm512_fpclass_ps_mask-in-O0-PR-101471.patch
Description: 0001-i386-Fix-_mm512_fpclass_ps_mask-in-O0-PR-101471.patch


[PATCH] i386: Fix wrong optimization for consecutive masked scatters [PR 101472]

2021-08-24 Thread Kong, Lingling via Gcc-patches
Hi,

For avx512f_scattersi, mask operand only affect set src, we
need to refine the pattern to let gcc know mask register also affect the dest.
So we put mask operand into UNSPEC_VSIBADDR.

Bootstrapped and regression tested on x86_64-linux-gnu{-m32,-m64}.
Ok for master?

gcc/ChangeLog:

*config/i386/sse.md (scattersi): Add mask operand to
UNSPEC_VSIBADDR.
(scattersi): Likewise.
(*avx512f_scattersi): Merge mask operand
to set_dest.
(*avx512f_scatterdi): Likewise

gcc/testsuite/ChangeLog:

*gcc.target/i386/avx512f-pr101472.c: New test.
*gcc.target/i386/avx512vl-pr101472.c: Ditto.


0001-i386-Fix-wrong-optimization-for-consecutive-masked-s.patch
Description: 0001-i386-Fix-wrong-optimization-for-consecutive-masked-s.patch


[PATCH] i386: Fix wrong optimization for consecutive masked scatters [PR 101472]

2021-08-26 Thread Kong, Lingling via Gcc-patches
Hi,

For avx512f_scattersi, mask operand only affect set src, we need to 
refine the pattern to let gcc know mask register also affect the dest.
So we put mask operand into UNSPEC_VSIBADDR.

Bootstrapped and regression tested on x86_64-linux-gnu{-m32,-m64}.
Ok for master?

gcc/ChangeLog:

PR target/101472
* config/i386/sse.md: (scattersi): Add mask operand to
UNSPEC_VSIBADDR.
(scattersi): Likewise.
(*avx512f_scattersi): Merge mask operand to set_dest.
(*avx512f_scatterdi): Likewise

gcc/testsuite/ChangeLog:

PR target/101472
* gcc.target/i386/avx512f-pr101472.c: New test.
* gcc.target/i386/avx512vl-pr101472.c: New test.
---
 gcc/config/i386/sse.md| 20 +++--
 .../gcc.target/i386/avx512f-pr101472.c| 49 
 .../gcc.target/i386/avx512vl-pr101472.c   | 79 +++
 3 files changed, 140 insertions(+), 8 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512f-pr101472.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-pr101472.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
03fc2df1fb0..a3055dbd316 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -24205,8 +24205,9 @@
   "TARGET_AVX512F"
 {
   operands[5]
-= gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2],
-   operands[4]), UNSPEC_VSIBADDR);
+= gen_rtx_UNSPEC (Pmode, gen_rtvec (4, operands[0], operands[2],
+   operands[4], operands[1]), 
+   UNSPEC_VSIBADDR);
 })
 
 (define_insn "*avx512f_scattersi"
@@ -24214,10 +24215,11 @@
  [(unspec:P
 [(match_operand:P 0 "vsib_address_operand" "Tv")
  (match_operand: 2 "register_operand" "v")
- (match_operand:SI 4 "const1248_operand" "n")]
+ (match_operand:SI 4 "const1248_operand" "n")
+ (match_operand: 6 "register_operand" "1")]
 UNSPEC_VSIBADDR)])
(unspec:VI48F
- [(match_operand: 6 "register_operand" "1")
+ [(match_dup 6)
   (match_operand:VI48F 3 "register_operand" "v")]
  UNSPEC_SCATTER))
(clobber (match_scratch: 1 "=&Yk"))] @@ -24243,8 +24245,9 
@@
   "TARGET_AVX512F"
 {
   operands[5]
-= gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[0], operands[2],
-   operands[4]), UNSPEC_VSIBADDR);
+= gen_rtx_UNSPEC (Pmode, gen_rtvec (4, operands[0], operands[2],
+   operands[4], operands[1]), 
+   UNSPEC_VSIBADDR);
 })
 
 (define_insn "*avx512f_scatterdi"
@@ -24252,10 +24255,11 @@
  [(unspec:P
 [(match_operand:P 0 "vsib_address_operand" "Tv")
  (match_operand: 2 "register_operand" "v")
- (match_operand:SI 4 "const1248_operand" "n")]
+ (match_operand:SI 4 "const1248_operand" "n")
+ (match_operand:QI 6 "register_operand" "1")]
 UNSPEC_VSIBADDR)])
(unspec:VI48F
- [(match_operand:QI 6 "register_operand" "1")
+ [(match_dup 6)
   (match_operand: 3 "register_operand" "v")]
  UNSPEC_SCATTER))
(clobber (match_scratch:QI 1 "=&Yk"))] diff --git 
a/gcc/testsuite/gcc.target/i386/avx512f-pr101472.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr101472.c
new file mode 100644
index 000..89c6603c2ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr101472.c
@@ -0,0 +1,49 @@
+/* PR target/101472 */
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times "vpscatterqd\[ 
+\\t\]+\[^\{\n\]*ymm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vpscatterdd\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vpscatterqq\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vpscatterdq\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*ymm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterqps\[ 
+\\t\]+\[^\{\n\]*ymm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterdps\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterqpd\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*zmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vscatterdpd\[ 
+\\t\]+\[^\{\n\]*zmm\[0-9\]\[^\n\]*ymm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ 
+\\t\]+#)" 2 } } */
+
+#include 
+
+void two_scatters_epi32(void* addr, __mmask8 k1, __mmask8 k2, __m512i vindex, 
+__m256i a, __m512i b)

[PATCH] i386: Fixed vec_init_dup_v16bf [PR106887]

2022-09-14 Thread Kong, Lingling via Gcc-patches
Hi

The patch is to fix vec_init_dup_v16bf, add correct handle for v16bf mode in 
ix86_expand_vector_init_duplicate.
Add testcase with sse2 without avx2.

OK for master? 

gcc/ChangeLog:

PR target/106887
* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
Fixed V16BF mode case.

gcc/testsuite/ChangeLog:

PR target/106887
* gcc.target/i386/vect-bfloat16-2c.c: New test.
---
 gcc/config/i386/i386-expand.cc|  1 +
 .../gcc.target/i386/vect-bfloat16-2c.c| 76 +++
 2 files changed, 77 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc 
index d7b49c99dc8..9451c561489 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15111,6 +15111,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
{
  machine_mode hvmode = (mode == V16HImode ? V8HImode
 : mode == V16HFmode ? V8HFmode
+: mode == V16BFmode ? V8BFmode
 : V16QImode);
  rtx x = gen_reg_rtx (hvmode);
 
diff --git a/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c 
b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
new file mode 100644
index 000..bead94e46a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
@@ -0,0 +1,76 @@
+/* { dg-do compile } */
+/* { dg-options "-mf16c -msse2 -mno-avx2 -O2" } */
+
+typedef __bf16 v8bf __attribute__ ((__vector_size__ (16))); typedef 
+__bf16 v16bf __attribute__ ((__vector_size__ (32)));
+
+#define VEC_EXTRACT(V,S,IDX)   \
+  S\
+  __attribute__((noipa))   \
+  vec_extract_##V##_##IDX (V v)\
+  {\
+return v[IDX]; \
+  }
+
+#define VEC_SET(V,S,IDX)   \
+  V\
+  __attribute__((noipa))   \
+  vec_set_##V##_##IDX (V v, S s)   \
+  {\
+v[IDX] = s;\
+return v;  \
+  }
+
+v8bf
+vec_init_v8bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
+  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8) {
+return __extension__ (v8bf) {a1, a2, a3, a4, a5, a6, a7, a8}; }
+
+v16bf
+vec_init_v16bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
+  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8,
+  __bf16 a9,  __bf16 a10, __bf16 a11, __bf16 a12,
+  __bf16 a13,  __bf16 a14, __bf16 a15, __bf16 a16) {
+return __extension__ (v16bf) {a1, a2, a3, a4, a5, a6, a7, a8,
+ a9, a10, a11, a12, a13, a14, a15, a16}; }
+
+v8bf
+vec_init_dup_v8bf (__bf16 a1)
+{
+return __extension__ (v8bf) {a1, a1, a1, a1, a1, a1, a1, a1}; }
+
+v16bf
+vec_init_dup_v16bf (__bf16 a1)
+{
+return __extension__ (v16bf) {a1, a1, a1, a1, a1, a1, a1, a1,
+ a1, a1, a1, a1, a1, a1, a1, a1};
+}
+
+/* { dg-final { scan-assembler-times "vpunpcklwd" 12 } } */
+/* { dg-final { scan-assembler-times "vpunpckldq" 6 } } */
+/* { dg-final { scan-assembler-times "vpunpcklqdq" 3 } } */
+
+VEC_EXTRACT (v8bf, __bf16, 0);
+VEC_EXTRACT (v8bf, __bf16, 4);
+VEC_EXTRACT (v16bf, __bf16, 0);
+VEC_EXTRACT (v16bf, __bf16, 3);
+VEC_EXTRACT (v16bf, __bf16, 8);
+VEC_EXTRACT (v16bf, __bf16, 15);
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$8" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$6" 1 } } */
+/* { dg-final { scan-assembler-times "vpsrldq\[\t ]*\\\$14" 1 } } */
+/* { dg-final { scan-assembler-times "vextract" 4 } } */
+
+VEC_SET (v8bf, __bf16, 4);
+VEC_SET (v16bf, __bf16, 3);
+VEC_SET (v16bf, __bf16, 8);
+VEC_SET (v16bf, __bf16, 15);
+/* { dg-final { scan-assembler-times "vpblendw" 3 { target { ! ia32 } } 
+} } */
+
+/* { dg-final { scan-assembler-times "vpinsrw" 30 { target ia32 } } } 
+*/
+
--
2.18.2



RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-09-15 Thread Kong, Lingling via Gcc-patches
Hi Richard,

Thanks again for your reviewing.

> Yes, use else if for the bitwise induction.  Can you also make the new case
> conditional on 'def'
> (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If that
> call produced something useful it will not be of either of the two special 
> forms.
> Thus like
> 
>   if (def != chrec_dont_know)
> /* Already OK.  */
> ;
>  else if ((bitinv_def = ...)
> ..
>  else if (tree_fits_uhwi_p (niter)
>  ... bitwise induction case...)
> ...
>
Yes, I fixed it in new patch. Thanks.
Ok for master ?

Thanks,
Lingling

> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, September 14, 2022 4:16 PM
> To: Kong, Lingling 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
> Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle bitop
> with an invariant induction.[PR105735]
> 
> On Tue, Sep 13, 2022 at 9:54 AM Kong, Lingling 
> wrote:
> >
> > Hi Richard,
> >
> > Thanks you so much for reviewing this patch.  I really appreciate it. For 
> > these
> review comments, I have made some changes.
> >
> > > That's a single-stmt match, you shouldn't use match.pd matching for this.
> > > Instead just do
> > >
> > >   if (is_gimple_assign (stmt)
> > >   && ((code = gimple_assign_rhs_code (stmt)), true)
> > >   && (code == BIT_AND_EXPR || code == BIT_IOR_EXPR || code ==
> > > BIT_XOR_EXPR))
> >
> > Yes, I fixed it and dropped modification for match.pd.
> >
> > > and pick gimple_assign_rhs{1,2} (stmt) as the operands.  The :c in
> > > bit_op:c is redundant btw. - while the name suggests "with
> > > invariant" you don't actually check for that.  But again, given
> > > canonicalization rules the invariant will be rhs2 so above add
> > >
> > > && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST
> >
> > For " with invariant", this needed op1 is invariant, and I used
> `expr_invariant_in_loop_p (loop, match_op[0])` for check.
> > And op2 just be PHI is ok. If op2 is INTEGER_CST, existing gcc can be 
> > directly
> optimized and do not need modification.
> >
> > > you probably need dg-require-effective-target longlong, but is it
> > > necessary to use long long for the testcases in the first place?
> > > The IV seems to be unused, if it should match the variables bit size
> > > use sizeof
> > > (type) * 8
> >
> > Yes, It is not necessary to use long long for the testcases. I changed type 
> > to
> unsigned int.
> >
> > > > +  inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge
> > > > + (loop));  return fold_build2 (code1, type, inv, match_op[0]); }
> > >
> > > The } goes to the next line.
> >
> > Sorry, It might be something wrong with my use of gcc send-email format.
> >
> > > > +  tree bitinv_def;
> > > > +  if ((bitinv_def
> > >
> > > please use else if here
> >
> > Sorry, If use the else if here, there is no corresponding above if. I'm not 
> > sure if
> you mean change bitwise induction expression if to else if.
> 
> Yes, use else if for the bitwise induction.  Can you also make the new case
> conditional on 'def'
> (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If that
> call produced something useful it will not be of either of the two special 
> forms.
> Thus like
> 
>   if (def != chrec_dont_know)
> /* Already OK.  */
> ;
>  else if ((bitinv_def = ...)
> ..
>  else if (tree_fits_uhwi_p (niter)
>  ... bitwise induction case...)
> ...
> 
> ?
> 
> Otherwise looks OK now.
> 
> Thanks,
> Richard.
> 
> > Do you agree with these changes?  Thanks again for taking a look.
> >
> > Thanks,
> > Lingling
> >
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Tuesday, August 23, 2022 3:27 PM
> > > To: Kong, Lingling 
> > > Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> > > Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle
> > > bitop with an invariant induction.[PR105735]
> > >
> > > On Thu, Aug 18, 2022 at 8:48 AM Kong, Lingling via Gcc-patches  > > patc...@gcc.gnu.org> wrote:
> > > >
> > > > Hi,
> > > >
> > > > This patch is for pr105735/pr101991. It will enable below optimization:
> > > > {
> &g

RE: [PATCH] i386: Fixed vec_init_dup_v16bf [PR106887]

2022-09-16 Thread Kong, Lingling via Gcc-patches
Hi,
 
> >   machine_mode hvmode = (mode == V16HImode ? V8HImode
> >  : mode == V16HFmode ? V8HFmode
> > +: mode == V16BFmode ? V8BFmode
> Can it be written as switch case?
Sure, I fixed it in new patch. Thanks again for take a look.
OK for master ?

Thanks,
Lingling

> -Original Message-
> From: Hongtao Liu 
> Sent: Thursday, September 15, 2022 11:46 AM
> To: Kong, Lingling 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
> Subject: Re: [PATCH] i386: Fixed vec_init_dup_v16bf [PR106887]
> 
> On Thu, Sep 15, 2022 at 11:36 AM Kong, Lingling via Gcc-patches  patc...@gcc.gnu.org> wrote:
> >
> > Hi
> >
> > The patch is to fix vec_init_dup_v16bf, add correct handle for v16bf mode in
> ix86_expand_vector_init_duplicate.
> > Add testcase with sse2 without avx2.
> >
> > OK for master?
> >
> > gcc/ChangeLog:
> >
> > PR target/106887
> > * config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
> > Fixed V16BF mode case.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR target/106887
> > * gcc.target/i386/vect-bfloat16-2c.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.cc|  1 +
> >  .../gcc.target/i386/vect-bfloat16-2c.c| 76 +++
> >  2 files changed, 77 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index d7b49c99dc8..9451c561489 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -15111,6 +15111,7 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> > {
> >   machine_mode hvmode = (mode == V16HImode ? V8HImode
> >  : mode == V16HFmode ? V8HFmode
> > +: mode == V16BFmode ? V8BFmode
> Can it be written as switch case?
> >  : V16QImode);
> >   rtx x = gen_reg_rtx (hvmode);
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> > b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> > new file mode 100644
> > index 000..bead94e46a1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/vect-bfloat16-2c.c
> > @@ -0,0 +1,76 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mf16c -msse2 -mno-avx2 -O2" } */
> > +
> > +typedef __bf16 v8bf __attribute__ ((__vector_size__ (16))); typedef
> > +__bf16 v16bf __attribute__ ((__vector_size__ (32)));
> > +
> > +#define VEC_EXTRACT(V,S,IDX)   \
> > +  S\
> > +  __attribute__((noipa))   \
> > +  vec_extract_##V##_##IDX (V v)\
> > +  {\
> > +return v[IDX]; \
> > +  }
> > +
> > +#define VEC_SET(V,S,IDX)   \
> > +  V\
> > +  __attribute__((noipa))   \
> > +  vec_set_##V##_##IDX (V v, S s)   \
> > +  {\
> > +v[IDX] = s;\
> > +return v;  \
> > +  }
> > +
> > +v8bf
> > +vec_init_v8bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
> > +  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8) {
> > +return __extension__ (v8bf) {a1, a2, a3, a4, a5, a6, a7, a8}; }
> > +
> > +v16bf
> > +vec_init_v16bf (__bf16 a1, __bf16 a2, __bf16 a3, __bf16 a4,
> > +  __bf16 a5,  __bf16 a6, __bf16 a7, __bf16 a8,
> > +  __bf16 a9,  __bf16 a10, __bf16 a11, __bf16 a12,
> > +  __bf16 a13,  __bf16 a14, __bf16 a15, __bf16 a16) {
> > +return __extension__ (v16bf) {a1, a2, a3, a4, a5, a6, a7, a8,
> > + a9, a10, a11, a12, a13, a14, a15,
> > +a16}; }
> > +
> > +v8bf
> > +vec_init_dup_v8bf (__bf16 a1)
> > +{
> > +return __extension__ (v8bf) {a1, a1, a1, a1, a1, a1, a1, a1}; }
> > +
> > +v16bf
> > +vec_init_dup_v16bf (__bf16 a1)
> > +{
> > +return __extension__ (v16bf) {a1, a1, a1, a1, a1, a1, a1, a1,
> > + a1, a1, a1, a1, a1, a1, a1, a1}; }
> > +
> > +/* { dg-final { scan-assembler-times "vpunpcklwd"

RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-09-19 Thread Kong, Lingling via Gcc-patches
Thanks a lot, pushed to trunk.

> Hi Richard,
> 
> Thanks again for your reviewing.
> 
> > Yes, use else if for the bitwise induction.  Can you also make the new
> > case conditional on 'def'
> > (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If
> > that call produced something useful it will not be of either of the two 
> > special
> forms.
> > Thus like
> >
> >   if (def != chrec_dont_know)
> > /* Already OK.  */
> > ;
> >  else if ((bitinv_def = ...)
> > ..
> >  else if (tree_fits_uhwi_p (niter)
> >  ... bitwise induction case...)
> > ...
> >
> Yes, I fixed it in new patch. Thanks.
> Ok for master ?
> 
> Thanks,
> Lingling
> 
> > -Original Message-
> > From: Richard Biener 
> > Sent: Wednesday, September 14, 2022 4:16 PM
> > To: Kong, Lingling 
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao 
> > Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle
> > bitop with an invariant induction.[PR105735]
> >
> > On Tue, Sep 13, 2022 at 9:54 AM Kong, Lingling
> > 
> > wrote:
> > >
> > > Hi Richard,
> > >
> > > Thanks you so much for reviewing this patch.  I really appreciate
> > > it. For these
> > review comments, I have made some changes.
> > >
> > > > That's a single-stmt match, you shouldn't use match.pd matching for 
> > > > this.
> > > > Instead just do
> > > >
> > > >   if (is_gimple_assign (stmt)
> > > >   && ((code = gimple_assign_rhs_code (stmt)), true)
> > > >   && (code == BIT_AND_EXPR || code == BIT_IOR_EXPR || code ==
> > > > BIT_XOR_EXPR))
> > >
> > > Yes, I fixed it and dropped modification for match.pd.
> > >
> > > > and pick gimple_assign_rhs{1,2} (stmt) as the operands.  The :c in
> > > > bit_op:c is redundant btw. - while the name suggests "with
> > > > invariant" you don't actually check for that.  But again, given
> > > > canonicalization rules the invariant will be rhs2 so above add
> > > >
> > > > && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST
> > >
> > > For " with invariant", this needed op1 is invariant, and I used
> > `expr_invariant_in_loop_p (loop, match_op[0])` for check.
> > > And op2 just be PHI is ok. If op2 is INTEGER_CST, existing gcc can
> > > be directly
> > optimized and do not need modification.
> > >
> > > > you probably need dg-require-effective-target longlong, but is it
> > > > necessary to use long long for the testcases in the first place?
> > > > The IV seems to be unused, if it should match the variables bit
> > > > size use sizeof
> > > > (type) * 8
> > >
> > > Yes, It is not necessary to use long long for the testcases. I
> > > changed type to
> > unsigned int.
> > >
> > > > > +  inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge
> > > > > + (loop));  return fold_build2 (code1, type, inv, match_op[0]);
> > > > > + }
> > > >
> > > > The } goes to the next line.
> > >
> > > Sorry, It might be something wrong with my use of gcc send-email format.
> > >
> > > > > +  tree bitinv_def;
> > > > > +  if ((bitinv_def
> > > >
> > > > please use else if here
> > >
> > > Sorry, If use the else if here, there is no corresponding above if.
> > > I'm not sure if
> > you mean change bitwise induction expression if to else if.
> >
> > Yes, use else if for the bitwise induction.  Can you also make the new
> > case conditional on 'def'
> > (the compute_overall_effect_of_inner_loop) being chrec_dont_know?  If
> > that call produced something useful it will not be of either of the two 
> > special
> forms.
> > Thus like
> >
> >   if (def != chrec_dont_know)
> > /* Already OK.  */
> > ;
> >  else if ((bitinv_def = ...)
> > ..
> >  else if (tree_fits_uhwi_p (niter)
> >  ... bitwise induction case...)
> > ...
> >
> > ?
> >
> > Otherwise looks OK now.
> >
> > Thanks,
> > Richard.
> >
> > > Do you agree with these changes?  Thanks again for taking a look.
> > >
> > > Thanks,
> > > Lingling
> > >

RE: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-23 Thread Kong, Lingling via Gcc-patches
Hi,

vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
-mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
And cleared before conversion, updated  movhi_internal and 
ix86_can_change_mode_class.

OK for master?

gcc/ChangeLog:

PR target/102811
* config/i386/i386.c (ix86_can_change_mode_class): SSE2 can load 16bit 
data
to sse register via pinsrw.
* config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for f16c.
(extendhfdf2): Split extendhf2 into separate extendhfsf2, 
extendhfdf2.
extendhfdf only for target_avx512fp16.
(*extendhf2):rename extendhf2.
(truncsfhf2): Likewise.
(truncdfhf2): Likewise.
(*trunc2): Likewise.

gcc/testsuite/ChangeLog:

PR target/102811
* gcc.target/i386/pr90773-21.c: Optimized movhi_internal,
optimize vmovd + movw to vpextrw.
* gcc.target/i386/pr90773-23.c: Ditto.
* gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
---
 gcc/config/i386/i386.c|  5 +-
 gcc/config/i386/i386.md   | 74 +--
 .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
 gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
 5 files changed, 83 insertions(+), 11 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
e94efdf39fb..4b813533961 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
machine_mode to,
 disallow a change to these modes, reload will assume it's ok to
 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
 the vec_dupv4hi pattern.
-NB: AVX512FP16 supports vmovw which can load 16bit data to sse
-register.  */
-  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4;
+NB: SSE2 can load 16bit data to sse register via pinsrw.  */
+  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 : 
+4;
   if (GET_MODE_SIZE (from) < mov_size)
return false;
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
6eb9de81921..6ee264f1151 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2525,6 +2525,16 @@
 case TYPE_SSEMOV:
   return ix86_output_ssemov (insn, operands);
 
+case TYPE_SSELOG:
+  if (SSE_REG_P (operands[0]))
+   return MEM_P (operands[1])
+ ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
+  else
+   return MEM_P (operands[1])
+ ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
+
 case TYPE_MSKLOG:
   if (operands[1] == const0_rtx)
return "kxorw\t%0, %0, %0";
@@ -2540,13 +2550,17 @@
 }
 }
   [(set (attr "isa")
-   (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "avx512fp16")
+   (cond [(eq_attr "alternative" "9,10,11,12")
+ (const_string "sse2")
+  (eq_attr "alternative" "13")
+ (const_string "sse4")
   ]
   (const_string "*")))
(set (attr "type")
  (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "ssemov")
+ (if_then_else (match_test "TARGET_AVX512FP16")
+   (const_string "ssemov")
+   (const_string "sselog"))
(eq_attr "alternative" "4,5,6,7")
  (const_string "mskmov")
(eq_attr "alternative" "8")
@@ -4574,8 +4588,32 @@
   emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
 })
 
-(define_insn "extendhf2"
-  [(set (match_operand:MODEF 0 "nonimm_ssenomem_operand" "=v")
+(define_expand "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand")
+   (float_extend:SF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+{
+  if (!TARGET_AVX512FP16)
+{
+  rtx res = gen_reg_rtx (V4SFmode);
+  rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
+
+  ix86_expand_vector_set (false, tmp, operands[1], 0);
+  emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp)));
+  emit_move_insn (operands[0], gen_lowpart (SFmode, res));
+  DONE;
+}
+})
+
+(define_expand "extendhfdf2"
+  [(set (match_operand:DF 0 "register_operand")
+   (float_extend:DF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
+(define_insn "*extendhf2"
+  [(set (match_operand:MODEF 0 "register_operand" "=v")
 (float_extend:MODEF
  (match_operand:HF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
@@ -4766,7 +4804,31 @@
 
 ;; Conversion from {SF,DF}mode to HFmode.
 
-(define_insn "trunchf2"
+(define_expand "truncsfhf2"
+  [

RE: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-24 Thread Kong, Lingling via Gcc-patches
Hi  Uros,

> BTW: When playing with my patch, I introduced (define_insn "*vec_set_0" 
> ...) to optimize scalar load to a vector. Does ix86_expand_vector_set work OK 
> without this pattern?

Yes, ix86_expand_vector_set could work ok with (define_insn 
"_pinsr"), this insn can optimize scalar load to a 
vector.

Thanks,
Lingling

-Original Message-
From: Uros Bizjak  
Sent: Wednesday, November 24, 2021 3:57 PM
To: Kong, Lingling 
Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert 
_Float16 to SFmode with -mf16c [PR 102811]

On Wed, Nov 24, 2021 at 7:25 AM Kong, Lingling via Gcc-patches 
 wrote:
>
> Hi,
>
> vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
> -mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
> And cleared before conversion, updated  movhi_internal and 
> ix86_can_change_mode_class.

Please fix the above commit message.

>
> OK for master?
>
> gcc/ChangeLog:
>
> PR target/102811
> * config/i386/i386.c (ix86_can_change_mode_class): SSE2 can load 
> 16bit data
> to sse register via pinsrw.

Allow 16bit data in XMM register for SSE2 targets.

> * config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for f16c.

... for TARGET_F16C.

> (extendhfdf2): Split extendhf2 into separate extendhfsf2, 
> extendhfdf2.
> extendhfdf only for target_avx512fp16.

Restrict extendhfdf for TARGET_AVX512FP16 only.

> (*extendhf2):rename extendhf2.

Rename from extendhf2.

> (truncsfhf2): Likewise.
> (truncdfhf2): Likewise.
> (*trunc2): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR target/102811
> * gcc.target/i386/pr90773-21.c: Optimized movhi_internal,
> optimize vmovd + movw to vpextrw.

Also allow pextrw.

> * gcc.target/i386/pr90773-23.c: Ditto.
> * gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.

Otherwise LGTM.

BTW: When playing with my patch, I introduced (define_insn "*vec_set_0" 
...) to optimize scalar load to a vector. Does ix86_expand_vector_set work OK 
without this pattern?

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.c|  5 +-
>  gcc/config/i386/i386.md   | 74 +--
>  .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
>  gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
>  5 files changed, 83 insertions(+), 11 deletions(-)  create mode 
> 100644 gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
> e94efdf39fb..4b813533961 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
> machine_mode to,
>  disallow a change to these modes, reload will assume it's ok to
>  drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
>  the vec_dupv4hi pattern.
> -NB: AVX512FP16 supports vmovw which can load 16bit data to sse
> -register.  */
> -  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 
> 4;
> +NB: SSE2 can load 16bit data to sse register via pinsrw.  */
> +  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 :
> +4;
>if (GET_MODE_SIZE (from) < mov_size)
> return false;
>  }
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
> 6eb9de81921..6ee264f1151 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2525,6 +2525,16 @@
>  case TYPE_SSEMOV:
>return ix86_output_ssemov (insn, operands);
>
> +case TYPE_SSELOG:
> +  if (SSE_REG_P (operands[0]))
> +   return MEM_P (operands[1])
> + ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
> +  else
> +   return MEM_P (operands[1])
> + ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
> +
>  case TYPE_MSKLOG:
>if (operands[1] == const0_rtx)
> return "kxorw\t%0, %0, %0";
> @@ -2540,13 +2550,17 @@
>  }
>  }
>[(set (attr "isa")
> -   (cond [(eq_attr "alternative" "9,10,11,12,13")
> - (const_string "avx512fp16")
> +   (cond [(eq_attr "alternative" "9,10,11,12")
> + (const_string "sse2")
> +  (eq_attr "alternative&q

[PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-24 Thread Kong, Lingling via Gcc-patches
Hi,

vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
-mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
Cleared before conversion, updated  movhi_internal and 
ix86_can_change_mode_class. And fixed some commit message.

OK for master?

gcc/ChangeLog:

PR target/102811
* config/i386/i386.c (ix86_can_change_mode_class): Allow 16 bit data in 
XMM register
for TARGET_SSE2.
* config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for TARGET_F16C.
(extendhfdf2): Restrict extendhfdf for TARGET_AVX512FP16 only.
(*extendhf2): Rename from extendhf2.
(truncsfhf2): Likewise.
(truncdfhf2): Likewise.
(*trunc2): Likewise.

gcc/testsuite/ChangeLog:

PR target/102811
* gcc.target/i386/pr90773-21.c: Optimize movhi_internal,
also allow pextrw replace vmovd + movw.
* gcc.target/i386/pr90773-23.c: Ditto.
* gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
---
 gcc/config/i386/i386.c|  5 +-
 gcc/config/i386/i386.md   | 74 +--
 .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
 gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
 gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
 5 files changed, 83 insertions(+), 11 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
e94efdf39fb..4b813533961 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
machine_mode to,
 disallow a change to these modes, reload will assume it's ok to
 drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
 the vec_dupv4hi pattern.
-NB: AVX512FP16 supports vmovw which can load 16bit data to sse
-register.  */
-  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 4;
+NB: SSE2 can load 16bit data to sse register via pinsrw.  */
+  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 : 
+4;
   if (GET_MODE_SIZE (from) < mov_size)
return false;
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
6eb9de81921..6ee264f1151 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2525,6 +2525,16 @@
 case TYPE_SSEMOV:
   return ix86_output_ssemov (insn, operands);
 
+case TYPE_SSELOG:
+  if (SSE_REG_P (operands[0]))
+   return MEM_P (operands[1])
+ ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
+  else
+   return MEM_P (operands[1])
+ ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
+ : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
+
 case TYPE_MSKLOG:
   if (operands[1] == const0_rtx)
return "kxorw\t%0, %0, %0";
@@ -2540,13 +2550,17 @@
 }
 }
   [(set (attr "isa")
-   (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "avx512fp16")
+   (cond [(eq_attr "alternative" "9,10,11,12")
+ (const_string "sse2")
+  (eq_attr "alternative" "13")
+ (const_string "sse4")
   ]
   (const_string "*")))
(set (attr "type")
  (cond [(eq_attr "alternative" "9,10,11,12,13")
- (const_string "ssemov")
+ (if_then_else (match_test "TARGET_AVX512FP16")
+   (const_string "ssemov")
+   (const_string "sselog"))
(eq_attr "alternative" "4,5,6,7")
  (const_string "mskmov")
(eq_attr "alternative" "8")
@@ -4574,8 +4588,32 @@
   emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
 })
 
-(define_insn "extendhf2"
-  [(set (match_operand:MODEF 0 "nonimm_ssenomem_operand" "=v")
+(define_expand "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand")
+   (float_extend:SF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+{
+  if (!TARGET_AVX512FP16)
+{
+  rtx res = gen_reg_rtx (V4SFmode);
+  rtx tmp = force_reg (V8HFmode, CONST0_RTX (V8HFmode));
+
+  ix86_expand_vector_set (false, tmp, operands[1], 0);
+  emit_insn (gen_vcvtph2ps (res, gen_lowpart (V8HImode, tmp)));
+  emit_move_insn (operands[0], gen_lowpart (SFmode, res));
+  DONE;
+}
+})
+
+(define_expand "extendhfdf2"
+  [(set (match_operand:DF 0 "register_operand")
+   (float_extend:DF
+ (match_operand:HF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16")
+
+(define_insn "*extendhf2"
+  [(set (match_operand:MODEF 0 "register_operand" "=v")
 (float_extend:MODEF
  (match_operand:HF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
@@ -4766,7 +4804,31 @@
 
 ;; Conversion from {SF,DF}mode to HFmode.
 
-(define_insn "trunchf2"
+(define_expand "truncsfhf2"
+  [(set (match_o

RE: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-24 Thread Kong, Lingling via Gcc-patches
OK, This is the patch I prepare to check in.

-Original Message-
From: Uros Bizjak  
Sent: Wednesday, November 24, 2021 4:49 PM
To: Kong, Lingling 
Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert 
_Float16 to SFmode with -mf16c [PR 102811]

On Wed, Nov 24, 2021 at 9:44 AM Kong, Lingling  wrote:
>
> Hi,
>
> vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
> -mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.
> Cleared before conversion, updated  movhi_internal and 
> ix86_can_change_mode_class. And fixed some commit message.
>
> OK for master?

OK, with a small adjustment to ChangeLog.

Thanks,
Uros.

> gcc/ChangeLog:
>
> PR target/102811
> * config/i386/i386.c (ix86_can_change_mode_class): Allow 16 bit data 
> in XMM register
> for TARGET_SSE2.
> * config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for TARGET_F16C.
> (extendhfdf2): Restrict extendhfdf for TARGET_AVX512FP16 only.
> (*extendhf2): Rename from extendhf2.
> (truncsfhf2): Likewise.
> (truncdfhf2): Likewise.
> (*trunc2): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> PR target/102811
> * gcc.target/i386/pr90773-21.c: Optimize movhi_internal,
> also allow pextrw replace vmovd + movw.

Just write:

* gcc.target/i386/pr90773-21.c: Allow pextrw instead of movw.

> * gcc.target/i386/pr90773-23.c: Ditto.
> * gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
> ---
>  gcc/config/i386/i386.c|  5 +-
>  gcc/config/i386/i386.md   | 74 +--
>  .../i386/avx512vl-vcvtps2ph-pr102811.c| 11 +++
>  gcc/testsuite/gcc.target/i386/pr90773-21.c|  2 +-
>  gcc/testsuite/gcc.target/i386/pr90773-23.c|  2 +-
>  5 files changed, 83 insertions(+), 11 deletions(-)  create mode 
> 100644 gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 
> e94efdf39fb..4b813533961 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19485,9 +19485,8 @@ ix86_can_change_mode_class (machine_mode from, 
> machine_mode to,
>  disallow a change to these modes, reload will assume it's ok to
>  drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
>  the vec_dupv4hi pattern.
> -NB: AVX512FP16 supports vmovw which can load 16bit data to sse
> -register.  */
> -  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_AVX512FP16 ? 2 : 
> 4;
> +NB: SSE2 can load 16bit data to sse register via pinsrw.  */
> +  int mov_size = MAYBE_SSE_CLASS_P (regclass) && TARGET_SSE2 ? 2 :
> +4;
>if (GET_MODE_SIZE (from) < mov_size)
> return false;
>  }
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
> 6eb9de81921..6ee264f1151 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -2525,6 +2525,16 @@
>  case TYPE_SSEMOV:
>return ix86_output_ssemov (insn, operands);
>
> +case TYPE_SSELOG:
> +  if (SSE_REG_P (operands[0]))
> +   return MEM_P (operands[1])
> + ? "pinsrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pinsrw\t{$0, %k1, %0|%0, %k1, 0}";
> +  else
> +   return MEM_P (operands[1])
> + ? "pextrw\t{$0, %1, %0|%0, %1, 0}"
> + : "pextrw\t{$0, %1, %k0|%k0, %k1, 0}";
> +
>  case TYPE_MSKLOG:
>if (operands[1] == const0_rtx)
> return "kxorw\t%0, %0, %0";
> @@ -2540,13 +2550,17 @@
>  }
>  }
>[(set (attr "isa")
> -   (cond [(eq_attr "alternative" "9,10,11,12,13")
> - (const_string "avx512fp16")
> +   (cond [(eq_attr "alternative" "9,10,11,12")
> + (const_string "sse2")
> +  (eq_attr "alternative" "13")
> + (const_string "sse4")
>]
>(const_string "*")))
> (set (attr "type")
>   (cond [(eq_attr "alternative" "9,10,11,12,13")
> - (const_string "ssemov")
> + (if_then_else (match_test "TARGET_AVX512FP16")
> +   (const_string "ssemov")
> +   (const_string "sselog"))
> (eq_attr "alternative" "4,5,6,7")
>   (const_string "mskmov")
> (eq_

RE: [PATCH 4/6] Support Intel AVX-NE-CONVERT

2022-10-23 Thread Kong, Lingling via Gcc-patches
> From: Gcc-patches 
> On Behalf Of Hongtao Liu via Gcc-patches
> Sent: Monday, October 17, 2022 1:47 PM
> To: Jiang, Haochen 
> Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH 4/6] Support Intel AVX-NE-CONVERT
>
> On Fri, Oct 14, 2022 at 3:58 PM Haochen Jiang via Gcc-patches
>  wrote:
> >
> > From: Kong Lingling 
> > +(define_insn "vbcstne2ps_"
> > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > +(vec_duplicate:VF1_128_256
> > + (unspec:SF
> > +  [(match_operand:HI 1 "memory_operand" "m")]
> > +  VBCSTNE)))]
> > +  "TARGET_AVXNECONVERT"
> > +  "vbcstne2ps\t{%1, %0|%0, %1}"
> > +  [(set_attr "prefix" "vex")
> > +  (set_attr "mode" "")])
> Since jakub has support bf16 software emulation, can we rewrite it
> with general rtl ir without unspec?
> Like (float_extend:SF (match_operand:BF "memory_operand" "m")
> > +
> > +(define_int_iterator VCVTNEBF16
> > +  [UNSPEC_VCVTNEEBF16SF
> > +   UNSPEC_VCVTNEOBF16SF])
> > +
> > +(define_int_attr vcvtnebf16type
> > +  [(UNSPEC_VCVTNEEBF16SF "ebf16")
> > +   (UNSPEC_VCVTNEOBF16SF "obf16")])
> > +(define_insn "vcvtne2ps_"
> > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > +(unspec:VF1_128_256
> > +  [(match_operand: 1 "memory_operand" "m")]
> > + VCVTNEBF16))]
> > +  "TARGET_AVXNECONVERT"
> > +  "vcvtne2ps\t{%1, %0|%0, %1}"
> > +  [(set_attr "prefix" "vex")
> > +   (set_attr "mode" "")])
> Similar for this one and all those patterns below.

That's great! Thanks for the review! 
Now rewrite it without unspec and use float_extend for new define_insn.

Thanks
Lingling




0001-Support-Intel-AVX-NE-CONVERT.patch
Description: 0001-Support-Intel-AVX-NE-CONVERT.patch


[PATCH] i386: using __bf16 for AVX512BF16 intrinsics

2022-10-27 Thread Kong, Lingling via Gcc-patches
Hi,

Previously we use unsigned short to represent bf16. It's not a good expression, 
and at the time the front end didn't support bf16 type.
Now we introduced __bf16 to X86 psABI. So we can switch intrinsics to the new 
type.

Ok for trunk ?

Thanks,
Lingling

gcc/ChangeLog:

* config/i386/avx512bf16intrin.h (__attribute__): Change short to bf16.
(_mm_cvtsbh_ss): Ditto.
(_mm512_cvtne2ps_pbh): Ditto.
(_mm512_mask_cvtne2ps_pbh): Ditto.
(_mm512_maskz_cvtne2ps_pbh): Ditto.
* config/i386/avx512bf16vlintrin.h (__attribute__): Ditto.
(_mm256_cvtne2ps_pbh): Ditto.
(_mm256_mask_cvtne2ps_pbh): Ditto.
(_mm256_maskz_cvtne2ps_pbh): Ditto.
(_mm_cvtne2ps_pbh): Ditto.
(_mm_mask_cvtne2ps_pbh): Ditto.
(_mm_maskz_cvtne2ps_pbh): Ditto.
(_mm_cvtness_sbh): Ditto.
* config/i386/i386-builtin-types.def (V8BF): Add new
DEF_VECTOR_TYPE for BFmode.
(V16BF): Ditto.
(V32BF): Ditto.
* config/i386/i386-builtin.def (BDESC): Fixed builtins.
* config/i386/i386-expand.cc (ix86_expand_args_builtin): Changed
avx512bf16 ix86_builtin_func_type included HI to BF.
* config/i386/immintrin.h: Add SSE2 depend for avx512bf16.
* config/i386/sse.md (TARGET_AVX512VL): Changed HI vector to BF
vector.
(avx512f_cvtneps2bf16_v4sf): New define_expand.
(*avx512f_cvtneps2bf16_v4sf): New define_insn.
(avx512f_cvtneps2bf16_v4sf_maskz):Ditto.
(avx512f_cvtneps2bf16_v4sf_mask): Ditto.
(avx512f_cvtneps2bf16_v4sf_mask_1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Add fpmath option.
* gcc.target/i386/avx512bf16-vdpbf16ps-2.c: Fixed
scan-assembler.
* gcc.target/i386/avx512bf16vl-cvtness2sbh-1.c: Add x/y suffix
for vcvtneps2bf16.
* gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: Ditto.
---
 gcc/config/i386/avx512bf16intrin.h|  12 +--
 gcc/config/i386/avx512bf16vlintrin.h  |  29 ++---
 gcc/config/i386/i386-builtin-types.def|  51 -
 gcc/config/i386/i386-builtin.def  |  54 +-
 gcc/config/i386/i386-expand.cc|  48 -
 gcc/config/i386/immintrin.h   |   2 +
 gcc/config/i386/sse.md| 101 ++
 .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c  |   2 +-
 .../gcc.target/i386/avx512bf16-vdpbf16ps-2.c  |   2 +-
 .../i386/avx512bf16vl-cvtness2sbh-1.c |   2 +-
 .../i386/avx512bf16vl-vcvtneps2bf16-1.c   |  12 +--
 11 files changed, 189 insertions(+), 126 deletions(-)

diff --git a/gcc/config/i386/avx512bf16intrin.h 
b/gcc/config/i386/avx512bf16intrin.h
index b6e9ddad157..ea1d0125b3f 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -35,16 +35,16 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for implementing the intrinsics.  */
-typedef short __v32bh __attribute__ ((__vector_size__ (64)));
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components.  */
-typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
 
 /* Convert One BF16 Data to One Single Float Data.  */
 extern __inline float
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsbh_ss (__bfloat16 __A)
+_mm_cvtsbh_ss (__bf16 __A)
 {
   union{ float a; unsigned int b;} __tmp;
   __tmp.b = ((unsigned int)(__A)) << 16;
@@ -57,21 +57,21 @@ extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
 }
 
 extern __inline __m512bh
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
 {
-  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
 }
 
 /* vcvtneps2bf16 */
diff --git a/gcc/config/i386/avx512bf16vlintrin.h 
b/gcc/config/i386/avx512bf16vlintrin.h
index 969335ff358..56c28f14cf6 100644
--- a/gcc/config/i386/avx512bf16vlintrin.h
+++ b/gcc/config/i386/avx512bf16vlintrin.h
@@ -35,57 +35,58 @@
 #endif /* __AVX512BF16__ */
 
 /* Internal data types for implementing

RE: [PATCH 4/6] Support Intel AVX-NE-CONVERT

2022-10-28 Thread Kong, Lingling via Gcc-patches
Hi,

Because we  switch intrinsics for avx512bf16 to the new type __bf16. Now we 
could use m128/256bh for vector bf16 type instead of m128/256bf16.
And unified builtin for avx512bf16/avxneconvert.

Thanks,
Lingling

> -Original Message-
> From: Hongtao Liu 
> Sent: Tuesday, October 25, 2022 1:23 PM
> To: Kong, Lingling 
> Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org; Jiang,
> Haochen 
> Subject: Re: [PATCH 4/6] Support Intel AVX-NE-CONVERT
> 
> On Mon, Oct 24, 2022 at 2:20 PM Kong, Lingling 
> wrote:
> >
> > > From: Gcc-patches
> > > 
> > > On Behalf Of Hongtao Liu via Gcc-patches
> > > Sent: Monday, October 17, 2022 1:47 PM
> > > To: Jiang, Haochen 
> > > Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> > > Subject: Re: [PATCH 4/6] Support Intel AVX-NE-CONVERT
> > >
> > > On Fri, Oct 14, 2022 at 3:58 PM Haochen Jiang via Gcc-patches
> > >  wrote:
> > > >
> > > > From: Kong Lingling 
> > > > +(define_insn "vbcstne2ps_"
> > > > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > > > +(vec_duplicate:VF1_128_256
> > > > + (unspec:SF
> > > > +  [(match_operand:HI 1 "memory_operand" "m")]
> > > > +  VBCSTNE)))]
> > > > +  "TARGET_AVXNECONVERT"
> > > > +  "vbcstne2ps\t{%1, %0|%0, %1}"
> > > > +  [(set_attr "prefix" "vex")
> > > > +  (set_attr "mode" "")])
> > > Since jakub has support bf16 software emulation, can we rewrite it
> > > with general rtl ir without unspec?
> > > Like (float_extend:SF (match_operand:BF "memory_operand" "m")
> > > > +
> > > > +(define_int_iterator VCVTNEBF16
> > > > +  [UNSPEC_VCVTNEEBF16SF
> > > > +   UNSPEC_VCVTNEOBF16SF])
> > > > +
> > > > +(define_int_attr vcvtnebf16type
> > > > +  [(UNSPEC_VCVTNEEBF16SF "ebf16")
> > > > +   (UNSPEC_VCVTNEOBF16SF "obf16")]) (define_insn
> > > > +"vcvtne2ps_"
> > > > +  [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
> > > > +(unspec:VF1_128_256
> > > > +  [(match_operand: 1 "memory_operand" "m")]
> > > > + VCVTNEBF16))]
> > > > +  "TARGET_AVXNECONVERT"
> > > > +  "vcvtne2ps\t{%1, %0|%0, %1}"
> > > > +  [(set_attr "prefix" "vex")
> > > > +   (set_attr "mode" "")])
> > > Similar for this one and all those patterns below.
> >
> > That's great! Thanks for the review!
> > Now rewrite it without unspec and use float_extend for new define_insn.
> Ok.
> >
> > Thanks
> > Lingling
> >
> >
> 
> 
> --
> BR,
> Hongtao


0001-Support-Intel-AVX-NE-CONVERT.patch
Description: 0001-Support-Intel-AVX-NE-CONVERT.patch


[wwwdocs] [GCC13] Mention Intel __bf16 support in AVX512BF16 intrinsics.

2022-10-31 Thread Kong, Lingling via Gcc-patches
Hi

The patch is for mention Intel __bf16 support in AVX512BF16 intrinsics.
Ok for master ?

Thanks,
Lingling

---
 htdocs/gcc-13/changes.html | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index 
7c6bfa6e..cd0282f1 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -230,6 +230,8 @@ a work-in-progress.
   For both C and C++ the __bf16 type is supported on
   x86 systems with SSE2 and above enabled.
   
+  Use __bf16 type for AVX512BF16 intrinsics.
+  
 
 
 
--
2.18.2



RE: [wwwdocs] [GCC13] Mention Intel __bf16 support in AVX512BF16 intrinsics.

2022-11-01 Thread Kong, Lingling via Gcc-patches
> > diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
> > index 7c6bfa6e..cd0282f1 100644
> > --- a/htdocs/gcc-13/changes.html
> > +++ b/htdocs/gcc-13/changes.html
> > @@ -230,6 +230,8 @@ a work-in-progress.
> >For both C and C++ the __bf16 type is supported on
> >x86 systems with SSE2 and above enabled.
> >
> > +  Use __bf16 type for AVX512BF16 intrinsics.
> Could you add more explanations. Like originally it's ..., now it's ..., and 
> what's
> the difference when users compile the same source code(which contains
> avx512bf16 intrinsics) with gcc12(and before) and GCC13.
> > +  
> >  
> >
> >  
> > --
> > 2.18.2
> >
Yes,  changed it. Thanks a lot!

Subject: [PATCH] Mention Intel __bf16 support in AVX512BF16 intrinsics.

---
 htdocs/gcc-13/changes.html | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
index 7c6bfa6e..a35f4fab 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -230,6 +230,12 @@ a work-in-progress.
   For both C and C++ the __bf16 type is supported on
   x86 systems with SSE2 and above enabled.
   
+  Use __bf16 type for AVX512BF16 intrinsics. Previously we use
+  short to represent bf16. Now we introduced __bf16 to x86 psABI.
+  So we switch intrinsics in AVX512BF16 to the new type __bf16.
+  When users compile the same source code contains AVX512BF16 intrinsics with
+  GCC13 need to support SSE2, which is different to GCC12 (and before).
+  
 

 
--
2.18.2

BRs,
Lingling


RE: [wwwdocs] [GCC13] Mention Intel __bf16 support in AVX512BF16 intrinsics.

2022-11-02 Thread Kong, Lingling via Gcc-patches
> > > diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
> > > index 7c6bfa6e..cd0282f1 100644
> > > --- a/htdocs/gcc-13/changes.html
> > > +++ b/htdocs/gcc-13/changes.html
> > > @@ -230,6 +230,8 @@ a work-in-progress.
> > >For both C and C++ the __bf16 type is supported on
> > >x86 systems with SSE2 and above enabled.
> > >
> > > +  Use __bf16 type for AVX512BF16 intrinsics.
> > Could you add more explanations. Like originally it's ..., now it's
> > ..., and what's the difference when users compile the same source
> > code(which contains
> > avx512bf16 intrinsics) with gcc12(and before) and GCC13.
> > > +  
> > >  
> > >
> > >  
> > > --
> > > 2.18.2
> > >
> Yes,  changed it. Thanks a lot!
> 
> Subject: [PATCH] Mention Intel __bf16 support in AVX512BF16 intrinsics.
> 
> ---
>  htdocs/gcc-13/changes.html | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index
> 7c6bfa6e..a35f4fab 100644
> --- a/htdocs/gcc-13/changes.html
> +++ b/htdocs/gcc-13/changes.html
> @@ -230,6 +230,12 @@ a work-in-progress.
>For both C and C++ the __bf16 type is supported on
>x86 systems with SSE2 and above enabled.
>
> +  Use __bf16 type for AVX512BF16 intrinsics.
> + Previously we use  short to represent bf16. Now we introduced
> __bf16 to x86 psABI.
> +  So we switch intrinsics in AVX512BF16 to the new type __bf16.
> +  When users compile the same source code contains AVX512BF16
> + intrinsics with
> +  GCC13 need to support SSE2, which is different to GCC12 (and before).
> +  
>  
> 
>  
> --
> 2.18.2
> 
> BRs,
> Lingling

Sorry, modified again. New patch is as below.

htdocs/gcc-13/changes.html | 5 +
 1 file changed, 5 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index 
7c6bfa6e..7a5d2ab6 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -230,6 +230,11 @@ a work-in-progress.
   For both C and C++ the __bf16 type is supported on
   x86 systems with SSE2 and above enabled.
   
+  Use real __bf16 type for AVX512BF16 intrinsics. 
+ Previously  we use __bfloat16 which is typedef of short. Now we 
+ introduced real  __bf16 type to x86 psABI. Users need to 
+ adjust their  AVX512BF16-related source code when upgrading GCC12 to GCC13.
+  
 
 
 
--
2.18.2

BRs,
Lingling


[PATCH] x86: Support vector __bf16 type.

2022-08-16 Thread Kong, Lingling via Gcc-patches
Hi,

The patch is support vector init/broadcast/set/extract for __bf16 type.
The __bf16 type is a storage type.

OK for master?

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Handle vector
BFmode.
(ix86_expand_vector_init_duplicate): Support vector BFmode.
(ix86_expand_vector_init_one_nonzero): Ditto.
(ix86_expand_vector_init_one_var): Ditto.
(ix86_expand_vector_init_concat): Ditto.
(ix86_expand_vector_init_interleave): Ditto.
(ix86_expand_vector_init_general): Ditto.
(ix86_expand_vector_init): Ditto.
(ix86_expand_vector_set_var): Ditto.
(ix86_expand_vector_set): Ditto.
(ix86_expand_vector_extract): Ditto.
* config/i386/i386.cc (classify_argument): Add BF vector modes.
(function_arg_64): Ditto.
(ix86_gimplify_va_arg): Ditto.
(ix86_get_ssemov): Ditto.
* config/i386/i386.h (VALID_AVX256_REG_MODE): Add BF vector modes.
(VALID_AVX512F_REG_MODE): Ditto.
(host_detect_local_cpu): Ditto.
(VALID_SSE2_REG_MODE): Ditto.
* config/i386/i386.md: Add BF vector modes.
(MODE_SIZE): Ditto.
(ssemodesuffix): Add bf suffix for BF vector modes.
(ssevecmode): Ditto.
* config/i386/sse.md (VMOVE): Adjust for BF vector modes.
(VI12HFBF_AVX512VL): Ditto.
(V_256_512): Ditto.
(VF_AVX512HFBF16): Ditto.
(VF_AVX512BWHFBF16): Ditto.
(VIHFBF): Ditto.
(avx512): Ditto.
(VIHFBF_256): Ditto.
(VIHFBF_AVX512BW): Ditto.
(VI2F_256_512):Ditto.
(V8_128):Ditto.
(V16_256): Ditto.
(V32_512): Ditto.
(sseinsnmode): Ditto.
(sseconstm1): Ditto.
(sseintmodesuffix): New mode_attr.
(avx512fmaskmode): Ditto.
(avx512fmaskmodelower): Ditto.
(ssedoublevecmode): Ditto.
(ssehalfvecmode): Ditto.
(ssehalfvecmodelower): Ditto.
(ssescalarmode): Add vector BFmode mapping.
(ssescalarmodelower): Ditto.
(ssexmmmode): Ditto.
(ternlogsuffix): Ditto.
(ssescalarsize): Ditto.
(sseintprefix): Ditto.
(i128): Ditto.
(xtg_mode): Ditto.
(bcstscalarsuff): Ditto.
(_blendm): New define_insn for BFmode.
(_store_mask): Ditto.
(vcond_mask_): Ditto.
(vec_set_0): New define_insn for BF vector set.
(V8BFH_128): New mode_iterator for BFmode.
(avx512fp16_mov): Ditto.
(vec_set): New define_insn for BF vector set.
(@vec_extract_hi_): Ditto.
(@vec_extract_lo_): Ditto.
(vec_set_hi_): Ditto.
(vec_set_lo_): Ditto.
(*vec_extract_0): New define_insn_and_split for BF
vector extract.
(*vec_extract): New define_insn.
(VEC_EXTRACT_MODE): Add BF vector modes.
(PINSR_MODE): Add V8BF.
(sse2p4_1): Ditto.
(pinsr_evex_isa): Ditto.
(_pinsr): Adjust to support
insert for V8BFmode.
(pbroadcast_evex_isa): Add BF vector modes.
(AVX2_VEC_DUP_MODE): Ditto.
(VEC_INIT_MODE): Ditto.
(VEC_INIT_HALF_MODE): Ditto.
(avx2_pbroadcast): Adjust to support BF vector mode
broadcast.
(avx2_pbroadcast_1): Ditto.
(_vec_dup_1): Ditto.
(_vec_dup_gpr):
Ditto.

gcc/testsuite/ChangeLog:

* g++.target/i386/vect-bfloat16-1.C: New test.
* gcc.target/i386/vect-bfloat16-1.c: New test.
* gcc.target/i386/vect-bfloat16-2a.c: New test.
* gcc.target/i386/vect-bfloat16-2b.c: New test.
* gcc.target/i386/vect-bfloat16-typecheck_1.c: New test.
* gcc.target/i386/vect-bfloat16-typecheck_2.c: New test.
---
 gcc/config/i386/i386-expand.cc| 129 +++--
 gcc/config/i386/i386.cc   |  16 +-
 gcc/config/i386/i386.h|  12 +-
 gcc/config/i386/i386.md   |   9 +-
 gcc/config/i386/sse.md| 211 --
 .../g++.target/i386/vect-bfloat16-1.C |  13 +
 .../gcc.target/i386/vect-bfloat16-1.c |  30 ++
 .../gcc.target/i386/vect-bfloat16-2a.c| 121 
 .../gcc.target/i386/vect-bfloat16-2b.c|  22 ++
 .../i386/vect-bfloat16-typecheck_1.c  | 258 ++
 .../i386/vect-bfloat16-typecheck_2.c  | 248 +
 11 files changed, 950 insertions(+), 119 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/vect-bfloat16-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-typecheck_1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-bfloat16-typecheck_2.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 66d

[PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-08-17 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is for pr105735/pr101991. It will enable below optimization:
{
-  long unsigned int bit;
-
-   [local count: 32534376]:
-
-   [local count: 1041207449]:
-  # tmp_10 = PHI 
-  # bit_12 = PHI 
-  tmp_7 = bit2_6(D) & tmp_10;
-  bit_8 = bit_12 + 1;
-  if (bit_8 != 32)
-goto ; [96.97%]
-  else
-goto ; [3.03%]
-
-   [local count: 1009658865]:
-  goto ; [100.00%]
-
-   [local count: 32534376]:
-  # tmp_11 = PHI 
-  return tmp_11;
+  tmp_11 = tmp_4(D) & bit2_6(D);
+  return tmp_11;

}

Ok for master ?

gcc/ChangeLog:

PR middle-end/105735
* match.pd (bitop_with_inv_p): New match.
* tree-scalar-evolution.cc (gimple_bitop_with_inv_p): Declare.
(analyze_and_compute_bitop_with_inv_effect): New function.
(final_value_replacement_loop): Enhanced to handle bitop
with inv induction.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr105735-1.c: New test.
* gcc.target/i386/pr105735-2.c: New test.
---
 gcc/match.pd   |  4 +
 gcc/testsuite/gcc.target/i386/pr105735-1.c | 88 ++  
gcc/testsuite/gcc.target/i386/pr105735-2.c | 28 +++
 gcc/tree-scalar-evolution.cc   | 59 +++
 4 files changed, 179 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-2.c

diff --git a/gcc/match.pd b/gcc/match.pd index 562138a8034..cfe593ebb02 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8050,6 +8050,10 @@ and,
  (bit_not
   (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3
 
+(for bit_op (bit_and bit_ior bit_xor)
+ (match (bitop_with_inv_p @0 @1)
+  (bit_op:c @0 @1)))
+
 /* n - (((n > C1) ? n : C1) & -C2) ->  n & C1 for unsigned case.
n - (((n > C1) ? n : C1) & -C2) ->  (n <= C1) ? n : (n & C1) for signed 
case.  */  (simplify diff --git a/gcc/testsuite/gcc.target/i386/pr105735-1.c 
b/gcc/testsuite/gcc.target/i386/pr105735-1.c
new file mode 100644
index 000..8d2123ed351
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105735-1.c
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 8 "sccp" 
+} } */
+
+unsigned long long
+__attribute__((noipa))
+foo (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 64; bit++)
+tmp &= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo1 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp &= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo2 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 64; bit++)
+tmp |= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo3 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 63; bit >= 0; bit -=3)
+tmp |= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo4 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 64; bit++)
+tmp ^= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo5 (unsigned long long tmp, unsigned long long bit2) {
+  for (int bit = 0; bit < 63; bit++)
+tmp ^= bit2;
+  return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+f (unsigned long long tmp, long long bit, unsigned long long bit2) {
+  unsigned long long res = tmp;
+  for (long long i = 0; i < bit; i++)
+res &= bit2;
+  return res;
+}
+
+unsigned long long
+__attribute__((noipa))
+f1 (unsigned long long tmp, long long bit, unsigned long long bit2) {
+  unsigned long long res = tmp;
+  for (long long i = 0; i < bit; i++)
+res |= bit2;
+  return res;
+}
+
+unsigned long long
+__attribute__((noipa))
+f2 (unsigned long long tmp, long long bit, unsigned long long bit2) {
+  unsigned long long res = tmp;
+  for (long long i = 0; i < bit; i++)
+res ^= bit2;
+  return res;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr105735-2.c 
b/gcc/testsuite/gcc.target/i386/pr105735-2.c
new file mode 100644
index 000..79c1d300b1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105735-2.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr105735-1.c"
+
+int main()
+{
+  unsigned long long tmp = 0x1101101ULL;
+  unsigned long long bit2 = 0x11100111ULL;
+  if (foo (tmp, bit2) != 0x1100101ULL)
+__builtin_abort ();
+  if (foo1 (tmp, bit2) != 0x1100101ULL)
+__builtin_abort ();
+  if (foo2 (tmp, bit2) != 0x1110ULL)
+__builtin_abort ();
+  if (foo3 (tmp, bit2) != 0x1110ULL)
+__builtin_abort ();
+  if (foo4 (tmp, bit2) != 0x1101101ULL)
+__builtin_abort ();
+  if (foo5 (tmp, bit2) != 0x111010011010ULL)
+__builtin_abort ();
+  if (f (tmp, 64, bit2) != 0x1100101ULL)
+__builtin_abort ();
+  if (f1 (tmp, 64, bit2) != 0x1110ULL)
+__bui

[wwwdocs] [GCC13] Mention Intel __bf16 support.

2022-08-18 Thread Kong, Lingling via Gcc-patches
Hi

The patch is for mention Intel __bf16 support in gcc13.
Ok for master ?

Thanks,
Lingling

htdocs/gcc-13/changes.html | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html index 
57bd8724..7d98329c 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -122,7 +122,12 @@ a work-in-progress.
 
 
 
-
+IA-32/x86-64
+
+  For both C and C++ the __bf16 type is supported on
+  x86 systems with SSE2 and above enabled.
+  
+
 
 
 
--
2.18.2



RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-08-22 Thread Kong, Lingling via Gcc-patches
Hi  Richard,  could you help to have a look for the patch ?
 
> Hi,
> 
> This patch is for pr105735/pr101991. It will enable below optimization:
> {
> -  long unsigned int bit;
> -
> -   [local count: 32534376]:
> -
> -   [local count: 1041207449]:
> -  # tmp_10 = PHI 
> -  # bit_12 = PHI 
> -  tmp_7 = bit2_6(D) & tmp_10;
> -  bit_8 = bit_12 + 1;
> -  if (bit_8 != 32)
> -goto ; [96.97%]
> -  else
> -goto ; [3.03%]
> -
> -   [local count: 1009658865]:
> -  goto ; [100.00%]
> -
> -   [local count: 32534376]:
> -  # tmp_11 = PHI 
> -  return tmp_11;
> +  tmp_11 = tmp_4(D) & bit2_6(D);
> +  return tmp_11;
> 
> }
> 
> Ok for master ?
> 
> gcc/ChangeLog:
> 
>   PR middle-end/105735
>   * match.pd (bitop_with_inv_p): New match.
>   * tree-scalar-evolution.cc (gimple_bitop_with_inv_p): Declare.
>   (analyze_and_compute_bitop_with_inv_effect): New function.
>   (final_value_replacement_loop): Enhanced to handle bitop
>   with inv induction.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/i386/pr105735-1.c: New test.
>   * gcc.target/i386/pr105735-2.c: New test.
> ---
>  gcc/match.pd   |  4 +
>  gcc/testsuite/gcc.target/i386/pr105735-1.c | 88 ++
> gcc/testsuite/gcc.target/i386/pr105735-2.c | 28 +++
>  gcc/tree-scalar-evolution.cc   | 59 +++
>  4 files changed, 179 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-2.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd index 562138a8034..cfe593ebb02
> 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8050,6 +8050,10 @@ and,
>   (bit_not
>(nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3
> 
> +(for bit_op (bit_and bit_ior bit_xor)
> + (match (bitop_with_inv_p @0 @1)
> +  (bit_op:c @0 @1)))
> +
>  /* n - (((n > C1) ? n : C1) & -C2) ->  n & C1 for unsigned case.
> n - (((n > C1) ? n : C1) & -C2) ->  (n <= C1) ? n : (n & C1) for signed 
> case.  */
> (simplify diff --git a/gcc/testsuite/gcc.target/i386/pr105735-1.c
> b/gcc/testsuite/gcc.target/i386/pr105735-1.c
> new file mode 100644
> index 000..8d2123ed351
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105735-1.c
> @@ -0,0 +1,88 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> +/* { dg-final { scan-tree-dump-times {final value replacement} 8 "sccp"
> +} } */
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 64; bit++)
> +tmp &= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo1 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 63; bit >= 0; bit -=3)
> +tmp &= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo2 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 64; bit++)
> +tmp |= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo3 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 63; bit >= 0; bit -=3)
> +tmp |= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo4 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 64; bit++)
> +tmp ^= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo5 (unsigned long long tmp, unsigned long long bit2) {
> +  for (int bit = 0; bit < 63; bit++)
> +tmp ^= bit2;
> +  return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +f (unsigned long long tmp, long long bit, unsigned long long bit2) {
> +  unsigned long long res = tmp;
> +  for (long long i = 0; i < bit; i++)
> +res &= bit2;
> +  return res;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +f1 (unsigned long long tmp, long long bit, unsigned long long bit2) {
> +  unsigned long long res = tmp;
> +  for (long long i = 0; i < bit; i++)
> +res |= bit2;
> +  return res;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +f2 (unsigned long long tmp, long long bit, unsigned long long bit2) {
> +  unsigned long long res = tmp;
> +  for (long long i = 0; i < bit; i++)
> +res ^= bit2;
> +  return res;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/pr105735-2.c
> b/gcc/testsuite/gcc.target/i386/pr105735-2.c
> new file mode 100644
> index 000..79c1d300b1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105735-2.c
> @@ -0,0 +1,28 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +#include "pr105735-1.c"
> +
> +int main()
> +{
> +  unsigned long long tmp = 0x1101101ULL;
> +  unsigned long long bit2 = 0x11100111ULL;
> +  if (foo (tmp, bit2) != 0x1100101ULL)
> +__builtin_abort ();
> +  if (foo1 (tmp, bit2) != 0x1100101ULL)
> +__builtin_abort ();
> +  if (foo2 (tmp, bit2) != 0x

[PATCH] middle-end: Add MULT_EXPR recognition for cond scalar reduction

2022-08-25 Thread Kong, Lingling via Gcc-patches
Hi,

The conditional mult reduction cannot be recognized with current GCC. The 
following loop cannot be vectorized.
Now add MULT_EXPR recognition for conditional scalar reduction.

float summa(int n, float *arg1, float *arg2)
{  
int i; 
float res1 = 1.0;
for(i = 0; i < n; i++) {
  if(arg2[i]) 
res1 *= arg1[i];
}  
return res1;   
}

gcc/ChangeLog:

* tree-if-conv.cc (is_cond_scalar_reduction): Add MULT_EXPR
recognition.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/gen-vect-34.c: New test.
* gcc.dg/vect/vect-ifcvt-18.c: New test.
---
 gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c | 16 +
 gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c   | 38 +
 gcc/tree-if-conv.cc |  1 +
 3 files changed, 55 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c 
b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
new file mode 100644
index 000..8d2d36401fe
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-vect-details" } */
+/* { dg-additional-options "-mavx2" { target { x86_64-*-* i?86-*-* } } 
+} */
+
+float summul(int n, float *arg1, float *arg2)
+{  
+int i; 
+float res1 = 1.0;
+for(i = 0; i < n; i++) {
+  if(arg2[i]) 
+res1 *= arg1[i];
+}  
+return res1;   
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { 
+target { ! { avr-*-* pru-*-* } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c 
b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
new file mode 100644
index 000..c1d3c27d819
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
@@ -0,0 +1,38 @@
+/* { dg-require-effective-target vect_condition } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-Ofast -mavx" { target avx_runtime } } */
+
+
+int A0[4] = {36,39,42,45};
+int B0[4] = {42,42,0,42};
+float A1[8] = {36,39,42,45,43,32,21,12}; float B1[8] = 
+{42,42,0,42,42,42,0,42}; double A2[16] = 
+{36,39,42,45,43,32,21,12,23,34,45,56,42,78,89,11};
+double B2[16] = {42,42,0,42,42,42,42,42,42,42,42,42,0,42,42,42};
+
+int main ()
+{
+  int i, j;
+  int res0 = 1;
+  float res1 = 1.0;
+  double res2 = 1.0;
+
+  for (i = 0; i < 4; i++)
+if (B0[i])
+  res0 *= A0[i];
+
+  for (i = 0; i < 8; i++)
+if (B1[i])
+  res1 *= A1[i];
+  
+  for (i = 0; i < 16; i++)
+if (B2[i])
+  res2 *= A2[i];
+  /* check results:  */
+  if (res0 != 63180 || res1 != 1043228160.00
+  ||res2 != 3296728515318523101184.00)
+  __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized 3 loops" "vect" { target 
+i?86-*-* x86_64-*-* } } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index 
1c8e1a45234..bac29fb5574 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1739,6 +1739,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, 
tree arg_0, tree arg_1,
 
   if (reduction_op != PLUS_EXPR
   && reduction_op != MINUS_EXPR
+  && reduction_op != MULT_EXPR
   && reduction_op != BIT_IOR_EXPR
   && reduction_op != BIT_XOR_EXPR
   && reduction_op != BIT_AND_EXPR)
--
2.18.2



[PATCH] x86: Handle V8BF in expand_vec_perm_broadcast_1

2022-08-30 Thread Kong, Lingling via Gcc-patches
Hi,

Handle E_V8BFmode in expand_vec_perm_broadcast_1 and 
ix86_expand_vector_init_duplicate.
Ok for trunk?

gcc/ChangeLog:

PR target/106742
* config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
Handle V8BF mode.
(expand_vec_perm_broadcast_1): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr106742.c: New test.
---
 gcc/config/i386/i386-expand.cc   | 17 -
 gcc/testsuite/gcc.target/i386/pr106742.c | 10 ++
 2 files changed, 22 insertions(+), 5 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/pr106742.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc 
index 4b216308a18..a08222fe1b6 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15030,11 +15030,15 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
  dperm.one_operand_p = true;
 
- if (mode == V8HFmode)
+ if (mode == V8HFmode || mode == V8BFmode)
{
- tmp1 = force_reg (HFmode, val);
+ rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
+ tmp1 = mode == V8HFmode ? force_reg (HFmode, val)
+ : force_reg (BFmode, val);
  tmp2 = gen_reg_rtx (mode);
- emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
+ gen_vec_set_0 = mode == V8HFmode ? gen_vec_setv8hf_0
+  : gen_vec_setv8bf_0;
+ emit_insn (gen_vec_set_0 (tmp2, CONST0_RTX (mode), tmp1));
  tmp1 = gen_lowpart (mode, tmp2);
}
  else
@@ -21822,17 +21826,20 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d 
*d)
   return true;
 
 case E_V8HFmode:
+case E_V8BFmode:
   /* This can be implemented via interleave and pshufd.  */
   if (d->testing_p)
return true;
 
   if (elt >= nelt2)
{
- gen = gen_vec_interleave_highv8hf;
+ gen = vmode == V8HFmode ? gen_vec_interleave_highv8hf
+ : gen_vec_interleave_highv8bf;
  elt -= nelt2;
}
   else
-   gen = gen_vec_interleave_lowv8hf;
+   gen = vmode == V8HFmode ? gen_vec_interleave_lowv8hf
+   : gen_vec_interleave_lowv8bf;
   nelt2 /= 2;
 
   dest = gen_reg_rtx (vmode);
diff --git a/gcc/testsuite/gcc.target/i386/pr106742.c 
b/gcc/testsuite/gcc.target/i386/pr106742.c
new file mode 100644
index 000..4a53cd49902
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106742.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options " -msse2 -mno-avx2 -O1" } */
+typedef __bf16 v8bf __attribute__ ((__vector_size__ (16)));
+
+v8bf
+vec_init_dup_v8bf (__bf16 a1)
+{
+  return __extension__ (v8bf) { a1, a1, a1, a1, a1, a1, a1, a1 }; }
+/* { dg-final { scan-assembler-times "punpcklwd" 1} } */
--
2.18.2



RE: [PATCH] middle-end: Add MULT_EXPR recognition for cond scalar reduction

2022-08-31 Thread Kong, Lingling via Gcc-patches
Hi  Richard,  could you help to have a look for the patch ?

Ok for master ?

> Hi,
> 
> The conditional mult reduction cannot be recognized with current GCC. The
> following loop cannot be vectorized.
> Now add MULT_EXPR recognition for conditional scalar reduction.
> 
> float summa(int n, float *arg1, float *arg2)
> {
> int i;
> float res1 = 1.0;
> for(i = 0; i < n; i++) {
>   if(arg2[i])
> res1 *= arg1[i];
> }
> return res1;
> }
> 
> gcc/ChangeLog:
> 
>   * tree-if-conv.cc (is_cond_scalar_reduction): Add MULT_EXPR
>   recognition.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/tree-ssa/gen-vect-34.c: New test.
>   * gcc.dg/vect/vect-ifcvt-18.c: New test.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c | 16 +
>  gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c   | 38 +
>  gcc/tree-if-conv.cc |  1 +
>  3 files changed, 55 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
> b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
> new file mode 100644
> index 000..8d2d36401fe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-34.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -fdump-tree-vect-details" } */
> +/* { dg-additional-options "-mavx2" { target { x86_64-*-* i?86-*-* } }
> +} */
> +
> +float summul(int n, float *arg1, float *arg2)
> +{
> +int i;
> +float res1 = 1.0;
> +for(i = 0; i < n; i++) {
> +  if(arg2[i])
> +res1 *= arg1[i];
> +}
> +return res1;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> +target { ! { avr-*-* pru-*-* } } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> new file mode 100644
> index 000..c1d3c27d819
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-ifcvt-18.c
> @@ -0,0 +1,38 @@
> +/* { dg-require-effective-target vect_condition } */
> +/* { dg-require-effective-target vect_float } */
> +/* { dg-additional-options "-Ofast -mavx" { target avx_runtime } } */
> +
> +
> +int A0[4] = {36,39,42,45};
> +int B0[4] = {42,42,0,42};
> +float A1[8] = {36,39,42,45,43,32,21,12}; float B1[8] =
> +{42,42,0,42,42,42,0,42}; double A2[16] =
> +{36,39,42,45,43,32,21,12,23,34,45,56,42,78,89,11};
> +double B2[16] = {42,42,0,42,42,42,42,42,42,42,42,42,0,42,42,42};
> +
> +int main ()
> +{
> +  int i, j;
> +  int res0 = 1;
> +  float res1 = 1.0;
> +  double res2 = 1.0;
> +
> +  for (i = 0; i < 4; i++)
> +if (B0[i])
> +  res0 *= A0[i];
> +
> +  for (i = 0; i < 8; i++)
> +if (B1[i])
> +  res1 *= A1[i];
> +
> +  for (i = 0; i < 16; i++)
> +if (B2[i])
> +  res2 *= A2[i];
> +  /* check results:  */
> +  if (res0 != 63180 || res1 != 1043228160.00
> +  ||res2 != 3296728515318523101184.00)
> +  __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorized 3 loops" "vect" { target
> +i?86-*-* x86_64-*-* } } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index
> 1c8e1a45234..bac29fb5574 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1739,6 +1739,7 @@ is_cond_scalar_reduction (gimple *phi, gimple
> **reduc, tree arg_0, tree arg_1,
> 
>if (reduction_op != PLUS_EXPR
>&& reduction_op != MINUS_EXPR
> +  && reduction_op != MULT_EXPR
>&& reduction_op != BIT_IOR_EXPR
>&& reduction_op != BIT_XOR_EXPR
>&& reduction_op != BIT_AND_EXPR)
> --
> 2.18.2



RE: [PATCH] x86: Handle V8BF in expand_vec_perm_broadcast_1

2022-09-02 Thread Kong, Lingling via Gcc-patches
Hi,

I fixed it in a new patch.  And added BF vector mode in SUBST_V and 
avx512fmaskhalfmode for @vec_interleave_high.
Ok for trunk ?

> > Hi,
> >
> > Handle E_V8BFmode in expand_vec_perm_broadcast_1 and
> ix86_expand_vector_init_duplicate.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > PR target/106742
> > * config/i386/i386-expand.cc (ix86_expand_vector_init_duplicate):
> > Handle V8BF mode.
> > (expand_vec_perm_broadcast_1): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr106742.c: New test.
> > ---
> >  gcc/config/i386/i386-expand.cc   | 17 -
> >  gcc/testsuite/gcc.target/i386/pr106742.c | 10 ++
> >  2 files changed, 22 insertions(+), 5 deletions(-)  create mode 100644
> > gcc/testsuite/gcc.target/i386/pr106742.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 4b216308a18..a08222fe1b6 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -15030,11 +15030,15 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> >   dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
> >   dperm.one_operand_p = true;
> >
> > - if (mode == V8HFmode)
> > + if (mode == V8HFmode || mode == V8BFmode)
> > {
> > - tmp1 = force_reg (HFmode, val);
> > + rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
> > + tmp1 = mode == V8HFmode ? force_reg (HFmode, val)
> > + : force_reg (BFmode, val);
> tmp1 = force_reg (GET_MODE_INNER (mode), val);
> >   tmp2 = gen_reg_rtx (mode);
> > - emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
> > + gen_vec_set_0 = mode == V8HFmode ? gen_vec_setv8hf_0
> > +  : gen_vec_setv8bf_0;
> add @ to vec_set_0 as (define_insn "@vec_set_0" and pass
> mode to vec_set_0 as emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX
> (mode), tmp1));
> > + emit_insn (gen_vec_set_0 (tmp2, CONST0_RTX (mode),
> > + tmp1));
> 
> >   tmp1 = gen_lowpart (mode, tmp2);
> > }
> >   else
> > @@ -21822,17 +21826,20 @@ expand_vec_perm_broadcast_1 (struct
> expand_vec_perm_d *d)
> >return true;
> >
> >  case E_V8HFmode:
> > +case E_V8BFmode:
> >/* This can be implemented via interleave and pshufd.  */
> >if (d->testing_p)
> > return true;
> >
> >if (elt >= nelt2)
> > {
> > - gen = gen_vec_interleave_highv8hf;
> > + gen = vmode == V8HFmode ? gen_vec_interleave_highv8hf
> > + : gen_vec_interleave_highv8bf;
> Similar, add @ to define_insn and pass gen_vec_interleave.
> >   elt -= nelt2;
> > }
> >else
> > -   gen = gen_vec_interleave_lowv8hf;
> > +   gen = vmode == V8HFmode ? gen_vec_interleave_lowv8hf
> > +   : gen_vec_interleave_lowv8bf;
> >nelt2 /= 2;
> >
> >dest = gen_reg_rtx (vmode);
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106742.c
> > b/gcc/testsuite/gcc.target/i386/pr106742.c
> > new file mode 100644
> > index 000..4a53cd49902
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106742.c
> > @@ -0,0 +1,10 @@
> > +/* { dg-do compile } */
> > +/* { dg-options " -msse2 -mno-avx2 -O1" } */ typedef __bf16 v8bf
> > +__attribute__ ((__vector_size__ (16)));
> > +
> > +v8bf
> > +vec_init_dup_v8bf (__bf16 a1)
> > +{
> > +  return __extension__ (v8bf) { a1, a1, a1, a1, a1, a1, a1, a1 }; }
> > +/* { dg-final { scan-assembler-times "punpcklwd" 1} } */
> > --
> > 2.18.2
> >
> 
> 
> --
> BR,
> Hongtao


0001-x86-Handle-V8BF-in-expand_vec_perm_broadcast_1.patch
Description: 0001-x86-Handle-V8BF-in-expand_vec_perm_broadcast_1.patch


RE: [PATCH] Enhance final_value_replacement_loop to handle bitop with an invariant induction.[PR105735]

2022-09-13 Thread Kong, Lingling via Gcc-patches
Hi Richard,

Thanks you so much for reviewing this patch.  I really appreciate it. For these 
review comments, I have made some changes.

> That's a single-stmt match, you shouldn't use match.pd matching for this.
> Instead just do
> 
>   if (is_gimple_assign (stmt)
>   && ((code = gimple_assign_rhs_code (stmt)), true)
>   && (code == BIT_AND_EXPR || code == BIT_IOR_EXPR || code ==
> BIT_XOR_EXPR))

Yes, I fixed it and dropped modification for match.pd.

> and pick gimple_assign_rhs{1,2} (stmt) as the operands.  The :c in bit_op:c is
> redundant btw. - while the name suggests "with invariant" you don't actually
> check for that.  But again, given canonicalization rules the invariant will 
> be rhs2
> so above add
> 
> && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST

For " with invariant", this needed op1 is invariant, and I used 
`expr_invariant_in_loop_p (loop, match_op[0])` for check.
And op2 just be PHI is ok. If op2 is INTEGER_CST, existing gcc can be directly 
optimized and do not need modification.

> you probably need dg-require-effective-target longlong, but is it necessary to
> use long long for the testcases in the first place?
> The IV seems to be unused, if it should match the variables bit size use 
> sizeof
> (type) * 8

Yes, It is not necessary to use long long for the testcases. I changed type to 
unsigned int.

> > +  inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge
> > + (loop));  return fold_build2 (code1, type, inv, match_op[0]); }
> 
> The } goes to the next line.

Sorry, It might be something wrong with my use of gcc send-email format.

> > +  tree bitinv_def;
> > +  if ((bitinv_def
> 
> please use else if here

Sorry, If use the else if here, there is no corresponding above if. I'm not 
sure if you mean change bitwise induction expression if to else if.

Do you agree with these changes?  Thanks again for taking a look.

Thanks,
Lingling

> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, August 23, 2022 3:27 PM
> To: Kong, Lingling 
> Cc: Liu, Hongtao ; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] Enhance final_value_replacement_loop to handle bitop
> with an invariant induction.[PR105735]
> 
> On Thu, Aug 18, 2022 at 8:48 AM Kong, Lingling via Gcc-patches  patc...@gcc.gnu.org> wrote:
> >
> > Hi,
> >
> > This patch is for pr105735/pr101991. It will enable below optimization:
> > {
> > -  long unsigned int bit;
> > -
> > -   [local count: 32534376]:
> > -
> > -   [local count: 1041207449]:
> > -  # tmp_10 = PHI 
> > -  # bit_12 = PHI 
> > -  tmp_7 = bit2_6(D) & tmp_10;
> > -  bit_8 = bit_12 + 1;
> > -  if (bit_8 != 32)
> > -goto ; [96.97%]
> > -  else
> > -goto ; [3.03%]
> > -
> > -   [local count: 1009658865]:
> > -  goto ; [100.00%]
> > -
> > -   [local count: 32534376]:
> > -  # tmp_11 = PHI 
> > -  return tmp_11;
> > +  tmp_11 = tmp_4(D) & bit2_6(D);
> > +  return tmp_11;
> >
> > }
> >
> > Ok for master ?
> >
> > gcc/ChangeLog:
> >
> > PR middle-end/105735
> > * match.pd (bitop_with_inv_p): New match.
> > * tree-scalar-evolution.cc (gimple_bitop_with_inv_p): Declare.
> > (analyze_and_compute_bitop_with_inv_effect): New function.
> > (final_value_replacement_loop): Enhanced to handle bitop
> > with inv induction.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr105735-1.c: New test.
> > * gcc.target/i386/pr105735-2.c: New test.
> > ---
> >  gcc/match.pd   |  4 +
> >  gcc/testsuite/gcc.target/i386/pr105735-1.c | 88 ++
> gcc/testsuite/gcc.target/i386/pr105735-2.c | 28 +++
> >  gcc/tree-scalar-evolution.cc   | 59 +++
> >  4 files changed, 179 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr105735-2.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd index
> > 562138a8034..cfe593ebb02 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -8050,6 +8050,10 @@ and,
> >   (bit_not
> >(nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2))
> > @3
> >
> > +(for bit_op (bit_and bit_ior bit_xor)  (match (bitop_with_inv_p @0
> > +@1)
> > +  (bit_op:c @0 @1)))
> > +
> 
> That's a single-stmt match, you shouldn't use match.pd matching for this.
> Instead just d

[PATCH] i386: Support complex fma/conj_fma for _Float16.

2021-11-05 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is to support cmla_optab, cmul_optab, cmla_conj_optab, 
cmul_conj_optab for vector _Float16.
Ok for master?

gcc/ChangeLog:

* config/i386/sse.md (cmul3): add new define_expand.
(cmla4): Likewise

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vector-complex-float.c: New test.
---
 gcc/config/i386/sse.md| 23 +++
 .../i386/avx512fp16-vector-complex-float.c| 40 +++
 2 files changed, 63 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
0a7f5b178f9..8d3fef0a31a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5922,6 +5922,12 @@
 (UNSPEC_COMPLEX_FMUL "fmulc")
 (UNSPEC_COMPLEX_FCMUL "fcmulc")])
 
+(define_int_attr conj_op
+   [(UNSPEC_COMPLEX_FMA "")
+(UNSPEC_COMPLEX_FCMA "_conj")
+(UNSPEC_COMPLEX_FMUL "")
+(UNSPEC_COMPLEX_FCMUL "_conj")])
+
 (define_mode_attr complexmove
   [(V32HF "avx512f_loadv16sf")
(V16HF "avx512vl_loadv8sf")
@@ -6003,6 +6009,15 @@
   DONE;
 })
 
+(define_expand "cmla4"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+   [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+(match_operand:VF_AVX512FP16VL 2 "vector_operand")
+(match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+UNSPEC_COMPLEX_F_C_MA))]
+  "TARGET_AVX512FP16")
+
 (define_insn "fma__"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(unspec:VF_AVX512FP16VL
@@ -6084,6 +6099,14 @@
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cmul3"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+  (match_operand:VF_AVX512FP16VL 2 "vector_operand")]
+  UNSPEC_COMPLEX_F_C_MUL))]
+  "TARGET_AVX512FP16")
+
 (define_insn "__"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
  (unspec:VF_AVX512FP16VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c
new file mode 100644
index 000..bcb957f0de0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vector-complex-float.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vfmaddcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-not "vfmadd\[123]*ph\[ \\t\]"} } */
+/* { dg-final { scan-assembler-not "vfmadd\[123]*sh\[ \\t\]"} } */
+/* { dg-final { scan-assembler-times "vfcmaddcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfmulcph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfcmulcph\[ \\t\]" 1 } } */
+
+#include
+#define TYPE _Float16
+#define N 16
+
+void fma0 (_Complex TYPE *a, _Complex TYPE *b,
+   _Complex TYPE *c)
+{
+  for (int i = 0; i < N; i++)
+c[i] += a[i] * b[i];
+}
+
+void fmaconj (_Complex TYPE a[restrict N], _Complex TYPE b[restrict N],
+ _Complex TYPE c[restrict N])
+{
+  for (int i = 0; i < N; i++)
+c[i] += a[i] * ~b[i];
+}
+
+void fmul (_Complex TYPE a[restrict N], _Complex TYPE b[restrict N],
+  _Complex TYPE c[restrict N])
+{
+  for (int i = 0; i < N; i++)
+c[i] = a[i] * b[i];
+}
+
+void fmulconj (_Complex TYPE a[restrict N], _Complex TYPE b[restrict N],
+  _Complex TYPE c[restrict N])
+{
+  for (int i = 0; i < N; i++)
+c[i] = a[i] * ~b[i];
+}
--
2.18.1



[PATCH] i386: Optimization for mm512_set1_pch.

2021-11-05 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is to support fold _mm512_fmadd_pch (a, _mm512_set1_pch(*(b)), c) to 
1 instruction vfmaddcph (%rsp){1to16}, %zmm1, %zmm2.
OK for master?

gcc/ChangeLog:

* config/i386/sse.md (fma___pair):
Add new define_insn.
(fma__fmaddc_bcst): Add new define_insn_and_split.
(fma__fcmaddc_bcst): Likewise

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16vl-complex-broadcast-1.c: New test.
---
 gcc/config/i386/sse.md| 62 +++
 .../i386/avx512fp16vl-complex-broadcast-1.c   | 25 
 2 files changed, 87 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 
0a7f5b178f9..eba8e77515f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -193,7 +193,9 @@
 
   ;; For AVX512FP16 suppport
   UNSPEC_COMPLEX_FMA
+  UNSPEC_COMPLEX_FMA_PAIR
   UNSPEC_COMPLEX_FCMA
+  UNSPEC_COMPLEX_FCMA_PAIR
   UNSPEC_COMPLEX_FMUL
   UNSPEC_COMPLEX_FCMUL
   UNSPEC_COMPLEX_MASK
@@ -5913,6 +5915,9 @@
 (define_int_iterator UNSPEC_COMPLEX_F_C_MA
[UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA])
 
+(define_int_iterator UNSPEC_COMPLEX_F_C_MA_PAIR
+   [UNSPEC_COMPLEX_FMA_PAIR UNSPEC_COMPLEX_FCMA_PAIR])
+
 (define_int_iterator UNSPEC_COMPLEX_F_C_MUL
[UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL])
 
@@ -5922,6 +5927,10 @@
 (UNSPEC_COMPLEX_FMUL "fmulc")
 (UNSPEC_COMPLEX_FCMUL "fcmulc")])
 
+(define_int_attr complexpairopname
+   [(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
+(UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
+
 (define_mode_attr complexmove
   [(V32HF "avx512f_loadv16sf")
(V16HF "avx512vl_loadv8sf")
@@ -6067,6 +6076,59 @@
  [(match_dup 1) (match_dup 2) (match_dup 4)]
   UNSPEC_COMPLEX_F_C_MA))])
 
+(define_insn "fma___pair"
+ [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
+   (unspec:VF1_AVX512VL
+[(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
+ (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
+ (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
+ UNSPEC_COMPLEX_F_C_MA_PAIR))]
+ "TARGET_AVX512FP16"
+ "vph\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "ssemuladd")])
+
+(define_insn_and_split "fma__fmaddc_bcst"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+  (subreg:VF_AVX512FP16VL
+(match_operand: 2 "bcst_vector_operand") 0)
+  (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+  UNSPEC_COMPLEX_FMA))]
+  "TARGET_AVX512FP16"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:
+ [(match_dup 1) (match_dup 2) (match_dup 3)]
+  UNSPEC_COMPLEX_FMA_PAIR))]
+  {
+operands[0] = lowpart_subreg (mode, operands[0], mode);
+operands[1] = lowpart_subreg (mode, operands[1], mode);
+operands[3] = lowpart_subreg (mode, operands[3], 
+mode);
+  })
+
+(define_insn_and_split "fma__fcmaddc_bcst"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+   (unspec:VF_AVX512FP16VL
+ [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+  (subreg:VF_AVX512FP16VL
+(match_operand: 2 "bcst_vector_operand") 0)
+  (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+  UNSPEC_COMPLEX_FCMA))]
+  "TARGET_AVX512FP16"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:
+ [(match_dup 1) (match_dup 2) (match_dup 3)]
+  UNSPEC_COMPLEX_FCMA_PAIR))]
+  {
+operands[0] = lowpart_subreg (mode, operands[0], mode);
+operands[1] = lowpart_subreg (mode, operands[1], mode);
+operands[3] = lowpart_subreg (mode, operands[3], 
+mode);
+  })
+
 (define_insn "___mask"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
(vec_merge:VF_AVX512FP16VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c 
b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
new file mode 100644
index 000..3c8e84230f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } }  */
+
+#include 
+
+volatile __m512h res0, a0, c0;
+volatile __m256h res1, a1, c1;
+volatile __m128h res2, a2, c2;
+volatile _Float16 *b;
+
+void extern
+avx_test(void)
+{
+  res0 = _mm512_fmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
+  res0 = _mm512_fcmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
+
+  res1 = _mm256_fmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
+  res1 = _mm256_fcmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
+
+  res2 =  _mm_f

[PATCH] i386: vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with -mf16c [PR 102811]

2021-11-16 Thread Kong, Lingling via Gcc-patches
Hi,

vcvtph2ps and vcvtps2ph should be used to convert _Float16 to SFmode with 
-mf16c. So added define_insn extendhfsf2 and truncsfhf2 for target_f16c.

OK for master?

gcc/ChangeLog:

PR target/102811
* config/i386/i386.md (extendhfsf2): Add extenndhfsf2 for f16c.
(extendhfdf2): Split extendhf2 into separate extendhfsf2, 
extendhfdf2.
(truncsfhf2): Likewise.
(truncdfhf2): Likewise.

gcc/testsuite/ChangeLog:

PR target/102811
* gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c: New test.
---
 gcc/config/i386/i386.md   | 48 +++
 .../i386/avx512vl-vcvtps2ph-pr102811.c| 10 
 2 files changed, 49 insertions(+), 9 deletions(-)  create mode 100644 
gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 
6eb9de81921..c5415475342 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4574,15 +4574,30 @@
   emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
 })
 
-(define_insn "extendhf2"
-  [(set (match_operand:MODEF 0 "nonimm_ssenomem_operand" "=v")
-(float_extend:MODEF
+(define_insn "extendhfsf2"
+  [(set (match_operand:SF 0 "register_operand" "=v")
+   (float_extend:SF
+ (match_operand:HF 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+{
+  if (TARGET_AVX512FP16)
+return "vcvtsh2ss\t{%1, %0, %0|%0, %0, %1}";
+  else
+return "vcvtph2ps\t{%1, %0|%0, %1}"; }
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_evex")
+   (set_attr "mode" "SF")])
+
+(define_insn "extendhfdf2"
+  [(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=v")
+   (float_extend:DF
  (match_operand:HF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
-  "vcvtsh2\t{%1, %0, %0|%0, %0, %1}"
+  "vcvtsh2sd\t{%1, %0, %0|%0, %0, %1}"
   [(set_attr "type" "ssecvt")
(set_attr "prefix" "evex")
-   (set_attr "mode" "")])
+   (set_attr "mode" "DF")])
 
 
 (define_expand "extendxf2"
@@ -4766,12 +4781,27 @@
 
 ;; Conversion from {SF,DF}mode to HFmode.
 
-(define_insn "trunchf2"
+(define_insn "truncsfhf2"
+  [(set (match_operand:HF 0 "register_operand" "=v")
+   (float_truncate:HF
+ (match_operand:SF 1 "nonimmediate_operand" "vm")))]
+  "TARGET_AVX512FP16 || TARGET_F16C || TARGET_AVX512VL"
+  {
+if (TARGET_AVX512FP16)
+  return "vcvtss2sh\t{%1, %d0|%d0, %1}";
+else
+  return "vcvtps2ph\t{0, %1, %0|%0, %1, 0}";
+  }
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "HF")])
+
+(define_insn "truncdfhf2"
   [(set (match_operand:HF 0 "register_operand" "=v")
-   (float_truncate:HF
- (match_operand:MODEF 1 "nonimmediate_operand" "vm")))]
+   (float_truncate:HF
+ (match_operand:DF 1 "nonimmediate_operand" "vm")))]
   "TARGET_AVX512FP16"
-  "vcvt2sh\t{%1, %d0|%d0, %1}"
+  "vcvtsd2sh\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "ssecvt")
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
new file mode 100644
index 000..ab44a304a03
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vcvtps2ph-pr102811.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mf16c -mno-avx512fp16" } */
+/* { dg-final { scan-assembler-times "vcvtph2ps\[ \\t\]" 2 } } */
+/* { dg-final { scan-assembler-times "vcvtps2ph\[ \\t\]" 1 } } */
+/* { dg-final { scan-assembler-not "__truncsfhf2\[ \\t\]"} } */
+/* { dg-final { scan-assembler-not "__extendhfsf2\[ \\t\]"} } */
+_Float16 test (_Float16 a, _Float16 b)
+{
+  return a + b;
+}
--
2.18.1



[PATCH] i386: add alias for f*mul_*ch intrinsics

2021-11-16 Thread Kong, Lingling via Gcc-patches
Hi,

This patch is to add alias for f*mul_*ch intrinsics. 

Ok for master?

gcc/ChangeLog:

* config/i386/avx512fp16intrin.h (_mm512_mul_pch): Add alias for 
_mm512_fmul_pch.
(_mm512_mask_mul_pch): Likewise.
(_mm512_maskz_mul_pch): Likewise.
(_mm512_mul_round_pch): Likewise.
(_mm512_mask_mul_round_pch): Likewise.
(_mm512_maskz_mul_round_pch): Likewise.
(_mm512_cmul_pch): Likewise.
(_mm512_mask_cmul_pch): Likewise.
(_mm512_maskz_cmul_pch): Likewise.
(_mm512_cmul_round_pch): Likewise.
(_mm512_mask_cmul_round_pch): Likewise.
(_mm512_maskz_cmul_round_pch): Likewise.
(_mm_mul_sch): Likewise.
(_mm_mask_mul_sch): Likewise.
(_mm_maskz_mul_sch): Likewise.
(_mm_mul_round_sch): Likewise.
(_mm_mask_mul_round_sch): Likewise.
(_mm_maskz_mul_round_sch): Likewise.
(_mm_cmul_sch): Likewise.
(_mm_mask_cmul_sch): Likewise.
(_mm_maskz_cmul_sch): Likewise.
(_mm_cmul_round_sch): Likewise.
(_mm_mask_cmul_round_sch): Likewise.
(_mm_maskz_cmul_round_sch): Likewise.
* config/i386/avx512fp16vlintrin.h (_mm_mul_pch): Likewise.
(_mm_mask_mul_pch): Likewise.
(_mm_maskz_mul_pch): Likewise.
(_mm256_mul_pch): Likewise.
(_mm256_mask_mul_pch): Likewise.
(_mm256_maskz_mul_pch): Likewise.
(_mm_cmul_pch): Likewise.
(_mm_mask_cmul_pch): Likewise.
(_mm_maskz_cmul_pch): Likewise.
(_mm256_cmul_pch): Likewise.
(_mm256_mask_cmul_pch): Likewise.
(_mm256_maskz_cmul_pch): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vfcmulcph-1a.c: Add new test for alias.
* gcc.target/i386/avx512fp16-vfcmulcsh-1a.c: Likewise.
* gcc.target/i386/avx512fp16-vfmulcph-1a.c: Likewise.
* gcc.target/i386/avx512fp16-vfmulcsh-1a.c: Likewise.
* gcc.target/i386/avx512fp16vl-vfcmulcph-1a.c: Likewise.
* gcc.target/i386/avx512fp16vl-vfmulcph-1a.c: Likewise.
---
 gcc/config/i386/avx512fp16intrin.h| 39 +++
 gcc/config/i386/avx512fp16vlintrin.h  | 17 
 .../gcc.target/i386/avx512fp16-vfcmulcph-1a.c | 19 ++---  
.../gcc.target/i386/avx512fp16-vfcmulcsh-1a.c | 19 ++---  
.../gcc.target/i386/avx512fp16-vfmulcph-1a.c  | 19 ++---  
.../gcc.target/i386/avx512fp16-vfmulcsh-1a.c  | 19 ++---
 .../i386/avx512fp16vl-vfcmulcph-1a.c  | 20 +++---
 .../i386/avx512fp16vl-vfmulcph-1a.c   | 20 +++---
 8 files changed, 136 insertions(+), 36 deletions(-)

diff --git a/gcc/config/i386/avx512fp16intrin.h 
b/gcc/config/i386/avx512fp16intrin.h
index 44c5e24f234..fe73e693897 100644
--- a/gcc/config/i386/avx512fp16intrin.h
+++ b/gcc/config/i386/avx512fp16intrin.h
@@ -7162,6 +7162,45 @@ _mm512_set1_pch (_Float16 _Complex __A)
   return (__m512h) _mm512_set1_ps (u.b);  }
 
+// intrinsics below are alias for f*mul_*ch #define _mm512_mul_pch(A, 
+B) _mm512_fmul_pch ((A), (B))
+#define _mm512_mask_mul_pch(W, U, A, B)  \
+  _mm512_mask_fmul_pch ((W), (U), (A), (B)) #define 
+_mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch ((U), (A), (B)) 
+#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch ((A), (B), (R))
+#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
+  _mm512_mask_fmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_mul_round_pch(U, A, B, R)   \
+  _mm512_maskz_fmul_round_pch ((U), (A), (B), (R))
+
+#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch ((A), (B))
+#define _mm512_mask_cmul_pch(W, U, A, B) \
+  _mm512_mask_fcmul_pch ((W), (U), (A), (B)) #define 
+_mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch ((U), (A), (B)) 
+#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch ((A), (B), (R))
+#define _mm512_mask_cmul_round_pch(W, U, A, B, R)\
+  _mm512_mask_fcmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_cmul_round_pch(U, A, B, R)  \
+  _mm512_maskz_fcmul_round_pch ((U), (A), (B), (R))
+
+#define _mm_mul_sch(A, B) _mm_fmul_sch ((A), (B)) #define 
+_mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch ((W), (U), (A), (B)) 
+#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch ((U), (A), (B)) 
+#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch ((A), (B), (R))
+#define _mm_mask_mul_round_sch(W, U, A, B, R)\
+  _mm_mask_fmul_round_sch ((W), (U), (A), (B), (R))
+#define _mm_maskz_mul_round_sch(U, A, B, R)  \
+  _mm_maskz_fmul_round_sch ((U), (A), (B), (R))
+
+#define _mm_cmul_sch(A, B) _mm_fcmul_sch ((A), (B)) #define 
+_mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch ((W), (U), (A), (B)) 
+#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch ((U), (A), (B)) 
+#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch ((A), (B), (R))
+#define

  1   2   >