On Mon, Apr 21, 2025 at 9:38 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> On Mon, Apr 21, 2025 at 6:34 PM Jan Hubicka <hubi...@ucw.cz> wrote:
> ...
> > We originally put CLEAR_RATIO < MOVE_RATIO based on observation that
> >   mov $0, mem
> > is longer in encoding than
> >   mov mem, mem
> > and there was a plan to implement optimization to avoid long immediates
> > in moves, but it did not materialize (yet).  With SSE this problem
> > disappears since SSE stores does not have immediates anyway.
>
> Here is a patch to implement it with UNSPEC_STORE_BY_PIECES.
> How does it look?
>

Here is the v2 patch with tests.

-- 
H.J.
From a71175839703460210aa4fb4f036469a7cf66b7b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Apr 2025 21:12:35 +0800
Subject: [PATCH v2] RFC: Add TARGET_STORE_BY_PIECES_ICODE

Add a target hook to control the instruction to move the memory used by
the store by_pieces infrastructure so that a target can choose a specific
instruction for shorter encoding.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
 gcc/config/i386/i386.cc                       | 23 +++++++++++++++++++
 gcc/config/i386/i386.md                       | 23 +++++++++++++++++++
 gcc/config/i386/x86-tune.def                  |  6 +++++
 gcc/doc/tm.texi                               |  5 ++++
 gcc/doc/tm.texi.in                            |  2 ++
 gcc/expr.cc                                   |  2 +-
 gcc/target.def                                |  7 ++++++
 gcc/targhooks.cc                              |  9 ++++++++
 gcc/targhooks.h                               |  2 ++
 .../gcc.target/i386/memset-strategy-10.c      |  9 ++++++++
 .../gcc.target/i386/memset-strategy-11.c      | 10 ++++++++
 .../gcc.target/i386/memset-strategy-12.c      |  9 ++++++++
 .../gcc.target/i386/memset-strategy-13.c      | 10 ++++++++
 .../gcc.target/i386/memset-strategy-14.c      | 10 ++++++++
 .../gcc.target/i386/memset-strategy-15.c      | 10 ++++++++
 .../gcc.target/i386/memset-strategy-16.c      | 10 ++++++++
 16 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-16.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d15f91ddd2c..830f156ce28 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26542,6 +26542,27 @@ ix86_redzone_clobber ()
   return NULL_RTX;
 }
 
+/* Implement TARGET_STORE_BY_PIECES_ICODE.  */
+
+static insn_code
+ix86_store_by_pieces_icode (machine_mode mode,
+			    unsigned HOST_WIDE_INT length)
+{
+  /* Avoid word moves with 32-bit immediate if it will be used more
+     than twice for shorter encoding.  If the remaining bytes are
+     greater than the half word, the previous word can be reused.  */
+  if (mode == word_mode
+      && ((length / UNITS_PER_WORD) > 3
+	  || (length > (UNITS_PER_WORD * 2)
+	      && (length % UNITS_PER_WORD) > UNITS_PER_WORD / 2))
+      && ix86_tune_features [X86_TUNE_USE_REGISTER_STORE_BY_PIECES])
+    return (word_mode == DImode
+	    ? CODE_FOR_store_by_pieces_movdi
+	    : CODE_FOR_store_by_pieces_movsi);
+
+  return default_store_by_pieces_icode (mode, length);
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -26987,6 +27008,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
 
 #undef TARGET_OVERLAP_OP_BY_PIECES_P
 #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
+#undef TARGET_STORE_BY_PIECES_ICODE
+#define TARGET_STORE_BY_PIECES_ICODE ix86_store_by_pieces_icode
 
 #undef TARGET_FLAGS_REGNUM
 #define TARGET_FLAGS_REGNUM FLAGS_REG
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d6b2f2959b2..f99e155ec3b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -118,6 +118,7 @@ (define_c_enum "unspec" [
   UNSPEC_POPFL
   UNSPEC_OPTCOMX
   UNSPEC_SETCC_SI_SLP
+  UNSPEC_STORE_BY_PIECES
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -2417,6 +2418,28 @@ (define_expand "mov<mode>"
   ""
   "ix86_expand_move (<MODE>mode, operands); DONE;")
 
+;; SI/DI mode register stores used by store by_pieces for shorter
+;; encoding.
+(define_expand "store_by_pieces_mov<mode>"
+  [(set (match_operand:W 0 "memory_operand")
+        (match_operand:W 1 "general_operand"))]
+  ""
+{
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+  emit_insn (gen_store_by_pieces_mov<mode>_1 (operands[0],
+                                              operands[1]));
+  DONE;
+})
+
+(define_insn "store_by_pieces_mov<mode>_1"
+  [(set (match_operand:W 0 "memory_operand" "=m")
+        (unspec:W [(match_operand:W 1 "register_operand" "r")]
+                  UNSPEC_STORE_BY_PIECES))]
+  ""
+  "mov<W:imodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*mov<mode>_xor"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(match_operand:SWI48 1 "const0_operand"))
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index c857e769b60..6ffeab3bb15 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -636,6 +636,12 @@ DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
 DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues",
 	  m_ZNVER4 | m_ZNVER5)
 
+/* X86_TUNE_USE_REGISTER_STORE_BY_PIECES: Generate store_by_pieces with
+   word register store for shorter encoding.  */
+DEF_TUNE (X86_TUNE_USE_REGISTER_STORE_BY_PIECES,
+	  "use_register_store_by_pieces",
+	  HOST_WIDE_INT_M1U)
+
 /*****************************************************************************/
 /*****************************************************************************/
 /* Historical relics: tuning flags that helps a specific old CPU designs     */
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index a96700c0d38..0061b2e2dd0 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7186,6 +7186,11 @@ particular mode from being used for block comparisons by returning a
 negative number from this hook.
 @end deftypefn
 
+@deftypefn {Target Hook} insn_code TARGET_STORE_BY_PIECES_ICODE (machine_mode @var{mode}, unsigned HOST_WIDE_INT @var{length})
+This target hook returns insn_code to move up to @var{length} bytes in
+@var{mode} used by the store @code{by_pieces} infrastructure.
+@end deftypefn
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index eccc4d88493..e8cd831ad32 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4639,6 +4639,8 @@ If you don't define this, a reasonable default is used.
 
 @hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO
 
+@hook TARGET_STORE_BY_PIECES_ICODE
+
 @defmac MOVE_MAX_PIECES
 A C expression used by @code{move_by_pieces} to determine the largest unit
 a load or store used to copy memory is.  Defaults to @code{MOVE_MAX}.
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 3815c565e2d..2b56b1bf983 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -1714,7 +1714,7 @@ class store_by_pieces_d : public op_by_pieces_d
 bool
 store_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align)
 {
-  insn_code icode = optab_handler (mov_optab, mode);
+  insn_code icode = targetm.store_by_pieces_icode (mode, m_len);
   m_gen_fun = GEN_FCN (icode);
   return icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode);
 }
diff --git a/gcc/target.def b/gcc/target.def
index 6c7cdc8126b..bfe33c4a614 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -3909,6 +3909,13 @@ negative number from this hook.",
  int, (machine_mode mode),
  default_compare_by_pieces_branch_ratio)
 
+DEFHOOK
+(store_by_pieces_icode,
+ "This target hook returns insn_code to move up to @var{length} bytes in\n\
+@var{mode} used by the store @code{by_pieces} infrastructure.",
+ insn_code, (machine_mode mode, unsigned HOST_WIDE_INT length),
+ default_store_by_pieces_icode)
+
 DEFHOOK
 (slow_unaligned_access,
  "This hook returns true if memory accesses described by the\n\
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index c79458e374e..918e7b564a0 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -2196,6 +2196,15 @@ default_compare_by_pieces_branch_ratio (machine_mode)
   return 1;
 }
 
+/* This target hook returns insn_code to move the MODE memory used by the
+   store by_pieces infrastructure.  */
+
+insn_code
+default_store_by_pieces_icode (machine_mode mode, unsigned HOST_WIDE_INT)
+{
+  return optab_handler (mov_optab, mode);
+}
+
 /* Write PATCH_AREA_SIZE NOPs into the asm outfile FILE around a function
    entry.  If RECORD_P is true and the target supports named sections,
    the location of the NOPs will be recorded in a special object section
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index f16b58798c2..c04a47c5897 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -249,6 +249,8 @@ extern bool default_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT,
 						    enum by_pieces_operation,
 						    bool);
 extern int default_compare_by_pieces_branch_ratio (machine_mode);
+extern insn_code default_store_by_pieces_icode (machine_mode,
+						unsigned HOST_WIDE_INT);
 
 extern void default_print_patchable_function_entry (FILE *,
 						    unsigned HOST_WIDE_INT,
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
new file mode 100644
index 00000000000..1ac39df097b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
+/* { dg-final { scan-assembler-not "mov\[lq\]?\[\\t \]*\\$\[0\]," } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 23);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
new file mode 100644
index 00000000000..67080e22fde
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
+/* { dg-final { scan-assembler-not "mov\[lq\]?\[\\t \]*\\$\[0\]," { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 2 { target { ! ia32 } } } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 17);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-12.c b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c
new file mode 100644
index 00000000000..bdd06aa685b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]*\\$\[0\]," 2 } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 7);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-13.c b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c
new file mode 100644
index 00000000000..3ba6218e2c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]*%e" 5 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]*%r" 3 { target { ! ia32 } } } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 21);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-14.c b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c
new file mode 100644
index 00000000000..26e38023cc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]*%e" 5 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 2 { target { ! ia32 } } } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 20);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-15.c b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c
new file mode 100644
index 00000000000..adb23cdcb5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]*\\$\[0\]," 2 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 1 { target { ! ia32 } } } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 10);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-16.c b/gcc/testsuite/gcc.target/i386/memset-strategy-16.c
new file mode 100644
index 00000000000..afedfac2b25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-16.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=tigerlake -mno-sse" } */
+/* { dg-final { scan-assembler-times "movl\[\\t \]*\\%e" 3 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 1 { target { ! ia32 } } } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 11);
+}
-- 
2.49.0

Reply via email to