On Mon, Apr 21, 2025 at 9:38 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Mon, Apr 21, 2025 at 6:34 PM Jan Hubicka <hubi...@ucw.cz> wrote: > ... > > We originally put CLEAR_RATIO < MOVE_RATIO based on observation that > > mov $0, mem > > is longer in encoding than > > mov mem, mem > > and there was a plan to implement optimization to avoid long immediates > > in moves, but it did not materialize (yet). With SSE this problem > > disappears since SSE stores does not have immediates anyway. > > Here is a patch to implement it with UNSPEC_STORE_BY_PIECES. > How does it look? >
Here is the v2 patch with tests. -- H.J.
From a71175839703460210aa4fb4f036469a7cf66b7b Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Mon, 21 Apr 2025 21:12:35 +0800 Subject: [PATCH v2] RFC: Add TARGET_STORE_BY_PIECES_ICODE Add a target hook to control the instruction to move the memory used by the store by_pieces infrastructure so that a target can choose a specific instruction for shorter encoding. Signed-off-by: H.J. Lu <hjl.tools@gmail.com> --- gcc/config/i386/i386.cc | 23 +++++++++++++++++++ gcc/config/i386/i386.md | 23 +++++++++++++++++++ gcc/config/i386/x86-tune.def | 6 +++++ gcc/doc/tm.texi | 5 ++++ gcc/doc/tm.texi.in | 2 ++ gcc/expr.cc | 2 +- gcc/target.def | 7 ++++++ gcc/targhooks.cc | 9 ++++++++ gcc/targhooks.h | 2 ++ .../gcc.target/i386/memset-strategy-10.c | 9 ++++++++ .../gcc.target/i386/memset-strategy-11.c | 10 ++++++++ .../gcc.target/i386/memset-strategy-12.c | 9 ++++++++ .../gcc.target/i386/memset-strategy-13.c | 10 ++++++++ .../gcc.target/i386/memset-strategy-14.c | 10 ++++++++ .../gcc.target/i386/memset-strategy-15.c | 10 ++++++++ .../gcc.target/i386/memset-strategy-16.c | 10 ++++++++ 16 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-12.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-13.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-14.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-15.c create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-16.c diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index d15f91ddd2c..830f156ce28 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26542,6 +26542,27 @@ ix86_redzone_clobber () return NULL_RTX; } +/* Implement TARGET_STORE_BY_PIECES_ICODE. */ + +static insn_code +ix86_store_by_pieces_icode (machine_mode mode, + unsigned HOST_WIDE_INT length) +{ + /* Avoid word moves with 32-bit immediate if it will be used more + than twice for shorter encoding. If the remaining bytes are + greater than the half word, the previous word can be reused. */ + if (mode == word_mode + && ((length / UNITS_PER_WORD) > 3 + || (length > (UNITS_PER_WORD * 2) + && (length % UNITS_PER_WORD) > UNITS_PER_WORD / 2)) + && ix86_tune_features [X86_TUNE_USE_REGISTER_STORE_BY_PIECES]) + return (word_mode == DImode + ? CODE_FOR_store_by_pieces_movdi + : CODE_FOR_store_by_pieces_movsi); + + return default_store_by_pieces_icode (mode, length); +} + /* Target-specific selftests. */ #if CHECKING_P @@ -26987,6 +27008,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] = #undef TARGET_OVERLAP_OP_BY_PIECES_P #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true +#undef TARGET_STORE_BY_PIECES_ICODE +#define TARGET_STORE_BY_PIECES_ICODE ix86_store_by_pieces_icode #undef TARGET_FLAGS_REGNUM #define TARGET_FLAGS_REGNUM FLAGS_REG diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index d6b2f2959b2..f99e155ec3b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -118,6 +118,7 @@ (define_c_enum "unspec" [ UNSPEC_POPFL UNSPEC_OPTCOMX UNSPEC_SETCC_SI_SLP + UNSPEC_STORE_BY_PIECES ;; For SSE/MMX support: UNSPEC_FIX_NOTRUNC @@ -2417,6 +2418,28 @@ (define_expand "mov<mode>" "" "ix86_expand_move (<MODE>mode, operands); DONE;") +;; SI/DI mode register stores used by store by_pieces for shorter +;; encoding. +(define_expand "store_by_pieces_mov<mode>" + [(set (match_operand:W 0 "memory_operand") + (match_operand:W 1 "general_operand"))] + "" +{ + operands[1] = force_reg (<MODE>mode, operands[1]); + emit_insn (gen_store_by_pieces_mov<mode>_1 (operands[0], + operands[1])); + DONE; +}) + +(define_insn "store_by_pieces_mov<mode>_1" + [(set (match_operand:W 0 "memory_operand" "=m") + (unspec:W [(match_operand:W 1 "register_operand" "r")] + UNSPEC_STORE_BY_PIECES))] + "" + "mov<W:imodesuffix>\t{%1, %0|%0, %1}" + [(set_attr "type" "imov") + (set_attr "mode" "<MODE>")]) + (define_insn "*mov<mode>_xor" [(set (match_operand:SWI48 0 "register_operand" "=r") (match_operand:SWI48 1 "const0_operand")) diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index c857e769b60..6ffeab3bb15 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -636,6 +636,12 @@ DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces", DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues", m_ZNVER4 | m_ZNVER5) +/* X86_TUNE_USE_REGISTER_STORE_BY_PIECES: Generate store_by_pieces with + word register store for shorter encoding. */ +DEF_TUNE (X86_TUNE_USE_REGISTER_STORE_BY_PIECES, + "use_register_store_by_pieces", + HOST_WIDE_INT_M1U) + /*****************************************************************************/ /*****************************************************************************/ /* Historical relics: tuning flags that helps a specific old CPU designs */ diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index a96700c0d38..0061b2e2dd0 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -7186,6 +7186,11 @@ particular mode from being used for block comparisons by returning a negative number from this hook. @end deftypefn +@deftypefn {Target Hook} insn_code TARGET_STORE_BY_PIECES_ICODE (machine_mode @var{mode}, unsigned HOST_WIDE_INT @var{length}) +This target hook returns insn_code to move up to @var{length} bytes in +@var{mode} used by the store @code{by_pieces} infrastructure. +@end deftypefn + @defmac MOVE_MAX_PIECES A C expression used by @code{move_by_pieces} to determine the largest unit a load or store used to copy memory is. Defaults to @code{MOVE_MAX}. diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index eccc4d88493..e8cd831ad32 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4639,6 +4639,8 @@ If you don't define this, a reasonable default is used. @hook TARGET_COMPARE_BY_PIECES_BRANCH_RATIO +@hook TARGET_STORE_BY_PIECES_ICODE + @defmac MOVE_MAX_PIECES A C expression used by @code{move_by_pieces} to determine the largest unit a load or store used to copy memory is. Defaults to @code{MOVE_MAX}. diff --git a/gcc/expr.cc b/gcc/expr.cc index 3815c565e2d..2b56b1bf983 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -1714,7 +1714,7 @@ class store_by_pieces_d : public op_by_pieces_d bool store_by_pieces_d::prepare_mode (machine_mode mode, unsigned int align) { - insn_code icode = optab_handler (mov_optab, mode); + insn_code icode = targetm.store_by_pieces_icode (mode, m_len); m_gen_fun = GEN_FCN (icode); return icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode); } diff --git a/gcc/target.def b/gcc/target.def index 6c7cdc8126b..bfe33c4a614 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -3909,6 +3909,13 @@ negative number from this hook.", int, (machine_mode mode), default_compare_by_pieces_branch_ratio) +DEFHOOK +(store_by_pieces_icode, + "This target hook returns insn_code to move up to @var{length} bytes in\n\ +@var{mode} used by the store @code{by_pieces} infrastructure.", + insn_code, (machine_mode mode, unsigned HOST_WIDE_INT length), + default_store_by_pieces_icode) + DEFHOOK (slow_unaligned_access, "This hook returns true if memory accesses described by the\n\ diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index c79458e374e..918e7b564a0 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -2196,6 +2196,15 @@ default_compare_by_pieces_branch_ratio (machine_mode) return 1; } +/* This target hook returns insn_code to move the MODE memory used by the + store by_pieces infrastructure. */ + +insn_code +default_store_by_pieces_icode (machine_mode mode, unsigned HOST_WIDE_INT) +{ + return optab_handler (mov_optab, mode); +} + /* Write PATCH_AREA_SIZE NOPs into the asm outfile FILE around a function entry. If RECORD_P is true and the target supports named sections, the location of the NOPs will be recorded in a special object section diff --git a/gcc/targhooks.h b/gcc/targhooks.h index f16b58798c2..c04a47c5897 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -249,6 +249,8 @@ extern bool default_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT, enum by_pieces_operation, bool); extern int default_compare_by_pieces_branch_ratio (machine_mode); +extern insn_code default_store_by_pieces_icode (machine_mode, + unsigned HOST_WIDE_INT); extern void default_print_patchable_function_entry (FILE *, unsigned HOST_WIDE_INT, diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c new file mode 100644 index 00000000000..1ac39df097b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=tigerlake -mno-sse" } */ +/* { dg-final { scan-assembler-not "mov\[lq\]?\[\\t \]*\\$\[0\]," } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 23); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c new file mode 100644 index 00000000000..67080e22fde --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=tigerlake -mno-sse" } */ +/* { dg-final { scan-assembler-not "mov\[lq\]?\[\\t \]*\\$\[0\]," { target ia32 } } } */ +/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 2 { target { ! ia32 } } } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 17); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-12.c b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c new file mode 100644 index 00000000000..bdd06aa685b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-12.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=tigerlake -mno-sse" } */ +/* { dg-final { scan-assembler-times "movl\[\\t \]*\\$\[0\]," 2 } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 7); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-13.c b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c new file mode 100644 index 00000000000..3ba6218e2c7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-13.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=tigerlake -mno-sse" } */ +/* { dg-final { scan-assembler-times "movl\[\\t \]*%e" 5 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "movq\[\\t \]*%r" 3 { target { ! ia32 } } } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 21); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-14.c b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c new file mode 100644 index 00000000000..26e38023cc5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-14.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=tigerlake -mno-sse" } */ +/* { dg-final { scan-assembler-times "movl\[\\t \]*%e" 5 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 2 { target { ! ia32 } } } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 20); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-15.c b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c new file mode 100644 index 00000000000..adb23cdcb5f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-15.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=tigerlake -mno-sse" } */ +/* { dg-final { scan-assembler-times "movl\[\\t \]*\\$\[0\]," 2 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 1 { target { ! ia32 } } } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 10); +} diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-16.c b/gcc/testsuite/gcc.target/i386/memset-strategy-16.c new file mode 100644 index 00000000000..afedfac2b25 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/memset-strategy-16.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=tigerlake -mno-sse" } */ +/* { dg-final { scan-assembler-times "movl\[\\t \]*\\%e" 3 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "movq\[\\t \]*\\$\[0\]," 1 { target { ! ia32 } } } } */ + +void +foo (char *dest) +{ + __builtin_memset (dest, 0, 11); +} -- 2.49.0