https://gcc.gnu.org/g:7392e9e480afe3143e72a99f7b5ac99b2f49c284

commit r13-9284-g7392e9e480afe3143e72a99f7b5ac99b2f49c284
Author: Jan Hubicka <j...@suse.cz>
Date:   Tue Sep 3 16:26:16 2024 +0200

    Zen5 tuning part 3: scheduler tweaks
    
    this patch adds support for new fussion in znver5 documented in the
    optimization manual:
    
       The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
       with certain ALU instructions. The following conditions need to be met 
for
       fusion to happen:
         - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
         - The MOV is followed by an ALU instruction where the MOV and ALU 
destination register match.
         - The ALU instruction may source only registers or immediate data. 
There cannot be any memory source.
         - The ALU instruction sources either the source or dest of MOV 
instruction.
         - If ALU instruction has 2 reg sources, they should be different.
         - The following ALU instructions can fuse with an older qualified MOV 
instruction:
           ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
           (I assume OP is OR)
    
    I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, 
but
    with our model we can't realy use it.
    Increasing issue rate to 8 leads to infinite loop in scheduler.
    
    Finally, I also enabled fuse_alu_and_branch since it is supported by
    znver5 (I think by earlier zens too).
    
    New fussion pattern moves quite few instructions around in common code:
    @@ -2210,13 +2210,13 @@
            .cfi_offset 3, -32
            leaq    63(%rsi), %rbx
            movq    %rbx, %rbp
    +       shrq    $6, %rbp
    +       salq    $3, %rbp
            subq    $16, %rsp
            .cfi_def_cfa_offset 48
            movq    %rdi, %r12
    -       shrq    $6, %rbp
    -       movq    %rsi, 8(%rsp)
    -       salq    $3, %rbp
            movq    %rbp, %rdi
    +       movq    %rsi, 8(%rsp)
            call    _Znwm
            movq    8(%rsp), %rsi
            movl    $0, 8(%r12)
    @@ -2224,8 +2224,8 @@
            movq    %rax, (%r12)
            movq    %rbp, 32(%r12)
            testq   %rsi, %rsi
    -       movq    %rsi, %rdx
            cmovns  %rsi, %rbx
    +       movq    %rsi, %rdx
            sarq    $63, %rdx
            shrq    $58, %rdx
            sarq    $6, %rbx
    which should help decoder bandwidth and perhaps also cache, though I was not
    able to measure off-noise effect on SPEC.
    
    gcc/ChangeLog:
    
            * config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
            * config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
            (ix86_adjust_cost): Add TODO about znver5 memory latency.
            (ix86_fuse_mov_alu_p): New.
            (ix86_macro_fusion_pair_p): Use it.
            * config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add 
ZNVER5.
            (X86_TUNE_FUSE_MOV_AND_ALU): New tune;
    
    (cherry picked from commit e2125a600552bc6e0329e3f1224eea14804db8d3)

Diff:
---
 gcc/config/i386/i386.h            |  2 ++
 gcc/config/i386/x86-tune-sched.cc | 59 +++++++++++++++++++++++++++++++++++++++
 gcc/config/i386/x86-tune.def      |  6 +++-
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 08309367c18b..25c6540fb2c9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -418,6 +418,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
        ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
        ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+       ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
 #define TARGET_AVOID_LEA_FOR_ADDR \
        ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index cbaba5f9e3c3..28b9ed84d03b 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -435,6 +435,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
          enum attr_unit unit = get_attr_unit (insn);
          int loadcost;
 
+         /* TODO: On znver5 complex addressing modes have
+            greater latency.  */
          if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
            loadcost = 4;
          else
@@ -540,6 +542,60 @@ ix86_macro_fusion_p ()
   return TARGET_FUSE_CMP_AND_BRANCH;
 }
 
+static bool
+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
+{
+  /* Validate mov:
+      - It should be reg-reg move with opcode 0x89 or 0x8B.  */
+  rtx set1 = PATTERN (mov);
+  if (GET_CODE (set1) != SET
+      || !GENERAL_REG_P (SET_SRC (set1))
+      || !GENERAL_REG_P (SET_DEST (set1)))
+    return false;
+  rtx reg = SET_DEST (set1);
+  /*  - it should have 0x89 or 0x8B opcode.  */
+  if (!INTEGRAL_MODE_P (GET_MODE (reg))
+      || GET_MODE_SIZE (GET_MODE (reg)) < 2
+      || GET_MODE_SIZE (GET_MODE (reg)) > 8)
+    return false;
+  /* Validate ALU.  */
+  if (GET_CODE (PATTERN (alu)) != PARALLEL)
+    return false;
+  rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
+  if (GET_CODE (set2) != SET)
+    return false;
+  /* Match one of:
+     ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
+     We also may add insn attribute to handle some of sporadic
+     case we output those with different RTX expressions.  */
+
+  if (GET_CODE (SET_SRC (set2)) != PLUS
+      && GET_CODE (SET_SRC (set2)) != MINUS
+      && GET_CODE (SET_SRC (set2)) != XOR
+      && GET_CODE (SET_SRC (set2)) != AND
+      && GET_CODE (SET_SRC (set2)) != IOR
+      && GET_CODE (SET_SRC (set2)) != NOT
+      && GET_CODE (SET_SRC (set2)) != ASHIFT
+      && GET_CODE (SET_SRC (set2)) != ASHIFTRT
+      && GET_CODE (SET_SRC (set2)) != LSHIFTRT)
+    return false;
+  rtx op0 = XEXP (SET_SRC (set2), 0);
+  rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
+  /* One of operands should be register.  */
+  if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
+    std::swap (op0, op1);
+  if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
+    return false;
+  if (op1
+      && !REG_P (op1)
+      && !x86_64_immediate_operand (op1, VOIDmode))
+    return false;
+  /* Only one of two paramters must be move destination.  */
+  if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
+    return false;
+  return true;
+}
+
 /* Check whether current microarchitecture support macro fusion
    for insn pair "CONDGEN + CONDJMP". Refer to
    "Intel Architectures Optimization Reference Manual". */
@@ -547,6 +603,9 @@ ix86_macro_fusion_p ()
 bool
 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
 {
+  if (TARGET_FUSE_MOV_AND_ALU
+      && ix86_fuse_mov_alu_p (condgen, condjmp))
+    return true;
   rtx src, dest;
   enum rtx_code ccode;
   rtx compare_set = NULL_RTX, test_if, cond;
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 773a4ea4ccf6..d5fcf5ce971d 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -145,8 +145,12 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, 
"fuse_cmp_and_branch_soflags",
    jump instruction when the alu instruction produces the CCFLAG consumed by
    the conditional jump instruction. */
 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
-                 m_SANDYBRIDGE | m_CORE_AVX2 | m_LUJIAZUI | m_GENERIC)
+                 m_SANDYBRIDGE | m_CORE_AVX2 | m_LUJIAZUI | m_GENERIC | 
m_ZNVER5)
 
+/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
+   and the destination is used by alu.  alu must be one of
+   ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR.  */
+DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)
 
 /*****************************************************************************/
 /* Function prologue, epilogue and function calling sequences.               */

Reply via email to