On Fri, Nov 17, 2023 at 3:26 PM Hongyu Wang <hongyu.w...@intel.com> wrote: > > Intel APX PPX feature has been released in [1]. > > PPX stands for Push-Pop Acceleration. PUSH/PUSH2 and its corresponding POP > can be marked with a 1-bit hint to indicate that the POP reads the > value written by the PUSH from the stack. The processor tracks these marked > instructions internally and fast-forwards register data between > matching PUSH and POP instructions, without going through memory or > through the training loop of the Fast Store Forwarding Predictor (FSFP). > This feature can also be adopted to PUSH2/POP2. > > For GCC, we emit explicit suffix 'p' (paired) to indicate the push/pop > pair are marked with PPX hint. To separate form original push/pop, we > use UNSPEC to restrict the PPX related patterns. So for pushp/popp, the > cfi is manually adjusted for the UNSPEC PPX insns. > > In the first implementation we only emit them under prologue/epilogue > when saving/restoring callee-saved registers to make sure push/pop are > paired. So an extra flag was added to check if PPX insns can be emitted > for those register save/restore interfaces. > > The PPX hint is purely a performance hint. If the 'p' suffix is not > emitted for paired push/pop, the PPX optimization will be disabled, > while program sematic will not be affected at all. > > Bootstrapped/regtest on x86-64-pc-linux-gnu{-m32,}. > > Ok for master? > > [1].https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.ht > > gcc/ChangeLog: > > * config/i386/i386-opts.h (enum apx_features): Add apx_ppx, add > it to apx_all. > * config/i386/i386.cc (ix86_emit_restore_reg_using_pop): Add > ppx_p parameter for function declaration. > (gen_push2): Add ppx_p parameter, emit push2p if ppx_p is true. > (ix86_emit_restore_reg_using_pop2): Likewise for pop2p. > (gen_pushp): New function to emit pushp and adjust cfi. > (ix86_emit_save_regs): Emit pushp/push2p under TARGET_APX_PPX. > (ix86_emit_restore_reg_using_pop): Add ppx_p, emit popp insn > and adjust cfi when ppx_p is ture. > (ix86_emit_restore_reg_using_pop2): Add ppx_p and parse to its > callee. > (ix86_emit_restore_regs_using_pop2): Likewise. > (ix86_expand_epilogue): Parse TARGET_APX_PPX to > ix86_emit_restore_reg_using_pop. > * config/i386/i386.h (TARGET_APX_PPX): New. > * config/i386/i386.md (UNSPEC_APXPUSHP): New unspec. > (UNSPEC_APXPOPP): Likewise. > (UNSPEC_APXPUSH2P): Likewise. > (UNSPEC_APXPOP2P_LOW): Likewise. > (UNSPEC_APXPOP2P_HIGH): Likewise. > (pushp_di): New define_insn. > (popp_di): Likewise. > (push2p_di): Likewise. > (pop2p_di): Likewise. > * config/i386/i386.opt: Add apx_ppx enum. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/apx-interrupt-1.c: Adjust option to restrict them > under certain subfeatures. > * gcc.target/i386/apx-push2pop2-1.c: Likewise. > * gcc.target/i386/apx-push2pop2_force_drap-1.c: Likewise. > * gcc.target/i386/apx-push2pop2_interrupt-1.c: Likewise. > * gcc.target/i386/apx-ppx-1.c: New test. > --- > gcc/config/i386/i386-opts.h | 3 +- > gcc/config/i386/i386.cc | 113 ++++++++++++++---- > gcc/config/i386/i386.h | 1 + > gcc/config/i386/i386.md | 47 +++++++- > gcc/config/i386/i386.opt | 3 + > .../gcc.target/i386/apx-interrupt-1.c | 2 +- > gcc/testsuite/gcc.target/i386/apx-ppx-1.c | 9 ++ > .../gcc.target/i386/apx-push2pop2-1.c | 2 +- > .../i386/apx-push2pop2_force_drap-1.c | 2 +- > .../i386/apx-push2pop2_interrupt-1.c | 2 +- > 10 files changed, 158 insertions(+), 26 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/apx-ppx-1.c > > diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h > index 2ec76a16bce..4d293edb399 100644 > --- a/gcc/config/i386/i386-opts.h > +++ b/gcc/config/i386/i386-opts.h > @@ -139,7 +139,8 @@ enum apx_features { > apx_egpr = 1 << 0, > apx_push2pop2 = 1 << 1, > apx_ndd = 1 << 2, > - apx_all = apx_egpr | apx_push2pop2 | apx_ndd, > + apx_ppx = 1 << 3, > + apx_all = apx_egpr | apx_push2pop2 | apx_ndd | apx_ppx, > }; > > #endif > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 683ac643bc8..df2fc236c0a 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -105,7 +105,7 @@ along with GCC; see the file COPYING3. If not see > static rtx legitimize_dllimport_symbol (rtx, bool); > static rtx legitimize_pe_coff_extern_decl (rtx, bool); > static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool); > -static void ix86_emit_restore_reg_using_pop (rtx); > +static void ix86_emit_restore_reg_using_pop (rtx, bool = false); > > > #ifndef CHECK_STACK_LIMIT > @@ -6512,7 +6512,7 @@ gen_popfl (void) > > /* Generate a "push2" pattern for input ARG. */ > rtx > -gen_push2 (rtx mem, rtx reg1, rtx reg2) > +gen_push2 (rtx mem, rtx reg1, rtx reg2, bool ppx_p = false) > { > struct machine_function *m = cfun->machine; > const int offset = UNITS_PER_WORD * 2; > @@ -6527,9 +6527,49 @@ gen_push2 (rtx mem, rtx reg1, rtx reg2) > if (REG_P (reg2) && GET_MODE (reg2) != word_mode) > reg2 = gen_rtx_REG (word_mode, REGNO (reg2)); > > - return gen_push2_di (mem, reg1, reg2); > + return ppx_p ? gen_push2p_di (mem, reg1, reg2): > + gen_push2_di (mem, reg1, reg2); > } > > +/* Generate a "pushp" insn for input ARG, and set reg notes. */ > +rtx_insn * > +gen_pushp (rtx arg) > +{ Can we just overload gen_push with an new arg bool ppx just like we did for push2/pop2. > + struct machine_function *m = cfun->machine; > + > + if (m->fs.cfa_reg == stack_pointer_rtx) > + m->fs.cfa_offset += UNITS_PER_WORD; > + m->fs.sp_offset += UNITS_PER_WORD; > + > + if (REG_P (arg) && GET_MODE (arg) != word_mode) > + arg = gen_rtx_REG (word_mode, REGNO (arg)); > + > + rtx pat = gen_rtx_MEM (word_mode, > + gen_rtx_PRE_DEC (Pmode, > + stack_pointer_rtx)); > + rtx_insn *insn = emit_insn (gen_pushp_di (pat, arg)); > + Why do we need .. > + rtx dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2)); > + > + rtx dwarf_reg = gen_rtx_REG (word_mode, > + REGNO (arg)); > + rtx tmp = gen_rtx_SET (gen_frame_mem (DImode, > + stack_pointer_rtx), > + dwarf_reg); > + RTX_FRAME_RELATED_P (tmp) = 1; > + XVECEXP (dwarf, 0, 1) = tmp; > + rtx sp_tmp = gen_rtx_SET (stack_pointer_rtx, > + plus_constant (Pmode, > + stack_pointer_rtx, > + -UNITS_PER_WORD)); > + RTX_FRAME_RELATED_P (sp_tmp) = 1; > + XVECEXP (dwarf, 0, 0) = sp_tmp; > + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); this. I assume pushp should be different from push only by ppx hint, not dwarf. > + > + return insn; > +} > + > + > /* Return >= 0 if there is an unused call-clobbered register available > for the entire function. */ > > @@ -7369,7 +7409,10 @@ ix86_emit_save_regs (void) > for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--) > if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) > { > - insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno))); > + if (TARGET_APX_PPX) > + insn = gen_pushp (gen_rtx_REG (word_mode, regno)); > + else > + insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno))); > RTX_FRAME_RELATED_P (insn) = 1; > } > } > @@ -7399,7 +7442,8 @@ ix86_emit_save_regs (void) > gen_rtx_REG (word_mode, > regno_list[0]), > gen_rtx_REG (word_mode, > - > regno_list[1]))); > + regno_list[1]), > + TARGET_APX_PPX)); > RTX_FRAME_RELATED_P (insn) = 1; > rtx dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (3)); > > @@ -7431,15 +7475,21 @@ ix86_emit_save_regs (void) > } > else > { > - insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno))); > + if (TARGET_APX_PPX) > + insn = gen_pushp (gen_rtx_REG (word_mode, regno)); > + else > + insn = emit_insn (gen_push (gen_rtx_REG (word_mode, > regno))); > RTX_FRAME_RELATED_P (insn) = 1; > aligned = true; > } > } > if (loaded_regnum == 1) > { > - insn = emit_insn (gen_push (gen_rtx_REG (word_mode, > - regno_list[0]))); > + if (TARGET_APX_PPX) > + insn = gen_pushp (gen_rtx_REG (word_mode, regno_list[0])); > + else > + insn = emit_insn (gen_push (gen_rtx_REG (word_mode, > + regno_list[0]))); > RTX_FRAME_RELATED_P (insn) = 1; > } > } > @@ -9268,15 +9318,30 @@ ix86_expand_prologue (void) > emit_insn (gen_prologue_use (stack_pointer_rtx)); > } > > -/* Emit code to restore REG using a POP insn. */ > +/* Emit code to restore REG using a POP or POPP insn. */ > > static void > -ix86_emit_restore_reg_using_pop (rtx reg) > +ix86_emit_restore_reg_using_pop (rtx reg, bool ppx_p) > { > struct machine_function *m = cfun->machine; > - rtx_insn *insn = emit_insn (gen_pop (reg)); > + rtx_insn *insn; > > - ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset); > + if (ppx_p) > + { > + rtx stack = gen_rtx_MEM (word_mode, > + gen_rtx_POST_INC (Pmode, > + stack_pointer_rtx)); > + insn = emit_insn (gen_popp_di (reg, stack)); Diddo for .. > + RTX_FRAME_RELATED_P (insn) = 1; > + rtx dwarf = NULL_RTX; > + dwarf = alloc_reg_note (REG_CFA_RESTORE, reg, dwarf); > + REG_NOTES (insn) = dwarf; .. this, why it's different from pop regarding cfa? > + } > + else > + { > + insn = emit_insn (gen_pop (reg)); > + ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset); > + } > m->fs.sp_offset -= UNITS_PER_WORD; > > if (m->fs.cfa_reg == crtl->drap_reg > @@ -9328,14 +9393,19 @@ ix86_emit_restore_reg_using_pop (rtx reg) > > /* Emit code to restore REG using a POP2 insn. */ > static void > -ix86_emit_restore_reg_using_pop2 (rtx reg1, rtx reg2) > +ix86_emit_restore_reg_using_pop2 (rtx reg1, rtx reg2, bool ppx_p = false) > { > struct machine_function *m = cfun->machine; > const int offset = UNITS_PER_WORD * 2; > + rtx_insn *insn; > > rtx mem = gen_rtx_MEM (TImode, gen_rtx_POST_INC (Pmode, > stack_pointer_rtx)); > - rtx_insn *insn = emit_insn (gen_pop2_di (reg1, mem, reg2)); > + > + if (ppx_p) > + insn = emit_insn (gen_pop2p_di (reg1, mem, reg2)); > + else > + insn = emit_insn (gen_pop2_di (reg1, mem, reg2)); > > RTX_FRAME_RELATED_P (insn) = 1; > > @@ -9397,13 +9467,13 @@ ix86_emit_restore_reg_using_pop2 (rtx reg1, rtx reg2) > /* Emit code to restore saved registers using POP insns. */ > > static void > -ix86_emit_restore_regs_using_pop (void) > +ix86_emit_restore_regs_using_pop (bool ppx_p) > { > unsigned int regno; > > for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) > if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true)) > - ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno)); > + ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno), > ppx_p); > } > > /* Emit code to restore saved registers using POP2 insns. */ > @@ -9432,20 +9502,23 @@ ix86_emit_restore_regs_using_pop2 (void) > ix86_emit_restore_reg_using_pop2 (gen_rtx_REG (word_mode, > regno_list[0]), > gen_rtx_REG (word_mode, > - > regno_list[1])); > + regno_list[1]), > + TARGET_APX_PPX); > loaded_regnum = 0; > regno_list[0] = regno_list[1] = -1; > } > } > else > { > - ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno)); > + ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno), > + TARGET_APX_PPX); > aligned = true; > } > } > > if (loaded_regnum == 1) > - ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno_list[0])); > + ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno_list[0]), > + TARGET_APX_PPX); > } > > /* Emit code and notes for the LEAVE instruction. If insn is non-null, > @@ -9990,7 +10063,7 @@ ix86_expand_epilogue (int style) > if (TARGET_APX_PUSH2POP2 && m->func_type == TYPE_NORMAL) > ix86_emit_restore_regs_using_pop2 (); > else > - ix86_emit_restore_regs_using_pop (); > + ix86_emit_restore_regs_using_pop (TARGET_APX_PPX); > } > > /* If we used a stack pointer and haven't already got rid of it, > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 037074758ea..227217a8712 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -54,6 +54,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. > If not, see > #define TARGET_APX_EGPR (ix86_apx_features & apx_egpr) > #define TARGET_APX_PUSH2POP2 (ix86_apx_features & apx_push2pop2) > #define TARGET_APX_NDD (ix86_apx_features & apx_ndd) > +#define TARGET_APX_PPX (ix86_apx_features & apx_ppx) > > #include "config/vxworks-dummy.h" > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 1b5a794b9e5..8cce4c11e35 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -210,10 +210,17 @@ (define_c_enum "unspec" [ > ;; For insn_callee_abi: > UNSPEC_CALLEE_ABI > > - ;; For PUSH2/POP2 support > + ;; For APX PUSH2/POP2 support > UNSPEC_APXPUSH2 > UNSPEC_APXPOP2_LOW > UNSPEC_APXPOP2_HIGH > + > + ;; For APX PPX support > + UNSPEC_APXPUSHP > + UNSPEC_APXPOPP > + UNSPEC_APXPUSH2P > + UNSPEC_APXPOP2P_LOW > + UNSPEC_APXPOP2P_HIGH > ]) > > (define_c_enum "unspecv" [ > @@ -3785,6 +3792,44 @@ (define_insn "pop2_di" > [(set_attr "mode" "TI") > (set_attr "prefix" "evex")]) > > +(define_insn "pushp_di" > + [(set (match_operand:DI 0 "push_operand" "=<") > + (unspec: DI [(match_operand:DI 1 "register_operand" "r")] > + UNSPEC_APXPUSHP))] > + "TARGET_64BIT" > + "pushp\t%1" > + [(set_attr "mode" "DI")]) I think you just need to add an extra (unspec [(const_int 0)] UNSPEC_PPX) to origin push_di pattern, then no need for particular handling of dwarf and cfa stuffs. Ditto for popp and push2p and pop2p. > + > +(define_insn "popp_di" > + [(set (match_operand:DI 0 "nonimmediate_operand" "=r") > + (unspec:DI [(match_operand:DI 1 "pop_operand" ">")] > + UNSPEC_APXPOPP))] > + "TARGET_APX_PPX" > + "popp\t%0" > + [(set_attr "mode" "DI")]) > + > +(define_insn "push2p_di" > + [(set (match_operand:TI 0 "push_operand" "=<") > + (unspec:TI [(match_operand:DI 1 "register_operand" "r") > + (match_operand:DI 2 "register_operand" "r")] > + UNSPEC_APXPUSH2P))] > + "TARGET_APX_PUSH2POP2 && TARGET_APX_PPX" > + "push2p\t%1, %2" > + [(set_attr "mode" "TI") > + (set_attr "type" "multi") > + (set_attr "prefix" "evex")]) > + > +(define_insn "pop2p_di" > + [(parallel [(set (match_operand:DI 0 "register_operand" "=r") > + (unspec:DI [(match_operand:TI 1 "pop_operand" ">")] > + UNSPEC_APXPOP2P_LOW)) > + (set (match_operand:DI 2 "register_operand" "=r") > + (unspec:DI [(const_int 0)] UNSPEC_APXPOP2P_HIGH))])] > + "TARGET_APX_PUSH2POP2 && TARGET_APX_PPX" > + "pop2p\t%0, %2" > + [(set_attr "mode" "TI") > + (set_attr "prefix" "evex")]) > + > (define_insn "*pushsf_rex64" > [(set (match_operand:SF 0 "push_operand" "=X,X,X") > (match_operand:SF 1 "nonmemory_no_elim_operand" "f,rF,v"))] > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt > index 0c3b8f4b621..ee8898e3abb 100644 > --- a/gcc/config/i386/i386.opt > +++ b/gcc/config/i386/i386.opt > @@ -1333,6 +1333,9 @@ Enum(apx_features) String(push2pop2) > Value(apx_push2pop2) Set(3) > EnumValue > Enum(apx_features) String(ndd) Value(apx_ndd) Set(4) > > +EnumValue > +Enum(apx_features) String(ppx) Value(apx_ppx) Set(5) > + > EnumValue > Enum(apx_features) String(all) Value(apx_all) Set(1) > > diff --git a/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c > b/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c > index 5f732d3e316..ffcb8fce71c 100644 > --- a/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c > +++ b/gcc/testsuite/gcc.target/i386/apx-interrupt-1.c > @@ -1,5 +1,5 @@ > /* { dg-do compile { target { ! ia32 } } } */ > -/* { dg-options "-mapxf -m64 -O2 -mgeneral-regs-only -mno-cld -mno-push-args > -maccumulate-outgoing-args" } */ > +/* { dg-options "-mapx-features=egpr -m64 -O2 -mgeneral-regs-only -mno-cld > -mno-push-args -maccumulate-outgoing-args" } */ > /* { dg-skip-if "does not emit .cfi_xxx" "*-*-darwin*" } */ > > extern void foo (void *) __attribute__ ((interrupt)); > diff --git a/gcc/testsuite/gcc.target/i386/apx-ppx-1.c > b/gcc/testsuite/gcc.target/i386/apx-ppx-1.c > new file mode 100644 > index 00000000000..e9a595381e3 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/apx-ppx-1.c > @@ -0,0 +1,9 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-O2 -mapx-features=egpr,push2pop2,ppx" } */ > + > +#include "apx-push2pop2-1.c" > + > +/* { dg-final { scan-assembler "pushp" } } */ > +/* { dg-final { scan-assembler "popp" } } */ > +/* { dg-final { scan-assembler "push2p" } } */ > +/* { dg-final { scan-assembler "pop2p" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c > b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c > index 089941d3726..c53112758a5 100644 > --- a/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c > +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2-1.c > @@ -1,5 +1,5 @@ > /* { dg-do compile { target { ! ia32 } } } */ > -/* { dg-options "-O2 -mapxf" } */ > +/* { dg-options "-O2 -mapx-features=push2pop2" } */ > /* { dg-skip-if "does not emit .cfi_xxx" "*-*-darwin*" } */ > > extern int bar (int); > diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c > b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c > index 656ca91391a..022113bb1e2 100644 > --- a/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c > +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2_force_drap-1.c > @@ -1,5 +1,5 @@ > /* { dg-do compile { target { ! ia32 } } } */ > -/* { dg-options "-O2 -mapxf -mforce-drap" } */ > +/* { dg-options "-O2 -mapx-features=push2pop2 -mforce-drap" } */ > /* { dg-skip-if "does not emit .cfi_xxx" "*-*-darwin*" } */ > > #include "apx-push2pop2-1.c" > diff --git a/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c > b/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c > index 747f7aaf191..a5b46893208 100644 > --- a/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c > +++ b/gcc/testsuite/gcc.target/i386/apx-push2pop2_interrupt-1.c > @@ -1,5 +1,5 @@ > /* { dg-do compile { target { ! ia32 } } } */ > -/* { dg-options "-O2 -mapxf -mgeneral-regs-only -mno-cld -mno-push-args > -maccumulate-outgoing-args" } */ > +/* { dg-options "-O2 -mapx-features=egpr,push2pop2 -mgeneral-regs-only > -mno-cld -mno-push-args -maccumulate-outgoing-args" } */ > > extern void foo (void *) __attribute__ ((interrupt)); > > -- > 2.31.1 >
-- BR, Hongtao