[PATCH 2/3] Power10: Add PCREL_OPT store support.

This patch adds support for optimizing power10 stores to an external variable
to eliminate loading the address of the variable, and then doing a subsequent
store using that address.

I have built compilers with and without these set of 3 patches doing a
bootstrap build and make check.  There were no regressions, and the new tests
passed.  Can I check these patches into the master branch for GCC?  Because
this is new functionality, I do not intend to back port these patches to GCC 10
at this time.

gcc/
2020-08-18  Michael Meissner  <meiss...@linux.ibm.com>

        * config/rs6000/pcrel-opt.c (counters): Add fields to count number
        of PCREL_OPT stores that were processed.
        (do_pcrel_opt_store): New function to do PCREL_OPT stores.
        (do_pcrel_opt_addr): Add support to optimize PCREL_OPT stores.
        (do_pcrel_opt_pass): Print out statistics for PCREL_OPT stores.
        * config/rs6000/pcrel-opt.md (UNSPEC_PCREL_OPT_ST_ADDR): New
        unspec.
        (UNSPEC_PCREL_OPT_ST_RELOC): New unspec.
        (pcrel_opt_st_addr<mode>): New insns for PCREL_OPT stores.
        (pcrel_opt_st<mode>): New insns for QI/HI/SI PCREL_OPT stores.
        (pcrel_opt_stdi): New insn to optimize DI PCREL_OPT stores.
        (pcrel_opt_stsf): New insn to optimize SF PCREL_OPT stores.
        (pcrel_opt_stdf): New insn to optimize DF PCREL_OPT stores.
        (pcrel_opt_st<mode>): New insns to optimize vector PCREL_OPT
        stores.
        * config/rs6000/rs6000.c (rs6000_delegitimize_address): Add
        support to de-legitimize PCREL_OPT stores.
---
 gcc/config/rs6000/pcrel-opt.c  | 259 +++++++++++++++++++++++++++++++++++++++--
 gcc/config/rs6000/pcrel-opt.md | 115 +++++++++++++++++-
 gcc/config/rs6000/rs6000.c     |   3 +-
 3 files changed, 367 insertions(+), 10 deletions(-)

diff --git a/gcc/config/rs6000/pcrel-opt.c b/gcc/config/rs6000/pcrel-opt.c
index 10b4bc4..61dce67 100644
--- a/gcc/config/rs6000/pcrel-opt.c
+++ b/gcc/config/rs6000/pcrel-opt.c
@@ -53,6 +53,43 @@
 
    We only look for a single usage in the basic block where the external
    address is loaded.  Multiple uses or references in another basic block will
+   force us to not use the PCREL_OPT relocation.
+
+   We also optimize stores to the address of an external variable using the
+   PCREL_GOT relocation and a single store that uses that external address.  If
+   that is found we create the PCREL_OPT relocation to possibly convert:
+
+       pld addr_reg,var@pcrel@got(0),1
+
+       <possibly other insns that do not use 'addr_reg' or 'data_reg'>
+
+       stw data_reg,0(addr_reg)
+
+   into:
+
+       pstw data_reg,var@pcrel(0),1
+
+       <possibly other insns that do not use 'addr_reg' or 'data_reg'>
+
+       nop
+
+   If the variable is not defined in the main program or the code using it is
+   not in the main program, the linker put the address in the .got section and
+   do:
+
+               .section .got
+       .Lvar_got:
+               .dword var
+
+               .section .text
+               pld addr_reg,.Lvar_got@pcrel(0),1
+
+               <possibly other insns that do not use 'addr_reg' or 'data_reg'>
+
+               stw data_reg,0(addr_reg)
+
+   We only look for a single usage in the basic block where the external
+   address is loaded.  Multiple uses or references in another basic block will
    force us to not use the PCREL_OPT relocation.  */
 
 #define IN_TARGET_CODE 1
@@ -82,11 +119,11 @@
 #include "insn-codes.h"
 
 
-// Maximum number of insns to scan between the load address and the load that
-// uses that address.  This can be bumped up if desired.  If the insns are far
-// enough away, the PCREL_OPT optimization probably does not help, since the
-// load of the external address has probably completed by the time we do the
-// load of the variable at that address.
+// Maximum number of insns to scan between the load address and the load or
+// store that uses that address.  This can be bumped up if desired.  If the
+// insns are far enough away, the PCREL_OPT optimization probably does not
+// help, since the load of the external address has probably completed by the
+// time we do the load or store of the variable at that address.
 const int MAX_PCREL_OPT_INSNS  = 10;
 
 /* Next PCREL_OPT label number.  */
@@ -97,6 +134,8 @@ static struct {
   unsigned long extern_addrs;
   unsigned long loads;
   unsigned long load_separation[MAX_PCREL_OPT_INSNS+1];
+  unsigned long stores;
+  unsigned long store_separation[MAX_PCREL_OPT_INSNS+1];
 } counters;
 
 
@@ -306,6 +345,156 @@ do_pcrel_opt_load (rtx_insn *addr_insn,           // insn 
loading address
 }
 
 
+// Optimize a PC-relative load address to be used in a store.
+
+// If the sequence of insns is safe to use the PCREL_OPT optimization (i.e. no
+// additional references to the address register, the address register dies at
+// the load, and no references to the load), convert insns of the form:
+//
+//     (set (reg:DI addr)
+//          (symbol_ref:DI "ext_symbol"))
+//
+//     ...
+//
+//     (set (mem:<MODE> (reg:DI addr))
+//          (reg:<MODE> value))
+//
+// into:
+//
+//     (parallel [(set (reg:DI addr)
+//                      (unspec:DI [(symbol_ref:DI "ext_symbol")
+//                                  (const_int label_num)]
+//                                 UNSPEC_PCREL_OPT_ST_ADDR))
+//                 (use (reg:<MODE> value))])
+//
+//     ...
+//
+//     (parallel [(set (mem:<MODE> (reg:DI addr))
+//                      (unspec:<MODE> [(reg:<MODE>)
+//                                      (const_int label_num)]
+//                                     UNSPEC_PCREL_OPT_ST_RELOC))
+//                 (clobber (reg:DI addr))])
+//
+//
+// The UNSPEC_PCREL_OPT_ST_ADDR insn will generate the load address plus
+// a definition of a label (.Lpcrel<n>), while the UNSPEC_PCREL_OPT_ST_RELOC
+// insn will generate the .reloc to tell the linker to tie the load address and
+// load using that address together.
+//
+//     pld b,ext_symbol@got@pcrel(0),1
+// .Lpcrel1:
+//
+//     ...
+//
+//     .reloc .Lpcrel1-8,R_PPC64_PCREL_OPT,.-(.Lpcrel1-8)
+//     stw r,0(b)
+//
+// If ext_symbol is defined in another object file in the main program and we
+// are linking the main program, the linker will convert the above instructions
+// to:
+//
+//     pstwz r,ext_symbol@got@pcrel(0),1
+//
+//     ...
+//
+//     nop
+//
+// Return the number of insns between the load of the external address and the
+// actual load or 0 if the load of the external address could not be combined
+// with a load with the PCREL_OPT optimization (i.e. if the load of the
+// external address was adjacent to the load that uses that external address, 1
+// would be returned)..
+//
+// Return true if the PCREL_OPT store optimization succeeded.
+
+static bool
+do_pcrel_opt_store (rtx_insn *addr_insn,       // insn loading address
+                   rtx_insn *store_insn)       // insn using address
+{
+  rtx addr_set = PATTERN (addr_insn);
+  rtx addr_reg = SET_DEST (addr_set);
+  rtx addr_symbol = SET_SRC (addr_set);
+  rtx store_set = single_set (store_insn);
+  rtx mem = SET_DEST (store_set);
+  rtx reg = SET_SRC (store_set);
+  machine_mode mem_mode = GET_MODE (mem);
+
+  // If this is LFIWAX or similar instructions that are indexed only, we can't
+  // do the optimization.
+  enum non_prefixed_form non_prefixed = reg_to_non_prefixed (reg, mem_mode);
+  if (non_prefixed == NON_PREFIXED_X)
+    return false;
+
+  // The optimization will only work on non-prefixed offsettable loads.
+  rtx addr = XEXP (mem, 0);
+  enum insn_form iform = address_to_insn_form (addr, mem_mode, non_prefixed);
+  if (iform != INSN_FORM_BASE_REG
+      && iform != INSN_FORM_D
+      && iform != INSN_FORM_DS
+      && iform != INSN_FORM_DQ)
+    return false;
+
+  // Allocate a new PC-relative label, and update the load address insn.
+
+  ++pcrel_opt_next_num;
+  rtx label_num = GEN_INT (pcrel_opt_next_num);
+  rtvec v_addr = gen_rtvec (2, addr_symbol, label_num);
+  rtx addr_unspec = gen_rtx_UNSPEC (Pmode, v_addr,
+                                  UNSPEC_PCREL_OPT_ST_ADDR);
+  rtx addr_new_set = gen_rtx_SET (addr_reg, addr_unspec);
+  rtx addr_use = gen_rtx_USE (VOIDmode, reg);
+
+  PATTERN (addr_insn)
+    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, addr_new_set, addr_use));
+
+  // Revalidate the insn, backing out of the optimization if the insn is not
+  // supported.
+  INSN_CODE (addr_insn) = recog (PATTERN (addr_insn), addr_insn, 0);
+  if (INSN_CODE (addr_insn) < 0)
+    {
+      PATTERN (addr_insn) = addr_set;
+      INSN_CODE (addr_insn) = recog (PATTERN (addr_insn), addr_insn, 0);
+      return false;
+    }
+
+  // Update the store insn.  Add an explicit clobber of the external address
+  // register just in case something runs after this pass.
+  //
+  // (parallel [(set (mem (addr_reg)
+  //                 (unspec:<MODE> [(reg)
+  //                                 (const_int label_num)]
+  //                                UNSPEC_PCREL_OPT_ST_RELOC))
+  //            (clobber (reg:DI addr_reg))])
+
+  rtvec v_store = gen_rtvec (2, reg, label_num);
+  rtx new_store = gen_rtx_UNSPEC (mem_mode, v_store,
+                                 UNSPEC_PCREL_OPT_ST_RELOC);
+
+  rtx old_store_set = PATTERN (store_insn);
+  rtx new_store_set = gen_rtx_SET (mem, new_store);
+  rtx store_clobber = gen_rtx_CLOBBER (VOIDmode, addr_reg);
+
+  PATTERN (store_insn)
+    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, new_store_set, store_clobber));
+
+  // Revalidate the insn, backing out of the optimization if the insn is not
+  // supported.
+
+  INSN_CODE (store_insn) = recog (PATTERN (store_insn), store_insn, 0);
+  if (INSN_CODE (store_insn) < 0)
+    {
+      PATTERN (addr_insn) = addr_set;
+      INSN_CODE (addr_insn) = recog (PATTERN (addr_insn), addr_insn, 0);
+
+      PATTERN (store_insn) = old_store_set;
+      INSN_CODE (store_insn) = recog (PATTERN (store_insn), store_insn, 0);
+      return false;
+    }
+
+  return true;
+}
+
+
 /* Given an insn, find the next insn in the basic block.  Stop if we find a the
    end of a basic block, such as a label, call or jump, and return NULL.  */
 
@@ -340,8 +529,8 @@ next_active_insn_in_basic_block (rtx_insn *insn)
 }
 
 
-// Validate that a load is actually a single instruction that can be optimized
-// with the PCREL_OPT optimization.
+// Validate that a load or store is actually a single instruction that can be
+// optimized with the PCREL_OPT optimization.
 
 static bool
 is_single_instruction (rtx_insn *insn, rtx reg)
@@ -522,6 +711,36 @@ do_pcrel_opt_addr (rtx_insn *addr_insn)
        }
     }
 
+  // Optimize stores
+  else if (is_store)
+    {
+      // If there were any loads in the insns between loading the external
+      // address and doing the store, turn off the optimization.
+      if (had_load)
+       return;
+
+      rtx reg = SET_SRC (set);
+      rtx mem = SET_DEST (set);
+      if (!is_single_instruction (insn, reg))
+       return;
+
+      if (!MEM_P (mem))
+       return;
+
+      // If the register being loaded or stored was used or set between the
+      // load of the external address and the load or store using the address,
+      // we can't do the optimization.
+      if (reg_used_between_p (reg, addr_insn, insn)
+         || reg_set_between_p (reg, addr_insn, insn))
+       return;
+
+      if (do_pcrel_opt_store (addr_insn, insn))
+       {
+         counters.stores++;
+         counters.store_separation[num_insns-1]++;
+       }
+    }
+
   return;
 }
 
@@ -544,7 +763,7 @@ do_pcrel_opt_pass (function *fun)
   df_set_flags (DF_DEFER_INSN_RESCAN | DF_LR_RUN_DCE);
 
   // Look at each basic block to see if there is a load of an external
-  // variable's external address, and a single load using that external
+  // variable's external address, and a single load/store using that external
   // address.
   FOR_ALL_BB_FN (bb, fun)
     {
@@ -598,6 +817,30 @@ do_pcrel_opt_pass (function *fun)
                             counters.load_separation[i]);
                }
            }
+
+         if (!counters.stores)
+           fprintf (dump_file,
+                    "No PCREL_OPT store optimizations were done\n");
+
+         else
+           {
+             fprintf (dump_file, "# of PCREL_OPT stores = %lu\n",
+                      counters.stores);
+
+             fprintf (dump_file, "# of adjacent PCREL_OPT stores = %lu\n",
+                      counters.store_separation[0]);
+
+             for (int i = 1; i < MAX_PCREL_OPT_INSNS; i++)
+               {
+                 if (counters.store_separation[i])
+                   fprintf (dump_file,
+                            "# of PCREL_OPT stores separated by "
+                            "%d insn%s = %lu\n",
+                            i, (i == 1) ? "" : "s",
+                            counters.store_separation[i]);
+               }
+           }
+
        }
 
       fprintf (dump_file, "\n");
diff --git a/gcc/config/rs6000/pcrel-opt.md b/gcc/config/rs6000/pcrel-opt.md
index 00a3bc4..d98c6ce 100644
--- a/gcc/config/rs6000/pcrel-opt.md
+++ b/gcc/config/rs6000/pcrel-opt.md
@@ -84,7 +84,9 @@
 (define_c_enum "unspec"
   [UNSPEC_PCREL_OPT_LD_ADDR
    UNSPEC_PCREL_OPT_LD_ADDR_SAME_REG
-   UNSPEC_PCREL_OPT_LD_RELOC])
+   UNSPEC_PCREL_OPT_LD_RELOC
+   UNSPEC_PCREL_OPT_ST_ADDR
+   UNSPEC_PCREL_OPT_ST_RELOC])
 
 ;; Modes that are supported for PCREL_OPT
 (define_mode_iterator PO [QI HI SI DI TI SF DF KF
@@ -246,3 +248,114 @@ (define_insn "*pcrel_opt_ld<mode>"
   "%r3lxv %x0,%1"
   [(set_attr "type" "vecload")
    (set_attr "isa" "pcrel_opt")])
+
+
+;; PCREL_OPT optimization for stores.  We need to put the label after the PLD
+;; instruction, because the assembler might insert a NOP before the PLD for
+;; alignment.
+;;
+;; If we are optimizing a single write, normally the code would look like:
+;;
+;;     (set (reg:DI <ptr>)
+;;          (symbol_ref:DI "<extern_addr>"))   # <data> must be live here
+;;
+;;         ...              # insns do not need to be adjacent
+;;
+;;     (set (mem:SI (reg:DI <xxx>))
+;;          (reg:SI <data>))                   # <ptr> dies with this insn
+;;
+;; We optimize this to be:
+;;
+;;     (parallel [(set (reg:DI <ptr>)
+;;                     (unspec:DI [(symbol_ref:DI "<extern_addr>")
+;;                                 (const_int <marker>)]
+;;                                UNSPEC_PCREL_OPT_ST_ADDR))
+;;                (use (reg:<MODE> <data>))])
+;;
+;;         ...              # insns do not need to be adjacent
+;;
+;;     (parallel [(set (mem:<MODE> (reg:DI <ptr>))
+;;                     (unspec:<MODE> [(reg:<MODE> <data>)
+;;                                     (const_int <marker>)]
+;;                                    UNSPEC_PCREL_OPT_ST_RELOC))
+;;                (clobber (reg:DI <ptr>))])
+
+(define_insn "*pcrel_opt_st_addr<mode>"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=b")
+       (unspec:DI [(match_operand:DI 1 "pcrel_external_address")
+                   (match_operand 2 "const_int_operand" "n")]
+               UNSPEC_PCREL_OPT_ST_ADDR))
+   (use (match_operand:PO 3 "gpc_reg_operand" "rwa"))]
+  "TARGET_PCREL_OPT"
+  "ld %0,%a1\n.Lpcrel%2:"
+  [(set_attr "prefixed" "yes")
+   (set_attr "type" "load")
+   (set_attr "isa" "pcrel_opt")
+   (set_attr "loads_extern_addr" "yes")])
+
+;; Alternate form of the stores that include a marker to identify whether we
+;; can do the PCREL_OPT optimization.
+(define_insn "*pcrel_opt_st<mode>"
+  [(set (match_operand:QHSI 0 "d_form_memory" "=o")
+       (unspec:QHSI [(match_operand:QHSI 1 "gpc_reg_operand" "r")
+                     (match_operand 2 "const_int_operand" "n")]
+                    UNSPEC_PCREL_OPT_ST_RELOC))
+   (clobber (match_operand:DI 3 "base_reg_operand" "=b"))]
+  "TARGET_PCREL_OPT"
+  "%r2st<wd> %1,%0"
+  [(set_attr "type" "store")
+   (set_attr "isa" "pcrel_opt")])
+
+(define_insn "*pcrel_opt_stdi"
+  [(set (match_operand:DI 0 "d_form_memory" "=o,o,o")
+       (unspec:DI [(match_operand:DI 1 "gpc_reg_operand" "r,d,v")
+                   (match_operand 2 "const_int_operand" "n,n,n")]
+                  UNSPEC_PCREL_OPT_ST_RELOC))
+   (clobber (match_operand:DI 3 "base_reg_operand" "=b,b,b"))]
+  "TARGET_PCREL_OPT && TARGET_POWERPC64"
+  "@
+   %r2std %1,%0
+   %r2stfd %1,%0
+   %r2stxsd %1,%0"
+  [(set_attr "type" "store,fpstore,fpstore")
+   (set_attr "isa" "pcrel_opt")])
+
+(define_insn "*pcrel_opt_stsf"
+  [(set (match_operand:SF 0 "d_form_memory" "=o,o,o")
+       (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "d,v,r")
+                   (match_operand 2 "const_int_operand" "n,n,n")]
+                  UNSPEC_PCREL_OPT_ST_RELOC))
+   (clobber (match_operand:DI 3 "base_reg_operand" "=b,b,b"))]
+  "TARGET_PCREL_OPT"
+  "@
+   %r2stfs %1,%0
+   %r2stxssp %1,%0
+   %r2stw %1,%0"
+  [(set_attr "type" "fpstore,fpstore,store")
+   (set_attr "isa" "pcrel_opt")])
+
+(define_insn "*pcrel_opt_stdf"
+  [(set (match_operand:DF 0 "d_form_memory" "=o,o,o")
+       (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d,v,r")
+                   (match_operand 2 "const_int_operand" "n,n,n")]
+                  UNSPEC_PCREL_OPT_ST_RELOC))
+   (clobber (match_operand:DI 3 "base_reg_operand" "=b,b,b"))]
+  "TARGET_PCREL_OPT
+   && (TARGET_POWERPC64 || vsx_register_operand (operands[1], DFmode))"
+  "@
+   %r2stfd %1,%0
+   %r2stxsd %1,%0
+   %r2std %1,%0"
+  [(set_attr "type" "fpstore,fpstore,store")
+   (set_attr "isa" "pcrel_opt")])
+
+(define_insn "*pcrel_opt_st<mode>"
+  [(set (match_operand:PO_VECT 0 "d_form_memory" "=o")
+       (unspec:PO_VECT [(match_operand:PO_VECT 1 "gpc_reg_operand" "wa")
+                    (match_operand 2 "const_int_operand" "n")]
+                   UNSPEC_PCREL_OPT_ST_RELOC))
+   (clobber (match_operand:DI 3 "base_reg_operand" "=b"))]
+  "TARGET_PCREL_OPT"
+  "%r2stxv %x1,%0"
+  [(set_attr "type" "vecstore")
+   (set_attr "isa" "pcrel_opt")])
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 6877de5..9ec346c 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -8525,7 +8525,8 @@ rs6000_delegitimize_address (rtx orig_x)
   if (GET_CODE (orig_x) == UNSPEC
       && (XINT (orig_x, 1) == UNSPEC_FUSION_GPR
          || XINT (orig_x, 1) == UNSPEC_PCREL_OPT_LD_ADDR
-         || XINT (orig_x, 1) == UNSPEC_PCREL_OPT_LD_ADDR_SAME_REG))
+         || XINT (orig_x, 1) == UNSPEC_PCREL_OPT_LD_ADDR_SAME_REG
+         || XINT (orig_x, 1) == UNSPEC_PCREL_OPT_ST_ADDR))
     orig_x = XVECEXP (orig_x, 0, 0);
 
   orig_x = delegitimize_mem_from_attrs (orig_x);
-- 
1.8.3.1


-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.ibm.com, phone: +1 (978) 899-4797

Reply via email to