[PATCH 7/7] sparc: M8 DFA scheduler

Jose E. Marchesi Thu, 29 Jun 2017 04:45:45 -0700

This patch adds a DFA scheduler modelling the core S5 in the SPARC M8
processors.


gcc/ChangeLog:

        * config/sparc/m8.md: New file.
        * config/sparc/sparc.md: Include m8.md.
---
 gcc/ChangeLog             |   5 +
 gcc/config/sparc/m8.md    | 242 ++++++++++++++++++++++++++++++++++++++++++++++
 gcc/config/sparc/sparc.md |   1 +
 3 files changed, 248 insertions(+)
 create mode 100644 gcc/config/sparc/m8.md

diff --git a/gcc/config/sparc/m8.md b/gcc/config/sparc/m8.md
new file mode 100644
index 0000000..f0fe1b2
--- /dev/null
+++ b/gcc/config/sparc/m8.md
@@ -0,0 +1,242 @@
+;; Scheduling description for the SPARC M8.
+;;   Copyright (C) 2017 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; Thigs to improve:
+;;
+;; - Store instructions are implemented by micro-ops, one of which
+;;   generates the store address and is executed in the store address
+;;   generation unit in the slot0.  We need to model that.
+;;
+;; - There are two V3 pipes connected to different slots.  The current
+;;   implementation assumes that all the instructions executing in a
+;;   V3 pipe are issued to the unit in slot3.
+;;
+;; - Single-issue ALU operations incur an additional cycle of latency to
+;;   slot 0 and slot 1 instructions.  This is not currently reflected
+;;   in the DFA.
+
+(define_automaton "m8_0")
+
+;; The S5 core has two dual-issue queues, PQLS and PQEX.  Each queue
+;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
+;; PQEX corresponds to slots 2 and 3.  The core can issue 4
+;; instructions per-cycle, and up to 4 instructions are committed each
+;; cycle.
+;;
+;;                            
+;;                   m8_slot0  - Load Unit.
+;;                             - Store address gen. Unit.
+;;                                                       
+;;                            
+;;   === PQLS ==>    m8_slot1  - Store data unit.
+;;                             - Branch unit.
+;;                                            
+;;                             
+;;   === PQEX ==>    m8_slot2  - Integer Unit (EXU2).                     
+;;                             - 3-cycles Crypto Unit (SPU2).
+;;                                                     
+;;                   m8_slot3  - Integer Unit (EXU3).
+;;                             - 3-cycles Crypto Unit (SPU3).
+;;                             - Floating-point and graphics unit (FPG).
+;;                             - Long-latency Crypto Unit.
+;;                             - Oracle Numbers Unit (ONU).
+
+(define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")
+
+;; Some instructions stall the pipeline and avoid any other
+;; instruction to be issued in the same cycle.  We assume the same for
+;; multi-instruction insns.
+
+(define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + 
m8_slot3")
+
+(define_insn_reservation "m8_single" 1
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "multi,savew,flushw,trap,bmask"))
+  "m8_single_issue")
+
+;; Most of the instructions executing in the integer units have a
+;; latency of 1.
+
+(define_insn_reservation "m8_integer" 1
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
+  "(m8_slot2 | m8_slot3)")
+
+;; Flushing the instruction memory takes 27 cycles.
+
+
+(define_insn_reservation "m8_iflush" 27
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "iflush"))
+  "(m8_slot2 | m8_slot3), nothing*26")
+
+;; The integer multiplication instructions have a latency of 10 cycles
+;; and execute in integer units.
+;;
+;; Likewise for array*, edge* and pdistn instructions.
+;;
+;; However, the latency is only 9 cycles if the consumer of the
+;; operation is also capable of 9 cycles latency.  We model this with
+;; a bypass.
+
+(define_insn_reservation "m8_imul" 10
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "imul,array,edge,edgen,pdistn"))
+  "(m8_slot2 | m8_slot3), nothing*12")
+
+(define_bypass 9 "m8_imul" "m8_imul")
+
+;; The integer division instructions `sdiv' and `udivx' have a latency
+;; of 30 cycles and execute in integer units.
+
+(define_insn_reservation "m8_idiv" 30
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "idiv"))
+  "(m8_slot2 | m8_slot3), nothing*29")
+
+;; Both integer and floating-point load instructions have a latency of
+;; only 3 cycles,and execute in the slot0.
+;;
+;; Misaligned load instructions feature a latency of 11 cycles.
+;;
+;; The prefetch instruction also executes in the load unit, but it's
+;; latency is only 1 cycle.
+
+(define_insn_reservation "m8_load" 3
+  (and (eq_attr "cpu" "m8")
+       (ior (eq_attr "type" "fpload,sload")
+            (and (eq_attr "type" "load")
+                 (eq_attr "subtype" "regular"))))
+  "m8_slot0, nothing*2")
+
+;; (define_insn_reservation "m8_load_misalign" 11
+;;  (and (eq_attr "cpu" "m8")
+;;       (eq_attr "type" "load_mis,fpload_mis"))
+;;  "m8_slot0, nothing*10")
+
+(define_insn_reservation "m8_prefetch" 1
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "load")
+       (eq_attr "subtype" "prefetch"))
+  "m8_slot0")
+
+;; Both integer and floating-point store instructions have a latency
+;; of 1 cycle, and execute in the store data unit in slot1.
+;;
+;; However, misaligned store instructions feature a latency of 3
+;; cycles.
+
+(define_insn_reservation "m8_store" 1
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "store,fpstore"))
+  "m8_slot1")
+
+;; (define_insn_reservation "m8_store_misalign" 3
+;;   (and (eq_attr "cpu" "m8")
+;;        (eq_attr "type" "store_mis,fpstore_mis"))
+;;   "m8_slot1, nothing*2")
+
+;; Control-transfer instructions execute in the Branch Unit in the
+;; slot1.
+
+(define_insn_reservation "m8_cti" 1
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" 
"cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
+  "m8_slot1")
+
+;; Many instructions executing in the Floating-point and Graphics Unit
+;; (FGU) serving slot3 feature a default latency of 9 cycles.
+
+(define_insn_reservation "m8_fp" 9
+  (and (eq_attr "cpu" "m8")
+       (ior (eq_attr "type" 
"fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
+            (and (eq_attr "type" "fga")
+                 (eq_attr "subtype" "fpu"))))
+  "m8_slot3, nothing*8")
+
+;; Floating-point division and floating-point square-root instructions
+;; have high latencies.  They execute in the FGU.
+
+(define_insn_reservation "m8_fpdivs" 26
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "fpdivs"))
+  "m8_slot3, nothing*25")
+
+(define_insn_reservation "m8_fpsqrts" 33
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "fpsqrts"))
+  "m8_slot3, nothing*32")
+
+(define_insn_reservation "m8_fpdivd" 30
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "fpdivd"))
+  "m8_slot3, nothing*29")
+
+(define_insn_reservation "m8_fpsqrtd" 41
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "fpsqrtd"))
+  "m8_slot3, nothing*40")
+
+;; SIMD VIS instructions executing in the Floating-point and graphics
+;; unit (FPG) in slot3 usually have a latency of 5 cycles.
+;;
+;; However, the latency for many instructions is only 3 cycles if the
+;; consumer can also be executed in 3 cycles.  We model this with a
+;; bypass.  In these cases the instructions are executed in one of the
+;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
+;; and 3.
+
+(define_insn_reservation "m8_vis" 5
+  (and (eq_attr "cpu" "m8")
+       (ior (eq_attr "type" "viscmp,lzd")
+            (and (eq_attr "type" "fga")
+                 (eq_attr "subtype" "maxmin,cmask,other"))
+            (and (eq_attr "type" "vismv")
+                 (eq_attr "subtype" "single,movstouw"))
+            (and (eq_attr "type" "visl")
+                 (eq_attr "subtype" "single"))))
+  "m8_slot3, nothing*4")
+
+(define_bypass 3 "m8_vis" "m8_vis")
+
+(define_insn_reservation "m8_gsr" 5
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "gsr")
+       (eq_attr "subtype" "alignaddr"))
+  "m8_slot3, nothing*4")
+
+;; A few VIS instructions have a latency of 1.
+
+(define_insn_reservation "m8_vis_1cycle" 1
+  (and (eq_attr "cpu" "m8")
+       (ior (and (eq_attr "type" "vismv")
+                 (eq_attr "subtype" "double,movxtod,movdtox"))
+            (and (eq_attr "type" "visl")
+                 (eq_attr "subtype" "double"))
+            (and (eq_attr "type" "fga")
+                 (eq_attr "subtype" "addsub64"))))
+  "m8_slot3")
+
+;; Reading and writing to the gsr register takes more than 70 cycles.
+
+(define_insn_reservation "m8_gsr_reg" 70
+  (and (eq_attr "cpu" "m8")
+       (eq_attr "type" "gsr")
+       (eq_attr "subtype" "reg"))
+  "m8_slot3, nothing*69")
diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md
index 407544b..cac1bd9 100644
--- a/gcc/config/sparc/sparc.md
+++ b/gcc/config/sparc/sparc.md
@@ -613,6 +613,7 @@
 (include "niagara2.md")
 (include "niagara4.md")
 (include "niagara7.md")
+(include "m8.md")
 
 
 ;; Operand and operator predicates and constraints
-- 
2.3.4

[PATCH 7/7] sparc: M8 DFA scheduler

Reply via email to