This patch adds a DFA scheduler modelling the core S5 in the SPARC M8 processors.
gcc/ChangeLog: * config/sparc/m8.md: New file. * config/sparc/sparc.md: Include m8.md. --- gcc/ChangeLog | 5 + gcc/config/sparc/m8.md | 242 ++++++++++++++++++++++++++++++++++++++++++++++ gcc/config/sparc/sparc.md | 1 + 3 files changed, 248 insertions(+) create mode 100644 gcc/config/sparc/m8.md diff --git a/gcc/config/sparc/m8.md b/gcc/config/sparc/m8.md new file mode 100644 index 0000000..f0fe1b2 --- /dev/null +++ b/gcc/config/sparc/m8.md @@ -0,0 +1,242 @@ +;; Scheduling description for the SPARC M8. +;; Copyright (C) 2017 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; <http://www.gnu.org/licenses/>. + +;; Thigs to improve: +;; +;; - Store instructions are implemented by micro-ops, one of which +;; generates the store address and is executed in the store address +;; generation unit in the slot0. We need to model that. +;; +;; - There are two V3 pipes connected to different slots. The current +;; implementation assumes that all the instructions executing in a +;; V3 pipe are issued to the unit in slot3. +;; +;; - Single-issue ALU operations incur an additional cycle of latency to +;; slot 0 and slot 1 instructions. This is not currently reflected +;; in the DFA. + +(define_automaton "m8_0") + +;; The S5 core has two dual-issue queues, PQLS and PQEX. Each queue +;; is divided into two slots: PQLS corresponds to slots 0 and 1, and +;; PQEX corresponds to slots 2 and 3. The core can issue 4 +;; instructions per-cycle, and up to 4 instructions are committed each +;; cycle. +;; +;; +;; m8_slot0 - Load Unit. +;; - Store address gen. Unit. +;; +;; +;; === PQLS ==> m8_slot1 - Store data unit. +;; - Branch unit. +;; +;; +;; === PQEX ==> m8_slot2 - Integer Unit (EXU2). +;; - 3-cycles Crypto Unit (SPU2). +;; +;; m8_slot3 - Integer Unit (EXU3). +;; - 3-cycles Crypto Unit (SPU3). +;; - Floating-point and graphics unit (FPG). +;; - Long-latency Crypto Unit. +;; - Oracle Numbers Unit (ONU). + +(define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0") + +;; Some instructions stall the pipeline and avoid any other +;; instruction to be issued in the same cycle. We assume the same for +;; multi-instruction insns. + +(define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3") + +(define_insn_reservation "m8_single" 1 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "multi,savew,flushw,trap,bmask")) + "m8_single_issue") + +;; Most of the instructions executing in the integer units have a +;; latency of 1. + +(define_insn_reservation "m8_integer" 1 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask")) + "(m8_slot2 | m8_slot3)") + +;; Flushing the instruction memory takes 27 cycles. + + +(define_insn_reservation "m8_iflush" 27 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "iflush")) + "(m8_slot2 | m8_slot3), nothing*26") + +;; The integer multiplication instructions have a latency of 10 cycles +;; and execute in integer units. +;; +;; Likewise for array*, edge* and pdistn instructions. +;; +;; However, the latency is only 9 cycles if the consumer of the +;; operation is also capable of 9 cycles latency. We model this with +;; a bypass. + +(define_insn_reservation "m8_imul" 10 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "imul,array,edge,edgen,pdistn")) + "(m8_slot2 | m8_slot3), nothing*12") + +(define_bypass 9 "m8_imul" "m8_imul") + +;; The integer division instructions `sdiv' and `udivx' have a latency +;; of 30 cycles and execute in integer units. + +(define_insn_reservation "m8_idiv" 30 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "idiv")) + "(m8_slot2 | m8_slot3), nothing*29") + +;; Both integer and floating-point load instructions have a latency of +;; only 3 cycles,and execute in the slot0. +;; +;; Misaligned load instructions feature a latency of 11 cycles. +;; +;; The prefetch instruction also executes in the load unit, but it's +;; latency is only 1 cycle. + +(define_insn_reservation "m8_load" 3 + (and (eq_attr "cpu" "m8") + (ior (eq_attr "type" "fpload,sload") + (and (eq_attr "type" "load") + (eq_attr "subtype" "regular")))) + "m8_slot0, nothing*2") + +;; (define_insn_reservation "m8_load_misalign" 11 +;; (and (eq_attr "cpu" "m8") +;; (eq_attr "type" "load_mis,fpload_mis")) +;; "m8_slot0, nothing*10") + +(define_insn_reservation "m8_prefetch" 1 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "load") + (eq_attr "subtype" "prefetch")) + "m8_slot0") + +;; Both integer and floating-point store instructions have a latency +;; of 1 cycle, and execute in the store data unit in slot1. +;; +;; However, misaligned store instructions feature a latency of 3 +;; cycles. + +(define_insn_reservation "m8_store" 1 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "store,fpstore")) + "m8_slot1") + +;; (define_insn_reservation "m8_store_misalign" 3 +;; (and (eq_attr "cpu" "m8") +;; (eq_attr "type" "store_mis,fpstore_mis")) +;; "m8_slot1, nothing*2") + +;; Control-transfer instructions execute in the Branch Unit in the +;; slot1. + +(define_insn_reservation "m8_cti" 1 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return")) + "m8_slot1") + +;; Many instructions executing in the Floating-point and Graphics Unit +;; (FGU) serving slot3 feature a default latency of 9 cycles. + +(define_insn_reservation "m8_fp" 9 + (and (eq_attr "cpu" "m8") + (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist") + (and (eq_attr "type" "fga") + (eq_attr "subtype" "fpu")))) + "m8_slot3, nothing*8") + +;; Floating-point division and floating-point square-root instructions +;; have high latencies. They execute in the FGU. + +(define_insn_reservation "m8_fpdivs" 26 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "fpdivs")) + "m8_slot3, nothing*25") + +(define_insn_reservation "m8_fpsqrts" 33 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "fpsqrts")) + "m8_slot3, nothing*32") + +(define_insn_reservation "m8_fpdivd" 30 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "fpdivd")) + "m8_slot3, nothing*29") + +(define_insn_reservation "m8_fpsqrtd" 41 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "fpsqrtd")) + "m8_slot3, nothing*40") + +;; SIMD VIS instructions executing in the Floating-point and graphics +;; unit (FPG) in slot3 usually have a latency of 5 cycles. +;; +;; However, the latency for many instructions is only 3 cycles if the +;; consumer can also be executed in 3 cycles. We model this with a +;; bypass. In these cases the instructions are executed in one of the +;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2 +;; and 3. + +(define_insn_reservation "m8_vis" 5 + (and (eq_attr "cpu" "m8") + (ior (eq_attr "type" "viscmp,lzd") + (and (eq_attr "type" "fga") + (eq_attr "subtype" "maxmin,cmask,other")) + (and (eq_attr "type" "vismv") + (eq_attr "subtype" "single,movstouw")) + (and (eq_attr "type" "visl") + (eq_attr "subtype" "single")))) + "m8_slot3, nothing*4") + +(define_bypass 3 "m8_vis" "m8_vis") + +(define_insn_reservation "m8_gsr" 5 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "gsr") + (eq_attr "subtype" "alignaddr")) + "m8_slot3, nothing*4") + +;; A few VIS instructions have a latency of 1. + +(define_insn_reservation "m8_vis_1cycle" 1 + (and (eq_attr "cpu" "m8") + (ior (and (eq_attr "type" "vismv") + (eq_attr "subtype" "double,movxtod,movdtox")) + (and (eq_attr "type" "visl") + (eq_attr "subtype" "double")) + (and (eq_attr "type" "fga") + (eq_attr "subtype" "addsub64")))) + "m8_slot3") + +;; Reading and writing to the gsr register takes more than 70 cycles. + +(define_insn_reservation "m8_gsr_reg" 70 + (and (eq_attr "cpu" "m8") + (eq_attr "type" "gsr") + (eq_attr "subtype" "reg")) + "m8_slot3, nothing*69") diff --git a/gcc/config/sparc/sparc.md b/gcc/config/sparc/sparc.md index 407544b..cac1bd9 100644 --- a/gcc/config/sparc/sparc.md +++ b/gcc/config/sparc/sparc.md @@ -613,6 +613,7 @@ (include "niagara2.md") (include "niagara4.md") (include "niagara7.md") +(include "m8.md") ;; Operand and operator predicates and constraints -- 2.3.4