Add Power10 scheduling description. This patch adds the Power10 scheduling description. Since power10.md was pretty much a complete rewrite (existing version of power10.md is mostly just a copy of power9.md), I diffed power10.md with /dev/null so that the full contents of the file are shown as opposed to a diff. This should make it easier to read. This patch will not apply on current trunk do to that reason. Bootstrap/regtest on powerpc64le (Power8/Power10) with no new regressions. Ok for trunk?
-Pat 2020-11-13 Pat Haugen <pthau...@linux.ibm.com> gcc/ * config/rs6000/rs6000.c (struct processor_costs): New. (rs6000_option_override_internal): Set Power10 costs. (rs6000_issue_rate): Set Power10 issue rate. * config/rs6000/power10.md: Rewrite for Power10.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 4d528a39a37..85bb42d6dce 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1080,6 +1080,26 @@ struct processor_costs power9_cost = { COSTS_N_INSNS (3), /* SF->DF convert */ }; +/* Instruction costs on POWER10 processors. */ +static const +struct processor_costs power10_cost = { + COSTS_N_INSNS (1), /* mulsi */ + COSTS_N_INSNS (1), /* mulsi_const */ + COSTS_N_INSNS (1), /* mulsi_const9 */ + COSTS_N_INSNS (1), /* muldi */ + COSTS_N_INSNS (4), /* divsi */ + COSTS_N_INSNS (4), /* divdi */ + COSTS_N_INSNS (2), /* fp */ + COSTS_N_INSNS (2), /* dmul */ + COSTS_N_INSNS (7), /* sdiv */ + COSTS_N_INSNS (9), /* ddiv */ + 128, /* cache line size */ + 32, /* l1 cache */ + 512, /* l2 cache */ + 16, /* prefetch streams */ + COSTS_N_INSNS (2), /* SF->DF convert */ +}; + /* Instruction costs on POWER A2 processors. */ static const struct processor_costs ppca2_cost = { @@ -4734,10 +4754,13 @@ rs6000_option_override_internal (bool global_init_p) break; case PROCESSOR_POWER9: - case PROCESSOR_POWER10: rs6000_cost = &power9_cost; break; + case PROCESSOR_POWER10: + rs6000_cost = &power10_cost; + break; + case PROCESSOR_PPCA2: rs6000_cost = &ppca2_cost; break; @@ -18001,8 +18024,9 @@ rs6000_issue_rate (void) case PROCESSOR_POWER8: return 7; case PROCESSOR_POWER9: - case PROCESSOR_POWER10: return 6; + case PROCESSOR_POWER10: + return 8; default: return 1; } diff --git a/gcc/config/rs6000/power10.md b/gcc/config/rs6000/power10.md new file mode 100644 index 00000000000..f9ca4cbf10e --- /dev/null +++ b/gcc/config/rs6000/power10.md @@ -0,0 +1,553 @@ +;; Scheduling description for the IBM POWER10 processor. +;; Copyright (C) 2020-2020 Free Software Foundation, Inc. +;; +;; Contributed by Pat Haugen (pthau...@us.ibm.com). + +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; <http://www.gnu.org/licenses/>. + +; For Power10 we model (and try to pack) the in-order decode/dispatch groups +; which consist of 8 instructions max. We do not try to model the details of +; the out-of-order issue queues and how insns flow to the various execution +; units except for the simple representation of the issue limitation of at +; most 4 insns to the execution units/2 insns to the load units/2 insns to +; the store units. +(define_automaton "power10dsp,power10issue,power10div") + +; Decode/dispatch slots +(define_cpu_unit "du0_power10,du1_power10,du2_power10,du3_power10, + du4_power10,du5_power10,du6_power10,du7_power10" "power10dsp") + +; Four execution units +(define_cpu_unit "exu0_power10,exu1_power10,exu2_power10,exu3_power10" + "power10issue") +; Two load units and two store units +(define_cpu_unit "lu0_power10,lu1_power10" "power10issue") +(define_cpu_unit "stu0_power10,stu1_power10" "power10issue") +; Create false units for use by non-pipelined div/sqrt +(define_cpu_unit "fx_div0_power10,fx_div1_power10" "power10div") +(define_cpu_unit "fp_div0_power10,fp_div1_power10,fp_div2_power10, + fp_div3_power10" "power10div") + + +; Dispatch slots are allocated in order conforming to program order. +(absence_set "du0_power10" "du1_power10,du2_power10,du3_power10,du4_power10,\ + du5_power10,du6_power10,du7_power10") +(absence_set "du1_power10" "du2_power10,du3_power10,du4_power10,du5_power10,\ + du6_power10,du7_power10") +(absence_set "du2_power10" "du3_power10,du4_power10,du5_power10,du6_power10,\ + du7_power10") +(absence_set "du3_power10" "du4_power10,du5_power10,du6_power10,du7_power10") +(absence_set "du4_power10" "du5_power10,du6_power10,du7_power10") +(absence_set "du5_power10" "du6_power10,du7_power10") +(absence_set "du6_power10" "du7_power10") + + +; Dispatch port reservations +; +; Power10 can dispatch a maximum of 8 iops per cycle. With a maximum of +; 4 VSU/2 Load/2 Store per cycle. + +; Any dispatch slot +(define_reservation "DU_any_power10" + "du0_power10|du1_power10|du2_power10|du3_power10| + du4_power10|du5_power10|du6_power10|du7_power10") + +; Even slot, actually takes even/odd slots +(define_reservation "DU_even_power10" + "du0_power10+du1_power10|du2_power10+du3_power10| + du4_power10+du5_power10|du6_power10+du7_power10") + +; 4-way cracked (consumes whole decode/dispatch cycle) +(define_reservation "DU_all_power10" + "du0_power10+du1_power10+du2_power10+du3_power10+ + du4_power10+du5_power10+du6_power10+du7_power10") + + +; Execution unit reservations +(define_reservation "LU_power10" + "lu0_power10|lu1_power10") + +(define_reservation "STU_power10" + "stu0_power10|stu1_power10") + +; Certain simple fixed-point insns can execute in the Store-agen pipe +(define_reservation "SXU_power10" + "stu0_power10|stu1_power10") + +(define_reservation "EXU_power10" + "exu0_power10|exu1_power10|exu2_power10|exu3_power10") + +(define_reservation "EXU_super_power10" + "exu0_power10+exu1_power10|exu2_power10+exu3_power10") + +; Define the reservations to be used by div/sqrt which allows other insns +; to be issued to the VSU, but blocks other div/sqrt for a number of cycles. +(define_reservation "FX_DIV_power10" + "fx_div0_power10*8|fx_div1_power10*8") +(define_reservation "FP_DIVS_power10" + "fp_div0_power10*5|fp_div1_power10*5|fp_div2_power10*5| + fp_div3_power10*5") +(define_reservation "FP_DIV_power10" + "fp_div0_power10*7|fp_div1_power10*7|fp_div2_power10*7| + fp_div3_power10*7") + + +; Load Unit +(define_insn_reservation "power10-load" 4 + (and (eq_attr "type" "load") + (eq_attr "update" "no") + (eq_attr "size" "!128") + (eq_attr "prefixed" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,LU_power10") + +(define_insn_reservation "power10-prefixed-load" 4 + (and (eq_attr "type" "load") + (eq_attr "update" "no") + (eq_attr "size" "!128") + (eq_attr "prefixed" "!no") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10") + +(define_insn_reservation "power10-load-update" 4 + (and (eq_attr "type" "load") + (eq_attr "update" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10+SXU_power10") + +(define_insn_reservation "power10-fpload-double" 4 + (and (eq_attr "type" "fpload") + (eq_attr "update" "no") + (eq_attr "size" "64") + (eq_attr "prefixed" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,LU_power10") + +(define_insn_reservation "power10-prefixed-fpload-double" 4 + (and (eq_attr "type" "fpload") + (eq_attr "update" "no") + (eq_attr "size" "64") + (eq_attr "prefixed" "!no") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10") + +(define_insn_reservation "power10-fpload-update-double" 4 + (and (eq_attr "type" "fpload") + (eq_attr "update" "yes") + (eq_attr "size" "64") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10+SXU_power10") + +; SFmode loads are cracked and have additional 3 cycles over DFmode +; Prefixed forms behave the same +(define_insn_reservation "power10-fpload-single" 7 + (and (eq_attr "type" "fpload") + (eq_attr "update" "no") + (eq_attr "size" "32") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10") + +(define_insn_reservation "power10-fpload-update-single" 7 + (and (eq_attr "type" "fpload") + (eq_attr "update" "yes") + (eq_attr "size" "32") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10+SXU_power10") + +(define_insn_reservation "power10-vecload" 4 + (and (eq_attr "type" "vecload") + (eq_attr "size" "!256") + (eq_attr "cpu" "power10")) + "DU_any_power10,LU_power10") + +; lxvp +(define_insn_reservation "power10-vecload-pair" 4 + (and (eq_attr "type" "vecload") + (eq_attr "size" "256") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10+SXU_power10") + +; Store Unit +(define_insn_reservation "power10-store" 0 + (and (eq_attr "type" "store,fpstore,vecstore") + (eq_attr "update" "no") + (eq_attr "prefixed" "no") + (eq_attr "size" "!128") + (eq_attr "size" "!256") + (eq_attr "cpu" "power10")) + "DU_any_power10,STU_power10") + +(define_insn_reservation "power10-prefixed-store" 0 + (and (eq_attr "type" "store,fpstore,vecstore") + (eq_attr "prefixed" "!no") + (eq_attr "size" "!128") + (eq_attr "size" "!256") + (eq_attr "cpu" "power10")) + "DU_even_power10,STU_power10") + +; Update forms have 2 cycle latency for updated addr reg +(define_insn_reservation "power10-store-update" 2 + (and (eq_attr "type" "store,fpstore") + (eq_attr "update" "yes") + (eq_attr "cpu" "power10")) + "DU_any_power10,STU_power10") + +; stxvp +(define_insn_reservation "power10-vecstore-pair" 0 + (and (eq_attr "type" "vecstore") + (eq_attr "size" "256") + (eq_attr "cpu" "power10")) + "DU_even_power10,stu0_power10+stu1_power10") + +(define_insn_reservation "power10-larx" 4 + (and (eq_attr "type" "load_l") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,LU_power10") + +; All load quad forms +(define_insn_reservation "power10-lq" 4 + (and (eq_attr "type" "load,load_l") + (eq_attr "size" "128") + (eq_attr "cpu" "power10")) + "DU_even_power10,LU_power10+SXU_power10") + +(define_insn_reservation "power10-stcx" 0 + (and (eq_attr "type" "store_c") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,STU_power10") + +; All store quad forms +(define_insn_reservation "power10-stq" 0 + (and (eq_attr "type" "store,store_c") + (eq_attr "size" "128") + (eq_attr "cpu" "power10")) + "DU_even_power10,stu0_power10+stu1_power10") + +(define_insn_reservation "power10-sync" 1 + (and (eq_attr "type" "sync,isync") + (eq_attr "cpu" "power10")) + "DU_even_power10,STU_power10") + + +; VSU Execution Unit + +; Fixed point ops + +; Most ALU insns are simple 2 cycle, including record form +(define_insn_reservation "power10-alu" 2 + (and (eq_attr "type" "add,exts,integer,logical,isel") + (eq_attr "prefixed" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") +; 4 cycle CR latency +(define_bypass 4 "power10-alu" + "power10-crlogical,power10-mfcr,power10-mfcrf") + +; paddi +(define_insn_reservation "power10-paddi" 2 + (and (eq_attr "type" "add") + (eq_attr "prefixed" "!no") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +; Rotate/shift (non-record form) +(define_insn_reservation "power10-rot" 2 + (and (eq_attr "type" "insert,shift") + (eq_attr "dot" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +; Record form rotate/shift +(define_insn_reservation "power10-rot-compare" 3 + (and (eq_attr "type" "insert,shift") + (eq_attr "dot" "yes") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") +; 5 cycle CR latency +(define_bypass 5 "power10-rot-compare" + "power10-crlogical,power10-mfcr,power10-mfcrf") + +(define_insn_reservation "power10-alu2" 3 + (and (eq_attr "type" "cntlz,popcnt,trap") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") +; 5 cycle CR latency +(define_bypass 5 "power10-alu2" + "power10-crlogical,power10-mfcr,power10-mfcrf") + +(define_insn_reservation "power10-cmp" 2 + (and (eq_attr "type" "cmp") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +; Treat 'two' and 'three' types as 2 or 3 way cracked +(define_insn_reservation "power10-two" 4 + (and (eq_attr "type" "two") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +(define_insn_reservation "power10-three" 6 + (and (eq_attr "type" "three") + (eq_attr "cpu" "power10")) + "DU_all_power10,EXU_power10") + +(define_insn_reservation "power10-mul" 5 + (and (eq_attr "type" "mul") + (eq_attr "dot" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") +; 4 cycle MUL->MUL latency +(define_bypass 4 "power10-mul" + "power10-mul,power10-mul-compare") + +(define_insn_reservation "power10-mul-compare" 5 + (and (eq_attr "type" "mul") + (eq_attr "dot" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") +; 4 cycle MUL->MUL latency +(define_bypass 4 "power10-mul-compare" + "power10-mul,power10-mul-compare") +; 7 cycle CR latency +(define_bypass 7 "power10-mul-compare" + "power10-crlogical,power10-mfcr,power10-mfcrf") + +(define_insn_reservation "power10-div" 12 + (and (eq_attr "type" "div") + (eq_attr "dot" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FX_DIV_power10") + +(define_insn_reservation "power10-div-compare" 12 + (and (eq_attr "type" "div") + (eq_attr "dot" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10,FX_DIV_power10") +; 14 cycle CR latency +(define_bypass 14 "power10-div-compare" + "power10-crlogical,power10-mfcr,power10-mfcrf") + +(define_insn_reservation "power10-crlogical" 2 + (and (eq_attr "type" "cr_logical") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-mfcrf" 2 + (and (eq_attr "type" "mfcrf") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-mfcr" 3 + (and (eq_attr "type" "mfcr") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +; Should differentiate between 1 cr field and > 1 since target of > 1 cr +; is cracked +(define_insn_reservation "power10-mtcr" 3 + (and (eq_attr "type" "mtcr") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-mtjmpr" 3 + (and (eq_attr "type" "mtjmpr") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-mfjmpr" 2 + (and (eq_attr "type" "mfjmpr") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + + +; Floating point/Vector ops + +(define_insn_reservation "power10-fpsimple" 3 + (and (eq_attr "type" "fpsimple") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-fp" 5 + (and (eq_attr "type" "fp,dmul") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-fpcompare" 3 + (and (eq_attr "type" "fpcompare") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-sdiv" 22 + (and (eq_attr "type" "sdiv") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FP_DIVS_power10") + +(define_insn_reservation "power10-ddiv" 27 + (and (eq_attr "type" "ddiv") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FP_DIV_power10") + +(define_insn_reservation "power10-sqrt" 26 + (and (eq_attr "type" "ssqrt") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FP_DIVS_power10") + +(define_insn_reservation "power10-dsqrt" 36 + (and (eq_attr "type" "dsqrt") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FP_DIV_power10") + +(define_insn_reservation "power10-vec-2cyc" 2 + (and (eq_attr "type" "vecmove,veclogical,vecexts,veccmpfx") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-veccmp" 3 + (and (eq_attr "type" "veccmp") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-vecsimple" 2 + (and (eq_attr "type" "vecsimple") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-vecnormal" 5 + (and (eq_attr "type" "vecfloat,vecdouble") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-qp" 12 + (and (eq_attr "type" "vecfloat,vecdouble") + (eq_attr "size" "128") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-vecperm" 3 + (and (eq_attr "type" "vecperm") + (eq_attr "prefixed" "no") + (eq_attr "dot" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-vecperm-compare" 3 + (and (eq_attr "type" "vecperm") + (eq_attr "dot" "yes") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +(define_insn_reservation "power10-prefixed-vecperm" 3 + (and (eq_attr "type" "vecperm") + (eq_attr "prefixed" "!no") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +(define_insn_reservation "power10-veccomplex" 6 + (and (eq_attr "type" "veccomplex") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-vecfdiv" 24 + (and (eq_attr "type" "vecfdiv") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FP_DIVS_power10") + +(define_insn_reservation "power10-vecdiv" 27 + (and (eq_attr "type" "vecdiv") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FP_DIV_power10") + +(define_insn_reservation "power10-qpdiv" 56 + (and (eq_attr "type" "vecdiv") + (eq_attr "size" "128") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10,FP_DIV_power10") + +(define_insn_reservation "power10-qpmul" 24 + (and (eq_attr "type" "qmul") + (eq_attr "size" "128") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-mtvsr" 2 + (and (eq_attr "type" "mtvsr") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-mfvsr" 2 + (and (eq_attr "type" "mfvsr") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + + +; Branch +; Branch is 2 cycles, grouped with STU for issue +(define_insn_reservation "power10-branch" 2 + (and (eq_attr "type" "jmpreg,branch") + (eq_attr "cpu" "power10")) + "DU_any_power10,STU_power10") + + +; Crypto +(define_insn_reservation "power10-crypto" 4 + (and (eq_attr "type" "crypto") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + + +; HTM +(define_insn_reservation "power10-htm" 2 + (and (eq_attr "type" "htmsimple,htm") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + + +; DFP +; Use the minimum 12 cycle latency for all insns, even though some are more +(define_insn_reservation "power10-dfp" 12 + (and (eq_attr "type" "dfp") + (eq_attr "size" "!128") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_power10") + +(define_insn_reservation "power10-dfpq" 12 + (and (eq_attr "type" "dfp") + (eq_attr "size" "128") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_power10") + +; MMA +(define_insn_reservation "power10-mma" 9 + (and (eq_attr "type" "mma") + (eq_attr "prefixed" "no") + (eq_attr "cpu" "power10")) + "DU_any_power10,EXU_super_power10") + +(define_insn_reservation "power10-prefixed-mma" 9 + (and (eq_attr "type" "mma") + (eq_attr "prefixed" "!no") + (eq_attr "cpu" "power10")) + "DU_even_power10,EXU_super_power10") +; 4 cycle MMA->MMA latency +(define_bypass 4 "power10-mma,power10-prefixed-mma" + "power10-mma,power10-prefixed-mma") + +