Re: [aarch64}: added variable issue rate feature for falkor

Kyrill Tkachov Tue, 16 Oct 2018 09:59:37 -0700

Hi Kai,

On 11/10/18 16:25, Kai Tietz wrote:

Hi,


I reworked patch use a tuning flag instead of checking explicit for
CPU flavor. I will send soon an update for it, which won't use the
static variable anymore, and uses instead the SCHED-api.

I would like first to get some comments on current version.

Regards,
Kai


Index: trunk/gcc/config/aarch64/aarch64.c
===================================================================
--- trunk.orig/gcc/config/aarch64/aarch64.c
+++ trunk/gcc/config/aarch64/aarch64.c
@@ -955,7 +955,7 @@ static const struct tune_params qdf24xx_
   &generic_branch_cost,
   &generic_approx_modes,
   4, /* memmov_cost  */
-  4, /* issue_rate  */
+  8, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
   "16",      /* function_align.  */
@@ -968,7 +968,7 @@ static const struct tune_params qdf24xx_
   2,   /* min_div_recip_mul_df.  */
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
+  AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS | AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS, /* 
tune_flags.  */

Thanks for this, this looks much cleaner and externsible.

 &qdf24xx_prefetch_tune
 };

@@ -18037,6 +18037,109 @@ aarch64_run_selftests (void)#endif /* #if CHECKING_P */+/* The number of micro ops left over after issuing the last instruction in a

+   cycle.  This is subtracted from the next cycle before we start issuing 
insns.
+   This is initialized to 0 at the start of every basic block.  */
+static int leftover_uops = 0;
+
+/* Implement TARGET_SCHED_REORDER.  */
+
+static int
+aarch64_sched_reorder (FILE *file, int verbose,
+                      rtx_insn **ready ATTRIBUTE_UNUSED,
+                      int *n_readyp ATTRIBUTE_UNUSED,
+                      int clock)
+{
+  int can_issue_more = aarch64_sched_issue_rate ();
+
+  if ((aarch64_tune_params.extra_tuning_flags
+       & AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS) != 0)
+    {
+      /* The start of a basic block.  */
+      if (clock == 0)
+       {
+         if (leftover_uops && file && (verbose > 3))
+           fprintf (file, ";;\tLeftover uops ignored at bb start.\n");
+
+         leftover_uops = 0;
+       }
+
+      /* Account for issue slots left over from previous cycle.  This value
+        can be larger than the number of issue slots per cycle, so we need
+        to check it here before scheduling any instructions.  */
+      else if (leftover_uops)
+       {
+         can_issue_more -= leftover_uops;
+
+         if (file && (verbose > 3))
+           {
+             fprintf (file, ";;\tUse %d issue slots for leftover uops.\n",
+                      leftover_uops);
+             fprintf (file, ";;\t%d issue slots left.\n", can_issue_more);
+           }
+
+         leftover_uops = 0;
+
+         if (can_issue_more < 0)
+           {
+             leftover_uops = 0 - can_issue_more;
+             can_issue_more = 0;
+
+             if (file && (verbose > 3))
+               {
+                 fprintf (file, ";;skipping issue cycle.\n");
+                 fprintf (file, ";;\t%d uops left over.\n", leftover_uops);
+               }
+           }
+       }
+    }
+
+  return can_issue_more;
+}
+
+/* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
+
+static int
+aarch64_variable_issue (FILE *file, int verbose,
+                       rtx_insn *insn, int more)
+{
+  if (GET_CODE (PATTERN (insn)) != USE
+      && GET_CODE (PATTERN (insn)) != CLOBBER)
+    {
+      if ((aarch64_tune_params.extra_tuning_flags
+          & AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS) == 0)
+       more -= 1;
+      else
+       {
+          /* There is for now just falkor target supporting scheduling
+            of micro operations. Therefore we don't need to check.  */
+         int issue_slots = get_attr_falkor_variable_issue (insn);


This is still my concern about having this falkor-specific path.
I think we'll want to go with my suggestion at 
https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00815.html
which is to create a generic attribute, called something like 
"number_micro_ops" and have it populated for each core if they decide to use it.
Your patch would provide the falkor numbers initially. Then this part of aarch64.c 
wouldn't be falkor-specific and could use something like 
"get_attr_number_micro_ops".

That is really the biggest concern I had.

Thanks,
Kyrill


+         more -= issue_slots;
+
+         if (file && (verbose > 3))
+           {
+             fprintf (file, ";;\tInsn takes %d issue slots.\n", issue_slots);
+             fprintf (file, ";;\t%d issue slots left.\n", more);
+           }
+
+         /* We schedule an instruction first, and then subtract issue slots,
+            which means the result can be negative.  We carry the extra over
+            to the next cycle.  */
+
+         if (more < 0)
+           {
+             leftover_uops = 0 - more;
+             more = 0;
+
+             if (file && (verbose > 3))
+               fprintf (file, ";;\t%d uops left over.\n", leftover_uops);
+           }
+       }
+    }
+
+  return more;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost

@@ -18265,6 +18368,12 @@ aarch64_libgcc_floating_mode_supported_p

 #undef TARGET_SCHED_ISSUE_RATE
 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate

+#undef TARGET_SCHED_REORDER

+#define TARGET_SCHED_REORDER aarch64_sched_reorder
+
+#undef TARGET_SCHED_VARIABLE_ISSUE
+#define TARGET_SCHED_VARIABLE_ISSUE aarch64_variable_issue
+
 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
   aarch64_sched_first_cycle_multipass_dfa_lookahead
Index: trunk/gcc/config/aarch64/falkor.md
===================================================================
--- trunk.orig/gcc/config/aarch64/falkor.md
+++ trunk/gcc/config/aarch64/falkor.md
@@ -685,3 +685,24 @@
 (define_bypass 3
   
"falkor_afp_5_vxvy_mul,falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mul,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mul,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mul,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mul,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mul,falkor_fpdt_6_vxvy_mla"
   
"falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mla")
+
+
+(define_attr "falkor_variable_issue" ""
+  (cond [
+;; A64 Instructions
+        (eq_attr "type" 
"neon_fp_neg_s_q,neon_fp_neg_d_q,neon_fp_abs_s_q,neon_fp_abs_d_q,neon_fp_minmax_s_q,neon_fp_minmax_d_q,neon_fp_compare_s_q,neon_fp_compare_d_q,neon_fp_round_s_q,neon_fp_round_d_q,neon_fp_abd_s_q,neon_fp_abd_d_q,neon_fp_addsub_s_q,neon_fp_addsub_d_q,neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q,neon_fp_to_int_s_q,neon_fp_to_int_d_q,neon_int_to_fp_s_q,neon_int_to_fp_d_q,neon_fp_mla_d_q,neon_fp_mla_d_scalar_q,neon_fp_div_s,neon_fp_div_d,neon_fp_sqrt_s,neon_fp_sqrt_d,neon_shift_imm_long,neon_add_q,neon_reduc_add_q,neon_logic_q,neon_neg_q,neon_sub_q,neon_add_halve_q,neon_sub_halve_q,neon_shift_imm_q,neon_shift_reg_q,neon_minmax_q,neon_abs_q,neon_compare_q,neon_compare_zero_q,neon_tst_q,neon_reduc_add_long,neon_shift_acc_q,neon_reduc_add_acc_q,neon_abd_q,neon_abd_long,neon_qadd_q,neon_qsub_q,neon_qabs_q,neon_qneg_q,neon_sat_shift_imm_q,neon_sat_shift_reg_q,neon_mul_b_q,neon_mul_h_q,neon_mul_s_q,neon_mul_h_scalar_q,neon_mul_s_scalar_q,neon_sat_mul_b_q,neon_sat_mul_h_q,neon_sat_mul_s_q,neon_mul_b_long,neon_mul_h_long,neon_mul_s_long,neon_mul_d_long,neon_mul_h_scalar_long,neon_mul_s_scalar_long,neon_sat_mul_b_long,neon_sat_mul_h_long,neon_sat_mul_s_long,neon_sat_mul_h_scalar_q,neon_sat_mul_s_scalar_q,neon_sat_mul_h_scalar_long,neon_sat_mul_s_scalar_long,neon_mla_b_q,neon_mla_h_q,neon_mla_s_q,neon_mla_h_scalar_q,neon_mla_s_scalar_q,neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,neon_mla_h_scalar_long,neon_mla_s_scalar_long,neon_sat_mla_b_long,neon_sat_mla_h_long,neon_sat_mla_s_long,neon_sat_mla_h_scalar_long,neon_sat_mla_s_scalar_long,neon_add_halve_narrow_q,neon_sub_halve_narrow_q,neon_arith_acc,neon_load1_2reg,neon_load2_2reg,neon_load2_all_lanes,neon_load1_2reg_q,neon_load2_2reg_q,neon_load2_all_lanes_q,neon_load3_one_lane,neon_load4_one_lane,neon_ldp,neon_ldp_q,neon_from_gp_q,neon_bsl_q,neon_dup_q,neon_ext_q,neon_move_q,neon_rev_q,neon_tbl1_q,neon_permute_q,neon_cls_q,neon_cnt_q,neon_rbit_q,neon_tbl2,neon_fp_recpe_s_q,neon_fp_recpe_d_q,neon_fp_rsqrte_s_q,neon_fp_rsqrte_d_q,neon_fp_recps_s_q,neon_fp_recps_d_q,neon_fp_rsqrts_d_q,neon_store1_1reg,neon_store1_1reg_q,neon_store1_one_lane,neon_store1_one_lane_q,neon_store1_2reg,neon_store2_2reg,neon_store2_one_lane,neon_store2_one_lane_q,neon_stp,crypto_sha1_xor,crypto_sha256_fast,crypto_sha1_slow,crypto_sha256_slow,crypto_aese,f_stores,f_stored,fdivs,fdivd,sdiv,udiv")
+          (const_int 2)
+        (eq_attr "type" 
"neon_fp_cvt_narrow_s_q,neon_fp_cvt_narrow_d_q,neon_load1_3reg,neon_load3_3reg,neon_load3_all_lanes,neon_load1_3reg_q,neon_load3_3reg_q,neon_load3_all_lanes_q,neon_tbl2_q,neon_tbl3")
+          (const_int 3)
+        (eq_attr "type" 
"neon_fp_div_s_q,neon_fp_div_d_q,neon_fp_sqrt_s_q,neon_fp_sqrt_d_q,neon_add_widen,neon_sub_widen,neon_arith_acc_q,neon_load1_4reg,neon_load4_4reg,neon_load1_4reg_q,neon_load4_4reg_q,neon_load4_all_lanes,neon_load4_all_lanes_q,neon_tbl3_q,neon_tbl4,neon_store1_2reg_q,neon_store1_3reg,neon_store1_4reg,neon_store2_2reg_q,neon_store3_3reg,neon_store4_4reg,neon_store3_one_lane,neon_store3_one_lane_q,neon_store4_one_lane,neon_store4_one_lane_q,neon_stp_q")
+          (const_int 4)
+        (eq_attr "type" "neon_tbl4_q")
+          (const_int 5)
+        (eq_attr "type" "neon_store1_3reg_q,neon_store3_3reg_q")
+          (const_int 6)
+        (eq_attr "type" "neon_store1_4reg_q,neon_store4_4reg_q")
+          (const_int 8)
+        (eq_attr "type" "multiple")
+          (const_int 2)
+       ]
+       (const_int 1)))
Index: trunk/gcc/config/aarch64/aarch64-tuning-flags.def
===================================================================
--- trunk.orig/gcc/config/aarch64/aarch64-tuning-flags.def
+++ trunk/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -46,4 +46,7 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp

AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)+/* Enable micro-op scheduling instead of default instruction one. */

+AARCH64_EXTRA_TUNING_OPTION ("sched_microops", SCHED_MICRO_OPS)
+
 #undef AARCH64_EXTRA_TUNING_OPTION

Re: [aarch64}: added variable issue rate feature for falkor

Reply via email to