Hi Kai,
On 11/10/18 16:25, Kai Tietz wrote:
Hi,
I reworked patch use a tuning flag instead of checking explicit for
CPU flavor. I will send soon an update for it, which won't use the
static variable anymore, and uses instead the SCHED-api.
I would like first to get some comments on current version.
Regards,
Kai
Index: trunk/gcc/config/aarch64/aarch64.c
===================================================================
--- trunk.orig/gcc/config/aarch64/aarch64.c
+++ trunk/gcc/config/aarch64/aarch64.c
@@ -955,7 +955,7 @@ static const struct tune_params qdf24xx_
&generic_branch_cost,
&generic_approx_modes,
4, /* memmov_cost */
- 4, /* issue_rate */
+ 8, /* issue_rate */
(AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
| AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
"16", /* function_align. */
@@ -968,7 +968,7 @@ static const struct tune_params qdf24xx_
2, /* min_div_recip_mul_df. */
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
+ AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS | AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS, /*
tune_flags. */
Thanks for this, this looks much cleaner and externsible.
&qdf24xx_prefetch_tune
};
@@ -18037,6 +18037,109 @@ aarch64_run_selftests (void)
#endif /* #if CHECKING_P */
+/* The number of micro ops left over after issuing the last instruction in a
+ cycle. This is subtracted from the next cycle before we start issuing
insns.
+ This is initialized to 0 at the start of every basic block. */
+static int leftover_uops = 0;
+
+/* Implement TARGET_SCHED_REORDER. */
+
+static int
+aarch64_sched_reorder (FILE *file, int verbose,
+ rtx_insn **ready ATTRIBUTE_UNUSED,
+ int *n_readyp ATTRIBUTE_UNUSED,
+ int clock)
+{
+ int can_issue_more = aarch64_sched_issue_rate ();
+
+ if ((aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS) != 0)
+ {
+ /* The start of a basic block. */
+ if (clock == 0)
+ {
+ if (leftover_uops && file && (verbose > 3))
+ fprintf (file, ";;\tLeftover uops ignored at bb start.\n");
+
+ leftover_uops = 0;
+ }
+
+ /* Account for issue slots left over from previous cycle. This value
+ can be larger than the number of issue slots per cycle, so we need
+ to check it here before scheduling any instructions. */
+ else if (leftover_uops)
+ {
+ can_issue_more -= leftover_uops;
+
+ if (file && (verbose > 3))
+ {
+ fprintf (file, ";;\tUse %d issue slots for leftover uops.\n",
+ leftover_uops);
+ fprintf (file, ";;\t%d issue slots left.\n", can_issue_more);
+ }
+
+ leftover_uops = 0;
+
+ if (can_issue_more < 0)
+ {
+ leftover_uops = 0 - can_issue_more;
+ can_issue_more = 0;
+
+ if (file && (verbose > 3))
+ {
+ fprintf (file, ";;skipping issue cycle.\n");
+ fprintf (file, ";;\t%d uops left over.\n", leftover_uops);
+ }
+ }
+ }
+ }
+
+ return can_issue_more;
+}
+
+/* Implement TARGET_SCHED_VARIABLE_ISSUE. */
+
+static int
+aarch64_variable_issue (FILE *file, int verbose,
+ rtx_insn *insn, int more)
+{
+ if (GET_CODE (PATTERN (insn)) != USE
+ && GET_CODE (PATTERN (insn)) != CLOBBER)
+ {
+ if ((aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_SCHED_MICRO_OPS) == 0)
+ more -= 1;
+ else
+ {
+ /* There is for now just falkor target supporting scheduling
+ of micro operations. Therefore we don't need to check. */
+ int issue_slots = get_attr_falkor_variable_issue (insn);
This is still my concern about having this falkor-specific path.
I think we'll want to go with my suggestion at
https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00815.html
which is to create a generic attribute, called something like
"number_micro_ops" and have it populated for each core if they decide to use it.
Your patch would provide the falkor numbers initially. Then this part of aarch64.c
wouldn't be falkor-specific and could use something like
"get_attr_number_micro_ops".
That is really the biggest concern I had.
Thanks,
Kyrill
+ more -= issue_slots;
+
+ if (file && (verbose > 3))
+ {
+ fprintf (file, ";;\tInsn takes %d issue slots.\n", issue_slots);
+ fprintf (file, ";;\t%d issue slots left.\n", more);
+ }
+
+ /* We schedule an instruction first, and then subtract issue slots,
+ which means the result can be negative. We carry the extra over
+ to the next cycle. */
+
+ if (more < 0)
+ {
+ leftover_uops = 0 - more;
+ more = 0;
+
+ if (file && (verbose > 3))
+ fprintf (file, ";;\t%d uops left over.\n", leftover_uops);
+ }
+ }
+ }
+
+ return more;
+}
+
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST aarch64_address_cost
@@ -18265,6 +18368,12 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_SCHED_ISSUE_RATE
#define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER aarch64_sched_reorder
+
+#undef TARGET_SCHED_VARIABLE_ISSUE
+#define TARGET_SCHED_VARIABLE_ISSUE aarch64_variable_issue
+
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
aarch64_sched_first_cycle_multipass_dfa_lookahead
Index: trunk/gcc/config/aarch64/falkor.md
===================================================================
--- trunk.orig/gcc/config/aarch64/falkor.md
+++ trunk/gcc/config/aarch64/falkor.md
@@ -685,3 +685,24 @@
(define_bypass 3
"falkor_afp_5_vxvy_mul,falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mul,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mul,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mul,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mul,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mul,falkor_fpdt_6_vxvy_mla"
"falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mla")
+
+
+(define_attr "falkor_variable_issue" ""
+ (cond [
+;; A64 Instructions
+ (eq_attr "type"
"neon_fp_neg_s_q,neon_fp_neg_d_q,neon_fp_abs_s_q,neon_fp_abs_d_q,neon_fp_minmax_s_q,neon_fp_minmax_d_q,neon_fp_compare_s_q,neon_fp_compare_d_q,neon_fp_round_s_q,neon_fp_round_d_q,neon_fp_abd_s_q,neon_fp_abd_d_q,neon_fp_addsub_s_q,neon_fp_addsub_d_q,neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q,neon_fp_to_int_s_q,neon_fp_to_int_d_q,neon_int_to_fp_s_q,neon_int_to_fp_d_q,neon_fp_mla_d_q,neon_fp_mla_d_scalar_q,neon_fp_div_s,neon_fp_div_d,neon_fp_sqrt_s,neon_fp_sqrt_d,neon_shift_imm_long,neon_add_q,neon_reduc_add_q,neon_logic_q,neon_neg_q,neon_sub_q,neon_add_halve_q,neon_sub_halve_q,neon_shift_imm_q,neon_shift_reg_q,neon_minmax_q,neon_abs_q,neon_compare_q,neon_compare_zero_q,neon_tst_q,neon_reduc_add_long,neon_shift_acc_q,neon_reduc_add_acc_q,neon_abd_q,neon_abd_long,neon_qadd_q,neon_qsub_q,neon_qabs_q,neon_qneg_q,neon_sat_shift_imm_q,neon_sat_shift_reg_q,neon_mul_b_q,neon_mul_h_q,neon_mul_s_q,neon_mul_h_scalar_q,neon_mul_s_scalar_q,neon_sat_mul_b_q,neon_sat_mul_h_q,neon_sat_mul_s_q,neon_mul_b_long,neon_mul_h_long,neon_mul_s_long,neon_mul_d_long,neon_mul_h_scalar_long,neon_mul_s_scalar_long,neon_sat_mul_b_long,neon_sat_mul_h_long,neon_sat_mul_s_long,neon_sat_mul_h_scalar_q,neon_sat_mul_s_scalar_q,neon_sat_mul_h_scalar_long,neon_sat_mul_s_scalar_long,neon_mla_b_q,neon_mla_h_q,neon_mla_s_q,neon_mla_h_scalar_q,neon_mla_s_scalar_q,neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,neon_mla_h_scalar_long,neon_mla_s_scalar_long,neon_sat_mla_b_long,neon_sat_mla_h_long,neon_sat_mla_s_long,neon_sat_mla_h_scalar_long,neon_sat_mla_s_scalar_long,neon_add_halve_narrow_q,neon_sub_halve_narrow_q,neon_arith_acc,neon_load1_2reg,neon_load2_2reg,neon_load2_all_lanes,neon_load1_2reg_q,neon_load2_2reg_q,neon_load2_all_lanes_q,neon_load3_one_lane,neon_load4_one_lane,neon_ldp,neon_ldp_q,neon_from_gp_q,neon_bsl_q,neon_dup_q,neon_ext_q,neon_move_q,neon_rev_q,neon_tbl1_q,neon_permute_q,neon_cls_q,neon_cnt_q,neon_rbit_q,neon_tbl2,neon_fp_recpe_s_q,neon_fp_recpe_d_q,neon_fp_rsqrte_s_q,neon_fp_rsqrte_d_q,neon_fp_recps_s_q,neon_fp_recps_d_q,neon_fp_rsqrts_d_q,neon_store1_1reg,neon_store1_1reg_q,neon_store1_one_lane,neon_store1_one_lane_q,neon_store1_2reg,neon_store2_2reg,neon_store2_one_lane,neon_store2_one_lane_q,neon_stp,crypto_sha1_xor,crypto_sha256_fast,crypto_sha1_slow,crypto_sha256_slow,crypto_aese,f_stores,f_stored,fdivs,fdivd,sdiv,udiv")
+ (const_int 2)
+ (eq_attr "type"
"neon_fp_cvt_narrow_s_q,neon_fp_cvt_narrow_d_q,neon_load1_3reg,neon_load3_3reg,neon_load3_all_lanes,neon_load1_3reg_q,neon_load3_3reg_q,neon_load3_all_lanes_q,neon_tbl2_q,neon_tbl3")
+ (const_int 3)
+ (eq_attr "type"
"neon_fp_div_s_q,neon_fp_div_d_q,neon_fp_sqrt_s_q,neon_fp_sqrt_d_q,neon_add_widen,neon_sub_widen,neon_arith_acc_q,neon_load1_4reg,neon_load4_4reg,neon_load1_4reg_q,neon_load4_4reg_q,neon_load4_all_lanes,neon_load4_all_lanes_q,neon_tbl3_q,neon_tbl4,neon_store1_2reg_q,neon_store1_3reg,neon_store1_4reg,neon_store2_2reg_q,neon_store3_3reg,neon_store4_4reg,neon_store3_one_lane,neon_store3_one_lane_q,neon_store4_one_lane,neon_store4_one_lane_q,neon_stp_q")
+ (const_int 4)
+ (eq_attr "type" "neon_tbl4_q")
+ (const_int 5)
+ (eq_attr "type" "neon_store1_3reg_q,neon_store3_3reg_q")
+ (const_int 6)
+ (eq_attr "type" "neon_store1_4reg_q,neon_store4_4reg_q")
+ (const_int 8)
+ (eq_attr "type" "multiple")
+ (const_int 2)
+ ]
+ (const_int 1)))
Index: trunk/gcc/config/aarch64/aarch64-tuning-flags.def
===================================================================
--- trunk.orig/gcc/config/aarch64/aarch64-tuning-flags.def
+++ trunk/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -46,4 +46,7 @@ AARCH64_EXTRA_TUNING_OPTION ("no_ldp_stp
AARCH64_EXTRA_TUNING_OPTION ("rename_load_regs", RENAME_LOAD_REGS)
+/* Enable micro-op scheduling instead of default instruction one. */
+AARCH64_EXTRA_TUNING_OPTION ("sched_microops", SCHED_MICRO_OPS)
+
#undef AARCH64_EXTRA_TUNING_OPTION