On Tue, Dec 17, 2024 at 11:53:24AM +0000, Andrew Carlotti wrote: > This pass is used to optimise assignments to the FPMR register in > aarch64. I chose to implement this as a middle-end pass because it > mostly reuses the existing RTL PRE code within gcse.cc. > > Compared to RTL PRE, the key difference in this new pass is that we > insert new writes directly to the destination hardreg, instead of > writing to a new pseudo-register and copying the result later. This > requires changes to the analysis portion of the pass, because sets > cannot be moved before existing instructions that set, use or clobber > the hardreg, and the value becomes unavailable after any uses of > clobbers of the hardreg. > > Any uses of the hardreg in debug insns will be deleted. We could do > better than this, but for the aarch64 fpmr I don't think we emit useful > debuginfo for deleted fp8 instructions anyway (and I don't even know if > it's possible to have a debug fpmr use when entering hardreg PRE). > > > Compared to the first version, I've now fixed the broken debug uses, and > simplified a lot of the analysis (it turns out DF analysis already provides > cleaner versions of the checks I need). I also fixed a couple of other minor > bugs (including one that broke the build on every target except aarch64). > > The new tests pass; I haven't rerun a bootstrap or full regression test yet, > but this should be NFC except for aarch64 code that uses the fpmr register. > > Is this ok for master?
I believe all the outstanding questions and gaps on the v1 patch thread have been addressed, so is this ok for master? > gcc/ChangeLog: > > * config/aarch64/aarch64.h (HARDREG_PRE_REGNOS): New macro. > * gcse.cc (doing_hardreg_pre_p): New global variable. > (do_load_motion): New boolean check. > (current_hardreg_regno): New global variable. > (compute_local_properties): Unset transp for hardreg clobbers. > (prune_hardreg_uses): New function. > (want_to_gcse_p): Use different checks for hardreg PRE. > (oprs_unchanged_p): Disable load motion for hardreg PRE pass. > (hash_scan_set): For hardreg PRE, skip non-hardreg sets and > check for hardreg clobbers. > (record_last_mem_set_info): Skip for hardreg PRE. > (compute_pre_data): Prune hardreg uses from transp bitmap. > (pre_expr_reaches_here_p_work): Add sentence to comment. > (insert_insn_start_basic_block): New functions. > (pre_edge_insert): Don't add hardreg sets to predecessor block. > (pre_delete): Use hardreg for the reaching reg. > (reset_hardreg_debug_uses): New function. > (pre_gcse): For hardreg PRE, reset debug uses and don't insert > copies. > (one_pre_gcse_pass): Disable load motion for hardreg PRE. > (execute_hardreg_pre): New. > (class pass_hardreg_pre): New. > (pass_hardreg_pre::gate): New. > (make_pass_hardreg_pre): New. > * passes.def (pass_hardreg_pre): New pass. > * tree-pass.h (make_pass_hardreg_pre): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/acle/fpmr-1.c: New test. > * gcc.target/aarch64/acle/fpmr-2.c: New test. > * gcc.target/aarch64/acle/fpmr-3.c: New test. > * gcc.target/aarch64/acle/fpmr-4.c: New test. > > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index > f1251f67c74e8da8420bad2d07a11a98a7de37ff..61837a4a98744225b9d15cfbc37cc914ac48421b > 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -1652,6 +1652,10 @@ enum class aarch64_tristate_mode : int { NO, YES, > MAYBE }; > { int (aarch64_tristate_mode::MAYBE), \ > int (aarch64_local_sme_state::ANY) } > > +/* Zero terminated list of regnos for which hardreg PRE should be > + applied. */ > +#define HARDREG_PRE_REGNOS { FPM_REGNUM, 0 } > + > #endif > > #endif /* GCC_AARCH64_H */ > diff --git a/gcc/gcse.cc b/gcc/gcse.cc > index > 31b92f30fa1ba6c519429d4b7bc55547b2d71c01..f33de3747b896950568154acbfac1817519fe748 > 100644 > --- a/gcc/gcse.cc > +++ b/gcc/gcse.cc > @@ -415,6 +415,17 @@ static int gcse_create_count; > > /* Doing code hoisting. */ > static bool doing_code_hoisting_p = false; > + > +/* Doing hardreg_pre. */ > +static bool doing_hardreg_pre_p = false; > + > +inline bool > +do_load_motion () > +{ > + return flag_gcse_lm && !doing_hardreg_pre_p; > +} > + > +static unsigned int current_hardreg_regno; > > /* For available exprs */ > static sbitmap *ae_kill; > @@ -689,14 +700,32 @@ compute_local_properties (sbitmap *transp, sbitmap > *comp, sbitmap *antloc, > int indx = expr->bitmap_index; > struct gcse_occr *occr; > > - /* The expression is transparent in this block if it is not killed. > - We start by assuming all are transparent [none are killed], and > - then reset the bits for those that are. */ > + /* In most cases, the expression is transparent in the block if it is > + not killed. The exception to this is during hardreg PRE, in which > + uses of the hardreg prevent transparency but do not kill the > + expression. > + > + We start by assuming all expressions are transparent [none are > + killed], and then reset the bits for those that are. */ > if (transp) > - compute_transp (expr->expr, indx, transp, > - blocks_with_calls, > - modify_mem_list_set, > - canon_modify_mem_list); > + { > + compute_transp (expr->expr, indx, transp, > + blocks_with_calls, > + modify_mem_list_set, > + canon_modify_mem_list); > + > + if (doing_hardreg_pre_p) > + { > + /* We also need to check whether the destination hardreg is > + set or call-clobbered in each BB. We'll check for hardreg > + uses later. */ > + df_ref def; > + for (def = DF_REG_DEF_CHAIN (current_hardreg_regno); > + def; > + def = DF_REF_NEXT_REG (def)) > + bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx); > + } > + } > > /* The occurrences recorded in antic_occr are exactly those that > we want to set to nonzero in ANTLOC. */ > @@ -728,6 +757,37 @@ compute_local_properties (sbitmap *transp, sbitmap > *comp, sbitmap *antloc, > } > } > } > + > +/* A hardreg set is not transparent in a block if there are any uses of that > + hardreg. This filters the results of compute_local_properties, after the > + result of that function has been used to define the kills bitmap. > + > + TRANSP is the destination sbitmap to be updated. > + > + TABLE controls which hash table to look at. */ > + > +static void > +prune_hardreg_uses (sbitmap *transp, struct gcse_hash_table_d *table) > +{ > + unsigned int i; > + gcc_assert (doing_hardreg_pre_p); > + > + for (i = 0; i < table->size; i++) > + { > + struct gcse_expr *expr; > + > + for (expr = table->table[i]; expr != NULL; expr = expr->next_same_hash) > + { > + int indx = expr->bitmap_index; > + df_ref def; > + > + for (def = DF_REG_USE_CHAIN (current_hardreg_regno); > + def; > + def = DF_REF_NEXT_REG (def)) > + bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx); > + } > + } > +} > > /* Hash table support. */ > > @@ -771,17 +831,24 @@ want_to_gcse_p (rtx x, machine_mode mode, HOST_WIDE_INT > *max_distance_ptr) > pressure, i.e., a pseudo register with REG_EQUAL to constant > is set only once. Failing to do so will result in IRA/reload > spilling such constants under high register pressure instead of > - rematerializing them. */ > + rematerializing them. > + > + For hardreg PRE, register pressure is not a concern, and we also want to > + apply GCSE to simple moves. */ > > switch (GET_CODE (x)) > { > case REG: > case SUBREG: > + return doing_hardreg_pre_p; > + > case CALL: > return false; > > CASE_CONST_ANY: > - if (!doing_code_hoisting_p) > + if (doing_hardreg_pre_p) > + return true; > + else if (!doing_code_hoisting_p) > /* Do not PRE constants. */ > return false; > > @@ -911,7 +978,7 @@ oprs_unchanged_p (const_rtx x, const rtx_insn *insn, bool > avail_p) > } > > case MEM: > - if (! flag_gcse_lm > + if (! do_load_motion () > || load_killed_in_block_p (current_bb, DF_INSN_LUID (insn), > x, avail_p)) > return false; > @@ -1258,8 +1325,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct > gcse_hash_table_d *table) > && want_to_gcse_p (XEXP (note, 0), GET_MODE (dest), NULL)) > src = XEXP (note, 0), set = gen_rtx_SET (dest, src); > > - /* Only record sets of pseudo-regs in the hash table. */ > - if (regno >= FIRST_PSEUDO_REGISTER > + /* Only record sets of pseudo-regs in the hash table, unless we're > + currently doing hardreg switching. */ > + if ((doing_hardreg_pre_p ? regno == current_hardreg_regno > + : regno >= FIRST_PSEUDO_REGISTER) > /* Don't GCSE something if we can't do a reg/reg copy. */ > && can_copy_p (GET_MODE (dest)) > /* GCSE commonly inserts instruction after the insn. We can't > @@ -1286,12 +1355,33 @@ hash_scan_set (rtx set, rtx_insn *insn, struct > gcse_hash_table_d *table) > able to handle code motion of insns with multiple sets. */ > bool antic_p = (oprs_anticipatable_p (src, insn) > && !multiple_sets (insn)); > + if (doing_hardreg_pre_p) > + { > + /* An hardreg assignment is anticipatable only if the hardreg is > + neither set nor used prior to this assignment. */ > + auto info = reg_avail_info[current_hardreg_regno]; > + if ((info.last_bb == current_bb > + && info.first_set < DF_INSN_LUID (insn)) > + || bitmap_bit_p (DF_LR_IN (current_bb), > + current_hardreg_regno)) > + antic_p = false; > + } > + > /* An expression is not available if its operands are > subsequently modified, including this insn. It's also not > available if this is a branch, because we can't insert > a set after the branch. */ > bool avail_p = (oprs_available_p (src, insn) > && ! JUMP_P (insn)); > + if (doing_hardreg_pre_p) > + { > + /* An hardreg assignment is only available if the hardreg is > + not set later in the BB. Uses of the hardreg are allowed. */ > + auto info = reg_avail_info[current_hardreg_regno]; > + if (info.last_bb == current_bb > + && info.last_set > DF_INSN_LUID (insn)) > + avail_p = false; > + } > > insert_expr_in_table (src, GET_MODE (dest), insn, antic_p, avail_p, > max_distance, table); > @@ -1300,7 +1390,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct > gcse_hash_table_d *table) > /* In case of store we want to consider the memory value as available in > the REG stored in that memory. This makes it possible to remove > redundant loads from due to stores to the same location. */ > - else if (flag_gcse_las && REG_P (src) && MEM_P (dest)) > + else if (flag_gcse_las > + && !doing_hardreg_pre_p > + && REG_P (src) > + && MEM_P (dest)) > { > unsigned int regno = REGNO (src); > HOST_WIDE_INT max_distance = 0; > @@ -1460,7 +1553,7 @@ record_last_reg_set_info (rtx_insn *insn, int regno) > static void > record_last_mem_set_info (rtx_insn *insn) > { > - if (! flag_gcse_lm) > + if (! do_load_motion ()) > return; > > record_last_mem_set_info_common (insn, modify_mem_list, > @@ -1884,6 +1977,9 @@ compute_pre_data (void) > bitmap_not (ae_kill[bb->index], ae_kill[bb->index]); > } > > + if (doing_hardreg_pre_p) > + prune_hardreg_uses (transp, &expr_hash_table); > + > edge_list = pre_edge_lcm (expr_hash_table.n_elems, transp, comp, antloc, > ae_kill, &pre_insert_map, &pre_delete_map); > sbitmap_vector_free (antloc); > @@ -1938,7 +2034,10 @@ pre_expr_reaches_here_p_work (basic_block occr_bb, > struct gcse_expr *expr, > > visited[pred_bb->index] = 1; > } > - /* Ignore this predecessor if it kills the expression. */ > + /* Ignore this predecessor if it kills the expression. > + > + If this were used for hardreg pre, then it would need to use the kills > + bitmap. */ > else if (! bitmap_bit_p (transp[pred_bb->index], expr->bitmap_index)) > visited[pred_bb->index] = 1; > > @@ -2109,6 +2208,59 @@ insert_insn_end_basic_block (struct gcse_expr *expr, > basic_block bb) > } > } > > +/* Return the INSN which is added at the start of the block BB with > + same instruction pattern with PAT. */ > + > +rtx_insn * > +insert_insn_start_basic_block (rtx_insn *pat, basic_block bb) > +{ > + rtx_insn *insn = BB_HEAD (bb); > + rtx_insn *next_insn; > + > + gcc_assert (pat && INSN_P (pat)); > + > + /* Insert after the last initial CODE_LABEL or NOTE_INSN_BASIC_BLOCK, > before > + any other instructions. */ > + while ((next_insn = NEXT_INSN (insn)) > + && (LABEL_P (next_insn) || NOTE_INSN_BASIC_BLOCK_P (insn))) > + insn = next_insn; > + > + rtx_insn *new_insn = emit_insn_after_noloc (pat, insn, bb); > + > + while (pat != NULL_RTX) > + { > + if (INSN_P (pat)) > + add_label_notes (PATTERN (pat), new_insn); > + pat = NEXT_INSN (pat); > + } > + > + return new_insn; > +} > + > +/* Add EXPR to the start of basic block BB. > + > + This is used by hardreg PRE. */ > + > +static void > +insert_insn_start_basic_block (struct gcse_expr *expr, basic_block bb) > +{ > + rtx reg = expr->reaching_reg; > + int regno = REGNO (reg); > + > + rtx_insn *insn = process_insert_insn (expr); > + rtx_insn *new_insn = insert_insn_start_basic_block (insn, bb); > + > + gcse_create_count++; > + > + if (dump_file) > + { > + fprintf (dump_file, "hardreg PRE: start of bb %d, insn %d, ", > + bb->index, INSN_UID (new_insn)); > + fprintf (dump_file, "copying expression %d to reg %d\n", > + expr->bitmap_index, regno); > + } > +} > + > /* Insert partially redundant expressions on edges in the CFG to make > the expressions fully redundant. */ > > @@ -2130,7 +2282,8 @@ pre_edge_insert (struct edge_list *edge_list, struct > gcse_expr **index_map) > for (e = 0; e < num_edges; e++) > { > int indx; > - basic_block bb = INDEX_EDGE_PRED_BB (edge_list, e); > + basic_block pred_bb = INDEX_EDGE_PRED_BB (edge_list, e); > + basic_block succ_bb = INDEX_EDGE_SUCC_BB (edge_list, e); > > for (i = indx = 0; i < set_size; i++, indx += SBITMAP_ELT_BITS) > { > @@ -2159,13 +2312,24 @@ pre_edge_insert (struct edge_list *edge_list, struct > gcse_expr **index_map) > > /* We can't insert anything on an abnormal and > critical edge, so we insert the insn at the end of > - the previous block. There are several alternatives > + the previous block. There are several alternatives > detailed in Morgans book P277 (sec 10.5) for > handling this situation. This one is easiest for > - now. */ > + now. > > + For hardreg PRE this would add an unwanted clobber > + of the hardreg, so we instead insert in the > + successor block. This may be partially redundant, > + but it is at least correct. */ > if (eg->flags & EDGE_ABNORMAL) > - insert_insn_end_basic_block (index_map[j], bb); > + { > + if (doing_hardreg_pre_p) > + insert_insn_start_basic_block (index_map[j], > + succ_bb); > + else > + insert_insn_end_basic_block (index_map[j], > + pred_bb); > + } > else > { > insn = process_insert_insn (index_map[j]); > @@ -2175,8 +2339,8 @@ pre_edge_insert (struct edge_list *edge_list, struct > gcse_expr **index_map) > if (dump_file) > { > fprintf (dump_file, "PRE: edge (%d,%d), ", > - bb->index, > - INDEX_EDGE_SUCC_BB (edge_list, e)->index); > + pred_bb->index, > + succ_bb->index); > fprintf (dump_file, "copy expression %d\n", > expr->bitmap_index); > } > @@ -2491,13 +2655,25 @@ pre_delete (void) > && (set = single_set (insn)) != 0 > && dbg_cnt (pre_insn)) > { > - /* Create a pseudo-reg to store the result of reaching > - expressions into. Get the mode for the new pseudo from > - the mode of the original destination pseudo. */ > + rtx dest = SET_DEST (set); > if (expr->reaching_reg == NULL) > - expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST (set)); > + { > + if (doing_hardreg_pre_p) > + /* Use the hardreg as the reaching register. The > + deleted sets will be replaced with noop moves. > + > + This may change the value of the hardreg in some debug > + instructions, so we will need to reset any debug uses > + of the hardreg. */ > + expr->reaching_reg = dest; > + else > + /* Create a pseudo-reg to store the result of reaching > + expressions into. Get the mode for the new pseudo from > + the mode of the original destination pseudo. */ > + expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST > (set)); > + } > > - gcse_emit_move_after (SET_DEST (set), expr->reaching_reg, insn); > + gcse_emit_move_after (dest, expr->reaching_reg, insn); > delete_insn (insn); > occr->deleted_p = 1; > changed = true; > @@ -2518,6 +2694,25 @@ pre_delete (void) > return changed; > } > > +/* Since hardreg PRE reuses the hardreg as the reaching register, we need to > + eliminate any existing uses in debug insns. This is overly conservative, > + but there's currently no benefit to preserving the debug insns, so there's > + no point doing the work to retain them. */ > + > +static void > +reset_hardreg_debug_uses () > +{ > + df_ref def; > + for (def = DF_REG_USE_CHAIN (current_hardreg_regno); > + def; > + def = DF_REF_NEXT_REG (def)) > + { > + rtx_insn *insn = DF_REF_INSN (def); > + if (DEBUG_INSN_P (insn)) > + delete_insn (insn); > + } > +} > + > /* Perform GCSE optimizations using PRE. > This is called by one_pre_gcse_pass after all the dataflow analysis > has been done. > @@ -2561,12 +2756,16 @@ pre_gcse (struct edge_list *edge_list) > > changed = pre_delete (); > did_insert = pre_edge_insert (edge_list, index_map); > - > /* In other places with reaching expressions, copy the expression to the > - specially allocated pseudo-reg that reaches the redundant expr. */ > - pre_insert_copies (); > + specially allocated pseudo-reg that reaches the redundant expr. This > + isn't needed for hardreg PRE. */ > + if (!doing_hardreg_pre_p) > + pre_insert_copies (); > + > if (did_insert) > { > + if (doing_hardreg_pre_p) > + reset_hardreg_debug_uses (); > commit_edge_insertions (); > changed = true; > } > @@ -2601,11 +2800,11 @@ one_pre_gcse_pass (void) > > alloc_hash_table (&expr_hash_table); > add_noreturn_fake_exit_edges (); > - if (flag_gcse_lm) > + if (do_load_motion ()) > compute_ld_motion_mems (); > > compute_hash_table (&expr_hash_table); > - if (flag_gcse_lm) > + if (do_load_motion ()) > trim_ld_motion_mems (); > if (dump_file) > dump_hash_table (dump_file, "Expression", &expr_hash_table); > @@ -2621,7 +2820,7 @@ one_pre_gcse_pass (void) > free_pre_mem (); > } > > - if (flag_gcse_lm) > + if (do_load_motion ()) > free_ld_motion_mems (); > remove_fake_exit_edges (); > free_hash_table (&expr_hash_table); > @@ -4028,6 +4227,32 @@ execute_rtl_pre (void) > return 0; > } > > +static unsigned int > +execute_hardreg_pre (void) > +{ > +#ifdef HARDREG_PRE_REGNOS > + doing_hardreg_pre_p = true; > + unsigned int regnos[] = HARDREG_PRE_REGNOS; > + /* It's possible to avoid this loop, but it isn't worth doing so until > + hardreg PRE is used for multiple hardregs. */ > + for (int i = 0; regnos[i] != 0; i++) > + { > + int changed; > + current_hardreg_regno = regnos[i]; > + if (dump_file) > + fprintf(dump_file, "Entering hardreg PRE for regno %d\n", > + current_hardreg_regno); > + delete_unreachable_blocks (); > + df_analyze (); > + changed = one_pre_gcse_pass (); > + if (changed) > + cleanup_cfg (0); > + } > + doing_hardreg_pre_p = false; > +#endif > + return 0; > +} > + > static unsigned int > execute_rtl_hoist (void) > { > @@ -4096,6 +4321,56 @@ make_pass_rtl_pre (gcc::context *ctxt) > > namespace { > > +const pass_data pass_data_hardreg_pre = > +{ > + RTL_PASS, /* type */ > + "hardreg_pre", /* name */ > + OPTGROUP_NONE, /* optinfo_flags */ > + TV_PRE, /* tv_id */ > + PROP_cfglayout, /* properties_required */ > + 0, /* properties_provided */ > + 0, /* properties_destroyed */ > + 0, /* todo_flags_start */ > + TODO_df_finish, /* todo_flags_finish */ > +}; > + > +class pass_hardreg_pre : public rtl_opt_pass > +{ > +public: > + pass_hardreg_pre (gcc::context *ctxt) > + : rtl_opt_pass (pass_data_hardreg_pre, ctxt) > + {} > + > + /* opt_pass methods: */ > + bool gate (function *) final override; > + unsigned int execute (function *) final override > + { > + return execute_hardreg_pre (); > + } > + > +}; // class pass_rtl_pre > + > +bool > +pass_hardreg_pre::gate (function *fun) > +{ > +#ifdef HARDREG_PRE_REGNOS > + return optimize > 0 > + && !fun->calls_setjmp; > +#else > + return false; > +#endif > +} > + > +} // anon namespace > + > +rtl_opt_pass * > +make_pass_hardreg_pre (gcc::context *ctxt) > +{ > + return new pass_hardreg_pre (ctxt); > +} > + > +namespace { > + > const pass_data pass_data_rtl_hoist = > { > RTL_PASS, /* type */ > diff --git a/gcc/passes.def b/gcc/passes.def > index > ae85ae72dff734a8698f606254970437e2bf93a5..95d72b22761eec3668a4d5bbcaa8e41fcc4d830a > 100644 > --- a/gcc/passes.def > +++ b/gcc/passes.def > @@ -463,6 +463,7 @@ along with GCC; see the file COPYING3. If not see > NEXT_PASS (pass_rtl_cprop); > NEXT_PASS (pass_rtl_pre); > NEXT_PASS (pass_rtl_hoist); > + NEXT_PASS (pass_hardreg_pre); > NEXT_PASS (pass_rtl_cprop); > NEXT_PASS (pass_rtl_store_motion); > NEXT_PASS (pass_cse_after_global_opts); > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c > b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..f7a47f81c5ea4639827d4c902f316932120f44af > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c > @@ -0,0 +1,58 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ > + > +#include <arm_neon.h> > + > +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, int br) > +{ > + float16x8_t a; > + a = vld1q_f16(ap); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); > + vst1q_f16(ap, a); > + if (br) > + { > + a = vld1q_f16(ap + 8); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); > + vst1q_f16(ap + 8, a); > + a = vld1q_f16(ap + 16); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); > + vst1q_f16(ap + 16, a); > + } > + else > + { > + a = vld1q_f16(ap + 24); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); > + vst1q_f16(ap + 24, a); > + } > + a = vld1q_f16(ap + 32); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); > + vst1q_f16(ap + 32, a); > +} > + > +void bar(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode, > int br) > +{ > + float16x8_t a; > + a = vld1q_f16(ap); > + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); > + vst1q_f16(ap, a); > + if (br) > + { > + a = vld1q_f16(ap + 8); > + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); > + vst1q_f16(ap + 8, a); > + a = vld1q_f16(ap + 16); > + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); > + vst1q_f16(ap + 16, a); > + } > + else > + { > + a = vld1q_f16(ap + 24); > + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); > + vst1q_f16(ap + 24, a); > + } > + a = vld1q_f16(ap + 32); > + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); > + vst1q_f16(ap + 32, a); > +} > + > +/* { dg-final { scan-assembler-times "msr\tfpmr" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c > b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..c5b255b0a9a8ea9161217b22f19adaf58c899dbb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ > + > +#include <arm_neon.h> > + > +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c) > +{ > + for (int i = 0; i < 103; i++) > + { > + float16x8_t a = vld1q_f16(ap + 8*i); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); > + vst1q_f16(ap + 8*i, a); > + } > +} > +/* { dg-final { scan-assembler "msr\tfpmr.*\n\.L2" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c > b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..73a79ad4b44e2b950cf7ea3e914254b5fdc05b69 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ > + > +#include <arm_neon.h> > + > +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode) > +{ > + float16x8_t x = vld1q_f16(ap + 1); > + x = vmlalbq_f16_mf8_fpm(x, b, c, mode); > + vst1q_f16(ap + 1, x); > + for (int i = 0; i < 103; i++) > + { > + float16x8_t a = vld1q_f16(ap + 8*i); > + a = vmlalbq_f16_mf8_fpm(a, b, c, mode); > + vst1q_f16(ap + 8*i, a); > + } > +} > +/* { dg-final { scan-assembler-times "msr\tfpmr" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c > b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..18c1def752f557e98868250cd73442fb9f556e18 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ > + > +#include <arm_neon.h> > + > +void baz(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c) > +{ > + float16x8_t x = vld1q_f16(ap + 1); > + x = vmlalbq_f16_mf8_fpm(x, b, c, 13); > + vst1q_f16(ap + 1, x); > + for (int i = 0; i < 10; i++) > + { > + float16x8_t a = vld1q_f16(ap + 16*i); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 13); > + vst1q_f16(ap + 16*i, a); > + a = vld1q_f16(ap + 16*i + 8); > + a = vmlalbq_f16_mf8_fpm(a, b, c, 865); > + vst1q_f16(ap + 16*i+8, a); > + } > +} > + > +/* { dg-final { scan-assembler-times "msr\tfpmr" 3 } } */ > +/* { dg-final { scan-assembler "msr\tfpmr.*\n\tb\t" } } */ > diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h > index > ce463629194a7298b70da6463706caea0b28dabd..797d719b2c45ffa2d71c7e94687bf1d5ac19c69f > 100644 > --- a/gcc/tree-pass.h > +++ b/gcc/tree-pass.h > @@ -573,6 +573,7 @@ extern rtl_opt_pass *make_pass_rtl_dse3 (gcc::context > *ctxt); > extern rtl_opt_pass *make_pass_rtl_cprop (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_rtl_pre (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_rtl_hoist (gcc::context *ctxt); > +extern rtl_opt_pass *make_pass_hardreg_pre (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_rtl_avoid_store_forwarding (gcc::context > *ctxt); > extern rtl_opt_pass *make_pass_rtl_store_motion (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_cse_after_global_opts (gcc::context *ctxt);