On Tue, Dec 17, 2024 at 11:53:24AM +0000, Andrew Carlotti wrote:
> This pass is used to optimise assignments to the FPMR register in
> aarch64.  I chose to implement this as a middle-end pass because it
> mostly reuses the existing RTL PRE code within gcse.cc.
> 
> Compared to RTL PRE, the key difference in this new pass is that we
> insert new writes directly to the destination hardreg, instead of
> writing to a new pseudo-register and copying the result later.  This
> requires changes to the analysis portion of the pass, because sets
> cannot be moved before existing instructions that set, use or clobber
> the hardreg, and the value becomes unavailable after any uses of
> clobbers of the hardreg.
> 
> Any uses of the hardreg in debug insns will be deleted.  We could do
> better than this, but for the aarch64 fpmr I don't think we emit useful
> debuginfo for deleted fp8 instructions anyway (and I don't even know if
> it's possible to have a debug fpmr use when entering hardreg PRE).
> 
> 
> Compared to the first version, I've now fixed the broken debug uses, and
> simplified a lot of the analysis (it turns out DF analysis already provides
> cleaner versions of the checks I need).  I also fixed a couple of other minor
> bugs (including one that broke the build on every target except aarch64).
> 
> The new tests pass; I haven't rerun a bootstrap or full regression test yet,
> but this should be NFC except for aarch64 code that uses the fpmr register.
> 
> Is this ok for master?

I believe all the outstanding questions and gaps on the v1 patch thread have
been addressed, so is this ok for master?

> gcc/ChangeLog:
> 
>       * config/aarch64/aarch64.h (HARDREG_PRE_REGNOS): New macro.
>       * gcse.cc (doing_hardreg_pre_p): New global variable.
>       (do_load_motion): New boolean check.
>       (current_hardreg_regno): New global variable.
>       (compute_local_properties): Unset transp for hardreg clobbers.
>       (prune_hardreg_uses): New function.
>       (want_to_gcse_p): Use different checks for hardreg PRE.
>       (oprs_unchanged_p): Disable load motion for hardreg PRE pass.
>       (hash_scan_set): For hardreg PRE, skip non-hardreg sets and
>       check for hardreg clobbers.
>       (record_last_mem_set_info): Skip for hardreg PRE.
>       (compute_pre_data): Prune hardreg uses from transp bitmap.
>       (pre_expr_reaches_here_p_work): Add sentence to comment.
>       (insert_insn_start_basic_block): New functions.
>       (pre_edge_insert): Don't add hardreg sets to predecessor block.
>       (pre_delete): Use hardreg for the reaching reg.
>       (reset_hardreg_debug_uses): New function.
>       (pre_gcse): For hardreg PRE, reset debug uses and don't insert
>       copies.
>       (one_pre_gcse_pass): Disable load motion for hardreg PRE.
>       (execute_hardreg_pre): New.
>       (class pass_hardreg_pre): New.
>       (pass_hardreg_pre::gate): New.
>       (make_pass_hardreg_pre): New.
>       * passes.def (pass_hardreg_pre): New pass.
>       * tree-pass.h (make_pass_hardreg_pre): New.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/aarch64/acle/fpmr-1.c: New test.
>       * gcc.target/aarch64/acle/fpmr-2.c: New test.
>       * gcc.target/aarch64/acle/fpmr-3.c: New test.
>       * gcc.target/aarch64/acle/fpmr-4.c: New test.
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 
> f1251f67c74e8da8420bad2d07a11a98a7de37ff..61837a4a98744225b9d15cfbc37cc914ac48421b
>  100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1652,6 +1652,10 @@ enum class aarch64_tristate_mode : int { NO, YES, 
> MAYBE };
>    { int (aarch64_tristate_mode::MAYBE), \
>      int (aarch64_local_sme_state::ANY) }
>  
> +/* Zero terminated list of regnos for which hardreg PRE should be
> +   applied.  */
> +#define HARDREG_PRE_REGNOS { FPM_REGNUM, 0 }
> +
>  #endif
>  
>  #endif /* GCC_AARCH64_H */
> diff --git a/gcc/gcse.cc b/gcc/gcse.cc
> index 
> 31b92f30fa1ba6c519429d4b7bc55547b2d71c01..f33de3747b896950568154acbfac1817519fe748
>  100644
> --- a/gcc/gcse.cc
> +++ b/gcc/gcse.cc
> @@ -415,6 +415,17 @@ static int gcse_create_count;
>  
>  /* Doing code hoisting.  */
>  static bool doing_code_hoisting_p = false;
> +
> +/* Doing hardreg_pre.  */
> +static bool doing_hardreg_pre_p = false;
> +
> +inline bool
> +do_load_motion ()
> +{
> +  return flag_gcse_lm && !doing_hardreg_pre_p;
> +}
> +
> +static unsigned int current_hardreg_regno;
>  
>  /* For available exprs */
>  static sbitmap *ae_kill;
> @@ -689,14 +700,32 @@ compute_local_properties (sbitmap *transp, sbitmap 
> *comp, sbitmap *antloc,
>         int indx = expr->bitmap_index;
>         struct gcse_occr *occr;
>  
> -       /* The expression is transparent in this block if it is not killed.
> -          We start by assuming all are transparent [none are killed], and
> -          then reset the bits for those that are.  */
> +       /* In most cases, the expression is transparent in the block if it is
> +          not killed.  The exception to this is during hardreg PRE, in which
> +          uses of the hardreg prevent transparency but do not kill the
> +          expression.
> +
> +          We start by assuming all expressions are transparent [none are
> +          killed], and then reset the bits for those that are.  */
>         if (transp)
> -         compute_transp (expr->expr, indx, transp,
> -                         blocks_with_calls,
> -                         modify_mem_list_set,
> -                         canon_modify_mem_list);
> +         {
> +           compute_transp (expr->expr, indx, transp,
> +                           blocks_with_calls,
> +                           modify_mem_list_set,
> +                           canon_modify_mem_list);
> +
> +           if (doing_hardreg_pre_p)
> +             {
> +               /* We also need to check whether the destination hardreg is
> +                  set or call-clobbered in each BB.  We'll check for hardreg
> +                  uses later.  */
> +               df_ref def;
> +               for (def = DF_REG_DEF_CHAIN (current_hardreg_regno);
> +                    def;
> +                    def = DF_REF_NEXT_REG (def))
> +                 bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx);
> +             }
> +         }
>  
>         /* The occurrences recorded in antic_occr are exactly those that
>            we want to set to nonzero in ANTLOC.  */
> @@ -728,6 +757,37 @@ compute_local_properties (sbitmap *transp, sbitmap 
> *comp, sbitmap *antloc,
>       }
>      }
>  }
> +
> +/* A hardreg set is not transparent in a block if there are any uses of that
> +   hardreg.  This filters the results of compute_local_properties, after the
> +   result of that function has been used to define the kills bitmap.
> +
> +   TRANSP is the destination sbitmap to be updated.
> +
> +   TABLE controls which hash table to look at.  */
> +
> +static void
> +prune_hardreg_uses (sbitmap *transp, struct gcse_hash_table_d *table)
> +{
> +  unsigned int i;
> +  gcc_assert (doing_hardreg_pre_p);
> +
> +  for (i = 0; i < table->size; i++)
> +    {
> +      struct gcse_expr *expr;
> +
> +      for (expr = table->table[i]; expr != NULL; expr = expr->next_same_hash)
> +     {
> +       int indx = expr->bitmap_index;
> +       df_ref def;
> +
> +       for (def = DF_REG_USE_CHAIN (current_hardreg_regno);
> +            def;
> +            def = DF_REF_NEXT_REG (def))
> +         bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx);
> +     }
> +    }
> +}
>  
>  /* Hash table support.  */
>  
> @@ -771,17 +831,24 @@ want_to_gcse_p (rtx x, machine_mode mode, HOST_WIDE_INT 
> *max_distance_ptr)
>       pressure, i.e., a pseudo register with REG_EQUAL to constant
>       is set only once.  Failing to do so will result in IRA/reload
>       spilling such constants under high register pressure instead of
> -     rematerializing them.  */
> +     rematerializing them.
> +
> +     For hardreg PRE, register pressure is not a concern, and we also want to
> +     apply GCSE to simple moves.  */
>  
>    switch (GET_CODE (x))
>      {
>      case REG:
>      case SUBREG:
> +      return doing_hardreg_pre_p;
> +
>      case CALL:
>        return false;
>  
>      CASE_CONST_ANY:
> -      if (!doing_code_hoisting_p)
> +      if (doing_hardreg_pre_p)
> +     return true;
> +      else if (!doing_code_hoisting_p)
>       /* Do not PRE constants.  */
>       return false;
>  
> @@ -911,7 +978,7 @@ oprs_unchanged_p (const_rtx x, const rtx_insn *insn, bool 
> avail_p)
>        }
>  
>      case MEM:
> -      if (! flag_gcse_lm
> +      if (! do_load_motion ()
>         || load_killed_in_block_p (current_bb, DF_INSN_LUID (insn),
>                                    x, avail_p))
>       return false;
> @@ -1258,8 +1325,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct 
> gcse_hash_table_d *table)
>         && want_to_gcse_p (XEXP (note, 0), GET_MODE (dest), NULL))
>       src = XEXP (note, 0), set = gen_rtx_SET (dest, src);
>  
> -      /* Only record sets of pseudo-regs in the hash table.  */
> -      if (regno >= FIRST_PSEUDO_REGISTER
> +      /* Only record sets of pseudo-regs in the hash table, unless we're
> +      currently doing hardreg switching.  */
> +      if ((doing_hardreg_pre_p ? regno == current_hardreg_regno
> +                                  : regno >= FIRST_PSEUDO_REGISTER)
>         /* Don't GCSE something if we can't do a reg/reg copy.  */
>         && can_copy_p (GET_MODE (dest))
>         /* GCSE commonly inserts instruction after the insn.  We can't
> @@ -1286,12 +1355,33 @@ hash_scan_set (rtx set, rtx_insn *insn, struct 
> gcse_hash_table_d *table)
>            able to handle code motion of insns with multiple sets.  */
>         bool antic_p = (oprs_anticipatable_p (src, insn)
>                         && !multiple_sets (insn));
> +       if (doing_hardreg_pre_p)
> +         {
> +           /* An hardreg assignment is anticipatable only if the hardreg is
> +              neither set nor used prior to this assignment.  */
> +           auto info = reg_avail_info[current_hardreg_regno];
> +           if ((info.last_bb == current_bb
> +                && info.first_set < DF_INSN_LUID (insn))
> +               || bitmap_bit_p (DF_LR_IN (current_bb),
> +                                current_hardreg_regno))
> +             antic_p = false;
> +         }
> +
>         /* An expression is not available if its operands are
>            subsequently modified, including this insn.  It's also not
>            available if this is a branch, because we can't insert
>            a set after the branch.  */
>         bool avail_p = (oprs_available_p (src, insn)
>                         && ! JUMP_P (insn));
> +       if (doing_hardreg_pre_p)
> +         {
> +           /* An hardreg assignment is only available if the hardreg is
> +              not set later in the BB.  Uses of the hardreg are allowed. */
> +           auto info = reg_avail_info[current_hardreg_regno];
> +           if (info.last_bb == current_bb
> +               && info.last_set > DF_INSN_LUID (insn))
> +             avail_p = false;
> +         }
>  
>         insert_expr_in_table (src, GET_MODE (dest), insn, antic_p, avail_p,
>                               max_distance, table);
> @@ -1300,7 +1390,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct 
> gcse_hash_table_d *table)
>    /* In case of store we want to consider the memory value as available in
>       the REG stored in that memory. This makes it possible to remove
>       redundant loads from due to stores to the same location.  */
> -  else if (flag_gcse_las && REG_P (src) && MEM_P (dest))
> +  else if (flag_gcse_las
> +        && !doing_hardreg_pre_p
> +        && REG_P (src)
> +        && MEM_P (dest))
>      {
>        unsigned int regno = REGNO (src);
>        HOST_WIDE_INT max_distance = 0;
> @@ -1460,7 +1553,7 @@ record_last_reg_set_info (rtx_insn *insn, int regno)
>  static void
>  record_last_mem_set_info (rtx_insn *insn)
>  {
> -  if (! flag_gcse_lm)
> +  if (! do_load_motion ())
>      return;
>  
>    record_last_mem_set_info_common (insn, modify_mem_list,
> @@ -1884,6 +1977,9 @@ compute_pre_data (void)
>        bitmap_not (ae_kill[bb->index], ae_kill[bb->index]);
>      }
>  
> +  if (doing_hardreg_pre_p)
> +    prune_hardreg_uses (transp, &expr_hash_table);
> +
>    edge_list = pre_edge_lcm (expr_hash_table.n_elems, transp, comp, antloc,
>                           ae_kill, &pre_insert_map, &pre_delete_map);
>    sbitmap_vector_free (antloc);
> @@ -1938,7 +2034,10 @@ pre_expr_reaches_here_p_work (basic_block occr_bb, 
> struct gcse_expr *expr,
>  
>         visited[pred_bb->index] = 1;
>       }
> -      /* Ignore this predecessor if it kills the expression.  */
> +      /* Ignore this predecessor if it kills the expression.
> +
> +      If this were used for hardreg pre, then it would need to use the kills
> +      bitmap.  */
>        else if (! bitmap_bit_p (transp[pred_bb->index], expr->bitmap_index))
>       visited[pred_bb->index] = 1;
>  
> @@ -2109,6 +2208,59 @@ insert_insn_end_basic_block (struct gcse_expr *expr, 
> basic_block bb)
>      }
>  }
>  
> +/* Return the INSN which is added at the start of the block BB with
> +   same instruction pattern with PAT.  */
> +
> +rtx_insn *
> +insert_insn_start_basic_block (rtx_insn *pat, basic_block bb)
> +{
> +  rtx_insn *insn = BB_HEAD (bb);
> +  rtx_insn *next_insn;
> +
> +  gcc_assert (pat && INSN_P (pat));
> +
> +  /* Insert after the last initial CODE_LABEL or NOTE_INSN_BASIC_BLOCK, 
> before
> +     any other instructions.  */
> +  while ((next_insn = NEXT_INSN (insn))
> +      && (LABEL_P (next_insn) || NOTE_INSN_BASIC_BLOCK_P (insn)))
> +    insn = next_insn;
> +
> +  rtx_insn *new_insn = emit_insn_after_noloc (pat, insn, bb);
> +
> +  while (pat != NULL_RTX)
> +    {
> +      if (INSN_P (pat))
> +     add_label_notes (PATTERN (pat), new_insn);
> +      pat = NEXT_INSN (pat);
> +    }
> +
> +  return new_insn;
> +}
> +
> +/* Add EXPR to the start of basic block BB.
> +
> +   This is used by hardreg PRE.  */
> +
> +static void
> +insert_insn_start_basic_block (struct gcse_expr *expr, basic_block bb)
> +{
> +  rtx reg = expr->reaching_reg;
> +  int regno = REGNO (reg);
> +
> +  rtx_insn *insn = process_insert_insn (expr);
> +  rtx_insn *new_insn = insert_insn_start_basic_block (insn, bb);
> +
> +  gcse_create_count++;
> +
> +  if (dump_file)
> +    {
> +      fprintf (dump_file, "hardreg PRE: start of bb %d, insn %d, ",
> +            bb->index, INSN_UID (new_insn));
> +      fprintf (dump_file, "copying expression %d to reg %d\n",
> +            expr->bitmap_index, regno);
> +    }
> +}
> +
>  /* Insert partially redundant expressions on edges in the CFG to make
>     the expressions fully redundant.  */
>  
> @@ -2130,7 +2282,8 @@ pre_edge_insert (struct edge_list *edge_list, struct 
> gcse_expr **index_map)
>    for (e = 0; e < num_edges; e++)
>      {
>        int indx;
> -      basic_block bb = INDEX_EDGE_PRED_BB (edge_list, e);
> +      basic_block pred_bb = INDEX_EDGE_PRED_BB (edge_list, e);
> +      basic_block succ_bb = INDEX_EDGE_SUCC_BB (edge_list, e);
>  
>        for (i = indx = 0; i < set_size; i++, indx += SBITMAP_ELT_BITS)
>       {
> @@ -2159,13 +2312,24 @@ pre_edge_insert (struct edge_list *edge_list, struct 
> gcse_expr **index_map)
>  
>                       /* We can't insert anything on an abnormal and
>                          critical edge, so we insert the insn at the end of
> -                        the previous block. There are several alternatives
> +                        the previous block.  There are several alternatives
>                          detailed in Morgans book P277 (sec 10.5) for
>                          handling this situation.  This one is easiest for
> -                        now.  */
> +                        now.
>  
> +                        For hardreg PRE  this would add an unwanted clobber
> +                        of the hardreg, so we instead insert in the
> +                        successor block. This may be partially redundant,
> +                        but it is at least correct.  */
>                       if (eg->flags & EDGE_ABNORMAL)
> -                       insert_insn_end_basic_block (index_map[j], bb);
> +                       {
> +                         if (doing_hardreg_pre_p)
> +                           insert_insn_start_basic_block (index_map[j],
> +                                                          succ_bb);
> +                         else
> +                           insert_insn_end_basic_block (index_map[j],
> +                                                        pred_bb);
> +                       }
>                       else
>                         {
>                           insn = process_insert_insn (index_map[j]);
> @@ -2175,8 +2339,8 @@ pre_edge_insert (struct edge_list *edge_list, struct 
> gcse_expr **index_map)
>                       if (dump_file)
>                         {
>                           fprintf (dump_file, "PRE: edge (%d,%d), ",
> -                                  bb->index,
> -                                  INDEX_EDGE_SUCC_BB (edge_list, e)->index);
> +                                  pred_bb->index,
> +                                  succ_bb->index);
>                           fprintf (dump_file, "copy expression %d\n",
>                                    expr->bitmap_index);
>                         }
> @@ -2491,13 +2655,25 @@ pre_delete (void)
>               && (set = single_set (insn)) != 0
>                  && dbg_cnt (pre_insn))
>             {
> -             /* Create a pseudo-reg to store the result of reaching
> -                expressions into.  Get the mode for the new pseudo from
> -                the mode of the original destination pseudo.  */
> +             rtx dest = SET_DEST (set);
>               if (expr->reaching_reg == NULL)
> -               expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST (set));
> +               {
> +                 if (doing_hardreg_pre_p)
> +                   /* Use the hardreg as the reaching register.  The
> +                      deleted sets will be replaced with noop moves.
> +
> +                      This may change the value of the hardreg in some debug
> +                      instructions, so we will need to reset any debug uses
> +                      of the hardreg.  */
> +                   expr->reaching_reg = dest;
> +                 else
> +                   /* Create a pseudo-reg to store the result of reaching
> +                      expressions into.  Get the mode for the new pseudo from
> +                      the mode of the original destination pseudo.  */
> +                   expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST 
> (set));
> +               }
>  
> -             gcse_emit_move_after (SET_DEST (set), expr->reaching_reg, insn);
> +             gcse_emit_move_after (dest, expr->reaching_reg, insn);
>               delete_insn (insn);
>               occr->deleted_p = 1;
>               changed = true;
> @@ -2518,6 +2694,25 @@ pre_delete (void)
>    return changed;
>  }
>  
> +/* Since hardreg PRE reuses the hardreg as the reaching register, we need to
> +   eliminate any existing uses in debug insns.  This is overly conservative,
> +   but there's currently no benefit to preserving the debug insns, so there's
> +   no point doing the work to retain them.  */
> +
> +static void
> +reset_hardreg_debug_uses ()
> +{
> +  df_ref def;
> +  for (def = DF_REG_USE_CHAIN (current_hardreg_regno);
> +       def;
> +       def = DF_REF_NEXT_REG (def))
> +    {
> +      rtx_insn *insn = DF_REF_INSN (def);
> +      if (DEBUG_INSN_P (insn))
> +     delete_insn (insn);
> +    }
> +}
> +
>  /* Perform GCSE optimizations using PRE.
>     This is called by one_pre_gcse_pass after all the dataflow analysis
>     has been done.
> @@ -2561,12 +2756,16 @@ pre_gcse (struct edge_list *edge_list)
>  
>    changed = pre_delete ();
>    did_insert = pre_edge_insert (edge_list, index_map);
> -
>    /* In other places with reaching expressions, copy the expression to the
> -     specially allocated pseudo-reg that reaches the redundant expr.  */
> -  pre_insert_copies ();
> +     specially allocated pseudo-reg that reaches the redundant expr.  This
> +     isn't needed for hardreg PRE.  */
> +  if (!doing_hardreg_pre_p)
> +    pre_insert_copies ();
> +
>    if (did_insert)
>      {
> +      if (doing_hardreg_pre_p)
> +     reset_hardreg_debug_uses ();
>        commit_edge_insertions ();
>        changed = true;
>      }
> @@ -2601,11 +2800,11 @@ one_pre_gcse_pass (void)
>  
>    alloc_hash_table (&expr_hash_table);
>    add_noreturn_fake_exit_edges ();
> -  if (flag_gcse_lm)
> +  if (do_load_motion ())
>      compute_ld_motion_mems ();
>  
>    compute_hash_table (&expr_hash_table);
> -  if (flag_gcse_lm)
> +  if (do_load_motion ())
>      trim_ld_motion_mems ();
>    if (dump_file)
>      dump_hash_table (dump_file, "Expression", &expr_hash_table);
> @@ -2621,7 +2820,7 @@ one_pre_gcse_pass (void)
>        free_pre_mem ();
>      }
>  
> -  if (flag_gcse_lm)
> +  if (do_load_motion ())
>      free_ld_motion_mems ();
>    remove_fake_exit_edges ();
>    free_hash_table (&expr_hash_table);
> @@ -4028,6 +4227,32 @@ execute_rtl_pre (void)
>    return 0;
>  }
>  
> +static unsigned int
> +execute_hardreg_pre (void)
> +{
> +#ifdef HARDREG_PRE_REGNOS
> +  doing_hardreg_pre_p = true;
> +  unsigned int regnos[] = HARDREG_PRE_REGNOS;
> +  /* It's possible to avoid this loop, but it isn't worth doing so until
> +     hardreg PRE is used for multiple hardregs.  */
> +  for (int i = 0; regnos[i] != 0; i++)
> +    {
> +      int changed;
> +      current_hardreg_regno = regnos[i];
> +      if (dump_file)
> +     fprintf(dump_file, "Entering hardreg PRE for regno %d\n",
> +             current_hardreg_regno);
> +      delete_unreachable_blocks ();
> +      df_analyze ();
> +      changed = one_pre_gcse_pass ();
> +      if (changed)
> +     cleanup_cfg (0);
> +    }
> +  doing_hardreg_pre_p = false;
> +#endif
> +  return 0;
> +}
> +
>  static unsigned int
>  execute_rtl_hoist (void)
>  {
> @@ -4096,6 +4321,56 @@ make_pass_rtl_pre (gcc::context *ctxt)
>  
>  namespace {
>  
> +const pass_data pass_data_hardreg_pre =
> +{
> +  RTL_PASS, /* type */
> +  "hardreg_pre", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_PRE, /* tv_id */
> +  PROP_cfglayout, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  TODO_df_finish, /* todo_flags_finish */
> +};
> +
> +class pass_hardreg_pre : public rtl_opt_pass
> +{
> +public:
> +  pass_hardreg_pre (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_hardreg_pre, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  bool gate (function *) final override;
> +  unsigned int execute (function *)  final override
> +  {
> +    return execute_hardreg_pre ();
> +  }
> +
> +}; // class pass_rtl_pre
> +
> +bool
> +pass_hardreg_pre::gate (function *fun)
> +{
> +#ifdef HARDREG_PRE_REGNOS
> +  return optimize > 0
> +    && !fun->calls_setjmp;
> +#else
> +  return false;
> +#endif
> +}
> +
> +} // anon namespace
> +
> +rtl_opt_pass *
> +make_pass_hardreg_pre (gcc::context *ctxt)
> +{
> +  return new pass_hardreg_pre (ctxt);
> +}
> +
> +namespace {
> +
>  const pass_data pass_data_rtl_hoist =
>  {
>    RTL_PASS, /* type */
> diff --git a/gcc/passes.def b/gcc/passes.def
> index 
> ae85ae72dff734a8698f606254970437e2bf93a5..95d72b22761eec3668a4d5bbcaa8e41fcc4d830a
>  100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -463,6 +463,7 @@ along with GCC; see the file COPYING3.  If not see
>        NEXT_PASS (pass_rtl_cprop);
>        NEXT_PASS (pass_rtl_pre);
>        NEXT_PASS (pass_rtl_hoist);
> +      NEXT_PASS (pass_hardreg_pre);
>        NEXT_PASS (pass_rtl_cprop);
>        NEXT_PASS (pass_rtl_store_motion);
>        NEXT_PASS (pass_cse_after_global_opts);
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..f7a47f81c5ea4639827d4c902f316932120f44af
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, int br)
> +{
> +  float16x8_t a;
> +  a = vld1q_f16(ap);
> +  a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> +  vst1q_f16(ap, a);
> +  if (br)
> +    {
> +      a = vld1q_f16(ap + 8);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> +      vst1q_f16(ap + 8, a);
> +      a = vld1q_f16(ap + 16);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> +      vst1q_f16(ap + 16, a);
> +    }
> +  else
> +    {
> +      a = vld1q_f16(ap + 24);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> +      vst1q_f16(ap + 24, a);
> +    }
> +  a = vld1q_f16(ap + 32);
> +  a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> +  vst1q_f16(ap + 32, a);
> +}
> +
> +void bar(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode, 
> int br)
> +{
> +  float16x8_t a;
> +  a = vld1q_f16(ap);
> +  a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> +  vst1q_f16(ap, a);
> +  if (br)
> +    {
> +      a = vld1q_f16(ap + 8);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> +      vst1q_f16(ap + 8, a);
> +      a = vld1q_f16(ap + 16);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> +      vst1q_f16(ap + 16, a);
> +    }
> +  else
> +    {
> +      a = vld1q_f16(ap + 24);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> +      vst1q_f16(ap + 24, a);
> +    }
> +  a = vld1q_f16(ap + 32);
> +  a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> +  vst1q_f16(ap + 32, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "msr\tfpmr" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..c5b255b0a9a8ea9161217b22f19adaf58c899dbb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c)
> +{
> +  for (int i = 0; i < 103; i++)
> +    {
> +      float16x8_t a = vld1q_f16(ap + 8*i);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> +      vst1q_f16(ap + 8*i, a);
> +    }
> +}
> +/* { dg-final { scan-assembler "msr\tfpmr.*\n\.L2" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..73a79ad4b44e2b950cf7ea3e914254b5fdc05b69
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode)
> +{
> +  float16x8_t x = vld1q_f16(ap + 1);
> +  x = vmlalbq_f16_mf8_fpm(x, b, c, mode);
> +  vst1q_f16(ap + 1, x);
> +  for (int i = 0; i < 103; i++)
> +    {
> +      float16x8_t a = vld1q_f16(ap + 8*i);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> +      vst1q_f16(ap + 8*i, a);
> +    }
> +}
> +/* { dg-final { scan-assembler-times "msr\tfpmr" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..18c1def752f557e98868250cd73442fb9f556e18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void baz(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c)
> +{
> +  float16x8_t x = vld1q_f16(ap + 1);
> +  x = vmlalbq_f16_mf8_fpm(x, b, c, 13);
> +  vst1q_f16(ap + 1, x);
> +  for (int i = 0; i < 10; i++)
> +    {
> +      float16x8_t a = vld1q_f16(ap + 16*i);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> +      vst1q_f16(ap + 16*i, a);
> +      a = vld1q_f16(ap + 16*i + 8);
> +      a = vmlalbq_f16_mf8_fpm(a, b, c, 865);
> +      vst1q_f16(ap + 16*i+8, a);
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times "msr\tfpmr" 3 } } */
> +/* { dg-final { scan-assembler "msr\tfpmr.*\n\tb\t" } } */
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index 
> ce463629194a7298b70da6463706caea0b28dabd..797d719b2c45ffa2d71c7e94687bf1d5ac19c69f
>  100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -573,6 +573,7 @@ extern rtl_opt_pass *make_pass_rtl_dse3 (gcc::context 
> *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_cprop (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_pre (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_hoist (gcc::context *ctxt);
> +extern rtl_opt_pass *make_pass_hardreg_pre (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_avoid_store_forwarding (gcc::context 
> *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_store_motion (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_cse_after_global_opts (gcc::context *ctxt);

Reply via email to