On 11/11/2022 17:40, Stam Markianos-Wright via Gcc-patches wrote:
Hi all,

This is the 2/2 patch that contains the functional changes needed
for MVE Tail Predicated Low Overhead Loops.  See my previous email
for a general introduction of MVE LOLs.

This support is added through the already existing loop-doloop
mechanisms that are used for non-MVE dls/le looping.

Changes are:

1) Relax the loop-doloop mechanism in the mid-end to allow for
   decrement numbers other that -1 and for `count` to be an
   rtx containing the number of elements to be processed, rather
   than an expression for calculating the number of iterations.
2) Add a `allow_elementwise_doloop` target hook. This allows the
   target backend to manipulate the iteration count as it needs:
   in our case to change it from a pre-calculation of the number
   of iterations to the number of elements to be processed.
3) The doloop_end target-insn now had an additional parameter:
   the `count` (note: this is before it gets modified to just be
   the number of elements), so that the decrement value is
   extracted from that parameter.

And many things in the backend to implement the above optimisation:

4)  Appropriate changes to the define_expand of doloop_end and new
    patterns for dlstp and letp.
5) `arm_attempt_dlstp_transform`: (called from the define_expand of
    doloop_end) this function checks for the loop's suitability for
    dlstp/letp transformation and then implements it, if possible.
6) `arm_mve_get_loop_unique_vctp`: A function that loops through
    the loop contents and returns the vctp VPR-genereting operation
    within the loop, if it is unique and there is exclusively one
    vctp within the loop.
7) A couple of utility functions: `arm_mve_get_vctp_lanes` to map
   from vctp unspecs to number of lanes, and `arm_get_required_vpr_reg`
   to check an insn to see if it requires the VPR or not.

No regressions on arm-none-eabi with various targets and on
aarch64-none-elf. Thoughts on getting this into trunk?

Thank you,
Stam Markianos-Wright

gcc/ChangeLog:

        * config/aarch64/aarch64.md: Add extra doloop_end arg.
        * config/arm/arm-protos.h (arm_attempt_dlstp_transform): New.
        * config/arm/arm.cc (TARGET_ALLOW_ELEMENTWISE_DOLOOP): New.
        (arm_mve_get_vctp_lanes): New.
        (arm_get_required_vpr_reg): New.
        (arm_mve_get_loop_unique_vctp): New.
        (arm_attempt_dlstp_transform): New.
        (arm_allow_elementwise_doloop): New.
        * config/arm/iterators.md:
        * config/arm/mve.md (*predicated_doloop_end_internal): New.
        (dlstp<mode1>_insn): New.
        * config/arm/thumb2.md (doloop_end): Update for MVE LOLs.
        * config/arm/unspecs.md: New unspecs.
        * config/ia64/ia64.md: Add extra doloop_end arg.
        * config/pru/pru.md: Add extra doloop_end arg.
        * config/rs6000/rs6000.md: Add extra doloop_end arg.
        * config/s390/s390.md: Add extra doloop_end arg.
        * config/v850/v850.md: Add extra doloop_end arg.
        * doc/tm.texi: Document new hook.
        * doc/tm.texi.in: Likewise.
        * loop-doloop.cc (doloop_condition_get): Relax conditions.
        (doloop_optimize): Add support for elementwise LoLs.
        * target-insns.def (doloop_end): Add extra arg.
        * target.def (allow_elementwise_doloop): New hook.
        * targhooks.cc (default_allow_elementwise_doloop): New.
        * targhooks.h (default_allow_elementwise_doloop): New.

gcc/testsuite/ChangeLog:

        * gcc.target/arm/lob.h: Update framework.
        * gcc.target/arm/lob1.c: Likewise.
        * gcc.target/arm/lob6.c: Likewise.
        * gcc.target/arm/dlstp-int16x8.c: New test.
        * gcc.target/arm/dlstp-int32x4.c: New test.
        * gcc.target/arm/dlstp-int64x2.c: New test.
        * gcc.target/arm/dlstp-int8x16.c: New test.


### Inline copy of patch ###

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index f2e3d905dbbeb2949f2947f5cfd68208c94c9272..7a6d24a80060b4a704a481ccd1a32d96e7b0f369 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7366,7 +7366,8 @@
 ;; knows what to generate.
 (define_expand "doloop_end"
   [(use (match_operand 0 "" ""))      ; loop pseudo
-   (use (match_operand 1 "" ""))]     ; label
+   (use (match_operand 1 "" ""))      ; label
+   (use (match_operand 2 "" ""))]     ; decrement constant
   "optimize > 0 && flag_modulo_sched"
 {
   rtx s0;
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 550272facd12e60a49bf8a3b20f811cc13765b3a..7684620f0f4d161dd9e9ad2d70308021ec3d3d34 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -63,7 +63,7 @@ extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
 extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern rtx arm_attempt_dlstp_transform (rtx, rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index ee8f1babf8a1319e77e0db0fa55851c038048804..99e144d52c26597c64b982b3d4ae9a62a114cf18 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -470,6 +470,9 @@ static const struct attribute_spec arm_attribute_table[] =
 #undef TARGET_SCHED_REORDER
 #define TARGET_SCHED_REORDER arm_sched_reorder

+#undef TARGET_ALLOW_ELEMENTWISE_DOLOOP
+#define TARGET_ALLOW_ELEMENTWISE_DOLOOP arm_allow_elementwise_doloop
+
Just a nit but maybe keep the same naming scheme as the existing hook:
TARGET_CAN_USE_ELEMENTWISE_DOLOOP_P ?
+
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn)
+{
+  bool requires_vpr;
+
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+    return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+     this insn.  */
+  preprocess_constraints (insn);
+
+  for (int use = 0; use < n_operands; use++)
+    {
+      requires_vpr = true;
+      /* Iterate through alternatives of operand "use" in recog_op_alt and
+       * identify if the operand is required to be the VPR.  */
Remove the * at the start of the new line.
+      for (int alt1 = 0; alt1 < recog_data.n_alternatives; alt1++)
+    {
+      const operand_alternative *op_alt1
+          = &recog_op_alt[alt1 * n_operands];
+      /* Fetch the reg_class for each entry and check it against the
+       * VPR_REG reg_class.  */
+      if (alternative_class (op_alt1, use) != VPR_REG)
+        requires_vpr = false;
+    }
+      /* If all alternatives of the insn require the VPR reg for this operand, +     it means that either this is VPR-generating instruction, like a vctp, +     vcmp, etc., or it is a VPT-predicated insruction.  Return the subrtx
+     of the VPR reg operand.  */
+      if (requires_vpr)
+    return recog_data.operand[use];
+    }
+  return NULL_RTX;
+}
+
+/* Scan the basic block of a loop body for a vctp instruction. If there is
+   exactly one unique vctp instruction, return its rtx_insn *. */
+
+static rtx_insn *
+arm_mve_get_loop_unique_vctp (basic_block bb)
+{
+  rtx_insn *insn = BB_HEAD (bb);
+  rtx_insn *vctp_op = NULL;
+
+  /* Now scan through all the instruction patterns and
+     pick out any MVE instructions.  */
+  FOR_BB_INSNS (bb, insn)
+    {
+      if (INSN_P (insn))
+    {
+      /* First check if this is a vctp instruction.  There needs to be
+         exactly one vctp instruction within the loop.  */
+      if (arm_mve_get_vctp_lanes (PATTERN (insn)) != 0)
+        {
+          /* If we already found one vctp instruction, then the
+         loop is not consistent internally.  */
+          if (vctp_op)
+        return NULL;
+
+          vctp_op = insn;
+        }
+    }
+    }
+  return vctp_op;
+}
+
+rtx
+arm_attempt_dlstp_transform (rtx label, rtx count)
+{
+  int decrementnum;
+  basic_block body = BLOCK_FOR_INSN (label)->prev_bb;
+  rtx initial_compare;
+  /* Doloop can only be done "elementwise" with predicated dlstp/letp
+     when the iteration counter gets deprecated by the number of MVE
s/deprecated/decreased/ ? I think
+ lanes.  This can be exxtracted from the `count`, which is the expression
s/exxtracted/extracted/
+     used to calculate the number of iterations that the loop would execute +     for a standard dls/le loop.  Since we only support cases where this is a
+     power of 2, we can assume that this expression arrives here as:
+       (lshiftrt: (A) (const_int y))
+     Then we can extract the decrementnum from y.  */
+  if (GET_CODE (count) == LSHIFTRT && ARITHMETIC_P (XEXP (count, 0))
+      && (decrementnum = (1 << (INTVAL (XEXP (count, 1)))))
Why are you calculating decrementnum inside the condition?
+      /* There is one final condition that needs to be met for the loop to be
+     transformable: dlstp/letp will continue looping until there are
+     elements still to process.  This can only work if the looping ends
+     when the element counter reaches zero and not some other value
+     (e.g. n > 0 works, not n > 1), or we can incorrectly end up running
+     one additional iteration.  To by-pass any hoisting that the compiler
+     may have done with the `A` in `count` above, we can instead look up
+     to the bb before the loop preheader: this should end with a cmp+jump
+     pair, where the cmp needs to be with (const_int 0).  */

I'm wondering whether it would be possible to subtract a non-zero const from count. But that might be dangerous...

Do you have an example/test case where you saw this happen?

+ && loop_preheader_edge (body->loop_father)->src->prev_bb
+      && BB_END (loop_preheader_edge (body->loop_father)->src->prev_bb)
+      && PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+                ->src->prev_bb))
+      && INSN_P (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+                    ->src->prev_bb)))
+      && (initial_compare
+      = PATTERN (PREV_INSN (BB_END (loop_preheader_edge (body->loop_father)
+                        ->src->prev_bb))))
+      && GET_CODE (initial_compare) == SET
+      && cc_register (XEXP (initial_compare, 0), VOIDmode)
+      && GET_CODE (XEXP (initial_compare, 1)) == COMPARE
+      && CONST_INT_P (XEXP (XEXP (initial_compare, 1), 1))
+      && INTVAL (XEXP (XEXP (initial_compare, 1), 1)) == 0)
+    {
+      /* Find the vctp predicate generation inside the loop body BB.  */
+      rtx_insn *vctp_insn = arm_mve_get_loop_unique_vctp (body);
+
+      /* If we have successfully found one exactly vctp predicate-generating +     instruction within the loop and the number by which we deprecate the
+     loop counter in each iteration matches the number of lanes of the
+     vctp instruction, we can attempt to turn this into a dlstp/letp loop.
+     */
+      if (vctp_insn
+      && decrementnum == arm_mve_get_vctp_lanes (PATTERN (vctp_insn)))
+    {
I would exit early here, so you don't need to indent the rest of the code, by that I mean something like:

if (!vectp_insn
     || decrementnum != ...)
  return GEN_INT (1);

.. rest of the code.

+ rtx_insn *insn = 0;
+      rtx_insn *cur_insn = 0;
+      rtx_insn *seq;
+      rtx vctp_vpr_generated = NULL_RTX;
+      rtx insn_vpr_reg_operand = NULL_RTX;
+      bool transform_worked = true;
Won't need transform_worked.
+      int new_icode;
+
+      /* Scan through the insns in the loop bb and emit the transformed bb
+         insns to a sequence.  */
+      start_sequence ();
+      FOR_BB_INSNS (body, insn)
+        {
+          if (INSN_P (insn))
This also captures DEBUG_INSNs, which means passing -g disables this feature.
+        {
+          /* When we find the vctp instruction: This may be followed by
+             a sign-extend insn to SImode.  If it is, then save the
+             sign-extended REG into vctp_vpr_generated.  If there is no
+             sign-extend, then store the raw output of the vctp.
+             For any VPT-predicated instructions we need to ensure that
+             the VPR they use is the same as the one given here and
+             they often consume the output of a subreg of the SImode
+             sign-extended VPR-reg.  As a result, comparing against the
+             output of the sign-extend is more likely to succeed.
+             This code also guarantees to us that the vctp comes before
+             any instructions that use the VPR within the loop, for the
+             dlstp/letp transform to succeed.  */
+          if (insn == vctp_insn)
+            {
+              if (GET_CODE (XEXP (PATTERN (NEXT_INSN (insn)), 1))
+                  == SIGN_EXTEND
+              && GET_CODE (XEXP (
+                 PATTERN (NEXT_INSN (NEXT_INSN (insn))), 1))
+                 == SUBREG)
+            vctp_vpr_generated
+                = XEXP (PATTERN (NEXT_INSN (NEXT_INSN (insn))), 0);
+              else
+            vctp_vpr_generated = XEXP (PATTERN (insn), 0);
+              /* Also emit a USE of the source register of the vctp.
+             This holds the number of elements being processed
+             by the loop.  This later gets stored into `count`.
+             */
+              emit_use (XVECEXP (XEXP (PATTERN (insn), 1), 0, 0));
What if we get here but don't end up creating a predicated do-loop? Will this use break something?
+       continue;
+            }
+          /* If the insn pattern requires the use of the VPR, then it
Missing an is.
+      a VPT-predicated instruction, so it will need to be
+             transformed into the non-predicated version of the
+             instruction.  */
But this comment seems misplace here.
+ else if ((insn_vpr_reg_operand
+                = arm_get_required_vpr_reg (insn))
+               != NULL_RTX)
+            {
+              /* If the VPR value is different to the one generated by
+             the vctp, then fail the conversion.  */
+              if (!rtx_equal_p (vctp_vpr_generated,
+                    insn_vpr_reg_operand))
+            {
+              transform_worked = false;
+              break;
return GEN_INT (1);
+     }
+              /* Also ensure that it's a valid recog-ed instruction with
+             the mve_unpredicated_insn atrribute.  */
+              else if (recog_memoized (insn) >= 0
+                   && (new_icode
+                   = get_attr_mve_unpredicated_insn (insn)))
+            {
+              extract_insn (insn);
+              rtx arr[8];
+              int j = 0;
+
+              /* When transforming a VPT-predicated instruction
+                 into its unpredicated equivalent we need to drop
+                 the VPR operand and we may need to also drop a
+                 merge "vuninit" input operand, depending on the
+                 instruction pattern.  Here ensure that we have at
+                 most a two-operand difference between the two
+                 instrunctions.  */
+              int n_operands_diff
+                  = recog_data.n_operands
+                - insn_data[new_icode].n_operands;
+              gcc_assert (n_operands_diff > 0
+                      && n_operands_diff <= 2);
+
+              /* Then, loop through the operands of the predicated
+                 instruction, and retain the ones that map to the
+                 unpredicated instruction.  */
+              for (int i = 0; i < recog_data.n_operands; i++)
+                {
+                  /* Ignore the VPR and, if needed, the vuninit
+                 operand.  */
+                  if (insn_vpr_reg_operand == recog_data.operand[i]
+                  || (n_operands_diff == 2
+                      && !strcmp (recog_data.constraints[i],
+                          "0")))
+                continue;
+                  else
+                {
+                  arr[j] = recog_data.operand[i];
+                  j++;
+                }
+                }
+
+              /* Finally, emit the upredicated instruction.  */
+              switch (j)
+                {
+                  case 2:
+                emit_insn (GEN_FCN (new_icode) (arr[0],
+                                arr[1]));
+                break;
+                  case 3:
+                emit_insn (GEN_FCN (new_icode) (arr[0], arr[1],
+                                arr[2]));
+                break;
+                  default:
+                gcc_unreachable ();
+                }
+            }
+              /* If we can't identify the INSN as either being either
+             for deletion or to re-map, then we don't know how to
+             handle it, so fail the whole conversion.  */
+              else
+            {
+              transform_worked = false;
+              break;
use
return GEN_INT (1);
+     }
+            }
+          /* Instructions that dont's require the VPR can be carried
+             over as-is.  */
+          else
+            emit_insn (PATTERN (insn));
+        }
+        }
+      seq = get_insns ();
+      end_sequence ();
+
+      if (transform_worked)
+        {
no need to check this, you can only get here if it worked.
+ /* Re-write the entire BB contents with the transformed
+         sequence.  */
+          FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+        if (INSN_P (insn))
+          delete_insn (insn);
This will also delete DEBUG_INSN's! You'd probably want to delete only NONDEBUG_INSN_P (insn). I'm not an expert in how DEBUG_INSNs work but I suspect their order compared to non-debug insns are likely to be important, so really you'd want change how you 'transform' the BB and do inline insn replacement.
+ for (insn = seq; NEXT_INSN (insn); insn = NEXT_INSN (insn))
+        emit_insn_after (PATTERN (insn), BB_END (body));
+          emit_jump_insn_after (PATTERN (insn), BB_END (body));
+          return GEN_INT (decrementnum);
+        }
+    }
+    }
+  /* Bail out: we can't use dlstp/letp, so return 1 to allow loop-doloop to try
+     the standard dls/le pair.  */
+  return GEN_INT (1);
+}

Only reviewed until here, will look at the rest later.

Reply via email to