On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:
On 06/11/2023 11:24, Richard Sandiford wrote:
Stamatis Markianos-Wright <stam.markianos-wri...@arm.com> writes:
One of the main reasons for reading the arm bits was to try to answer
the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE? E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.
Does the patch handle that?
So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:
This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standard
dls/le loops)
Thanks to that check I haven't been able to trigger anything like the
behaviour you describe, do you think the doloop_valid_p checks are
robust enough?
The loops I was thinking of are provably not infinite though. E.g.:
for (unsigned int i = 0; i < UINT_MAX - 100; ++i)
...
is known to terminate. And doloop conversion is safe with the normal
count-down-by-1 approach, so I don't think current code would need
to reject it. I.e. a conversion to:
unsigned int i = UINT_MAX - 101;
do
...
while (--i != ~0U);
would be safe, but a conversion to:
int i = UINT_MAX - 101;
do
...
while ((i -= step, i > 0));
wouldn't, because the loop body would only be executed once.
I'm only going off the name "infinite" though :) It's possible that
it has more connotations than that.
Thanks,
Richard
Ack, yep, I see what you mean now, and yep, that kind of loop does
indeed pass through doloop_valid_p
Interestingly , in the v8-M Arm ARM this is done with:
```
boolean IsLastLowOverheadLoop(INSTR_EXEC_STATE_Type state)
// This does not check whether a loop is currently active.
// If the PE were in a loop, would this be the last one?
return UInt(state.LoopCount) <= (1 << (4 - LTPSIZE));
```
So architecturally the asm we output would be ok (except maybe the
"branch too far subs;bgt;lctp" fallback at
`predicated_doloop_end_internal` (maybe that should be `bhi`))... But
now GE: isn't looking like an accurate representation of this
operation in the compiler.
I'm wondering if I should try to make
`predicated_doloop_end_internal` contain a comparison along the lines
of:
(gtu: (plus: (LR) (const_int -num_lanes)) (const_int num_lanes_minus_1))
I'll give that a try :)
The only reason I'd chosen to go with GE earlier, tbh, was because of
the existing handling of GE in loop-doloop.cc
Let me know if any other ideas come to your mind!
Cheers,
Stam
It looks like I've had success with the below (diff to previous patch),
trimmed a bit to only the functionally interesting things::
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 368d5138ca1..54dd4ee564b 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1649,16 +1649,28 @@
&& (decrement_num = arm_attempt_dlstp_transform (operands[1]))
&& (INTVAL (decrement_num) != 1))
{
- insn = emit_insn
- (gen_thumb2_addsi3_compare0
- (s0, s0, GEN_INT ((-1) * (INTVAL (decrement_num)))));
- cmp = XVECEXP (PATTERN (insn), 0, 0);
- cc_reg = SET_DEST (cmp);
- bcomp = gen_rtx_GE (VOIDmode, cc_reg, const0_rtx);
loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
- emit_jump_insn (gen_rtx_SET (pc_rtx,
- gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
- loc_ref, pc_rtx)));
+ switch (INTVAL (decrement_num))
+ {
+ case 2:
+ insn = emit_jump_insn (gen_predicated_doloop_end_internal2
+ (s0, loc_ref));
+ break;
+ case 4:
+ insn = emit_jump_insn (gen_predicated_doloop_end_internal4
+ (s0, loc_ref));
+ break;
+ case 8:
+ insn = emit_jump_insn (gen_predicated_doloop_end_internal8
+ (s0, loc_ref));
+ break;
+ case 16:
+ insn = emit_jump_insn (gen_predicated_doloop_end_internal16
+ (s0, loc_ref));
+ break;
+ default:
+ gcc_unreachable ();
+ }
DONE;
}
}
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 93905583b18..c083f965fa9 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6922,23 +6922,24 @@
;; Originally expanded by 'predicated_doloop_end'.
;; In the rare situation where the branch is too far, we do also need to
;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration.
-(define_insn "*predicated_doloop_end_internal"
+(define_insn "predicated_doloop_end_internal<letp_num_lanes>"
[(set (pc)
(if_then_else
- (ge (plus:SI (reg:SI LR_REGNUM)
- (match_operand:SI 0 "const_int_operand" ""))
- (const_int 0))
- (label_ref (match_operand 1 "" ""))
+ (gtu (unspec:SI [(plus:SI (match_operand:SI 0
"s_register_operand" "=r")
+ (const_int <letp_num_lanes_neg>))]
+ LETP)
+ (const_int <letp_num_lanes_minus_1>))
+ (match_operand 1 "" "")
(pc)))
- (set (reg:SI LR_REGNUM)
- (plus:SI (reg:SI LR_REGNUM) (match_dup 0)))
+ (set (match_dup 0)
+ (plus:SI (match_dup 0) (const_int <letp_num_lanes_neg>)))
(clobber (reg:CC CC_REGNUM))]
"TARGET_HAVE_MVE"
{
if (get_attr_length (insn) == 4)
return "letp\t%|lr, %l1";
else
- return "subs\t%|lr, #%n0\n\tbgt\t%l1\n\tlctp";
+ return "subs\t%|lr, #<letp_num_lanes>\n\tbhi\t%l1\n\tlctp";
}
[(set (attr "length")
(if_then_else
@@ -6947,11 +6948,11 @@
(const_int 6)))
(set_attr "type" "branch")])
-(define_insn "dlstp<mode1>_insn"
+(define_insn "dlstp<dlstp_elemsize>_insn"
[
(set (reg:SI LR_REGNUM)
(unspec:SI [(match_operand:SI 0 "s_register_operand" "r")]
DLSTP))
]
"TARGET_HAVE_MVE"
- "dlstp.<mode1>\t%|lr, %0")
+ "dlstp.<dlstp_elemsize>\t%|lr, %0")
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 6a72700a127..47fdef989b4 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -185,6 +185,7 @@ doloop_condition_get (rtx_insn *doloop_pat)
|| XEXP (inc_src, 0) != reg
|| !CONST_INT_P (XEXP (inc_src, 1)))
return 0;
+ int dec_num = abs (INTVAL (XEXP (inc_src, 1)));
/* Check for (set (pc) (if_then_else (condition)
(label_ref (label))
@@ -199,21 +200,32 @@ doloop_condition_get (rtx_insn *doloop_pat)
/* Extract loop termination condition. */
condition = XEXP (SET_SRC (cmp), 0);
- /* We expect a GE or NE comparison with 0 or 1. */
- if ((GET_CODE (condition) != GE
- && GET_CODE (condition) != NE)
- || (XEXP (condition, 1) != const0_rtx
- && XEXP (condition, 1) != const1_rtx))
+ /* We expect a GE or NE comparison with 0 or 1, or a GTU comparison
with
+ dec_num - 1. */
+ if (!((GET_CODE (condition) == GE
+ || GET_CODE (condition) == NE)
+ && (XEXP (condition, 1) == const0_rtx
+ || XEXP (condition, 1) == const1_rtx ))
+ &&!(GET_CODE (condition) == GTU
+ && ((INTVAL (XEXP (condition, 1))) == (dec_num - 1))))
return 0;
- if ((XEXP (condition, 0) == reg)
+ /* For the ARM special case of having a GTU: re-form the condition
without
+ the unspec for the benefit of the middle-end. */
+ if (GET_CODE (condition) == GTU)
+ {
+ condition = gen_rtx_fmt_ee (GTU, VOIDmode, inc_src, GEN_INT
(dec_num - 1));
+ return condition;
+ }
+ else if ((XEXP (condition, 0) == reg)
/* For the third case: */
|| ((cc_reg != NULL_RTX)
&& (XEXP (condition, 0) == cc_reg)
&& (reg_orig == reg))
@@ -245,20 +257,11 @@ doloop_condition_get (rtx_insn *doloop_pat)
(label_ref (label))
(pc))))])
- So we return the second form instead for the two cases when n == 1.
-
- For n > 1, the final value may be exceeded, so use GE instead of NE.
+ So we return the second form instead for the two cases.
*/
- if (GET_CODE (pattern) != PARALLEL)
- {
- if (INTVAL (XEXP (inc_src, 1)) != -1)
- condition = gen_rtx_fmt_ee (GE, VOIDmode, inc_src, const0_rtx);
- else
- condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);;
- }
-
+ condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
return condition;
- }
+ }
/* ??? If a machine uses a funny comparison, we could return a
canonicalized form here. */
@@ -501,7 +504,8 @@ doloop_modify (class loop *loop, class niter_desc
*desc,
case GE:
/* Currently only GE tests against zero are supported. */
gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+ /* FALLTHRU */
+ case GTU:
noloop = constm1_rtx;
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index a6a7ff507a5..9398702cddd 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2673,8 +2673,16 @@
(define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
(define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
-(define_int_attr mode1 [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
- (DLSTP64 "64")])
+(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32
"32")
+ (DLSTP64 "64")])
+
+(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4")
+ (LETP64 "2")])
+(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8")
(LETP32 "-4")
+ (LETP64 "-2")])
+
+(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7")
(LETP32 "3")
+ (LETP64 "1")])
(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
(UNSPEC_DOT_U "u8")
@@ -2921,6 +2929,8 @@
(define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S])
(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
DLSTP64])
+(define_int_iterator LETP [LETP8 LETP16 LETP32
+ LETP64])
;; Define iterators for VCMLA operations
(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
/* The iteration count does not need incrementing for a GE
test. */
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 12ae4c4f820..2d6f27c14f4 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -587,6 +587,10 @@
DLSTP16
DLSTP32
DLSTP64
+ LETP8
+ LETP16
+ LETP32
+ LETP64
VPNOT
VCREATEQ_F
VCVTQ_N_TO_F_S
I've attached the whole [2/2] patch diff with this change and
the required comment changes in doloop_condition_get.
WDYT?
Thanks,
Stam