Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > The following loop does a conditional reduction using an add: > > #include <stdint.h> > > int32_t f (int32_t *restrict array, int len, int min) > { > int32_t iSum = 0; > > for (int i=0; i<len; i++) { > if (array[i] >= min) > iSum += array[i]; > } > return iSum; > } > > for this we currently generate: > > mov z1.b, #0 > mov z2.s, w2 > mov z3.d, z1.d > ptrue p2.b, all > ld1w z0.s, p0/z, [x0, x3, lsl 2] > cmpge p1.s, p2/z, z0.s, z2.s > add x3, x3, x4 > sel z0.s, p1, z0.s, z3.s > add z1.s, p0/m, z1.s, z0.s > whilelo p0.s, w3, w1 > > where the SEL is unneeded as it's selecting between 0 or a value. This can be > optimized to just doing the conditional add on p1 instead of p0. After this > patch we generate: > > mov z2.s, w2 > mov z0.b, #0 > ptrue p1.b, all > ld1w z1.s, p0/z, [x0, x3, lsl 2] > cmpge p0.s, p0/z, z1.s, z2.s > add x3, x3, x4 > add z0.s, p0/m, z0.s, z1.s > whilelo p0.s, w3, w1 > > and so we drop the SEL and the 0 move. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master?
OK, thanks. Richard > Thanks, > Tamar > > gcc/ChangeLog: > > * match.pd: New rule. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/pred-cond-reduc.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/match.pd b/gcc/match.pd > index > 19cbad7592787a568d4a7cfd62746d5844c0be5f..ec98a302ac773647413f776fba15930ad247c747 > 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -6978,6 +6978,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > && element_precision (type) == element_precision (op_type)) > (view_convert (cond_op @2 @3 @4 @5 (view_convert:op_type @1))))))) > > +/* Detect simplication for a conditional reduction where > + > + a = mask1 ? b : 0 > + c = mask2 ? d + a : d > + > + is turned into > + > + c = mask1 && mask2 ? d + b : d. */ > +(simplify > + (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1) > + (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1)) > + > /* For pointers @0 and @2 and nonnegative constant offset @1, look for > expressions like: > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..bd53025d3f17224004244dadc88e0c68ded23f12 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c > @@ -0,0 +1,18 @@ > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O3 --save-temps" } */ > + > +#include <stdint.h> > + > +int32_t f (int32_t *restrict array, int len, int min) > +{ > + int32_t iSum = 0; > + > + for (int i=0; i<len; i++) { > + if (array[i] >= min) > + iSum += array[i]; > + } > + return iSum; > +} > + > + > +/* { dg-final { scan-assembler-not {\tsel\tz[0-9]+\.s, p1, z[0-9]+\.s, > z[0-9]+\.s} } } */