Jonathan Wright <jonathan.wri...@arm.com> writes: > Hi, > > V2 of this patch uses the same approach as that just implemented > for the multiply high-half cost patch. > > Regression tested and bootstrapped on aarch64-none-linux-gnu - no > issues. > > Ok for master? > > Thanks, > Jonathan > > --- > > gcc/ChangeLog: > > 2021-07-28 Jonathan Wright <jonathan.wri...@arm.com> > > * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost > of vec_select high-half from being added into Neon add cost. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/vaddX_high_cost.c: New test.
OK, thanks. Richard > > From: Jonathan Wright > Sent: 29 July 2021 10:22 > To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org> > Cc: Richard Sandiford <richard.sandif...@arm.com>; Kyrylo Tkachov > <kyrylo.tkac...@arm.com> > Subject: [PATCH] aarch64: Don't include vec_select high-half in SIMD add cost > > Hi, > > The Neon add-long/add-widen instructions can select the top or bottom > half of the operand registers. This selection does not change the > cost of the underlying instruction and this should be reflected by > the RTL cost function. > > This patch adds RTL tree traversal in the Neon add cost function to > match vec_select high-half of its operands. This traversal prevents > the cost of the vec_select from being added into the cost of the > subtract - meaning that these instructions can now be emitted in the > combine pass as they are no longer deemed prohibitively expensive. > > Regression tested and bootstrapped on aarch64-none-linux-gnu - no > issues. > > Ok for master? > > Thanks, > Jonathan > > --- > > gcc/ChangeLog: > > 2021-07-28 Jonathan Wright <jonathan.wri...@arm.com> > > * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost > of vec_select high-half from being added into Neon add cost. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/vaddX_high_cost.c: New test. > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 10a436ad7e6fa6c5de706ee5abbdc6fb3d268076..cc92cc9c208e63f262c22c7fe8e6915825884775 > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -13161,6 +13161,21 @@ cost_minus: > op1 = XEXP (x, 1); > > cost_plus: > + if (VECTOR_MODE_P (mode)) > + { > + /* ADDL2 and ADDW2. */ > + unsigned int vec_flags = aarch64_classify_vector_mode (mode); > + if (vec_flags & VEC_ADVSIMD) > + { > + /* The select-operand-high-half versions of the add instruction > + have the same cost as the regular three vector version - > + don't add the costs of the select into the costs of the add. > + */ > + op0 = aarch64_strip_extend_vec_half (op0); > + op1 = aarch64_strip_extend_vec_half (op1); > + } > + } > + > if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE > || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) > { > diff --git a/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c > b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..43f28d597a94d8aceac87ef2240a50cc56c07240 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c > @@ -0,0 +1,38 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > + > +#include <arm_neon.h> > + > +#define TEST_ADDL(rettype, intype, ts, rs) \ > + rettype test_vaddl_ ## ts (intype a, intype b, intype c) \ > + { \ > + rettype t0 = vaddl_ ## ts (vget_high_ ## ts (a), \ > + vget_high_ ## ts (c)); \ > + rettype t1 = vaddl_ ## ts (vget_high_ ## ts (b), \ > + vget_high_ ## ts (c)); \ > + return vaddq ## _ ## rs (t0, t1); \ > + } > + > +TEST_ADDL (int16x8_t, int8x16_t, s8, s16) > +TEST_ADDL (uint16x8_t, uint8x16_t, u8, u16) > +TEST_ADDL (int32x4_t, int16x8_t, s16, s32) > +TEST_ADDL (uint32x4_t, uint16x8_t, u16, u32) > +TEST_ADDL (int64x2_t, int32x4_t, s32, s64) > +TEST_ADDL (uint64x2_t, uint32x4_t, u32, u64) > + > +#define TEST_ADDW(rettype, intype, intypel, ts, rs) \ > + rettype test_vaddw_ ## ts (intype a, intype b, intypel c) \ > + { \ > + rettype t0 = vaddw_ ## ts (a, vget_high_ ## ts (c)); \ > + rettype t1 = vaddw_ ## ts (b, vget_high_ ## ts (c)); \ > + return vaddq ## _ ## rs (t0, t1); \ > + } > + > +TEST_ADDW (int16x8_t, int16x8_t, int8x16_t, s8, s16) > +TEST_ADDW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16) > +TEST_ADDW (int32x4_t, int32x4_t, int16x8_t, s16, s32) > +TEST_ADDW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32) > +TEST_ADDW (int64x2_t, int64x2_t, int32x4_t, s32, s64) > +TEST_ADDW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64) > + > +/* { dg-final { scan-assembler-not "dup\\t" } } */