Jonathan Wright <jonathan.wri...@arm.com> writes:
> Hi,
>
> V2 of this patch uses the same approach as that just implemented
> for the multiply high-half cost patch.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-28  Jonathan Wright  <jonathan.wri...@arm.com>
>
>         * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
>         of vec_select high-half from being added into Neon add cost.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vaddX_high_cost.c: New test.

OK, thanks.

Richard

>
> From: Jonathan Wright
> Sent: 29 July 2021 10:22
> To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
> Cc: Richard Sandiford <richard.sandif...@arm.com>; Kyrylo Tkachov 
> <kyrylo.tkac...@arm.com>
> Subject: [PATCH] aarch64: Don't include vec_select high-half in SIMD add cost
>
> Hi,
>
> The Neon add-long/add-widen instructions can select the top or bottom
> half of the operand registers. This selection does not change the
> cost of the underlying instruction and this should be reflected by
> the RTL cost function.
>
> This patch adds RTL tree traversal in the Neon add cost function to
> match vec_select high-half of its operands. This traversal prevents
> the cost of the vec_select from being added into the cost of the
> subtract - meaning that these instructions can now be emitted in the
> combine pass as they are no longer deemed prohibitively expensive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-28  Jonathan Wright  <jonathan.wri...@arm.com>
>
>         * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
>         of vec_select high-half from being added into Neon add cost.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vaddX_high_cost.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 10a436ad7e6fa6c5de706ee5abbdc6fb3d268076..cc92cc9c208e63f262c22c7fe8e6915825884775
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13161,6 +13161,21 @@ cost_minus:
>       op1 = XEXP (x, 1);
>  
>  cost_plus:
> +     if (VECTOR_MODE_P (mode))
> +       {
> +         /* ADDL2 and ADDW2.  */
> +         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
> +         if (vec_flags & VEC_ADVSIMD)
> +           {
> +             /* The select-operand-high-half versions of the add instruction
> +                have the same cost as the regular three vector version -
> +                don't add the costs of the select into the costs of the add.
> +                */
> +             op0 = aarch64_strip_extend_vec_half (op0);
> +             op1 = aarch64_strip_extend_vec_half (op1);
> +           }
> +       }
> +
>       if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
>           || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
>         {
> diff --git a/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c 
> b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..43f28d597a94d8aceac87ef2240a50cc56c07240
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vaddX_high_cost.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include <arm_neon.h>
> +
> +#define TEST_ADDL(rettype, intype, ts, rs) \
> +  rettype test_vaddl_ ## ts (intype a, intype b, intype c) \
> +     { \
> +             rettype t0 = vaddl_ ## ts (vget_high_ ## ts (a), \
> +                                        vget_high_ ## ts (c)); \
> +             rettype t1 = vaddl_ ## ts (vget_high_ ## ts (b), \
> +                                        vget_high_ ## ts (c)); \
> +             return vaddq ## _ ## rs (t0, t1); \
> +     }
> +
> +TEST_ADDL (int16x8_t, int8x16_t, s8, s16)
> +TEST_ADDL (uint16x8_t, uint8x16_t, u8, u16)
> +TEST_ADDL (int32x4_t, int16x8_t, s16, s32)
> +TEST_ADDL (uint32x4_t, uint16x8_t, u16, u32)
> +TEST_ADDL (int64x2_t, int32x4_t, s32, s64)
> +TEST_ADDL (uint64x2_t, uint32x4_t, u32, u64)
> +
> +#define TEST_ADDW(rettype, intype, intypel, ts, rs) \
> +  rettype test_vaddw_ ## ts (intype a, intype b, intypel c) \
> +     { \
> +             rettype t0 = vaddw_ ## ts (a, vget_high_ ## ts (c)); \
> +             rettype t1 = vaddw_ ## ts (b, vget_high_ ## ts (c)); \
> +             return vaddq ## _ ## rs (t0, t1); \
> +     }
> +
> +TEST_ADDW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
> +TEST_ADDW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
> +TEST_ADDW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
> +TEST_ADDW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_ADDW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
> +TEST_ADDW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
> +
> +/* { dg-final { scan-assembler-not "dup\\t" } } */

Reply via email to