Re: [PATCH] match.pd: Improve fneg/fadd optimization [PR109240]

Richard Biener via Gcc-patches Tue, 18 Apr 2023 01:58:00 -0700

On Tue, 18 Apr 2023, Jakub Jelinek wrote:

> Hi!
> 
> match.pd has mostly for AArch64 an optimization in which it optimizes
> certain forms of __builtin_shuffle of x + y and x - y vectors into
> fneg using twice as wide element type so that every other sign is changed,
> followed by fadd.
> 
> The following patch extends that optimization, so that it can handle
> other forms as well, using the same fneg but fsub instead of fadd.
> 
> As the plus is commutative and minus is not and I want to handle
> vec_perm with plus minus and minus plus order preferrably in one
> pattern, I had to do the matching operand checks by hand.
> 
> Bootstrapped/regtested on aarch64-linux, x86_64-linux and i686-linux,
> ok for trunk?


OK.

Thanks,
Richard.

> 2023-04-18  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR tree-optimization/109240
>       * match.pd (fneg/fadd): Rewrite such that it handles both plus as
>       first vec_perm operand and minus as second using fneg/fadd and
>       minus as first vec_perm operand and plus as second using fneg/fsub.
> 
>       * gcc.target/aarch64/simd/addsub_2.c: New test.
>       * gcc.target/aarch64/sve/addsub_2.c: New test.
> 
> --- gcc/match.pd.jj   2023-03-21 19:59:40.209634256 +0100
> +++ gcc/match.pd      2023-03-22 10:17:25.344772636 +0100
> @@ -8074,63 +8074,76 @@ and,
>     under IEEE 754 the fneg of the wider type will negate every even entry
>     and when doing an add we get a sub of the even and add of every odd
>     elements.  */
> -(simplify
> - (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
> - (if (!VECTOR_INTEGER_TYPE_P (type)
> -      && !FLOAT_WORDS_BIG_ENDIAN)
> -  (with
> -   {
> -     /* Build a vector of integers from the tree mask.  */
> -     vec_perm_builder builder;
> -   }
> -   (if (tree_to_vec_perm_builder (&builder, @2))
> -    (with
> -     {
> -       /* Create a vec_perm_indices for the integer vector.  */
> -       poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> -       vec_perm_indices sel (builder, 2, nelts);
> -       machine_mode vec_mode = TYPE_MODE (type);
> -       machine_mode wide_mode;
> -       scalar_mode wide_elt_mode;
> -       poly_uint64 wide_nunits;
> -       scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
> -     }
> -     (if (sel.series_p (0, 2, 0, 2)
> -       && sel.series_p (1, 2, nelts + 1, 2)
> -       && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
> -       && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
> -       && related_vector_mode (vec_mode, wide_elt_mode,
> -                               wide_nunits).exists (&wide_mode))
> -     (with
> -      {
> -        tree stype
> -          = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
> -                                            TYPE_UNSIGNED (type));
> -        tree ntype = build_vector_type_for_mode (stype, wide_mode);
> +(for plusminus (plus minus)
> +     minusplus (minus plus)
> + (simplify
> +  (vec_perm (plusminus @0 @1) (minusplus @2 @3) VECTOR_CST@4)
> +   (if (!VECTOR_INTEGER_TYPE_P (type)
> +     && !FLOAT_WORDS_BIG_ENDIAN
> +        /* plus is commutative, while minus is not, so :c can't be used.
> +        Do equality comparisons by hand and at the end pick the operands
> +        from the minus.  */
> +     && (operand_equal_p (@0, @2, 0)
> +         ? operand_equal_p (@1, @3, 0)
> +         : operand_equal_p (@0, @3, 0) && operand_equal_p (@1, @2, 0)))
> +   (with
> +    {
> +      /* Build a vector of integers from the tree mask.  */
> +      vec_perm_builder builder;
> +    }
> +    (if (tree_to_vec_perm_builder (&builder, @4))
> +     (with
> +      {
> +     /* Create a vec_perm_indices for the integer vector.  */
> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> +     vec_perm_indices sel (builder, 2, nelts);
> +     machine_mode vec_mode = TYPE_MODE (type);
> +     machine_mode wide_mode;
> +     scalar_mode wide_elt_mode;
> +     poly_uint64 wide_nunits;
> +     scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
> +      }
> +      (if (sel.series_p (0, 2, 0, 2)
> +        && sel.series_p (1, 2, nelts + 1, 2)
> +        && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
> +        && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
> +        && related_vector_mode (vec_mode, wide_elt_mode,
> +                                wide_nunits).exists (&wide_mode))
> +       (with
> +     {
> +       tree stype
> +         = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
> +                                           TYPE_UNSIGNED (type));
> +       tree ntype = build_vector_type_for_mode (stype, wide_mode);
>  
> -        /* The format has to be a non-extended ieee format.  */
> -        const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
> -        const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
> -      }
> -      (if (TYPE_MODE (stype) != BLKmode
> -           && VECTOR_TYPE_P (ntype)
> -           && fmt_old != NULL
> -           && fmt_new != NULL)
> -       (with
> -        {
> -          /* If the target doesn't support v1xx vectors, try using
> -             scalar mode xx instead.  */
> +       /* The format has to be a non-extended ieee format.  */
> +       const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
> +       const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
> +     }
> +     (if (TYPE_MODE (stype) != BLKmode
> +          && VECTOR_TYPE_P (ntype)
> +          && fmt_old != NULL
> +          && fmt_new != NULL)
> +      (with
> +       {
> +         /* If the target doesn't support v1xx vectors, try using
> +            scalar mode xx instead.  */
>           if (known_eq (GET_MODE_NUNITS (wide_mode), 1)
>               && !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))
>             ntype = stype;
> -        }
> -        (if (fmt_new->signbit_rw
> -             == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
> -             && fmt_new->signbit_rw == fmt_new->signbit_ro
> -             && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE 
> (type), ALL_REGS)
> -             && ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P 
> (ntype))
> -                 || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> -         (plus (view_convert:type (negate (view_convert:ntype @1))) 
> @0)))))))))))
> +       }
> +       (if (fmt_new->signbit_rw
> +            == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
> +            && fmt_new->signbit_rw == fmt_new->signbit_ro
> +            && targetm.can_change_mode_class (TYPE_MODE (ntype),
> +                                              TYPE_MODE (type), ALL_REGS)
> +            && ((optimize_vectors_before_lowering_p ()
> +                 && VECTOR_TYPE_P (ntype))
> +                || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> +        (if (plusminus == PLUS_EXPR)
> +         (plus (view_convert:type (negate (view_convert:ntype @3))) @2)
> +         (minus @0 (view_convert:type
> +                     (negate (view_convert:ntype @1))))))))))))))))
>  
>  (simplify
>   (vec_perm @0 @1 VECTOR_CST@2)
> --- gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c.jj       2023-03-22 
> 10:22:57.324017790 +0100
> +++ gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c  2023-03-22 
> 10:23:54.482199126 +0100
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +/* 
> +** f1:
> +** ...
> +**   fneg    v[0-9]+.2d, v[0-9]+.2d
> +**   fsub    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +**   fneg    v[0-9]+.4s, v[0-9]+.4s
> +**   fsub    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/* 
> +** e1:
> +** ...
> +**   fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**   fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**   ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> --- gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c.jj        2023-03-22 
> 10:24:14.169917153 +0100
> +++ gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c   2023-03-22 
> 10:25:05.414183194 +0100
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +/*
> +** f1:
> +** ...
> +**   fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +**   fsub    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +**   fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> +**   fsub    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/ 
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/*
> +** e1:
> +** ...
> +**   fadd    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> +**   movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +**   fsub    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

Re: [PATCH] match.pd: Improve fneg/fadd optimization [PR109240]

Reply via email to