On Tue, 18 Apr 2023, Jakub Jelinek wrote: > Hi! > > match.pd has mostly for AArch64 an optimization in which it optimizes > certain forms of __builtin_shuffle of x + y and x - y vectors into > fneg using twice as wide element type so that every other sign is changed, > followed by fadd. > > The following patch extends that optimization, so that it can handle > other forms as well, using the same fneg but fsub instead of fadd. > > As the plus is commutative and minus is not and I want to handle > vec_perm with plus minus and minus plus order preferrably in one > pattern, I had to do the matching operand checks by hand. > > Bootstrapped/regtested on aarch64-linux, x86_64-linux and i686-linux, > ok for trunk?
OK. Thanks, Richard. > 2023-04-18 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/109240 > * match.pd (fneg/fadd): Rewrite such that it handles both plus as > first vec_perm operand and minus as second using fneg/fadd and > minus as first vec_perm operand and plus as second using fneg/fsub. > > * gcc.target/aarch64/simd/addsub_2.c: New test. > * gcc.target/aarch64/sve/addsub_2.c: New test. > > --- gcc/match.pd.jj 2023-03-21 19:59:40.209634256 +0100 > +++ gcc/match.pd 2023-03-22 10:17:25.344772636 +0100 > @@ -8074,63 +8074,76 @@ and, > under IEEE 754 the fneg of the wider type will negate every even entry > and when doing an add we get a sub of the even and add of every odd > elements. */ > -(simplify > - (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2) > - (if (!VECTOR_INTEGER_TYPE_P (type) > - && !FLOAT_WORDS_BIG_ENDIAN) > - (with > - { > - /* Build a vector of integers from the tree mask. */ > - vec_perm_builder builder; > - } > - (if (tree_to_vec_perm_builder (&builder, @2)) > - (with > - { > - /* Create a vec_perm_indices for the integer vector. */ > - poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); > - vec_perm_indices sel (builder, 2, nelts); > - machine_mode vec_mode = TYPE_MODE (type); > - machine_mode wide_mode; > - scalar_mode wide_elt_mode; > - poly_uint64 wide_nunits; > - scalar_mode inner_mode = GET_MODE_INNER (vec_mode); > - } > - (if (sel.series_p (0, 2, 0, 2) > - && sel.series_p (1, 2, nelts + 1, 2) > - && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode) > - && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits) > - && related_vector_mode (vec_mode, wide_elt_mode, > - wide_nunits).exists (&wide_mode)) > - (with > - { > - tree stype > - = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode), > - TYPE_UNSIGNED (type)); > - tree ntype = build_vector_type_for_mode (stype, wide_mode); > +(for plusminus (plus minus) > + minusplus (minus plus) > + (simplify > + (vec_perm (plusminus @0 @1) (minusplus @2 @3) VECTOR_CST@4) > + (if (!VECTOR_INTEGER_TYPE_P (type) > + && !FLOAT_WORDS_BIG_ENDIAN > + /* plus is commutative, while minus is not, so :c can't be used. > + Do equality comparisons by hand and at the end pick the operands > + from the minus. */ > + && (operand_equal_p (@0, @2, 0) > + ? operand_equal_p (@1, @3, 0) > + : operand_equal_p (@0, @3, 0) && operand_equal_p (@1, @2, 0))) > + (with > + { > + /* Build a vector of integers from the tree mask. */ > + vec_perm_builder builder; > + } > + (if (tree_to_vec_perm_builder (&builder, @4)) > + (with > + { > + /* Create a vec_perm_indices for the integer vector. */ > + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); > + vec_perm_indices sel (builder, 2, nelts); > + machine_mode vec_mode = TYPE_MODE (type); > + machine_mode wide_mode; > + scalar_mode wide_elt_mode; > + poly_uint64 wide_nunits; > + scalar_mode inner_mode = GET_MODE_INNER (vec_mode); > + } > + (if (sel.series_p (0, 2, 0, 2) > + && sel.series_p (1, 2, nelts + 1, 2) > + && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode) > + && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits) > + && related_vector_mode (vec_mode, wide_elt_mode, > + wide_nunits).exists (&wide_mode)) > + (with > + { > + tree stype > + = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode), > + TYPE_UNSIGNED (type)); > + tree ntype = build_vector_type_for_mode (stype, wide_mode); > > - /* The format has to be a non-extended ieee format. */ > - const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode); > - const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode); > - } > - (if (TYPE_MODE (stype) != BLKmode > - && VECTOR_TYPE_P (ntype) > - && fmt_old != NULL > - && fmt_new != NULL) > - (with > - { > - /* If the target doesn't support v1xx vectors, try using > - scalar mode xx instead. */ > + /* The format has to be a non-extended ieee format. */ > + const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode); > + const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode); > + } > + (if (TYPE_MODE (stype) != BLKmode > + && VECTOR_TYPE_P (ntype) > + && fmt_old != NULL > + && fmt_new != NULL) > + (with > + { > + /* If the target doesn't support v1xx vectors, try using > + scalar mode xx instead. */ > if (known_eq (GET_MODE_NUNITS (wide_mode), 1) > && !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)) > ntype = stype; > - } > - (if (fmt_new->signbit_rw > - == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode) > - && fmt_new->signbit_rw == fmt_new->signbit_ro > - && targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE > (type), ALL_REGS) > - && ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P > (ntype)) > - || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))) > - (plus (view_convert:type (negate (view_convert:ntype @1))) > @0))))))))))) > + } > + (if (fmt_new->signbit_rw > + == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode) > + && fmt_new->signbit_rw == fmt_new->signbit_ro > + && targetm.can_change_mode_class (TYPE_MODE (ntype), > + TYPE_MODE (type), ALL_REGS) > + && ((optimize_vectors_before_lowering_p () > + && VECTOR_TYPE_P (ntype)) > + || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))) > + (if (plusminus == PLUS_EXPR) > + (plus (view_convert:type (negate (view_convert:ntype @3))) @2) > + (minus @0 (view_convert:type > + (negate (view_convert:ntype @1)))))))))))))))) > > (simplify > (vec_perm @0 @1 VECTOR_CST@2) > --- gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c.jj 2023-03-22 > 10:22:57.324017790 +0100 > +++ gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c 2023-03-22 > 10:23:54.482199126 +0100 > @@ -0,0 +1,56 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ > +/* { dg-options "-Ofast" } */ > +/* { dg-add-options arm_v8_2a_fp16_neon } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ > + > +#pragma GCC target "+nosve" > + > +/* > +** f1: > +** ... > +** fneg v[0-9]+.2d, v[0-9]+.2d > +** fsub v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** ... > +*/ > +void f1 (float *restrict a, float *restrict b, float *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** d1: > +** ... > +** fneg v[0-9]+.4s, v[0-9]+.4s > +** fsub v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h > +** ... > +*/ > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) > +{ > + for (int i = 0; i < (n & -8); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** e1: > +** ... > +** fsub v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** fadd v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\] > +** ... > +*/ > +void e1 (double *restrict a, double *restrict b, double *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > --- gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c.jj 2023-03-22 > 10:24:14.169917153 +0100 > +++ gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c 2023-03-22 > 10:25:05.414183194 +0100 > @@ -0,0 +1,52 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ > + > +/* > +** f1: > +** ... > +** fneg z[0-9]+.d, p[0-9]+/m, z[0-9]+.d > +** fsub z[0-9]+.s, z[0-9]+.s, z[0-9]+.s > +** ... > +*/ > +void f1 (float *restrict a, float *restrict b, float *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** d1: > +** ... > +** fneg z[0-9]+.s, p[0-9]+/m, z[0-9]+.s > +** fsub z[0-9]+.h, z[0-9]+.h, z[0-9]+.h > +** ... > +*/ > +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) > +{ > + for (int i = 0; i < (n & -8); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > + > +/* > +** e1: > +** ... > +** fadd z[0-9]+.d, z[0-9]+.d, z[0-9]+.d > +** movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d > +** fsub z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d > +** ... > +*/ > +void e1 (double *restrict a, double *restrict b, double *res, int n) > +{ > + for (int i = 0; i < (n & -4); i+=2) > + { > + res[i+0] = a[i+0] - b[i+0]; > + res[i+1] = a[i+1] + b[i+1]; > + } > +} > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; HRB 36809 (AG Nuernberg)