Hi All, This is a respin of an older patch which never got upstream reviewed by a maintainer. It's been updated to fit the current GCC codegen.
This patch adds a pattern to support the (F)ADDP (scalar) instruction. Before the patch, the C code typedef float v4sf __attribute__((vector_size (16))); float foo1 (v4sf x) { return x[0] + x[1]; } generated: foo1: dup s1, v0.s[1] fadd s0, s1, s0 ret After patch: foo1: faddp s0, v0.2s ret The double case is now handled by SLP but the remaining cases still need help from combine. I have kept the integer and floating point separate because of the integer one only supports V2DI and sharing it with the float would have required definition of a few new iterators for just a single use. I provide support for when both elements are subregs as a different pattern as there's no way to tell reload that the two registers must be equal with just constraints. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>, *aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>, *aarch64_addp_scalar2v2di): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/scalar_faddp.c: New test. * gcc.target/aarch64/simd/scalar_faddp2.c: New test. * gcc.target/aarch64/simd/scalar_addp.c: New test. Co-authored-by: Tamar Christina <tamar.christ...@arm.com> --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a9384424f44bde05 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>" [(set_attr "type" "neon_fp_reduc_add_<stype><q>")] ) +;; For the case where both operands are a subreg we need to use a +;; match_dup since reload cannot enforce that the registers are +;; the same with a constraint in this case. +(define_insn "*aarch64_faddp_scalar2<mode>" + [(set (match_operand:<VEL> 0 "register_operand" "=w") + (plus:<VEL> + (vec_select:<VEL> + (match_operator:<VEL> 1 "subreg_lowpart_operator" + [(match_operand:VHSDF 2 "register_operand" "w")]) + (parallel [(match_operand 3 "const_int_operand" "n")])) + (match_dup:<VEL> 2)))] + "TARGET_SIMD + && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1" + "faddp\t%<Vetype>0, %2.2<Vetype>" + [(set_attr "type" "neon_fp_reduc_add_<stype><q>")] +) + +(define_insn "*aarch64_faddp_scalar<mode>" + [(set (match_operand:<VEL> 0 "register_operand" "=w") + (plus:<VEL> + (vec_select:<VEL> + (match_operand:VHSDF 1 "register_operand" "w") + (parallel [(match_operand 2 "const_int_operand" "n")])) + (match_operand:<VEL> 3 "register_operand" "1")))] + "TARGET_SIMD + && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1 + && SUBREG_P (operands[3]) && !SUBREG_P (operands[1]) + && subreg_lowpart_p (operands[3])" + "faddp\t%<Vetype>0, %1.2<Vetype>" + [(set_attr "type" "neon_fp_reduc_add_<stype><q>")] +) + +;; For the case where both operands are a subreg we need to use a +;; match_dup since reload cannot enforce that the registers are +;; the same with a constraint in this case. +(define_insn "*aarch64_addp_scalar2v2di" + [(set (match_operand:DI 0 "register_operand" "=w") + (plus:DI + (vec_select:DI + (match_operator:DI 1 "subreg_lowpart_operator" + [(match_operand:V2DI 2 "register_operand" "w")]) + (parallel [(match_operand 3 "const_int_operand" "n")])) + (match_dup:DI 2)))] + "TARGET_SIMD + && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1" + "addp\t%d0, %2.2d" + [(set_attr "type" "neon_reduc_add_long")] +) + +(define_insn "*aarch64_addp_scalarv2di" + [(set (match_operand:DI 0 "register_operand" "=w") + (plus:DI + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "w") + (parallel [(match_operand 2 "const_int_operand" "n")])) + (match_operand:DI 3 "register_operand" "1")))] + "TARGET_SIMD + && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1 + && SUBREG_P (operands[3]) && !SUBREG_P (operands[1]) + && subreg_lowpart_p (operands[3])" + "addp\t%d0, %1.2d" + [(set_attr "type" "neon_reduc_add_long")] +) + (define_insn "aarch64_reduc_plus_internal<mode>" [(set (match_operand:VDQV 0 "register_operand" "=w") (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")] diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c new file mode 100644 index 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c @@ -0,0 +1,11 @@ +/* { dg-do assemble } */ +/* { dg-additional-options "-save-temps -O1 -std=c99" } */ + +typedef long long v2di __attribute__((vector_size (16))); + +long long +foo (v2di x) +{ + return x[1] + x[0]; +} +/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c new file mode 100644 index 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c @@ -0,0 +1,44 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */ +/* { dg-add-options arm_v8_2a_fp16_scalar } */ +/* { dg-additional-options "-save-temps -O1" } */ +/* { dg-final { scan-assembler-times "dup" 4 } } */ + + +typedef double v2df __attribute__((vector_size (16))); +typedef float v4sf __attribute__((vector_size (16))); +typedef __fp16 v8hf __attribute__((vector_size (16))); + +double +foo (v2df x) +{ + return x[1] + x[0]; +} +/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */ + +float +foo1 (v4sf x) +{ + return x[0] + x[1]; +} +/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */ + +__fp16 +foo2 (v8hf x) +{ + return x[0] + x[1]; +} +/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */ + +float +foo3 (v4sf x) +{ + return x[2] + x[3]; +} + +__fp16 +foo4 (v8hf x) +{ + return x[6] + x[7]; +} + diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c new file mode 100644 index 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c @@ -0,0 +1,14 @@ +/* { dg-do assemble } */ +/* { dg-additional-options "-save-temps -O1 -w" } */ + +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long)))); +double a[]; +*b; +fn1() { + __m128i c; + *(__m128i *)a = c; + *b = a[0] + a[1]; +} + +/* { dg-final { scan-assembler-times "faddp" 1 } } */ + --
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a9384424f44bde05 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>" [(set_attr "type" "neon_fp_reduc_add_<stype><q>")] ) +;; For the case where both operands are a subreg we need to use a +;; match_dup since reload cannot enforce that the registers are +;; the same with a constraint in this case. +(define_insn "*aarch64_faddp_scalar2<mode>" + [(set (match_operand:<VEL> 0 "register_operand" "=w") + (plus:<VEL> + (vec_select:<VEL> + (match_operator:<VEL> 1 "subreg_lowpart_operator" + [(match_operand:VHSDF 2 "register_operand" "w")]) + (parallel [(match_operand 3 "const_int_operand" "n")])) + (match_dup:<VEL> 2)))] + "TARGET_SIMD + && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1" + "faddp\t%<Vetype>0, %2.2<Vetype>" + [(set_attr "type" "neon_fp_reduc_add_<stype><q>")] +) + +(define_insn "*aarch64_faddp_scalar<mode>" + [(set (match_operand:<VEL> 0 "register_operand" "=w") + (plus:<VEL> + (vec_select:<VEL> + (match_operand:VHSDF 1 "register_operand" "w") + (parallel [(match_operand 2 "const_int_operand" "n")])) + (match_operand:<VEL> 3 "register_operand" "1")))] + "TARGET_SIMD + && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1 + && SUBREG_P (operands[3]) && !SUBREG_P (operands[1]) + && subreg_lowpart_p (operands[3])" + "faddp\t%<Vetype>0, %1.2<Vetype>" + [(set_attr "type" "neon_fp_reduc_add_<stype><q>")] +) + +;; For the case where both operands are a subreg we need to use a +;; match_dup since reload cannot enforce that the registers are +;; the same with a constraint in this case. +(define_insn "*aarch64_addp_scalar2v2di" + [(set (match_operand:DI 0 "register_operand" "=w") + (plus:DI + (vec_select:DI + (match_operator:DI 1 "subreg_lowpart_operator" + [(match_operand:V2DI 2 "register_operand" "w")]) + (parallel [(match_operand 3 "const_int_operand" "n")])) + (match_dup:DI 2)))] + "TARGET_SIMD + && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1" + "addp\t%d0, %2.2d" + [(set_attr "type" "neon_reduc_add_long")] +) + +(define_insn "*aarch64_addp_scalarv2di" + [(set (match_operand:DI 0 "register_operand" "=w") + (plus:DI + (vec_select:DI + (match_operand:V2DI 1 "register_operand" "w") + (parallel [(match_operand 2 "const_int_operand" "n")])) + (match_operand:DI 3 "register_operand" "1")))] + "TARGET_SIMD + && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1 + && SUBREG_P (operands[3]) && !SUBREG_P (operands[1]) + && subreg_lowpart_p (operands[3])" + "addp\t%d0, %1.2d" + [(set_attr "type" "neon_reduc_add_long")] +) + (define_insn "aarch64_reduc_plus_internal<mode>" [(set (match_operand:VDQV 0 "register_operand" "=w") (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")] diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c new file mode 100644 index 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c @@ -0,0 +1,11 @@ +/* { dg-do assemble } */ +/* { dg-additional-options "-save-temps -O1 -std=c99" } */ + +typedef long long v2di __attribute__((vector_size (16))); + +long long +foo (v2di x) +{ + return x[1] + x[0]; +} +/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c new file mode 100644 index 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c @@ -0,0 +1,44 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */ +/* { dg-add-options arm_v8_2a_fp16_scalar } */ +/* { dg-additional-options "-save-temps -O1" } */ +/* { dg-final { scan-assembler-times "dup" 4 } } */ + + +typedef double v2df __attribute__((vector_size (16))); +typedef float v4sf __attribute__((vector_size (16))); +typedef __fp16 v8hf __attribute__((vector_size (16))); + +double +foo (v2df x) +{ + return x[1] + x[0]; +} +/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */ + +float +foo1 (v4sf x) +{ + return x[0] + x[1]; +} +/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */ + +__fp16 +foo2 (v8hf x) +{ + return x[0] + x[1]; +} +/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */ + +float +foo3 (v4sf x) +{ + return x[2] + x[3]; +} + +__fp16 +foo4 (v8hf x) +{ + return x[6] + x[7]; +} + diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c new file mode 100644 index 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c @@ -0,0 +1,14 @@ +/* { dg-do assemble } */ +/* { dg-additional-options "-save-temps -O1 -w" } */ + +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long)))); +double a[]; +*b; +fn1() { + __m128i c; + *(__m128i *)a = c; + *b = a[0] + a[1]; +} + +/* { dg-final { scan-assembler-times "faddp" 1 } } */ +