This patch adds support for auto-vectorization of average value computation using vhadd or vrhadd, for both MVE and Neon.
The patch adds the needed [u]avg<mode>3_[floor|ceil] patterns to vec-common.md, I'm not sure how to factorize them without introducing an unspec iterator? It also adds tests for 'floor' and for 'ceil', each for MVE and Neon. Vectorization works with 8-bit and 16 bit input/output vectors, but not with 32-bit ones because the vectorizer expects wider types availability for the intermediate values, but int32_t + int32_t does not involve wider types in the IR. As noted in neon-vhadd-1.c, I couldn't write a test able to use Neon vectorization with 64-bit vectors: we default to -mvectorize-with-neon-quad, and attempts to use -mvectorize-with-neon-double resulted in much worse code, which this patch does not aim at improving. 2021-05-31 Christophe Lyon <christophe.l...@linaro.org> gcc/ * gcc/config/arm/mve.md (mve_vhaddq_<supf><mode>): Prefix with '@'. (@mve_vrhaddq_<supf><mode): Likewise. * gcc/config/arm/neon.md (neon_v<r>hadd<sup><mode>): Likewise. * config/arm/vec-common.md (avg<mode>3_floor, uavg<mode>3_floor) (avg<mode>3_ceil", uavg<mode>3_ceil): New patterns. gcc/testsuite/ * gcc.target/arm/simd/mve-vhadd-1.c: New test. * gcc.target/arm/simd/mve-vhadd-2.c: New test. * gcc.target/arm/simd/neon-vhadd-1.c: New test. * gcc.target/arm/simd/neon-vhadd-2.c: New test. --- gcc/config/arm/mve.md | 4 +- gcc/config/arm/neon.md | 2 +- gcc/config/arm/vec-common.md | 60 ++++++++++++ .../gcc.target/arm/simd/mve-vhadd-1.c | 31 +++++++ .../gcc.target/arm/simd/mve-vhadd-2.c | 31 +++++++ .../gcc.target/arm/simd/neon-vhadd-1.c | 93 +++++++++++++++++++ .../gcc.target/arm/simd/neon-vhadd-2.c | 33 +++++++ 7 files changed, 251 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vhadd-1.c create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vhadd-2.c create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-vhadd-1.c create mode 100644 gcc/testsuite/gcc.target/arm/simd/neon-vhadd-2.c diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 0bfa6a91d55..04aa612331a 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -1030,7 +1030,7 @@ (define_insn "mve_vhaddq_n_<supf><mode>" ;; ;; [vhaddq_s, vhaddq_u]) ;; -(define_insn "mve_vhaddq_<supf><mode>" +(define_insn "@mve_vhaddq_<supf><mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") @@ -1652,7 +1652,7 @@ (define_insn "mve_vqsubq_<supf><mode>" ;; ;; [vrhaddq_s, vrhaddq_u]) ;; -(define_insn "mve_vrhaddq_<supf><mode>" +(define_insn "@mve_vrhaddq_<supf><mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "=w") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 077c62ffd20..18571d819eb 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1488,7 +1488,7 @@ (define_insn "neon_vaddw<sup><mode>" ; vhadd and vrhadd. -(define_insn "neon_v<r>hadd<sup><mode>" +(define_insn "@neon_v<r>hadd<sup><mode>" [(set (match_operand:VDQIW 0 "s_register_operand" "=w") (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") (match_operand:VDQIW 2 "s_register_operand" "w")] diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 80b273229f5..2779c1a8aaa 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -565,3 +565,63 @@ (define_expand "reduc_plus_scal_<mode>" DONE; }) + +(define_expand "avg<mode>3_floor" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE_<MODE>_ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vhaddq (VHADDQ_S, <MODE>mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VHADD_S, UNSPEC_VHADD_S, <MODE>mode, + operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "uavg<mode>3_floor" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE_<MODE>_ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vhaddq (VHADDQ_U, <MODE>mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VHADD_U, UNSPEC_VHADD_U, <MODE>mode, + operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "avg<mode>3_ceil" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE_<MODE>_ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vrhaddq (VRHADDQ_S, <MODE>mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VRHADD_S, UNSPEC_VRHADD_S, <MODE>mode, + operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "uavg<mode>3_ceil" + [(match_operand:MVE_2 0 "s_register_operand") + (match_operand:MVE_2 1 "s_register_operand") + (match_operand:MVE_2 2 "s_register_operand")] + "ARM_HAVE_<MODE>_ARITH" +{ + if (TARGET_HAVE_MVE) + emit_insn (gen_mve_vrhaddq (VRHADDQ_U, <MODE>mode, + operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vhadd (UNSPEC_VRHADD_U, UNSPEC_VRHADD_U, <MODE>mode, + operands[0], operands[1], operands[2])); + DONE; +}) diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-1.c new file mode 100644 index 00000000000..40489ecc67d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-1.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-additional-options "-O3" } */ + +#include <stdint.h> + +#define FUNC(SIGN, TYPE, BITS, OP, NAME) \ + void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \ + TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \ + int i; \ + for (i=0; i < (128 / BITS); i++) { \ + dest[i] = (a[i] OP b[i]) >> 1; \ + } \ +} + +FUNC(s, int, 32, +, vhadd) +FUNC(u, uint, 32, +, vhadd) +FUNC(s, int, 16, +, vhadd) +FUNC(u, uint, 16, +, vhadd) +FUNC(s, int, 8, +, vhadd) +FUNC(u, uint, 8, +, vhadd) + +/* We cannot make use of the 32 bits version because the vectorizer needs an + extended precision for the intermediate computations. */ +/* { dg-final { scan-assembler-times {vhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-2.c b/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-2.c new file mode 100644 index 00000000000..9bf145f9457 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vhadd-2.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-additional-options "-O3" } */ + +#include <stdint.h> + +#define FUNC(SIGN, TYPE, BITS, OP, NAME) \ + void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \ + TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \ + int i; \ + for (i=0; i < (128 / BITS); i++) { \ + dest[i] = (a[i] OP b[i] + 1) >> 1; \ + } \ +} + +FUNC(s, int, 32, +, vrhadd) +FUNC(u, uint, 32, +, vrhadd) +FUNC(s, int, 16, +, vrhadd) +FUNC(u, uint, 16, +, vrhadd) +FUNC(s, int, 8, +, vrhadd) +FUNC(u, uint, 8, +, vrhadd) + +/* We cannot make use of the 32 bits version because the vectorizer needs an + extended precision for the intermediate computations. */ +/* { dg-final { scan-assembler-times {vrhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vrhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vrhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vrhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vrhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vrhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-vhadd-1.c b/gcc/testsuite/gcc.target/arm/simd/neon-vhadd-1.c new file mode 100644 index 00000000000..e141c2895a8 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/neon-vhadd-1.c @@ -0,0 +1,93 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_neon_ok } */ +/* { dg-add-options arm_neon } */ +/* { dg-additional-options "-O3" } */ + +#include <stdint.h> + +/* Since we default to -mvectorize-with-neon-quad, use enough iterations so that + we can vectorize using 128-bit vectors. */ +/* For instance with int16_t: + 8 iterations and -mvectorize-with-neon-quad, we generate: +test_vhadd_s16: + vld1.16 {q8}, [r1] + vld1.16 {q9}, [r2] + vhadd.s16 q8, q8, q9 + vst1.16 {q8}, [r0] + bx lr + + 8 iterations and -mvectorize-with-neon-double, we generate: +test_vhadd_s16: + vld1.16 {d24}, [r1]! + vld1.16 {d26}, [r2]! + vld1.16 {d22}, [r1] + vmovl.s16 q12, d24 + vld1.16 {d20}, [r2] + vmovl.s16 q13, d26 + vmovl.s16 q11, d22 + vmovl.s16 q10, d20 + vadd.i32 d28, d26, d24 + vadd.i32 d24, d27, d25 + vadd.i32 d25, d22, d20 + vadd.i32 d20, d23, d21 + vshr.s32 d18, d28, #1 + vshr.s32 d19, d24, #1 + vshr.s32 d16, d25, #1 + vshr.s32 d17, d20, #1 + vmovn.i32 d18, q9 + vmovn.i32 d16, q8 + vst1.16 {d18}, [r0]! + vst1.16 {d16}, [r0] + bx lr + + Adding a cast to avoid integer promotion: + dest[i] = (int16_t)(a[i] + b[i]) >> 1 + + 8 iterations and -mvectorize-with-neon-quad, we generate: +test_vhadd_s16: + vld1.16 {q8}, [r2] + vld1.16 {q9}, [r1] + vadd.i16 q8, q8, q9 + vshr.s16 q8, q8, #1 + vst1.16 {q8}, [r0] + bx lr + + 8 iterations and -mvectorize-with-neon-double, we generate: +test_vhadd_s16: + vld1.16 {d17}, [r1]! + vld1.16 {d19}, [r2]! + vld1.16 {d18}, [r1] + vld1.16 {d16}, [r2] + vadd.i16 d17, d17, d19 + vadd.i16 d16, d16, d18 + vshr.s16 d17, d17, #1 + vshr.s16 d16, d16, #1 + vst1.16 {d17}, [r0]! + vst1.16 {d16}, [r0] + bx lr + + */ +#define FUNC(SIGN, TYPE, BITS, OP, NAME) \ + void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \ + TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \ + int i; \ + for (i=0; i < (128 / BITS); i++) { \ + dest[i] = (a[i] OP b[i]) >> 1; \ + } \ +} + +FUNC(s, int, 32, +, vhadd) +FUNC(u, uint, 32, +, vhadd) +FUNC(s, int, 16, +, vhadd) +FUNC(u, uint, 16, +, vhadd) +FUNC(s, int, 8, +, vhadd) +FUNC(u, uint, 8, +, vhadd) + +/* We cannot make use of the 32 bits version because the vectorizer needs an + extended precision for the intermediate computations. */ +/* { dg-final { scan-assembler-times {vhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/neon-vhadd-2.c b/gcc/testsuite/gcc.target/arm/simd/neon-vhadd-2.c new file mode 100644 index 00000000000..7938d89a77c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/neon-vhadd-2.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_neon_ok } */ +/* { dg-add-options arm_neon } */ +/* { dg-additional-options "-O3" } */ + +#include <stdint.h> + +/* Since we default to -mvectorize-with-neon-quad, use enough iterations so that + we can vectorize using 128-bit vectors. */ +#define FUNC(SIGN, TYPE, BITS, OP, NAME) \ + void test_ ## NAME ##_ ## SIGN ## BITS (TYPE##BITS##_t * __restrict__ dest, \ + TYPE##BITS##_t *a, TYPE##BITS##_t *b) { \ + int i; \ + for (i=0; i < (128 / BITS); i++) { \ + dest[i] = (a[i] OP b[i] + 1) >> 1; \ + } \ +} + +FUNC(s, int, 32, +, vrhadd) +FUNC(u, uint, 32, +, vrhadd) +FUNC(s, int, 16, +, vrhadd) +FUNC(u, uint, 16, +, vrhadd) +FUNC(s, int, 8, +, vrhadd) +FUNC(u, uint, 8, +, vrhadd) + +/* We cannot make use of the 32 bits version because the vectorizer needs an + extended precision for the intermediate computations. */ +/* { dg-final { scan-assembler-times {vrhadd\.s32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vrhadd\.u32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {vrhadd\.s16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vrhadd\.u16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vrhadd\.s8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vrhadd\.u8\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ -- 2.25.1