Hi all, This patch enables MVE vmul instructions for auto-vectorization. It includes MVE in expander mul<mode>3 to enable vectorization for MVE and modifies related vmul insns to support the expander by using 'mult' instead of unspec. The mul<mode>3 for vectorization in vec-common.md uses mode iterator VDQWH instead of VALLW to cover all supported modes. The macros ARM_HAVE_<MODE>_ARITH are used to select supported modes for different targets. The redundant mul<mode>3 in neon.md is removed.
Regression tested on arm-none-eabi and bootstraped on arm-none-linux-gnueabihf. Is it OK for trunk please? Thanks Dennis gcc/ChangeLog: 2020-10-02 Dennis Zhang <dennis.zh...@arm.com> * config/arm/mve.md (mve_vmulq<mode>): New entry for vmul instruction using expression 'mult'. (mve_vmulq_f<mode>): Use mult instead of VMULQ_F. * config/arm/neon.md (mul<mode>3): Removed. * config/arm/vec-common.md (mul<mode>3): Use the new mode macros ARM_HAVE_<MODE>_ARITH. Use mode iterator VDQWH instead of VALLW. gcc/testsuite/ChangeLog: 2020-10-02 Dennis Zhang <dennis.zh...@arm.com> * gcc.target/arm/simd/mve-vmul_1.c: New test.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 3a57901bd5b..5b2b609174c 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -2199,6 +2199,17 @@ [(set_attr "type" "mve_move") ]) +(define_insn "mve_vmulq<mode>" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (mult:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vmul.i%#<V_sz_elem>\t%q0, %q1, %q2" + [(set_attr "type" "mve_move") +]) + ;; ;; [vornq_u, vornq_s]) ;; @@ -3210,9 +3221,8 @@ (define_insn "mve_vmulq_f<mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VMULQ_F)) + (mult:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vmul.f%#<V_sz_elem> %q0, %q1, %q2" diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 96bf277f501..f6632f1a25a 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1899,17 +1899,6 @@ (const_string "neon_mul_<V_elem_ch><q>")))] ) -(define_insn "mul<mode>3" - [(set - (match_operand:VH 0 "s_register_operand" "=w") - (mult:VH - (match_operand:VH 1 "s_register_operand" "w") - (match_operand:VH 2 "s_register_operand" "w")))] - "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" - "vmul.f16\t%<V_reg>0, %<V_reg>1, %<V_reg>2" - [(set_attr "type" "neon_mul_<VH_elem_ch><q>")] -) - (define_insn "neon_vmulf<mode>" [(set (match_operand:VH 0 "s_register_operand" "=w") diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index c3c86c46355..45db60e7411 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -101,14 +101,11 @@ }) (define_expand "mul<mode>3" - [(set (match_operand:VALLW 0 "s_register_operand") - (mult:VALLW (match_operand:VALLW 1 "s_register_operand") - (match_operand:VALLW 2 "s_register_operand")))] - "(TARGET_NEON && ((<MODE>mode != V2SFmode && <MODE>mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (<MODE>mode == V4HImode && TARGET_REALLY_IWMMXT)" -{ -}) + [(set (match_operand:VDQWH 0 "s_register_operand") + (mult:VDQWH (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] + "ARM_HAVE_<MODE>_ARITH" +) (define_expand "smin<mode>3" [(set (match_operand:VALLW 0 "s_register_operand") diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c b/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c new file mode 100644 index 00000000000..514f292c15e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ +/* { dg-add-options arm_v8_1m_mve_fp } */ +/* { dg-additional-options "-O3" } */ + +#include <stdint.h> + +void test_vmul_i32 (int32_t * dest, int32_t * a, int32_t * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i32_u (uint32_t * dest, uint32_t * a, uint32_t * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_i16 (int16_t * dest, int16_t * a, int16_t * b) { + int i; + for (i=0; i<8; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i16_u (uint16_t * dest, uint16_t * a, uint16_t * b) { + int i; + for (i=0; i<8; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_i8 (int8_t * dest, int8_t * a, int8_t * b) { + int i; + for (i=0; i<16; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i8_u (uint8_t * dest, uint8_t * a, uint8_t * b) { + int i; + for (i=0; i<16; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_f32 (float * dest, float * a, float * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ +