The ARMv8.1 architecture extension adds two Adv.SIMD instructions, sqrdmlah and sqrdmlsh. This patch adds the NEON intrinsics vqrdmlah and vqrdmlsh for these instructions. The new intrinsics are of the form vqrdml{as}h[q]_<type>.
Tested the series for aarch64-none-linux-gnu with native bootstrap and make check on an ARMv8 architecture. Also tested aarch64-none-elf with cross-compiled check-gcc on an ARMv8.1 emulator. Ok for trunk? Matthew gcc/ 2015-10-23 Matthew Wahab <matthew.wa...@arm.com> * gcc/config/aarch64/arm_neon.h (vqrdmlah_s16, vqrdmlah_s32): New. (vqrdmlahq_s16, vqrdmlahq_s32): New. (vqrdmlsh_s16, vqrdmlsh_s32): New. (vqrdmlshq_s16, vqrdmlshq_s32): New. gcc/testsuite 2015-10-23 Matthew Wahab <matthew.wa...@arm.com> * gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc: New file, support code for vqrdml{as}h tests. * gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c: New. * gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c: New.
>From 611e1232a59dfe42f2cd9666680407d67abcfea5 Mon Sep 17 00:00:00 2001 From: Matthew Wahab <matthew.wa...@arm.com> Date: Thu, 27 Aug 2015 13:22:41 +0100 Subject: [PATCH 6/7] Add neon intrinsics: vqrdmlah, vqrdmlsh. Change-Id: I5c7f8d36ee980d280c1d50f6f212b286084c5acf --- gcc/config/aarch64/arm_neon.h | 53 ++++++++ .../aarch64/advsimd-intrinsics/vqrdmlXh.inc | 138 +++++++++++++++++++++ .../aarch64/advsimd-intrinsics/vqrdmlah.c | 57 +++++++++ .../aarch64/advsimd-intrinsics/vqrdmlsh.c | 61 +++++++++ 4 files changed, 309 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index e186348..9e73809 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -2649,6 +2649,59 @@ vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b) return (int32x4_t) __builtin_aarch64_sqrdmulhv4si (__a, __b); } +#pragma GCC push_options +#pragma GCC target ("arch=armv8.1-a") + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int16x4_t) __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int32x2_t) __builtin_aarch64_sqrdmlahv2si (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int16x8_t) __builtin_aarch64_sqrdmlahv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int32x4_t) __builtin_aarch64_sqrdmlahv4si (__a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return (int16x4_t) __builtin_aarch64_sqrdmlshv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return (int32x2_t) __builtin_aarch64_sqrdmlshv2si (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int16x8_t) __builtin_aarch64_sqrdmlshv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int32x4_t) __builtin_aarch64_sqrdmlshv4si (__a, __b, __c); +} + +#pragma GCC pop_options + __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vcreate_s8 (uint64_t __a) { diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc new file mode 100644 index 0000000..a504ca6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc @@ -0,0 +1,138 @@ +#define FNNAME1(NAME) exec_ ## NAME +#define FNNAME(NAME) FNNAME1 (NAME) + +void FNNAME (INSN) (void) +{ + /* vector_res = vqrdmlah (vector, vector2, vector3, vector4), + then store the result. */ +#define TEST_VQRDMLAH2(INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT) \ + Set_Neon_Cumulative_Sat (0, VECT_VAR (vector_res, T1, W, N)); \ + VECT_VAR (vector_res, T1, W, N) = \ + INSN##Q##_##T2##W (VECT_VAR (vector, T1, W, N), \ + VECT_VAR (vector2, T1, W, N), \ + VECT_VAR (vector3, T1, W, N)); \ + vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N), \ + VECT_VAR (vector_res, T1, W, N)); \ + CHECK_CUMULATIVE_SAT (TEST_MSG, T1, W, N, \ + EXPECTED_CUMULATIVE_SAT, CMT) + + /* Two auxliary macros are necessary to expand INSN. */ +#define TEST_VQRDMLAH1(INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT) \ + TEST_VQRDMLAH2 (INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT) + +#define TEST_VQRDMLAH(Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT) \ + TEST_VQRDMLAH1 (INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT) + + DECL_VARIABLE (vector, int, 16, 4); + DECL_VARIABLE (vector, int, 32, 2); + DECL_VARIABLE (vector, int, 16, 8); + DECL_VARIABLE (vector, int, 32, 4); + + DECL_VARIABLE (vector_res, int, 16, 4); + DECL_VARIABLE (vector_res, int, 32, 2); + DECL_VARIABLE (vector_res, int, 16, 8); + DECL_VARIABLE (vector_res, int, 32, 4); + + DECL_VARIABLE (vector2, int, 16, 4); + DECL_VARIABLE (vector2, int, 32, 2); + DECL_VARIABLE (vector2, int, 16, 8); + DECL_VARIABLE (vector2, int, 32, 4); + + DECL_VARIABLE (vector3, int, 16, 4); + DECL_VARIABLE (vector3, int, 32, 2); + DECL_VARIABLE (vector3, int, 16, 8); + DECL_VARIABLE (vector3, int, 32, 4); + + clean_results (); + + VLOAD (vector, buffer, , int, s, 16, 4); + VLOAD (vector, buffer, , int, s, 32, 2); + VLOAD (vector, buffer, q, int, s, 16, 8); + VLOAD (vector, buffer, q, int, s, 32, 4); + + /* Initialize vector2. */ + VDUP (vector2, , int, s, 16, 4, 0x5555); + VDUP (vector2, , int, s, 32, 2, 0xBB); + VDUP (vector2, q, int, s, 16, 8, 0xBB); + VDUP (vector2, q, int, s, 32, 4, 0x22); + + /* Initialize vector3. */ + VDUP (vector3, , int, s, 16, 4, 0x5555); + VDUP (vector3, , int, s, 32, 2, 0xBB); + VDUP (vector3, q, int, s, 16, 8, 0x33); + VDUP (vector3, q, int, s, 32, 4, 0x22); + +#define CMT "" + TEST_VQRDMLAH ( , int, s, 16, 4, expected_cumulative_sat, CMT); + TEST_VQRDMLAH ( , int, s, 32, 2, expected_cumulative_sat, CMT); + TEST_VQRDMLAH (q, int, s, 16, 8, expected_cumulative_sat, CMT); + TEST_VQRDMLAH (q, int, s, 32, 4, expected_cumulative_sat, CMT); + + CHECK (TEST_MSG, int, 16, 4, PRIx16, expected, CMT); + CHECK (TEST_MSG, int, 32, 2, PRIx32, expected, CMT); + CHECK (TEST_MSG, int, 16, 8, PRIx16, expected, CMT); + CHECK (TEST_MSG, int, 32, 4, PRIx32, expected, CMT); + + /* Now use input values such that the multiplication causes + saturation. */ +#define TEST_MSG_MUL " (check mul cumulative saturation)" + VDUP (vector, , int, s, 16, 4, 0x8000); + VDUP (vector, , int, s, 32, 2, 0x80000000); + VDUP (vector, q, int, s, 16, 8, 0x8000); + VDUP (vector, q, int, s, 32, 4, 0x80000000); + VDUP (vector2, , int, s, 16, 4, 0x8000); + VDUP (vector2, , int, s, 32, 2, 0x80000000); + VDUP (vector2, q, int, s, 16, 8, 0x8000); + VDUP (vector2, q, int, s, 32, 4, 0x80000000); + VDUP (vector3, , int, s, 16, 4, 0x8000); + VDUP (vector3, , int, s, 32, 2, 0x80000000); + VDUP (vector3, q, int, s, 16, 8, 0x8000); + VDUP (vector3, q, int, s, 32, 4, 0x80000000); + + TEST_VQRDMLAH ( , int, s, 16, 4, expected_cumulative_sat_mul, TEST_MSG_MUL); + TEST_VQRDMLAH ( , int, s, 32, 2, expected_cumulative_sat_mul, TEST_MSG_MUL); + TEST_VQRDMLAH (q, int, s, 16, 8, expected_cumulative_sat_mul, TEST_MSG_MUL); + TEST_VQRDMLAH (q, int, s, 32, 4, expected_cumulative_sat_mul, TEST_MSG_MUL); + + CHECK (TEST_MSG, int, 16, 4, PRIx16, expected_mul, TEST_MSG_MUL); + CHECK (TEST_MSG, int, 32, 2, PRIx32, expected_mul, TEST_MSG_MUL); + CHECK (TEST_MSG, int, 16, 8, PRIx16, expected_mul, TEST_MSG_MUL); + CHECK (TEST_MSG, int, 32, 4, PRIx32, expected_mul, TEST_MSG_MUL); + + /* Use input values where rounding produces a result equal to the + saturation value, but does not set the saturation flag. */ +#define TEST_MSG_ROUND " (check rounding)" + VDUP (vector, , int, s, 16, 4, 0x8000); + VDUP (vector, , int, s, 32, 2, 0x80000000); + VDUP (vector, q, int, s, 16, 8, 0x8000); + VDUP (vector, q, int, s, 32, 4, 0x80000000); + VDUP (vector2, , int, s, 16, 4, 0x8001); + VDUP (vector2, , int, s, 32, 2, 0x80000001); + VDUP (vector2, q, int, s, 16, 8, 0x8001); + VDUP (vector2, q, int, s, 32, 4, 0x80000001); + VDUP (vector3, , int, s, 16, 4, 0x8001); + VDUP (vector3, , int, s, 32, 2, 0x80000001); + VDUP (vector3, q, int, s, 16, 8, 0x8001); + VDUP (vector3, q, int, s, 32, 4, 0x80000001); + + TEST_VQRDMLAH ( , int, s, 16, 4, expected_cumulative_sat_round, \ + TEST_MSG_ROUND); + TEST_VQRDMLAH ( , int, s, 32, 2, expected_cumulative_sat_round, \ + TEST_MSG_ROUND); + TEST_VQRDMLAH (q, int, s, 16, 8, expected_cumulative_sat_round, \ + TEST_MSG_ROUND); + TEST_VQRDMLAH (q, int, s, 32, 4, expected_cumulative_sat_round, \ + TEST_MSG_ROUND); + + CHECK (TEST_MSG, int, 16, 4, PRIx16, expected_round, TEST_MSG_ROUND); + CHECK (TEST_MSG, int, 32, 2, PRIx32, expected_round, TEST_MSG_ROUND); + CHECK (TEST_MSG, int, 16, 8, PRIx16, expected_round, TEST_MSG_ROUND); + CHECK (TEST_MSG, int, 32, 4, PRIx32, expected_round, TEST_MSG_ROUND); +} + +int +main (void) +{ + FNNAME (INSN) (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c new file mode 100644 index 0000000..148d94c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c @@ -0,0 +1,57 @@ +/* { dg-require-effective-target arm_v8_1a_neon_hw } */ +/* { dg-add-options arm_v8_1a_neon } */ + +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +/* Expected values of cumulative_saturation flag. */ +int VECT_VAR (expected_cumulative_sat, int, 16, 4) = 0; +int VECT_VAR (expected_cumulative_sat, int, 32, 2) = 0; +int VECT_VAR (expected_cumulative_sat, int, 16, 8) = 0; +int VECT_VAR (expected_cumulative_sat, int, 32, 4) = 0; + +/* Expected results. */ +VECT_VAR_DECL (expected, int, 16, 4) [] = { 0x38d3, 0x38d4, 0x38d5, 0x38d6 }; +VECT_VAR_DECL (expected, int, 32, 2) [] = { 0xfffffff0, 0xfffffff1 }; +VECT_VAR_DECL (expected, int, 16, 8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3, + 0xfff4, 0xfff5, 0xfff6, 0xfff7 }; +VECT_VAR_DECL (expected, int, 32, 4) [] = { 0xfffffff0, 0xfffffff1, + 0xfffffff2, 0xfffffff3 }; + +/* Expected values of cumulative_saturation flag when multiplication + saturates. */ +int VECT_VAR (expected_cumulative_sat_mul, int, 16, 4) = 0; +int VECT_VAR (expected_cumulative_sat_mul, int, 32, 2) = 0; +int VECT_VAR (expected_cumulative_sat_mul, int, 16, 8) = 0; +int VECT_VAR (expected_cumulative_sat_mul, int, 32, 4) = 0; + +/* Expected results when multiplication saturates. */ +VECT_VAR_DECL (expected_mul, int, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL (expected_mul, int, 32, 2) [] = { 0x0, 0x0 }; +VECT_VAR_DECL (expected_mul, int, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0 }; +VECT_VAR_DECL (expected_mul, int, 32, 4) [] = { 0x0, 0x0, 0x0, 0x0 }; + +/* Expected values of cumulative_saturation flag when rounding + should not cause saturation. */ +int VECT_VAR (expected_cumulative_sat_round, int, 16, 4) = 0; +int VECT_VAR (expected_cumulative_sat_round, int, 32, 2) = 0; +int VECT_VAR (expected_cumulative_sat_round, int, 16, 8) = 0; +int VECT_VAR (expected_cumulative_sat_round, int, 32, 4) = 0; + +/* Expected results when rounding should not cause saturation. */ +VECT_VAR_DECL (expected_round, int, 16, 4) [] = { 0xfffe, 0xfffe, + 0xfffe, 0xfffe }; +VECT_VAR_DECL (expected_round, int, 32, 2) [] = { 0xfffffffe, 0xfffffffe }; +VECT_VAR_DECL (expected_round, int, 16, 8) [] = { 0xfffe, 0xfffe, + 0xfffe, 0xfffe, + 0xfffe, 0xfffe, + 0xfffe, 0xfffe }; +VECT_VAR_DECL (expected_round, int, 32, 4) [] = { 0xfffffffe, 0xfffffffe, + 0xfffffffe, 0xfffffffe }; + +#define INSN vqrdmlah +#define TEST_MSG "VQRDMLAH" + +#include "vqrdmlXh.inc" diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c new file mode 100644 index 0000000..91c3b34 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c @@ -0,0 +1,61 @@ +/* { dg-require-effective-target arm_v8_1a_neon_hw } */ +/* { dg-add-options arm_v8_1a_neon } */ + +#include <arm_neon.h> +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +/* Expected values of cumulative_saturation flag. */ +int VECT_VAR (expected_cumulative_sat, int, 16, 4) = 0; +int VECT_VAR (expected_cumulative_sat, int, 32, 2) = 0; +int VECT_VAR (expected_cumulative_sat, int, 16, 8) = 0; +int VECT_VAR (expected_cumulative_sat, int, 32, 4) = 0; + +/* Expected results. */ +VECT_VAR_DECL (expected, int, 16, 4) [] = { 0xc70d, 0xc70e, 0xc70f, 0xc710 }; +VECT_VAR_DECL (expected, int, 32, 2) [] = { 0xfffffff0, 0xfffffff1 }; +VECT_VAR_DECL (expected, int, 16, 8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3, + 0xfff4, 0xfff5, 0xfff6, 0xfff7 }; +VECT_VAR_DECL (expected, int, 32, 4) [] = { 0xfffffff0, 0xfffffff1, + 0xfffffff2, 0xfffffff3 }; + +/* Expected values of cumulative_saturation flag when multiplication + saturates. */ +int VECT_VAR (expected_cumulative_sat_mul, int, 16, 4) = 1; +int VECT_VAR (expected_cumulative_sat_mul, int, 32, 2) = 1; +int VECT_VAR (expected_cumulative_sat_mul, int, 16, 8) = 1; +int VECT_VAR (expected_cumulative_sat_mul, int, 32, 4) = 1; + +/* Expected results when multiplication saturates. */ +VECT_VAR_DECL (expected_mul, int, 16, 4) [] = { 0x8000, 0x8000, + 0x8000, 0x8000 }; +VECT_VAR_DECL (expected_mul, int, 32, 2) [] = { 0x80000000, 0x80000000 }; +VECT_VAR_DECL (expected_mul, int, 16, 8) [] = { 0x8000, 0x8000, + 0x8000, 0x8000, + 0x8000, 0x8000, + 0x8000, 0x8000 }; +VECT_VAR_DECL (expected_mul, int, 32, 4) [] = { 0x80000000, 0x80000000, + 0x80000000, 0x80000000 }; + +/* Expected values of cumulative_saturation flag when rounding + should not cause saturation. */ +int VECT_VAR (expected_cumulative_sat_round, int, 16, 4) = 1; +int VECT_VAR (expected_cumulative_sat_round, int, 32, 2) = 1; +int VECT_VAR (expected_cumulative_sat_round, int, 16, 8) = 1; +int VECT_VAR (expected_cumulative_sat_round, int, 32, 4) = 1; + +/* Expected results when rounding should not cause saturation. */ +VECT_VAR_DECL (expected_round, int, 16, 4) [] = { 0x8000, 0x8000, + 0x8000, 0x8000 }; +VECT_VAR_DECL (expected_round, int, 32, 2) [] = { 0x80000000, 0x80000000 }; +VECT_VAR_DECL (expected_round, int, 16, 8) [] = { 0x8000, 0x8000, + 0x8000, 0x8000, + 0x8000, 0x8000, + 0x8000, 0x8000 }; +VECT_VAR_DECL (expected_round, int, 32, 4) [] = { 0x80000000, 0x80000000, + 0x80000000, 0x80000000 }; + +#define INSN vqrdmlsh +#define TEST_MSG "VQRDMLSH" + +#include "vqrdmlXh.inc" -- 2.1.4