Re: [AArch64, NEON] Improve vmulX intrinsics

Jiangjiji Wed, 10 Dec 2014 18:04:31 -0800

Hi, Christophe Lyon
These testcases are not covered by the glorious testsuite.
If these cases are in your todo list , I will exclude them.


Thanks.

-----邮件原件-----
发件人: Christophe Lyon [mailto:christophe.l...@linaro.org] 
发送时间: 2014年12月9日 21:43
收件人: Jiangjiji
抄送: gcc-patches@gcc.gnu.org; Richard Earnshaw; Yangfei (Felix); Marcus Shawcroft
主题: Re: [AArch64, NEON] Improve vmulX intrinsics

On 9 December 2014 at 13:52, Jiangjiji <jiangj...@huawei.com> wrote:
> Hi,
>      This patch converts more intrinsics to use builtin functions instead of
> the
> previous inline assembly syntax.
>      Passed the glorious testsuite of Christophe Lyon.
>
>      Three testcases are added for the testing of intriniscs which are not
> covered by the testsuite:
>      gcc.target/aarch64/vmull_high.c
>      gcc.target/aarch64/vmull_high_lane.c
>      gcc.target/aarch64/vmull_high_n.c
>

As I said here:
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html
I am in tre process of converting my existing testsuite to GCC/Dejagnu.
Please do not duplicate work.


>      Regtested with aarch64-linux-gnu on QEMU.
>      This patch has no regressions for aarch64_be-linux-gnu big-endian
> target too.
>      OK for the trunk?
>
>
>
> Index: gcc/ChangeLog
> ===================================================================
> --- gcc/ChangeLog       (revision 218464)
> +++ gcc/ChangeLog       (working copy)
> @@ -1,3 +1,38 @@
> +2014-12-09  Felix Yang  <felix.y...@huawei.com>
> +            Jiji Jiang  <jiangj...@huawei.com>
> +
> +       * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>,
> +       aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>,
> +       aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>,
> +       aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal,
> +       aarch64_<su>mull_laneq<mode>,
> aarch64_<su>mull2_laneq<mode>_internal,
> +       aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>,
> +       aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>,
> +       aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>,
> +       aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns.
> +       * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_,
> +       vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n,
> smull2_n,
> +       umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull,
> +       umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2,
> +       smull2_lane): New builtins.
> +       * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32,
> +       vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16,
> +       vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16,
> +       vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32,
> +       vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16,
> +       vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32,
> +       vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8,
> +       vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16,
> +       vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16,
> +       vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16,
> +       vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32,
> +       vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16,
> +       vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32,
> +       vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite
> +       using builtin functions.
> +       * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE,
> +       VDQF_Q): New unspec and int iterator.
> +
>  2014-12-07  Felix Yang  <felix.y...@huawei.com>
>             Shanyao Chen  <chenshan...@huawei.com>
>  Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> (revision 0)
> @@ -0,0 +1,111 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad,
> +                                        0xfe24, 0xfe9b, 0xff12, 0xff89 };
> +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28,
> +                                        0xfffffab0, 0xfffffb38 };
> +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2,
> +                                        0xfffffffffffff83b };
> +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae,
> +                                         0xa758, 0xa802, 0xa8ac, 0xa956 };
> +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7,
> +                                         0xbaf8b2, 0xbaf96d };
> +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8,
> +                                         0xcbfffff5a4};
> +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce,
> +                                         0x6798, 0x6732, 0x66cc, 0x6666 };
> +
> +#ifndef INSN_NAME
> +#define INSN_NAME vmull_high
> +#define TEST_MSG "VMUL_HIGH"
> +#endif
> +
> +#define FNNAME1(NAME) exec_ ## NAME
> +#define FNNAME(NAME) FNNAME1(NAME)
> +
> +void FNNAME (INSN_NAME) (void)
> +{
> +#define DECL_VMUL(T, W, N)                      \
> +  DECL_VARIABLE(vector1, T, W, N);              \
> +  DECL_VARIABLE(vector2, T, W, N);
> +
> +  /* vector_res = OP(vector1, vector2), then store the result.  */
> +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)                 \
> +  VECT_VAR(vector_res, T1, W1, N1) =                                    \
> +    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
> +                               VECT_VAR(vector2, T1, W, N));            \
> +  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
> +                   VECT_VAR(vector_res, T1, W1, N1))
> +
> +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1)                 \
> +  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1)
> +
> +#define CHECK_VMULL_HIGH_RESULTS(test_name,comment)                     \
> +  {                                                                     \
> +    CHECK(test_name, int, 16, 8, PRIx16, expected, comment);            \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 16, 8, PRIx16,  expected, comment);          \
> +    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +    CHECK(test_name, poly, 16, 8, PRIx16, expected, comment);           \
> +  }
> +
> +  DECL_VMUL(int, 8, 16);
> +  DECL_VMUL(int, 16, 8);
> +  DECL_VMUL(int, 32, 4);
> +  DECL_VMUL(uint, 8, 16);
> +  DECL_VMUL(uint, 16, 8);
> +  DECL_VMUL(uint, 32, 4);
> +  DECL_VMUL(poly, 8, 16);
> +
> +  DECL_VARIABLE(vector_res, int, 16, 8);
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 16, 8);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +  DECL_VARIABLE(vector_res, poly, 16, 8);
> +
> +  clean_results ();
> +
> +  /* Initialize input "vector1" from "buffer".  */
> +  VLOAD(vector1, buffer, q, int, s, 8, 16);
> +  VLOAD(vector1, buffer, q, int, s, 16, 8);
> +  VLOAD(vector1, buffer, q, int, s, 32, 4);
> +  VLOAD(vector1, buffer, q, uint, u, 8, 16);
> +  VLOAD(vector1, buffer, q, uint, u, 16, 8);
> +  VLOAD(vector1, buffer, q, uint, u, 32, 4);
> +  VLOAD(vector1, buffer, q, poly, p, 8, 16);
> +
> +  /* Choose init value arbitrarily.  */
> +  VDUP(vector2, q, int, s, 8, 16, 0x77);
> +  VDUP(vector2, q, int, s, 16, 8, 0x88);
> +  VDUP(vector2, q, int, s, 32, 4, 0x99);
> +  VDUP(vector2, q, uint, u, 8, 16, 0xAA);
> +  VDUP(vector2, q, uint, u, 16, 8, 0xBB);
> +  VDUP(vector2, q, uint, u, 32, 4, 0xCC);
> +  VDUP(vector2, q, poly, p, 8, 16, 0xAA);
> +
> +  /* Execute the tests.  */
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8);
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4);
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2);
> +  TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8);
> +
> +  CHECK_VMULL_HIGH_RESULTS (TEST_MSG, "");
> +}
> +
> +int main (void)
> +{
> +  FNNAME (INSN_NAME) ();
> +
> +  return 0;
> +}
>
> Property changes on:
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c
> ___________________________________________________________________
> Added: svn:executable
>    + *
>
> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> (revision 0)
> @@ -0,0 +1,135 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000
> };
> +VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000};
> +VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000
> };
> +VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 };
> +
> +#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ"
> +void exec_vmull_high_lane (void)
> +{
> +  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.
> */
> +#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L)                  \
> +  VECT_VAR(vector_res, T1, W2, N2) =                                    \
> +    vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ),             \
> +                         VECT_VAR(vector2, T1, W, N2),                  \
> +                         L);                                            \
> +  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2,
> N2))
> +
> +#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment)                \
> +  {                                                                     \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +  }
> +
> +
> +  /* With ARM RVCT, we need to declare variables before any executable
> +     statement */
> +  DECL_VARIABLE(vector, int, 16, 8);
> +  DECL_VARIABLE(vector, int, 32, 4);
> +  DECL_VARIABLE(vector, uint, 16, 8);
> +  DECL_VARIABLE(vector, uint, 32, 4);
> +  DECL_VARIABLE(vector2, int, 16, 4);
> +  DECL_VARIABLE(vector2, int, 32, 2);
> +  DECL_VARIABLE(vector2, uint, 16, 4);
> +  DECL_VARIABLE(vector2, uint, 32, 2);
> +
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +
> +  clean_results ();
> +
> +  /* Initialize vector */
> +  VDUP(vector2, , int, s, 16, 4, 0x1000);
> +  VDUP(vector2, , int, s, 32, 2, 0x1000);
> +  VDUP(vector2, , uint, u, 16, 4, 0x1000);
> +  VDUP(vector2, , uint, u, 32, 2, 0x1000);
> +
> +  /* Initialize vector2 */
> +  VDUP(vector, q, int, s, 16, 8, 0x4);
> +  VDUP(vector, q, int, s, 32, 4, 0x2);
> +  VDUP(vector, q, uint, u, 16, 8, 0x4);
> +  VDUP(vector, q, uint, u, 32, 4, 0x2);
> +
> +  /* Choose lane arbitrarily */
> +  TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1);
> +  TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1);
> +
> +  CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, "");
> +}
> +
> +
> +void exec_vmull_high_laneq (void)
> +{
> +  /* vector_res = vmull_lane(vector,vector2,lane), then store the result.
> */
> +#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L)                 \
> +  VECT_VAR(vector_res, T1, W2, N1) =                                    \
> +    vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ),            \
> +                         VECT_VAR(vector2, T1, W, N2),                  \
> +                         L);                                            \
> +  vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2,
> N1))
> +
> +#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment)               \
> +  {                                                                     \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 32, 4, PRIx32,  expected, comment);          \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +  }
> +
> +
> +  /* With ARM RVCT, we need to declare variables before any executable
> +     statement */
> +  DECL_VARIABLE(vector, int, 16, 8);
> +  DECL_VARIABLE(vector, int, 32, 4);
> +  DECL_VARIABLE(vector, uint, 16, 8);
> +  DECL_VARIABLE(vector, uint, 32, 4);
> +  DECL_VARIABLE(vector2, int, 16, 8);
> +  DECL_VARIABLE(vector2, int, 32, 4);
> +  DECL_VARIABLE(vector2, uint, 16, 8);
> +  DECL_VARIABLE(vector2, uint, 32, 4);
> +
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +
> +  clean_results ();
> +
> +  /* Initialize vector */
> +  VDUP(vector2, q, int, s, 16, 8, 0x1000);
> +  VDUP(vector2, q, int, s, 32, 4, 0x1000);
> +  VDUP(vector2, q, uint, u, 16, 8, 0x1000);
> +  VDUP(vector2, q, uint, u, 32, 4, 0x1000);
> +
> +  /* Initialize vector2 */
> +  VDUP(vector, q, int, s, 16, 8, 0x4);
> +  VDUP(vector, q, int, s, 32, 4, 0x2);
> +  VDUP(vector, q, uint, u, 16, 8, 0x4);
> +  VDUP(vector, q, uint, u, 32, 4, 0x2);
> +
> +  /* Choose lane arbitrarily */
> +  TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1);
> +  TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2);
> +  TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1);
> +
> +  CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, "");
> +}
> +
> +
> +
> +
> +int main (void)
> +{
> +  exec_vmull_high_lane();
> +  return 0;
> +}
>
> Property changes on:
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c
> ___________________________________________________________________
> Added: svn:executable
>    + *
>
> Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> ===================================================================
> --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> (revision 0)
> +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> (revision 0)
> @@ -0,0 +1,81 @@
> +#include <arm_neon.h>
> +#include "arm-neon-ref.h"
> +#include "compute-ref-data.h"
> +
> +
> +/* Expected results.  */
> +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7,
> +                                        0xfffff8b2, 0xfffff96d };
> +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8,
> +                                        0xfffffffffffff5a4 };
> +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6,
> +                                         0xedf6b4, 0xedf7a2 };
> +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e,
> +                                         0xfefffff30d};
> +
> +#ifndef INSN_NAME
> +#define INSN_NAME vmull_high_n
> +#define TEST_MSG "VMULL_HIGH_N"
> +#endif
> +
> +#define FNNAME1(NAME) exec_ ## NAME
> +#define FNNAME(NAME) FNNAME1(NAME)
> +
> +void FNNAME (INSN_NAME) (void)
> +{
> +#define DECL_VMUL(T, W, N)                      \
> +  DECL_VARIABLE(vector1, T, W, N);              \
> +
> +  /* vector_res = OP(vector1, vector2), then store the result.  */
> +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)              \
> +  VECT_VAR(vector_res, T1, W1, N1) =                                    \
> +    INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),                      \
> +                      C);                                               \
> +  vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1),                        \
> +                      VECT_VAR(vector_res, T1, W1, N1))
> +
> +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C)               \
> +  TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C)
> +
> +#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment)                   \
> +  {                                                                     \
> +    CHECK(test_name, int, 32, 4, PRIx32, expected, comment);            \
> +    CHECK(test_name, int, 64, 2, PRIx64, expected, comment);            \
> +    CHECK(test_name, uint, 32, 4, PRIx32, expected, comment);           \
> +    CHECK(test_name, uint, 64, 2, PRIx64, expected, comment);           \
> +  }
> +
> +  DECL_VMUL(int, 16, 8);
> +  DECL_VMUL(int, 32, 4);
> +  DECL_VMUL(uint, 16, 8);
> +  DECL_VMUL(uint, 32, 4);
> +
> +  DECL_VARIABLE(vector_res, int, 32, 4);
> +  DECL_VARIABLE(vector_res, int, 64, 2);
> +  DECL_VARIABLE(vector_res, uint, 32, 4);
> +  DECL_VARIABLE(vector_res, uint, 64, 2);
> +
> +  clean_results ();
> +
> +  /* Initialize input "vector1" from "buffer".  */
> +  VLOAD(vector1, buffer, q, int, s, 16, 8);
> +  VLOAD(vector1, buffer, q, int, s, 32, 4);
> +  VLOAD(vector1, buffer, q, uint, u, 16, 8);
> +  VLOAD(vector1, buffer, q, uint, u, 32, 4);
> +
> +
> +  /* Execute the tests.  */
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB);
> +  TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE);
> +  TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF);
> +
> +  CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, "");
> +}
> +
> +int main (void)
> +{
> +  FNNAME (INSN_NAME) ();
> +
> +  return 0;
> +}
>
> Property changes on:
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c
> ___________________________________________________________________
> Added: svn:executable
>    + *
>
> Index: gcc/testsuite/ChangeLog
> ===================================================================
> --- gcc/testsuite/ChangeLog     (revision 218464)
> +++ gcc/testsuite/ChangeLog     (working copy)
> @@ -1,3 +1,13 @@
> +2014-12-09  Felix Yang  <felix.y...@huawei.com>
> +            Jiji Jiang  <jiangj...@huawei.com>
> +
> +        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New
> +        test.
> +        *
> testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c:
> +        New test.
> +        * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c:
> New
> +        test.
> +
>  2014-12-07  Christophe Lyon  <christophe.l...@linaro.org>
>         * gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute
> Index: gcc/config/aarch64/arm_neon.h
> ===================================================================
> --- gcc/config/aarch64/arm_neon.h       (revision 218464)
> +++ gcc/config/aarch64/arm_neon.h       (working copy)
> @@ -7627,671 +7627,6 @@ vmovn_u64 (uint64x2_t a)
>    return result;
>  }
>  -__extension__ static __inline float32x2_t __attribute__
> ((__always_inline__))
> -vmul_n_f32 (float32x2_t a, float32_t b)
> -{
> -  float32x2_t result;
> -  __asm__ ("fmul %0.2s,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> -vmul_n_s16 (int16x4_t a, int16_t b)
> -{
> -  int16x4_t result;
> -  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
> -vmul_n_s32 (int32x2_t a, int32_t b)
> -{
> -  int32x2_t result;
> -  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x4_t __attribute__
> ((__always_inline__))
> -vmul_n_u16 (uint16x4_t a, uint16_t b)
> -{
> -  uint16x4_t result;
> -  __asm__ ("mul %0.4h,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x2_t __attribute__
> ((__always_inline__))
> -vmul_n_u32 (uint32x2_t a, uint32_t b)
> -{
> -  uint32x2_t result;
> -  __asm__ ("mul %0.2s,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmull_high_lane_s16(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x4_t b_ = (b);                                              \
> -       int16x8_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_lane_s32(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x2_t b_ = (b);                                              \
> -       int32x4_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_lane_u16(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x4_t b_ = (b);                                             \
> -       uint16x8_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_lane_u32(a, b, c)                                    \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x2_t b_ = (b);                                             \
> -       uint32x4_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_s16(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x8_t b_ = (b);                                              \
> -       int16x8_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_s32(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x4_t b_ = (b);                                              \
> -       int32x4_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_u16(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x8_t b_ = (b);                                             \
> -       uint16x8_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_high_laneq_u32(a, b, c)                                   \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x4_t b_ = (b);                                             \
> -       uint32x4_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]"                         \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_high_n_s16 (int16x8_t a, int16_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_high_n_s32 (int32x4_t a, int32_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_high_n_u16 (uint16x8_t a, uint16_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_high_n_u32 (uint32x4_t a, uint32_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline poly16x8_t __attribute__
> ((__always_inline__))
> -vmull_high_p8 (poly8x16_t a, poly8x16_t b)
> -{
> -  poly16x8_t result;
> -  __asm__ ("pmull2 %0.8h,%1.16b,%2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vmull_high_s8 (int8x16_t a, int8x16_t b)
> -{
> -  int16x8_t result;
> -  __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_high_s16 (int16x8_t a, int16x8_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_high_s32 (int32x4_t a, int32x4_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> -vmull_high_u8 (uint8x16_t a, uint8x16_t b)
> -{
> -  uint16x8_t result;
> -  __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_high_u16 (uint16x8_t a, uint16x8_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_high_u32 (uint32x4_t a, uint32x4_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmull_lane_s16(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x4_t b_ = (b);                                              \
> -       int16x4_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull %0.4s,%1.4h,%2.h[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_lane_s32(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x2_t b_ = (b);                                              \
> -       int32x2_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull %0.2d,%1.2s,%2.s[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_lane_u16(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x4_t b_ = (b);                                             \
> -       uint16x4_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull %0.4s,%1.4h,%2.h[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_lane_u32(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x2_t b_ = (b);                                             \
> -       uint32x2_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_s16(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int16x8_t b_ = (b);                                              \
> -       int16x4_t a_ = (a);                                              \
> -       int32x4_t result;                                                \
> -       __asm__ ("smull %0.4s, %1.4h, %2.h[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_s32(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       int32x4_t b_ = (b);                                              \
> -       int32x2_t a_ = (a);                                              \
> -       int64x2_t result;                                                \
> -       __asm__ ("smull %0.2d, %1.2s, %2.s[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_u16(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint16x8_t b_ = (b);                                             \
> -       uint16x4_t a_ = (a);                                             \
> -       uint32x4_t result;                                               \
> -       __asm__ ("umull %0.4s, %1.4h, %2.h[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "x"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmull_laneq_u32(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       uint32x4_t b_ = (b);                                             \
> -       uint32x2_t a_ = (a);                                             \
> -       uint64x2_t result;                                               \
> -       __asm__ ("umull %0.2d, %1.2s, %2.s[%3]"                          \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_n_s16 (int16x4_t a, int16_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull %0.4s,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_n_s32 (int32x2_t a, int32_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull %0.2d,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_n_u16 (uint16x4_t a, uint16_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull %0.4s,%1.4h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_n_u32 (uint32x2_t a, uint32_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull %0.2d,%1.2s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline poly16x8_t __attribute__
> ((__always_inline__))
> -vmull_p8 (poly8x8_t a, poly8x8_t b)
> -{
> -  poly16x8_t result;
> -  __asm__ ("pmull %0.8h, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vmull_s8 (int8x8_t a, int8x8_t b)
> -{
> -  int16x8_t result;
> -  __asm__ ("smull %0.8h, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmull_s16 (int16x4_t a, int16x4_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("smull %0.4s, %1.4h, %2.4h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> -vmull_s32 (int32x2_t a, int32x2_t b)
> -{
> -  int64x2_t result;
> -  __asm__ ("smull %0.2d, %1.2s, %2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> -vmull_u8 (uint8x8_t a, uint8x8_t b)
> -{
> -  uint16x8_t result;
> -  __asm__ ("umull %0.8h, %1.8b, %2.8b"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmull_u16 (uint16x4_t a, uint16x4_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("umull %0.4s, %1.4h, %2.4h"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> -vmull_u32 (uint32x2_t a, uint32x2_t b)
> -{
> -  uint64x2_t result;
> -  __asm__ ("umull %0.2d, %1.2s, %2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> -vmulq_n_f32 (float32x4_t a, float32_t b)
> -{
> -  float32x4_t result;
> -  __asm__ ("fmul %0.4s,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float64x2_t __attribute__
> ((__always_inline__))
> -vmulq_n_f64 (float64x2_t a, float64_t b)
> -{
> -  float64x2_t result;
> -  __asm__ ("fmul %0.2d,%1.2d,%2.d[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> -vmulq_n_s16 (int16x8_t a, int16_t b)
> -{
> -  int16x8_t result;
> -  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> -vmulq_n_s32 (int32x4_t a, int32_t b)
> -{
> -  int32x4_t result;
> -  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> -vmulq_n_u16 (uint16x8_t a, uint16_t b)
> -{
> -  uint16x8_t result;
> -  __asm__ ("mul %0.8h,%1.8h,%2.h[0]"
> -           : "=w"(result)
> -           : "w"(a), "x"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> -vmulq_n_u32 (uint32x4_t a, uint32_t b)
> -{
> -  uint32x4_t result;
> -  __asm__ ("mul %0.4s,%1.4s,%2.s[0]"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x2_t __attribute__
> ((__always_inline__))
> -vmulx_f32 (float32x2_t a, float32x2_t b)
> -{
> -  float32x2_t result;
> -  __asm__ ("fmulx %0.2s,%1.2s,%2.2s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmulx_lane_f32(a, b, c)                                         \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       float32x4_t b_ = (b);                                            \
> -       float32x2_t a_ = (a);                                            \
> -       float32x2_t result;                                              \
> -       __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline float64_t __attribute__ ((__always_inline__))
> -vmulxd_f64 (float64_t a, float64_t b)
> -{
> -  float64_t result;
> -  __asm__ ("fmulx %d0, %d1, %d2"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> -vmulxq_f32 (float32x4_t a, float32x4_t b)
> -{
> -  float32x4_t result;
> -  __asm__ ("fmulx %0.4s,%1.4s,%2.4s"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -__extension__ static __inline float64x2_t __attribute__
> ((__always_inline__))
> -vmulxq_f64 (float64x2_t a, float64x2_t b)
> -{
> -  float64x2_t result;
> -  __asm__ ("fmulx %0.2d,%1.2d,%2.2d"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
> -#define vmulxq_lane_f32(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       float32x4_t b_ = (b);                                            \
> -       float32x4_t a_ = (a);                                            \
> -       float32x4_t result;                                              \
> -       __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -#define vmulxq_lane_f64(a, b, c)                                        \
> -  __extension__                                                         \
> -    ({                                                                  \
> -       float64x2_t b_ = (b);                                            \
> -       float64x2_t a_ = (a);                                            \
> -       float64x2_t result;                                              \
> -       __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]"                            \
> -                : "=w"(result)                                          \
> -                : "w"(a_), "w"(b_), "i"(c)                              \
> -                : /* No clobbers */);                                   \
> -       result;                                                          \
> -     })
> -
> -__extension__ static __inline float32_t __attribute__ ((__always_inline__))
> -vmulxs_f32 (float32_t a, float32_t b)
> -{
> -  float32_t result;
> -  __asm__ ("fmulx %s0, %s1, %s2"
> -           : "=w"(result)
> -           : "w"(a), "w"(b)
> -           : /* No clobbers */);
> -  return result;
> -}
> -
>  __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
>  vmvn_p8 (poly8x8_t a)
>  {
> @@ -19172,6 +18507,78 @@ vmul_n_f64  (float64x1_t __a, float64_t __b)
>    return (float64x1_t) { vget_lane_f64 (__a, 0) * __b };
>  }
>  +__extension__ static __inline float32x2_t __attribute__
> ((__always_inline__))
> +vmul_n_f32 (float32x2_t __a, float32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv2sf (__a, __b);
> +}
> +
> +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> +vmul_n_s16 (int16x4_t __a, int16_t __b)
> +{
> +  return __builtin_aarch64_mul_nv4hi (__a, __b);
> +}
> +
> +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
> +vmul_n_s32 (int32x2_t __a, int32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv2si (__a, __b);
> +}
> +
> +__extension__ static __inline uint16x4_t __attribute__
> ((__always_inline__))
> +vmul_n_u16 (uint16x4_t __a, uint16_t __b)
> +{
> +  return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a,
> +                                                   (int16_t)__b);
> +}
> +
> +__extension__ static __inline uint32x2_t __attribute__
> ((__always_inline__))
> +vmul_n_u32 (uint32x2_t __a, uint32_t __b)
> +{
> +  return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a,
> +                                                   (int32_t)__b);
> +}
> +
> +/* vmulq_n  */
> +
> +__extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> +vmulq_n_f32 (float32x4_t __a, float32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv4sf (__a, __b);
> +}
> +
> +__extension__ static __inline float64x2_t __attribute__
> ((__always_inline__))
> +vmulq_n_f64 (float64x2_t __a, float64_t __b)
> +{
> +  return __builtin_aarch64_mul_nv2df (__a, __b);
> +}
> +
> +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> +vmulq_n_s16 (int16x8_t __a, int16_t __b)
> +{
> +  return __builtin_aarch64_mul_nv8hi (__a, __b);
> +}
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmulq_n_s32 (int32x4_t __a, int32_t __b)
> +{
> +  return __builtin_aarch64_mul_nv4si (__a, __b);
> +}
> +
> +__extension__ static __inline uint16x8_t __attribute__
> ((__always_inline__))
> +vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
> +{
> +  return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a,
> +                                                   (int16_t)__b);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
> +{
> +  return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a,
> +                                                   (int32_t)__b);
> +}
> +
>  /* vmulq_lane  */
>   __extension__ static __inline float32x4_t __attribute__
> ((__always_inline__))
> @@ -19249,6 +18656,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c
>    return __a * __aarch64_vgetq_lane_u32 (__b, __lane);
>  }
>  +/* vmull_high_lane  */
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c);
> +}
> +
> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_lanev4si (__a, __b, __c);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
> +{
> +  return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a,
> +                                                         (int16x4_t) __b,
> +                                                         __c);
> +}
> +
> +__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
> +{
> +  return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a,
> +                                                         (int32x2_t) __b,
> +                                                          __c);
> +}
> +
> +/* vmull_high_laneq  */
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c);
> +}
> +
> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
> +{
> +  return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
> +{
> +  return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a,
> +                                                          (int16x8_t)__b,
> +                                                          __c);
> +}
> +
> +__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
> +{
> +  return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a,
> +                                                          (int32x4_t) __b,
> +                                                           __c);
> +}
> +
> +/* vmull_high_n  */
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_n_s16 (int16x8_t __a, int16_t __b)
> +{
> +  return __builtin_aarch64_smull2_nv8hi (__a, __b);
> +}
> +
> +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
> +vmull_high_n_s32 (int32x4_t __a, int32_t __b)
> +{
> +  return __builtin_aarch64_smull2_nv4si (__a, __b);
> +}
> +
> +__extension__ static __inline uint32x4_t __attribute__
> ((__always_inline__))
> +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b)
> +{
> +  return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b);
> +}
> +
> +__extension__ static __inline uint64x2_t __attribute__
> ((__always_inline__))
> +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b)
> +{
> +  return __builtin_aarch64_umull2_nv4si_uuu (__a, __b);
> +}
> +
> +/* vmull_high  */
> +
> +__extension__ static __inline poly16x8_t __attribute__
> ((__always_inline__))
> +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b)
> +{
> +    return  __builtin_aarch64_pmull2v16qi_ppp (__a, __b);
> +}
> +
> +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
> +vmull_high_s8 (int8x16_t __a, int8x16_t __b)
> +{
> +  return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
> +}
> +
> +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
> +vmull_high_s16 (int16x8_t __a, int16x8_t __b)
> +{
> +  return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);

Re: [AArch64, NEON] Improve vmulX intrinsics

Reply via email to