Hi, Christophe Lyon These testcases are not covered by the glorious testsuite. If these cases are in your todo list , I will exclude them.
Thanks. -----邮件原件----- 发件人: Christophe Lyon [mailto:christophe.l...@linaro.org] 发送时间: 2014年12月9日 21:43 收件人: Jiangjiji 抄送: gcc-patches@gcc.gnu.org; Richard Earnshaw; Yangfei (Felix); Marcus Shawcroft 主题: Re: [AArch64, NEON] Improve vmulX intrinsics On 9 December 2014 at 13:52, Jiangjiji <jiangj...@huawei.com> wrote: > Hi, > This patch converts more intrinsics to use builtin functions instead of > the > previous inline assembly syntax. > Passed the glorious testsuite of Christophe Lyon. > > Three testcases are added for the testing of intriniscs which are not > covered by the testsuite: > gcc.target/aarch64/vmull_high.c > gcc.target/aarch64/vmull_high_lane.c > gcc.target/aarch64/vmull_high_n.c > As I said here: https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01934.html I am in tre process of converting my existing testsuite to GCC/Dejagnu. Please do not duplicate work. > Regtested with aarch64-linux-gnu on QEMU. > This patch has no regressions for aarch64_be-linux-gnu big-endian > target too. > OK for the trunk? > > > > Index: gcc/ChangeLog > =================================================================== > --- gcc/ChangeLog (revision 218464) > +++ gcc/ChangeLog (working copy) > @@ -1,3 +1,38 @@ > +2014-12-09 Felix Yang <felix.y...@huawei.com> > + Jiji Jiang <jiangj...@huawei.com> > + > + * config/aarch64/aarch64-simd.md (aarch64_mul_n<mode>, > + aarch64_<su>mull_n<mode>, aarch64_<su>mull<mode>, > + aarch64_simd_<su>mull2_n<mode>, aarch64_<su>mull2_n<mode>, > + aarch64_<su>mull_lane<mode>, aarch64_<su>mull2_lane<mode>_internal, > + aarch64_<su>mull_laneq<mode>, > aarch64_<su>mull2_laneq<mode>_internal, > + aarch64_smull2_lane<mode>, aarch64_umull2_lane<mode>, > + aarch64_smull2_laneq<mode>, aarch64_umull2_laneq<mode>, > + aarch64_fmulx<mode>, aarch64_fmulx<mode>, aarch64_fmulx_lane<mode>, > + aarch64_pmull2v16qi, aarch64_pmullv8qi): New patterns. > + * config/aarch64/aarch64-simd-builtins.def (vec_widen_smult_hi_, > + vec_widen_umult_hi_, umull, smull, smull_n, umull_n, mul_n, > smull2_n, > + umull2_n, smull_lane, umull_lane, smull_laneq, umull_laneq, pmull, > + umull2_lane, smull2_laneq, umull2_laneq, fmulx, fmulx_lane, pmull2, > + smull2_lane): New builtins. > + * config/aarch64/arm_neon.h (vmul_n_f32, vmul_n_s16, vmul_n_s32, > + vmul_n_u16, vmul_n_u32, vmulq_n_f32, vmulq_n_f64, vmulq_n_s16, > + vmulq_n_s32, vmulq_n_u16, vmulq_n_u32, vmull_high_lane_s16, > + vmull_high_lane_s32, vmull_high_lane_u16, vmull_high_lane_u32, > + vmull_high_laneq_s16, vmull_high_laneq_s32, vmull_high_laneq_u16, > + vmull_high_laneq_u32, vmull_high_n_s16, vmull_high_n_s32, > + vmull_high_n_u16, vmull_high_n_u32, vmull_high_p8, vmull_high_s8, > + vmull_high_s16, vmull_high_s32, vmull_high_u8, vmull_high_u16, > + vmull_high_u32, vmull_lane_s16, vmull_lane_s32, vmull_lane_u16, > + vmull_lane_u32, vmull_laneq_s16, vmull_laneq_s32, vmull_laneq_u16, > + vmull_laneq_u32, vmull_n_s16, vmull_n_s32, vmull_n_u16, vmull_n_u32, > + vmull_p8, vmull_s8, vmull_s16, vmull_s32, vmull_u8, vmull_u16, > + vmull_u32, vmulx_f32, vmulx_lane_f32, vmulxd_f64, vmulxq_f32, > + vmulxq_f64, vmulxq_lane_f32, vmulxq_lane_f64, vmulxs_f32): Rewrite > + using builtin functions. > + * config/aarch64/iterators.md (UNSPEC_FMULX, UNSPEC_FMULX_LANE, > + VDQF_Q): New unspec and int iterator. > + > 2014-12-07 Felix Yang <felix.y...@huawei.com> > Shanyao Chen <chenshan...@huawei.com> > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > (revision 0) > @@ -0,0 +1,111 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > + > +/* Expected results. */ > +VECT_VAR_DECL(expected,int,16,8) [] = { 0xfc48, 0xfcbf, 0xfd36, 0xfdad, > + 0xfe24, 0xfe9b, 0xff12, 0xff89 }; > +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff9a0, 0xfffffa28, > + 0xfffffab0, 0xfffffb38 }; > +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff7a2, > + 0xfffffffffffff83b }; > +VECT_VAR_DECL(expected,uint,16,8) [] = { 0xa4b0, 0xa55a, 0xa604, 0xa6ae, > + 0xa758, 0xa802, 0xa8ac, 0xa956 }; > +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xbaf73c, 0xbaf7f7, > + 0xbaf8b2, 0xbaf96d }; > +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xcbfffff4d8, > + 0xcbfffff5a4}; > +VECT_VAR_DECL(expected,poly,16,8) [] = { 0x6530, 0x659a, 0x6464, 0x64ce, > + 0x6798, 0x6732, 0x66cc, 0x6666 }; > + > +#ifndef INSN_NAME > +#define INSN_NAME vmull_high > +#define TEST_MSG "VMUL_HIGH" > +#endif > + > +#define FNNAME1(NAME) exec_ ## NAME > +#define FNNAME(NAME) FNNAME1(NAME) > + > +void FNNAME (INSN_NAME) (void) > +{ > +#define DECL_VMUL(T, W, N) \ > + DECL_VARIABLE(vector1, T, W, N); \ > + DECL_VARIABLE(vector2, T, W, N); > + > + /* vector_res = OP(vector1, vector2), then store the result. */ > +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) \ > + VECT_VAR(vector_res, T1, W1, N1) = \ > + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ > + VECT_VAR(vector2, T1, W, N)); \ > + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ > + VECT_VAR(vector_res, T1, W1, N1)) > + > +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1) \ > + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1) > + > +#define CHECK_VMULL_HIGH_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 16, 8, PRIx16, expected, comment); \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 16, 8, PRIx16, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, poly, 16, 8, PRIx16, expected, comment); \ > + } > + > + DECL_VMUL(int, 8, 16); > + DECL_VMUL(int, 16, 8); > + DECL_VMUL(int, 32, 4); > + DECL_VMUL(uint, 8, 16); > + DECL_VMUL(uint, 16, 8); > + DECL_VMUL(uint, 32, 4); > + DECL_VMUL(poly, 8, 16); > + > + DECL_VARIABLE(vector_res, int, 16, 8); > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 16, 8); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + DECL_VARIABLE(vector_res, poly, 16, 8); > + > + clean_results (); > + > + /* Initialize input "vector1" from "buffer". */ > + VLOAD(vector1, buffer, q, int, s, 8, 16); > + VLOAD(vector1, buffer, q, int, s, 16, 8); > + VLOAD(vector1, buffer, q, int, s, 32, 4); > + VLOAD(vector1, buffer, q, uint, u, 8, 16); > + VLOAD(vector1, buffer, q, uint, u, 16, 8); > + VLOAD(vector1, buffer, q, uint, u, 32, 4); > + VLOAD(vector1, buffer, q, poly, p, 8, 16); > + > + /* Choose init value arbitrarily. */ > + VDUP(vector2, q, int, s, 8, 16, 0x77); > + VDUP(vector2, q, int, s, 16, 8, 0x88); > + VDUP(vector2, q, int, s, 32, 4, 0x99); > + VDUP(vector2, q, uint, u, 8, 16, 0xAA); > + VDUP(vector2, q, uint, u, 16, 8, 0xBB); > + VDUP(vector2, q, uint, u, 32, 4, 0xCC); > + VDUP(vector2, q, poly, p, 8, 16, 0xAA); > + > + /* Execute the tests. */ > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 8, 16, 16, 8); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 8, 16, 16, 8); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2); > + TEST_VMULL_HIGH(INSN_NAME, , poly, p, 8, 16, 16, 8); > + > + CHECK_VMULL_HIGH_RESULTS (TEST_MSG, ""); > +} > + > +int main (void) > +{ > + FNNAME (INSN_NAME) (); > + > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > (revision 0) > @@ -0,0 +1,135 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > +VECT_VAR_DECL(expected, int, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 > }; > +VECT_VAR_DECL(expected, int, 64, 2) [] = { 0x2000, 0x2000}; > +VECT_VAR_DECL(expected, uint, 32, 4) [] = { 0x4000, 0x4000, 0x4000, 0x4000 > }; > +VECT_VAR_DECL(expected, uint, 64, 2) [] = { 0x2000, 0x2000 }; > + > +#define TEST_MSG "VMULL_HIGH_LANE/VMULL_HIGH_LANEQ" > +void exec_vmull_high_lane (void) > +{ > + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. > */ > +#define TEST_VMULL_HIGH_LANE(T1, T2, W, W2, N1, N2, L) \ > + VECT_VAR(vector_res, T1, W2, N2) = \ > + vmull##_high_lane_##T2##W(VECT_VAR(vector, T1, W, N1 ), \ > + VECT_VAR(vector2, T1, W, N2), \ > + L); \ > + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N2), VECT_VAR(vector_res, T1, W2, > N2)) > + > +#define CHECK_VMULL_HIGH_LANE_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + > + /* With ARM RVCT, we need to declare variables before any executable > + statement */ > + DECL_VARIABLE(vector, int, 16, 8); > + DECL_VARIABLE(vector, int, 32, 4); > + DECL_VARIABLE(vector, uint, 16, 8); > + DECL_VARIABLE(vector, uint, 32, 4); > + DECL_VARIABLE(vector2, int, 16, 4); > + DECL_VARIABLE(vector2, int, 32, 2); > + DECL_VARIABLE(vector2, uint, 16, 4); > + DECL_VARIABLE(vector2, uint, 32, 2); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize vector */ > + VDUP(vector2, , int, s, 16, 4, 0x1000); > + VDUP(vector2, , int, s, 32, 2, 0x1000); > + VDUP(vector2, , uint, u, 16, 4, 0x1000); > + VDUP(vector2, , uint, u, 32, 2, 0x1000); > + > + /* Initialize vector2 */ > + VDUP(vector, q, int, s, 16, 8, 0x4); > + VDUP(vector, q, int, s, 32, 4, 0x2); > + VDUP(vector, q, uint, u, 16, 8, 0x4); > + VDUP(vector, q, uint, u, 32, 4, 0x2); > + > + /* Choose lane arbitrarily */ > + TEST_VMULL_HIGH_LANE(int, s, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANE(int, s, 32, 64, 4, 2, 1); > + TEST_VMULL_HIGH_LANE(uint, u, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANE(uint, u, 32, 64, 4, 2, 1); > + > + CHECK_VMULL_HIGH_LANE_RESULTS (TEST_MSG, ""); > +} > + > + > +void exec_vmull_high_laneq (void) > +{ > + /* vector_res = vmull_lane(vector,vector2,lane), then store the result. > */ > +#define TEST_VMULL_HIGH_LANEQ(T1, T2, W, W2, N2, N1, L) \ > + VECT_VAR(vector_res, T1, W2, N1) = \ > + vmull##_high_laneq_##T2##W(VECT_VAR(vector, T1, W, N2 ), \ > + VECT_VAR(vector2, T1, W, N2), \ > + L); \ > + vst1q_##T2##W2(VECT_VAR(result, T1, W2, N1), VECT_VAR(vector_res, T1, W2, > N1)) > + > +#define CHECK_VMULL_HIGH_LANEQ_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + > + /* With ARM RVCT, we need to declare variables before any executable > + statement */ > + DECL_VARIABLE(vector, int, 16, 8); > + DECL_VARIABLE(vector, int, 32, 4); > + DECL_VARIABLE(vector, uint, 16, 8); > + DECL_VARIABLE(vector, uint, 32, 4); > + DECL_VARIABLE(vector2, int, 16, 8); > + DECL_VARIABLE(vector2, int, 32, 4); > + DECL_VARIABLE(vector2, uint, 16, 8); > + DECL_VARIABLE(vector2, uint, 32, 4); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize vector */ > + VDUP(vector2, q, int, s, 16, 8, 0x1000); > + VDUP(vector2, q, int, s, 32, 4, 0x1000); > + VDUP(vector2, q, uint, u, 16, 8, 0x1000); > + VDUP(vector2, q, uint, u, 32, 4, 0x1000); > + > + /* Initialize vector2 */ > + VDUP(vector, q, int, s, 16, 8, 0x4); > + VDUP(vector, q, int, s, 32, 4, 0x2); > + VDUP(vector, q, uint, u, 16, 8, 0x4); > + VDUP(vector, q, uint, u, 32, 4, 0x2); > + > + /* Choose lane arbitrarily */ > + TEST_VMULL_HIGH_LANEQ(int, s, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANEQ(int, s, 32, 64, 4, 2, 1); > + TEST_VMULL_HIGH_LANEQ(uint, u, 16, 32, 8, 4, 2); > + TEST_VMULL_HIGH_LANEQ(uint, u, 32, 64, 4, 2, 1); > + > + CHECK_VMULL_HIGH_LANEQ_RESULTS (TEST_MSG, ""); > +} > + > + > + > + > +int main (void) > +{ > + exec_vmull_high_lane(); > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > =================================================================== > --- gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > (revision 0) > +++ gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > (revision 0) > @@ -0,0 +1,81 @@ > +#include <arm_neon.h> > +#include "arm-neon-ref.h" > +#include "compute-ref-data.h" > + > + > +/* Expected results. */ > +VECT_VAR_DECL(expected,int,32,4) [] = { 0xfffff73c, 0xfffff7f7, > + 0xfffff8b2, 0xfffff96d }; > +VECT_VAR_DECL(expected,int,64,2) [] = { 0xfffffffffffff4d8, > + 0xfffffffffffff5a4 }; > +VECT_VAR_DECL(expected,uint,32,4) [] = { 0xedf4d8, 0xedf5c6, > + 0xedf6b4, 0xedf7a2 }; > +VECT_VAR_DECL(expected,uint,64,2) [] = { 0xfefffff20e, > + 0xfefffff30d}; > + > +#ifndef INSN_NAME > +#define INSN_NAME vmull_high_n > +#define TEST_MSG "VMULL_HIGH_N" > +#endif > + > +#define FNNAME1(NAME) exec_ ## NAME > +#define FNNAME(NAME) FNNAME1(NAME) > + > +void FNNAME (INSN_NAME) (void) > +{ > +#define DECL_VMUL(T, W, N) \ > + DECL_VARIABLE(vector1, T, W, N); \ > + > + /* vector_res = OP(vector1, vector2), then store the result. */ > +#define TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) \ > + VECT_VAR(vector_res, T1, W1, N1) = \ > + INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ > + C); \ > + vst1q##_##T2##W1(VECT_VAR(result, T1, W1, N1), \ > + VECT_VAR(vector_res, T1, W1, N1)) > + > +#define TEST_VMULL_HIGH(INSN, Q, T1, T2, W, N, W1, N1, C) \ > + TEST_VMULL_HIGH1(INSN, Q, T1, T2, W, N, W1, N1, C) > + > +#define CHECK_VMULL_HIGH_N_RESULTS(test_name,comment) \ > + { \ > + CHECK(test_name, int, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, int, 64, 2, PRIx64, expected, comment); \ > + CHECK(test_name, uint, 32, 4, PRIx32, expected, comment); \ > + CHECK(test_name, uint, 64, 2, PRIx64, expected, comment); \ > + } > + > + DECL_VMUL(int, 16, 8); > + DECL_VMUL(int, 32, 4); > + DECL_VMUL(uint, 16, 8); > + DECL_VMUL(uint, 32, 4); > + > + DECL_VARIABLE(vector_res, int, 32, 4); > + DECL_VARIABLE(vector_res, int, 64, 2); > + DECL_VARIABLE(vector_res, uint, 32, 4); > + DECL_VARIABLE(vector_res, uint, 64, 2); > + > + clean_results (); > + > + /* Initialize input "vector1" from "buffer". */ > + VLOAD(vector1, buffer, q, int, s, 16, 8); > + VLOAD(vector1, buffer, q, int, s, 32, 4); > + VLOAD(vector1, buffer, q, uint, u, 16, 8); > + VLOAD(vector1, buffer, q, uint, u, 32, 4); > + > + > + /* Execute the tests. */ > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 16, 8, 32, 4, 0xBB); > + TEST_VMULL_HIGH(INSN_NAME, , int, s, 32, 4, 64, 2, 0xCC); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 16, 8, 32, 4, 0xEE); > + TEST_VMULL_HIGH(INSN_NAME, , uint, u, 32, 4, 64, 2, 0xFF); > + > + CHECK_VMULL_HIGH_N_RESULTS (TEST_MSG, ""); > +} > + > +int main (void) > +{ > + FNNAME (INSN_NAME) (); > + > + return 0; > +} > > Property changes on: > gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c > ___________________________________________________________________ > Added: svn:executable > + * > > Index: gcc/testsuite/ChangeLog > =================================================================== > --- gcc/testsuite/ChangeLog (revision 218464) > +++ gcc/testsuite/ChangeLog (working copy) > @@ -1,3 +1,13 @@ > +2014-12-09 Felix Yang <felix.y...@huawei.com> > + Jiji Jiang <jiangj...@huawei.com> > + > + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high.c: New > + test. > + * > testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_lane.c: > + New test. > + * testsuite/gcc.target/aarch64/advsimd-intrinsics/vmull_high_n.c: > New > + test. > + > 2014-12-07 Christophe Lyon <christophe.l...@linaro.org> > * gcc.target/aarch64/advsimd-intrinsics/vaddhn.c: Actually execute > Index: gcc/config/aarch64/arm_neon.h > =================================================================== > --- gcc/config/aarch64/arm_neon.h (revision 218464) > +++ gcc/config/aarch64/arm_neon.h (working copy) > @@ -7627,671 +7627,6 @@ vmovn_u64 (uint64x2_t a) > return result; > } > -__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > -vmul_n_f32 (float32x2_t a, float32_t b) > -{ > - float32x2_t result; > - __asm__ ("fmul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > -vmul_n_s16 (int16x4_t a, int16_t b) > -{ > - int16x4_t result; > - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > -vmul_n_s32 (int32x2_t a, int32_t b) > -{ > - int32x2_t result; > - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x4_t __attribute__ > ((__always_inline__)) > -vmul_n_u16 (uint16x4_t a, uint16_t b) > -{ > - uint16x4_t result; > - __asm__ ("mul %0.4h,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x2_t __attribute__ > ((__always_inline__)) > -vmul_n_u32 (uint32x2_t a, uint32_t b) > -{ > - uint32x2_t result; > - __asm__ ("mul %0.2s,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmull_high_lane_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x4_t b_ = (b); \ > - int16x8_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x2_t b_ = (b); \ > - int32x4_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x4_t b_ = (b); \ > - uint16x8_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_lane_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x2_t b_ = (b); \ > - uint32x4_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x8_t b_ = (b); \ > - int16x8_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x4_t b_ = (b); \ > - int32x4_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x8_t b_ = (b); \ > - uint16x8_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_high_laneq_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x4_t b_ = (b); \ > - uint32x4_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_high_n_s16 (int16x8_t a, int16_t b) > -{ > - int32x4_t result; > - __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_high_n_s32 (int32x4_t a, int32_t b) > -{ > - int64x2_t result; > - __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_high_n_u16 (uint16x8_t a, uint16_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_high_n_u32 (uint32x4_t a, uint32_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > -vmull_high_p8 (poly8x16_t a, poly8x16_t b) > -{ > - poly16x8_t result; > - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmull_high_s8 (int8x16_t a, int8x16_t b) > -{ > - int16x8_t result; > - __asm__ ("smull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_high_s16 (int16x8_t a, int16x8_t b) > -{ > - int32x4_t result; > - __asm__ ("smull2 %0.4s,%1.8h,%2.8h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_high_s32 (int32x4_t a, int32x4_t b) > -{ > - int64x2_t result; > - __asm__ ("smull2 %0.2d,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmull_high_u8 (uint8x16_t a, uint8x16_t b) > -{ > - uint16x8_t result; > - __asm__ ("umull2 %0.8h,%1.16b,%2.16b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_high_u16 (uint16x8_t a, uint16x8_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull2 %0.4s,%1.8h,%2.8h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_high_u32 (uint32x4_t a, uint32x4_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull2 %0.2d,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmull_lane_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x4_t b_ = (b); \ > - int16x4_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x2_t b_ = (b); \ > - int32x2_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x4_t b_ = (b); \ > - uint16x4_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_lane_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x2_t b_ = (b); \ > - uint32x2_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_s16(a, b, c) \ > - __extension__ \ > - ({ \ > - int16x8_t b_ = (b); \ > - int16x4_t a_ = (a); \ > - int32x4_t result; \ > - __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_s32(a, b, c) \ > - __extension__ \ > - ({ \ > - int32x4_t b_ = (b); \ > - int32x2_t a_ = (a); \ > - int64x2_t result; \ > - __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_u16(a, b, c) \ > - __extension__ \ > - ({ \ > - uint16x8_t b_ = (b); \ > - uint16x4_t a_ = (a); \ > - uint32x4_t result; \ > - __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "x"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmull_laneq_u32(a, b, c) \ > - __extension__ \ > - ({ \ > - uint32x4_t b_ = (b); \ > - uint32x2_t a_ = (a); \ > - uint64x2_t result; \ > - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_n_s16 (int16x4_t a, int16_t b) > -{ > - int32x4_t result; > - __asm__ ("smull %0.4s,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_n_s32 (int32x2_t a, int32_t b) > -{ > - int64x2_t result; > - __asm__ ("smull %0.2d,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_n_u16 (uint16x4_t a, uint16_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull %0.4s,%1.4h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_n_u32 (uint32x2_t a, uint32_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull %0.2d,%1.2s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > -vmull_p8 (poly8x8_t a, poly8x8_t b) > -{ > - poly16x8_t result; > - __asm__ ("pmull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmull_s8 (int8x8_t a, int8x8_t b) > -{ > - int16x8_t result; > - __asm__ ("smull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmull_s16 (int16x4_t a, int16x4_t b) > -{ > - int32x4_t result; > - __asm__ ("smull %0.4s, %1.4h, %2.4h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > -vmull_s32 (int32x2_t a, int32x2_t b) > -{ > - int64x2_t result; > - __asm__ ("smull %0.2d, %1.2s, %2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmull_u8 (uint8x8_t a, uint8x8_t b) > -{ > - uint16x8_t result; > - __asm__ ("umull %0.8h, %1.8b, %2.8b" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmull_u16 (uint16x4_t a, uint16x4_t b) > -{ > - uint32x4_t result; > - __asm__ ("umull %0.4s, %1.4h, %2.4h" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > -vmull_u32 (uint32x2_t a, uint32x2_t b) > -{ > - uint64x2_t result; > - __asm__ ("umull %0.2d, %1.2s, %2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > -vmulq_n_f32 (float32x4_t a, float32_t b) > -{ > - float32x4_t result; > - __asm__ ("fmul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > -vmulq_n_f64 (float64x2_t a, float64_t b) > -{ > - float64x2_t result; > - __asm__ ("fmul %0.2d,%1.2d,%2.d[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > -vmulq_n_s16 (int16x8_t a, int16_t b) > -{ > - int16x8_t result; > - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > -vmulq_n_s32 (int32x4_t a, int32_t b) > -{ > - int32x4_t result; > - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > -vmulq_n_u16 (uint16x8_t a, uint16_t b) > -{ > - uint16x8_t result; > - __asm__ ("mul %0.8h,%1.8h,%2.h[0]" > - : "=w"(result) > - : "w"(a), "x"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > -vmulq_n_u32 (uint32x4_t a, uint32_t b) > -{ > - uint32x4_t result; > - __asm__ ("mul %0.4s,%1.4s,%2.s[0]" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > -vmulx_f32 (float32x2_t a, float32x2_t b) > -{ > - float32x2_t result; > - __asm__ ("fmulx %0.2s,%1.2s,%2.2s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmulx_lane_f32(a, b, c) \ > - __extension__ \ > - ({ \ > - float32x4_t b_ = (b); \ > - float32x2_t a_ = (a); \ > - float32x2_t result; \ > - __asm__ ("fmulx %0.2s,%1.2s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) > -vmulxd_f64 (float64_t a, float64_t b) > -{ > - float64_t result; > - __asm__ ("fmulx %d0, %d1, %d2" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > -vmulxq_f32 (float32x4_t a, float32x4_t b) > -{ > - float32x4_t result; > - __asm__ ("fmulx %0.4s,%1.4s,%2.4s" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > -vmulxq_f64 (float64x2_t a, float64x2_t b) > -{ > - float64x2_t result; > - __asm__ ("fmulx %0.2d,%1.2d,%2.2d" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > -#define vmulxq_lane_f32(a, b, c) \ > - __extension__ \ > - ({ \ > - float32x4_t b_ = (b); \ > - float32x4_t a_ = (a); \ > - float32x4_t result; \ > - __asm__ ("fmulx %0.4s,%1.4s,%2.s[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -#define vmulxq_lane_f64(a, b, c) \ > - __extension__ \ > - ({ \ > - float64x2_t b_ = (b); \ > - float64x2_t a_ = (a); \ > - float64x2_t result; \ > - __asm__ ("fmulx %0.2d,%1.2d,%2.d[%3]" \ > - : "=w"(result) \ > - : "w"(a_), "w"(b_), "i"(c) \ > - : /* No clobbers */); \ > - result; \ > - }) > - > -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) > -vmulxs_f32 (float32_t a, float32_t b) > -{ > - float32_t result; > - __asm__ ("fmulx %s0, %s1, %s2" > - : "=w"(result) > - : "w"(a), "w"(b) > - : /* No clobbers */); > - return result; > -} > - > __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) > vmvn_p8 (poly8x8_t a) > { > @@ -19172,6 +18507,78 @@ vmul_n_f64 (float64x1_t __a, float64_t __b) > return (float64x1_t) { vget_lane_f64 (__a, 0) * __b }; > } > +__extension__ static __inline float32x2_t __attribute__ > ((__always_inline__)) > +vmul_n_f32 (float32x2_t __a, float32_t __b) > +{ > + return __builtin_aarch64_mul_nv2sf (__a, __b); > +} > + > +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) > +vmul_n_s16 (int16x4_t __a, int16_t __b) > +{ > + return __builtin_aarch64_mul_nv4hi (__a, __b); > +} > + > +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) > +vmul_n_s32 (int32x2_t __a, int32_t __b) > +{ > + return __builtin_aarch64_mul_nv2si (__a, __b); > +} > + > +__extension__ static __inline uint16x4_t __attribute__ > ((__always_inline__)) > +vmul_n_u16 (uint16x4_t __a, uint16_t __b) > +{ > + return (uint16x4_t) __builtin_aarch64_mul_nv4hi ((int16x4_t)__a, > + (int16_t)__b); > +} > + > +__extension__ static __inline uint32x2_t __attribute__ > ((__always_inline__)) > +vmul_n_u32 (uint32x2_t __a, uint32_t __b) > +{ > + return (uint32x2_t) __builtin_aarch64_mul_nv2si ((int32x2_t)__a, > + (int32_t)__b); > +} > + > +/* vmulq_n */ > + > +__extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > +vmulq_n_f32 (float32x4_t __a, float32_t __b) > +{ > + return __builtin_aarch64_mul_nv4sf (__a, __b); > +} > + > +__extension__ static __inline float64x2_t __attribute__ > ((__always_inline__)) > +vmulq_n_f64 (float64x2_t __a, float64_t __b) > +{ > + return __builtin_aarch64_mul_nv2df (__a, __b); > +} > + > +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > +vmulq_n_s16 (int16x8_t __a, int16_t __b) > +{ > + return __builtin_aarch64_mul_nv8hi (__a, __b); > +} > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmulq_n_s32 (int32x4_t __a, int32_t __b) > +{ > + return __builtin_aarch64_mul_nv4si (__a, __b); > +} > + > +__extension__ static __inline uint16x8_t __attribute__ > ((__always_inline__)) > +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) > +{ > + return (uint16x8_t) __builtin_aarch64_mul_nv8hi ((int16x8_t)__a, > + (int16_t)__b); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) > +{ > + return (uint32x4_t) __builtin_aarch64_mul_nv4si ((int32x4_t)__a, > + (int32_t)__b); > +} > + > /* vmulq_lane */ > __extension__ static __inline float32x4_t __attribute__ > ((__always_inline__)) > @@ -19249,6 +18656,308 @@ vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, c > return __a * __aarch64_vgetq_lane_u32 (__b, __lane); > } > +/* vmull_high_lane */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_lanev8hi (__a, __b, __c); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_lanev4si (__a, __b, __c); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c) > +{ > + return (uint32x4_t) __builtin_aarch64_umull2_lanev8hi ((int16x8_t) __a, > + (int16x4_t) __b, > + __c); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c) > +{ > + return (uint64x2_t) __builtin_aarch64_umull2_lanev4si ((int32x4_t) __a, > + (int32x2_t) __b, > + __c); > +} > + > +/* vmull_high_laneq */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_laneqv8hi (__a, __b, __c); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) > +{ > + return __builtin_aarch64_smull2_laneqv4si (__a, __b, __c); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) > +{ > + return (uint32x4_t) __builtin_aarch64_umull2_laneqv8hi ((int16x8_t)__a, > + (int16x8_t)__b, > + __c); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) > +{ > + return (uint64x2_t) __builtin_aarch64_umull2_laneqv4si ((int32x4_t) __a, > + (int32x4_t) __b, > + __c); > +} > + > +/* vmull_high_n */ > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_n_s16 (int16x8_t __a, int16_t __b) > +{ > + return __builtin_aarch64_smull2_nv8hi (__a, __b); > +} > + > +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) > +vmull_high_n_s32 (int32x4_t __a, int32_t __b) > +{ > + return __builtin_aarch64_smull2_nv4si (__a, __b); > +} > + > +__extension__ static __inline uint32x4_t __attribute__ > ((__always_inline__)) > +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b) > +{ > + return __builtin_aarch64_umull2_nv8hi_uuu (__a, __b); > +} > + > +__extension__ static __inline uint64x2_t __attribute__ > ((__always_inline__)) > +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b) > +{ > + return __builtin_aarch64_umull2_nv4si_uuu (__a, __b); > +} > + > +/* vmull_high */ > + > +__extension__ static __inline poly16x8_t __attribute__ > ((__always_inline__)) > +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b) > +{ > + return __builtin_aarch64_pmull2v16qi_ppp (__a, __b); > +} > + > +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) > +vmull_high_s8 (int8x16_t __a, int8x16_t __b) > +{ > + return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b); > +} > + > +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) > +vmull_high_s16 (int16x8_t __a, int16x8_t __b) > +{ > + return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);