Hi,
This patch adds missing AdvSIMD intrinsics vmlsq_laneq_<suf><16,32> to
arm_neon.h. OK?
Thanks,
Tejas Belagod
ARM.
Changelog:
2012-06-14 Tejas Belagod <tejas.bela...@arm.com>
gcc/
* config/aarch64/arm_neon.h (vmlsq_laneq_f32, vmlsq_laneq_s16,
vmlsq_laneq_u16, vmlsq_laneq_s32, vmlsq_laneq_u32): New.
testsuite/
* gcc.target/aarch64/vmlsq_laneq.c: New.
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 58976cc..3b581bd 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11151,6 +11151,77 @@ vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
result; \
})
+#define vmlsq_laneq_f32(__a, __b, __c, __d) \
+ __extension__
\
+ ({ \
+ float32x4_t __c_ = (__c); \
+ float32x4_t __b_ = (__b); \
+ float32x4_t __a_ = (__a); \
+ float32x4_t __result; \
+ float32x4_t __t1; \
+ __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s"
\
+ : "=w"(__result), "=w"(__t1) \
+ : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
+ : /* No clobbers */); \
+ __result; \
+ })
+
+#define vmlsq_laneq_s16(__a, __b, __c, __d) \
+ __extension__
\
+ ({ \
+ int16x8_t __c_ = (__c); \
+ int16x8_t __b_ = (__b); \
+ int16x8_t __a_ = (__a); \
+ int16x8_t __result; \
+ __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \
+ : "=w"(__result) \
+ : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
+ : /* No clobbers */); \
+ __result; \
+ })
+
+#define vmlsq_laneq_s32(__a, __b, __c, __d) \
+ __extension__
\
+ ({ \
+ int32x4_t __c_ = (__c); \
+ int32x4_t __b_ = (__b); \
+ int32x4_t __a_ = (__a); \
+ int32x4_t __result; \
+ __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \
+ : "=w"(__result) \
+ : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
+ : /* No clobbers */); \
+ __result; \
+ })
+
+#define vmlsq_laneq_u16(__a, __b, __c, __d) \
+ __extension__
\
+ ({ \
+ uint16x8_t __c_ = (__c);
\
+ uint16x8_t __b_ = (__b);
\
+ uint16x8_t __a_ = (__a);
\
+ uint16x8_t __result; \
+ __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \
+ : "=w"(__result) \
+ : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
+ : /* No clobbers */); \
+ __result; \
+ })
+
+#define vmlsq_laneq_u32(__a, __b, __c, __d) \
+ __extension__
\
+ ({ \
+ uint32x4_t __c_ = (__c);
\
+ uint32x4_t __b_ = (__b);
\
+ uint32x4_t __a_ = (__a);
\
+ uint32x4_t __result; \
+ __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \
+ : "=w"(__result) \
+ : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \
+ : /* No clobbers */); \
+ __result; \
+ })
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
{
diff --git a/gcc/testsuite/gcc.target/aarch64/vmlsq_laneq.c
b/gcc/testsuite/gcc.target/aarch64/vmlsq_laneq.c
new file mode 100644
index 0000000..dd3fb81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmlsq_laneq.c
@@ -0,0 +1,158 @@
+
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void
+test1 ()
+{
+ int16x8_t val1, val2, val3;
+ int16x8_t result;
+ uint64_t act, exp;
+
+ val1 = vcombine_s16 (vcreate_s16 (UINT64_C (0xffff9ab680000000)),
+ vcreate_s16 (UINT64_C (0x00000000ffff0000)));
+ val2 = vcombine_s16 (vcreate_s16 (UINT64_C (0x32b77fffffff7fff)),
+ vcreate_s16 (UINT64_C (0x0000ffff00007fff)));
+ val3 = vcombine_s16 (vcreate_s16 (UINT64_C (0x7fff00007fff0000)),
+ vcreate_s16 (UINT64_C (0x80007fff00000000)));
+ result = vmlsq_laneq_s16 (val1, val2, val3, 6);
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_s16 (result), 0);
+ exp = UINT64_C (0xb2b69ab5ffffffff);
+ if (act != exp)
+ abort ();
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_s16 (result), 1);
+ exp = UINT64_C (0x00007fffffffffff);
+ if (act != exp)
+ abort ();
+}
+
+void
+test2 ()
+{
+ int32x4_t val1, val2, val3;
+ int32x4_t result;
+ uint64_t exp, act;
+
+ val1 = vcombine_s32 (vcreate_s32 (UINT64_C (0x00008000f46f7fff)),
+ vcreate_s32 (UINT64_C (0x7fffffffffff8000)));
+ val2 = vcombine_s32 (vcreate_s32 (UINT64_C (0x7fff7fff0e700000)),
+ vcreate_s32 (UINT64_C (0xffff000080000000)));
+ val3 = vcombine_s32 (vcreate_s32 (UINT64_C (0x00000000ffff0000)),
+ vcreate_s32 (UINT64_C (0xd9edea1a8000fb28)));
+ result = vmlsq_laneq_s32 (val1, val2, val3, 3);
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_s32 (result), 0);
+ exp = UINT64_C (0xcefb6a1a1d0f7fff);
+ if (act != exp)
+ abort ();
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_s32 (result), 1);
+ exp = UINT64_C (0x6a19ffffffff8000);
+ if (act != exp)
+ abort ();
+}
+
+void
+test3 ()
+{
+ uint16x8_t val1, val2, val3;
+ uint16x8_t result;
+ uint64_t act, exp;
+
+ val1 = vcombine_u16 (vcreate_u16 (UINT64_C (0x000080008000802a)),
+ vcreate_u16 (UINT64_C (0x7fffffff00007fff)));
+ val2 = vcombine_u16 (vcreate_u16 (UINT64_C (0x7fffcdf1ffff0000)),
+ vcreate_u16 (UINT64_C (0xe2550000ffffffff)));
+ val3 = vcombine_u16 (vcreate_u16 (UINT64_C (0x80007fff80000000)),
+ vcreate_u16 (UINT64_C (0xbe2100007fffffff)));
+
+ result = vmlsq_laneq_u16 (val1, val2, val3, 7);
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_u16 (result), 0);
+ exp = UINT64_C (0x3e2115ef3e21802a);
+ if (act != exp)
+ abort ();
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_u16 (result), 1);
+ exp = UINT64_C (0x3d0affffbe213e20);
+ if (act != exp)
+ abort ();
+}
+
+void
+test4 ()
+{
+ uint32x4_t val1, val2, val3;
+ uint32x4_t result;
+ uint64_t act, exp;
+
+ val1 = vcombine_u32 (vcreate_u32 (UINT64_C (0x3295fe3d7fff7fff)),
+ vcreate_u32 (UINT64_C (0x7fff00007fff7fff)));
+ val2 = vcombine_u32 (vcreate_u32 (UINT64_C (0xffff7fff7fff8000)),
+ vcreate_u32 (UINT64_C (0x7fff80008000ffff)));
+ val3 = vcombine_u32 (vcreate_u32 (UINT64_C (0x7fff7fff80008000)),
+ vcreate_u32 (UINT64_C (0x0000800053ab7fff)));
+
+ result = vmlsq_laneq_u32 (val1, val2, val3, 2);
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_u32 (result), 0);
+ exp = UINT64_C (0x4640fe3cbffeffff);
+ if (act != exp)
+ abort ();
+
+ act = vgetq_lane_u64 (vreinterpretq_u64_u32 (result), 1);
+ exp = UINT64_C (0xbffe8000d3abfffe);
+ if (act != exp)
+ abort ();
+}
+
+void
+test5 ()
+{
+ float32x4_t val1, val2, val3;
+ float32x4_t result;
+ float32_t act;
+
+ val1 = vcombine_f32 (vcreate_f32 (UINT64_C (0x3f49daf03ef3dc73)),
+ vcreate_f32 (UINT64_C (0x3f5d467a3ef3dc73)));
+ val2 = vcombine_f32 (vcreate_f32 (UINT64_C (0x3d2064c83d10cd28)),
+ vcreate_f32 (UINT64_C (0x3ea7d1a23d10cd28)));
+ val3 = vcombine_f32 (vcreate_f32 (UINT64_C (0x3f6131993edb1e04)),
+ vcreate_f32 (UINT64_C (0x3f37f4bf3edb1e04)));
+
+ result = vmlsq_laneq_f32 (val1, val2, val3, 0);
+
+ act = vgetq_lane_f32 (result, 0);
+ if (act != 0.46116194128990173f)
+ abort ();
+
+ act = vgetq_lane_f32 (result, 1);
+ if (act != 0.7717385292053223f)
+ abort ();
+
+ act = vgetq_lane_f32 (result, 2);
+ if (act != 0.46116194128990173f)
+ abort ();
+
+ act = vgetq_lane_f32 (result, 3);
+ if (act != 0.7240825295448303f)
+ abort ();
+}
+
+int
+main (void)
+{
+ test1 ();
+ test2 ();
+ test3 ();
+ test4 ();
+ test5 ();
+
+ return 0;
+}