For this example using the Adv.SIMD/SVE Bridge
#include <arm_neon.h>
#include <arm_neon_sve_bridge.h>
#include <stdint.h>
svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
return svset_neonq_s16(svundef_s16(),
vsubq_s16(vmovl_high_s8(svget_neonq(a)),
vmovl_high_s8(svget_neonq(b))));
}
we generate:
sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
sxtl2 v0.8h, v0.16b
ssubw2 v0.8h, v0.8h, v1.16b
ret
instead of just
sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
ssubl2 v0.8h, v0.16b, v1.16b
ret
Commit g:abf865732a7313cf79ffa325faed3467ed28d8b8 added a framework to fold
uses of instrinsics combined with lo/hi extractions into the appropriate low
or highpart instructions.
However this doesn't trigger because the Adv.SIMD from SVE extraction code for
vmovl_high_s8(svget_neonq(a))
does not have one argument as constant and only supports folding 2 insn, not 3
into 1.
The above in RTL generates
(insn 7 4 8 2 (set (reg:V8QI 103 [ _6 ])
(vec_select:V8QI (subreg:V16QI (reg/v:VNx16QI 109 [ a ]) 0)
(parallel:V16QI [
(const_int 8 [0x8])
(const_int 9 [0x9])
(const_int 10 [0xa])
(const_int 11 [0xb])
(const_int 12 [0xc])
(const_int 13 [0xd])
(const_int 14 [0xe])
(const_int 15 [0xf])
]))) "":3174:43 -1
(nil))
Since the SVE and the Adv. SIMD modes are tieable this is a valid instruction to
make, however it's suboptimal in that we can't fold this into the existing
instruction patterns. Eventually early-ra will split off the SVE reg from the
patterns but by then we're passed combine and insn foldings so we miss all the
optimizations.
This patch introduces vec_extract optabs for 128-bit and 64-bit Adv.SIMD vector
extraction from SVE registers and emits an explicit separate instruction for the
subregs. This then gives combine and rtl folding the opportunity to form the
combined instructions and if not we arrive at the same RTL after early-ra.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64-sve.md (vec_extract<mode><v128>,
vec_extract<mode><v64>): New.
* config/aarch64/iterators.md (V64, v64): New.
* config/aarch64/predicates.md (const0_to_1_operand): New.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/fold_to_highpart_6.c: Update codegen.
* gcc.target/aarch64/sve/fold_to_highpart_1.c: New test.
* gcc.target/aarch64/sve/fold_to_highpart_2.c: New test.
---
diff --git a/gcc/config/aarch64/aarch64-sve.md
b/gcc/config/aarch64/aarch64-sve.md
index
f459f63d6bb248a53d1e5199646fbb6cf3c9759b..780fb76164f9f26389d5c7a9c6e4ca3e63a63ebc
100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3112,6 +3112,48 @@ (define_expand "vec_extract<mode><Vel>"
}
)
+;; Don't allow expansions of SVE to Adv. SIMD registers immediately as subregs.
+;; Doing so prevents combine from matching instructions generated by the
+;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the instructions.
+;; Eventually early-ra or reload will split them but by then we've lost the
+;; combinations. Instead split them early and allow forwardprop or combine to
+;; push them into instructions where they are actually supported as part of the
+;; instruction.
+(define_expand "vec_extract<mode><v128>"
+ [(match_operand:<V128> 0 "register_operand")
+ (match_operand:SVE_FULL 1 "register_operand")
+ (match_operand:SI 2 "const0_operand")]
+ "TARGET_SVE"
+{
+ emit_move_insn (operands[0],
+ force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode));
+ DONE;
+})
+
+;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors. For
+;; these extractions we can support offsets 0 and 1 by first extracting a
+;; 128-bit vector and then selecting the appropriate half.
+(define_expand "vec_extract<mode><v64>"
+ [(match_operand:<V64> 0 "register_operand")
+ (match_operand:SVE_FULL_BHS 1 "register_operand")
+ (match_operand:SI 2 "const0_to_1_operand")]
+ "TARGET_SVE"
+{
+ if (const0_rtx == operands[2])
+ emit_move_insn (operands[0],
+ force_lowpart_subreg (<V64>mode, operands[1],
+ <MODE>mode));
+ else
+ {
+ rtx tmp = gen_reg_rtx (<V128>mode);
+ emit_move_insn (tmp,
+ force_lowpart_subreg (<V128>mode, operands[1],
+ <MODE>mode));
+ emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2]));
+ }
+ DONE;
+})
+
;; Extract element zero. This is a special case because we want to force
;; the registers to be the same for the second alternative, and then
;; split the instruction into nothing after RA.
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index
517b2808b5f725db81709122848817aaafff1f34..9248f038bf42a8eb4297914347cde33fd0f1d5c9
100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1816,6 +1816,18 @@ (define_mode_attr v128 [(VNx16QI "v16qi")
(VNx4SI "v4si") (VNx4SF "v4sf")
(VNx2DI "v2di") (VNx2DF "v2df")])
+;; Gives the mode of the 64-bit lowpart of an SVE vector.
+(define_mode_attr V64 [(VNx16QI "V8QI")
+ (VNx8HI "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF")
+ (VNx4SI "V2SI") (VNx4SF "V2SF")
+ (VNx2DI "DI") (VNx2DF "DF")])
+
+;; ...and again in lower case.
+(define_mode_attr v64 [(VNx16QI "v8qi")
+ (VNx8HI "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf")
+ (VNx4SI "v2si") (VNx4SF "v2sf")
+ (VNx2DI "di") (VNx2DF "df")])
+
(define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
;; 64-bit container modes the inner or scalar source mode.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index
42304cef4391e15598bcd22da590c8663f3ffaa5..7a147294150357ed3484609cac2b5acdf2638e52
100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -46,6 +46,10 @@ (define_predicate "const0_operand"
(and (match_code "const_int")
(match_test "op == CONST0_RTX (mode)")))
+(define_predicate "const0_to_1_operand"
+ (and (match_code "const_int")
+ (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
(define_predicate "const_0_to_7_operand"
(and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 0, 7)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
index
3570d4da34b5ebb4ae507d57b506f8cd87b76ba8..83ef2148fd84f806c243226f2bb4f4b6b1fdba7a
100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -1,6 +1,7 @@
/* { dg-do compile } */
/* { dg-require-effective-target aarch64_little_endian } */
/* { dg-options "-O2 -march=armv8-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" } } */
#include <arm_neon_sve_bridge.h>
@@ -16,6 +17,11 @@ test_addressable ()
return vmovl_s8 (vget_high_s8 (z));
}
+/*
+** test_scalable_type:
+** sxtl2 v0.8h, v0.16b
+** ret
+*/
int16x8_t
test_scalable_type (svint8_t scalable)
{
@@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo)
return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
}
-/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */
+/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..a3d59a498bf6152e16f43271da1b400a79f84959
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <stdint.h>
+
+/*
+** sub_neon_i16_sve_bridged:
+** ssubl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
+ return svset_neonq_s16(svundef_s16(),
+ vsubq_s16(vmovl_high_s8(svget_neonq(a)),
+ vmovl_high_s8(svget_neonq(b))));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
new file mode 100644
index
0000000000000000000000000000000000000000..6cca4adb86513970b53d6b102b53d10b78401d63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
@@ -0,0 +1,295 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// ============================================================================
+// 8 -> 16 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i16_from_i8_low_sve_bridged:
+** saddl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** add_neon_i16_from_i8_high_sve_bridged:
+** saddl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_low_sve_bridged:
+** ssubl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_high_sve_bridged:
+** ssubl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+// ============================================================================
+// 8 -> 16 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u16_from_u8_low_sve_bridged:
+** uaddl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** add_neon_u16_from_u8_high_sve_bridged:
+** uaddl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_low_sve_bridged:
+** usubl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_high_sve_bridged:
+** usubl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i32_from_i16_low_sve_bridged:
+** saddl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** add_neon_i32_from_i16_high_sve_bridged:
+** saddl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_low_sve_bridged:
+** ssubl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_high_sve_bridged:
+** ssubl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u32_from_u16_low_sve_bridged:
+** uaddl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** add_neon_u32_from_u16_high_sve_bridged:
+** uaddl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_low_sve_bridged:
+** usubl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_high_sve_bridged:
+** usubl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i64_from_i32_low_sve_bridged:
+** saddl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** add_neon_i64_from_i32_high_sve_bridged:
+** saddl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_low_sve_bridged:
+** ssubl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_high_sve_bridged:
+** ssubl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u64_from_u32_low_sve_bridged:
+** uaddl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** add_neon_u64_from_u32_high_sve_bridged:
+** uaddl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_low_sve_bridged:
+** usubl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_high_sve_bridged:
+** usubl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
--
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index f459f63d6bb248a53d1e5199646fbb6cf3c9759b..780fb76164f9f26389d5c7a9c6e4ca3e63a63ebc 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3112,6 +3112,48 @@ (define_expand "vec_extract<mode><Vel>"
}
)
+;; Don't allow expansions of SVE to Adv. SIMD registers immediately as subregs.
+;; Doing so prevents combine from matching instructions generated by the
+;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the instructions.
+;; Eventually early-ra or reload will split them but by then we've lost the
+;; combinations. Instead split them early and allow forwardprop or combine to
+;; push them into instructions where they are actually supported as part of the
+;; instruction.
+(define_expand "vec_extract<mode><v128>"
+ [(match_operand:<V128> 0 "register_operand")
+ (match_operand:SVE_FULL 1 "register_operand")
+ (match_operand:SI 2 "const0_operand")]
+ "TARGET_SVE"
+{
+ emit_move_insn (operands[0],
+ force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode));
+ DONE;
+})
+
+;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors. For
+;; these extractions we can support offsets 0 and 1 by first extracting a
+;; 128-bit vector and then selecting the appropriate half.
+(define_expand "vec_extract<mode><v64>"
+ [(match_operand:<V64> 0 "register_operand")
+ (match_operand:SVE_FULL_BHS 1 "register_operand")
+ (match_operand:SI 2 "const0_to_1_operand")]
+ "TARGET_SVE"
+{
+ if (const0_rtx == operands[2])
+ emit_move_insn (operands[0],
+ force_lowpart_subreg (<V64>mode, operands[1],
+ <MODE>mode));
+ else
+ {
+ rtx tmp = gen_reg_rtx (<V128>mode);
+ emit_move_insn (tmp,
+ force_lowpart_subreg (<V128>mode, operands[1],
+ <MODE>mode));
+ emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2]));
+ }
+ DONE;
+})
+
;; Extract element zero. This is a special case because we want to force
;; the registers to be the same for the second alternative, and then
;; split the instruction into nothing after RA.
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 517b2808b5f725db81709122848817aaafff1f34..9248f038bf42a8eb4297914347cde33fd0f1d5c9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1816,6 +1816,18 @@ (define_mode_attr v128 [(VNx16QI "v16qi")
(VNx4SI "v4si") (VNx4SF "v4sf")
(VNx2DI "v2di") (VNx2DF "v2df")])
+;; Gives the mode of the 64-bit lowpart of an SVE vector.
+(define_mode_attr V64 [(VNx16QI "V8QI")
+ (VNx8HI "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF")
+ (VNx4SI "V2SI") (VNx4SF "V2SF")
+ (VNx2DI "DI") (VNx2DF "DF")])
+
+;; ...and again in lower case.
+(define_mode_attr v64 [(VNx16QI "v8qi")
+ (VNx8HI "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf")
+ (VNx4SI "v2si") (VNx4SF "v2sf")
+ (VNx2DI "di") (VNx2DF "df")])
+
(define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
;; 64-bit container modes the inner or scalar source mode.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 42304cef4391e15598bcd22da590c8663f3ffaa5..7a147294150357ed3484609cac2b5acdf2638e52 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -46,6 +46,10 @@ (define_predicate "const0_operand"
(and (match_code "const_int")
(match_test "op == CONST0_RTX (mode)")))
+(define_predicate "const0_to_1_operand"
+ (and (match_code "const_int")
+ (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
(define_predicate "const_0_to_7_operand"
(and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 0, 7)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
index 3570d4da34b5ebb4ae507d57b506f8cd87b76ba8..83ef2148fd84f806c243226f2bb4f4b6b1fdba7a 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -1,6 +1,7 @@
/* { dg-do compile } */
/* { dg-require-effective-target aarch64_little_endian } */
/* { dg-options "-O2 -march=armv8-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" } } */
#include <arm_neon_sve_bridge.h>
@@ -16,6 +17,11 @@ test_addressable ()
return vmovl_s8 (vget_high_s8 (z));
}
+/*
+** test_scalable_type:
+** sxtl2 v0.8h, v0.16b
+** ret
+*/
int16x8_t
test_scalable_type (svint8_t scalable)
{
@@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo)
return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
}
-/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */
+/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..a3d59a498bf6152e16f43271da1b400a79f84959
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <stdint.h>
+
+/*
+** sub_neon_i16_sve_bridged:
+** ssubl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
+ return svset_neonq_s16(svundef_s16(),
+ vsubq_s16(vmovl_high_s8(svget_neonq(a)),
+ vmovl_high_s8(svget_neonq(b))));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cca4adb86513970b53d6b102b53d10b78401d63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
@@ -0,0 +1,295 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// ============================================================================
+// 8 -> 16 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i16_from_i8_low_sve_bridged:
+** saddl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** add_neon_i16_from_i8_high_sve_bridged:
+** saddl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_low_sve_bridged:
+** ssubl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_high_sve_bridged:
+** ssubl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+ int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+ int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+ return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+// ============================================================================
+// 8 -> 16 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u16_from_u8_low_sve_bridged:
+** uaddl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** add_neon_u16_from_u8_high_sve_bridged:
+** uaddl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_low_sve_bridged:
+** usubl v0.8h, v0.8b, v1.8b
+** ret
+*/
+svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_high_sve_bridged:
+** usubl2 v0.8h, v0.16b, v1.16b
+** ret
+*/
+svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+ uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+ uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+ return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i32_from_i16_low_sve_bridged:
+** saddl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** add_neon_i32_from_i16_high_sve_bridged:
+** saddl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_low_sve_bridged:
+** ssubl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_high_sve_bridged:
+** ssubl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+ int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+ int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+ return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u32_from_u16_low_sve_bridged:
+** uaddl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** add_neon_u32_from_u16_high_sve_bridged:
+** uaddl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_low_sve_bridged:
+** usubl v0.4s, v0.4h, v1.4h
+** ret
+*/
+svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_high_sve_bridged:
+** usubl2 v0.4s, v0.8h, v1.8h
+** ret
+*/
+svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+ uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+ uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+ return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i64_from_i32_low_sve_bridged:
+** saddl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** add_neon_i64_from_i32_high_sve_bridged:
+** saddl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_low_sve_bridged:
+** ssubl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_high_sve_bridged:
+** ssubl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+ int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+ int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+ return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u64_from_u32_low_sve_bridged:
+** uaddl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** add_neon_u64_from_u32_high_sve_bridged:
+** uaddl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_low_sve_bridged:
+** usubl v0.2d, v0.2s, v1.2s
+** ret
+*/
+svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_high_sve_bridged:
+** usubl2 v0.2d, v0.4s, v1.4s
+** ret
+*/
+svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+ uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+ uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+ return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}