[PATCH]AArch64: expand extractions of Adv.SIMD registers from SVE as separate insn.

Tamar Christina Tue, 28 Oct 2025 01:38:06 -0700

For this example using the Adv.SIMD/SVE Bridge

#include <arm_neon.h>
#include <arm_neon_sve_bridge.h>
#include <stdint.h>


svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
    return svset_neonq_s16(svundef_s16(),
            vsubq_s16(vmovl_high_s8(svget_neonq(a)),
                      vmovl_high_s8(svget_neonq(b))));
}

we generate:

sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
        sxtl2   v0.8h, v0.16b
        ssubw2  v0.8h, v0.8h, v1.16b
        ret

instead of just

sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
        ssubl2  v0.8h, v0.16b, v1.16b
        ret

Commit g:abf865732a7313cf79ffa325faed3467ed28d8b8 added a framework to fold
uses of instrinsics combined with lo/hi extractions into the appropriate low
or highpart instructions.

However this doesn't trigger because the Adv.SIMD from SVE extraction code for

vmovl_high_s8(svget_neonq(a))

does not have one argument as constant and only supports folding 2 insn, not 3
into 1.

The above in RTL generates

(insn 7 4 8 2 (set (reg:V8QI 103 [ _6 ])
        (vec_select:V8QI (subreg:V16QI (reg/v:VNx16QI 109 [ a ]) 0)
            (parallel:V16QI [
                    (const_int 8 [0x8])
                    (const_int 9 [0x9])
                    (const_int 10 [0xa])
                    (const_int 11 [0xb])
                    (const_int 12 [0xc])
                    (const_int 13 [0xd])
                    (const_int 14 [0xe])
                    (const_int 15 [0xf])
                ]))) "":3174:43 -1
     (nil))

Since the SVE and the Adv. SIMD modes are tieable this is a valid instruction to
make, however it's suboptimal in that we can't fold this into the existing
instruction patterns.  Eventually early-ra will split off the SVE reg from the
patterns but by then we're passed combine and insn foldings so we miss all the
optimizations.

This patch introduces vec_extract optabs for 128-bit and 64-bit Adv.SIMD vector
extraction from SVE registers and emits an explicit separate instruction for the
subregs.  This then gives combine and rtl folding the opportunity to form the
combined instructions and if not we arrive at the same RTL after early-ra.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-sve.md (vec_extract<mode><v128>,
        vec_extract<mode><v64>): New.
        * config/aarch64/iterators.md (V64, v64): New.
        * config/aarch64/predicates.md (const0_to_1_operand): New.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/fold_to_highpart_6.c: Update codegen.
        * gcc.target/aarch64/sve/fold_to_highpart_1.c: New test.
        * gcc.target/aarch64/sve/fold_to_highpart_2.c: New test.

---
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
f459f63d6bb248a53d1e5199646fbb6cf3c9759b..780fb76164f9f26389d5c7a9c6e4ca3e63a63ebc
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3112,6 +3112,48 @@ (define_expand "vec_extract<mode><Vel>"
   }
 )
 
+;; Don't allow expansions of SVE to Adv. SIMD registers immediately as subregs.
+;; Doing so prevents combine from matching instructions generated by the
+;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the instructions.
+;; Eventually early-ra or reload will split them but by then we've lost the
+;; combinations.  Instead split them early and allow forwardprop or combine to
+;; push them into instructions where they are actually supported as part of the
+;; instruction.
+(define_expand "vec_extract<mode><v128>"
+  [(match_operand:<V128> 0 "register_operand")
+   (match_operand:SVE_FULL 1 "register_operand")
+   (match_operand:SI 2 "const0_operand")]
+  "TARGET_SVE"
+{
+    emit_move_insn (operands[0],
+                   force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode));
+    DONE;
+})
+
+;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors.  For
+;; these extractions we can support offsets 0 and 1 by first extracting a
+;; 128-bit vector and then selecting the appropriate half.
+(define_expand "vec_extract<mode><v64>"
+  [(match_operand:<V64> 0 "register_operand")
+   (match_operand:SVE_FULL_BHS 1 "register_operand")
+   (match_operand:SI 2 "const0_to_1_operand")]
+  "TARGET_SVE"
+{
+    if (const0_rtx == operands[2])
+      emit_move_insn (operands[0],
+                     force_lowpart_subreg (<V64>mode, operands[1],
+                                           <MODE>mode));
+    else
+      {
+       rtx tmp = gen_reg_rtx (<V128>mode);
+       emit_move_insn (tmp,
+                       force_lowpart_subreg (<V128>mode, operands[1],
+                                             <MODE>mode));
+       emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2]));
+      }
+    DONE;
+})
+
 ;; Extract element zero.  This is a special case because we want to force
 ;; the registers to be the same for the second alternative, and then
 ;; split the instruction into nothing after RA.
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
517b2808b5f725db81709122848817aaafff1f34..9248f038bf42a8eb4297914347cde33fd0f1d5c9
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1816,6 +1816,18 @@ (define_mode_attr v128 [(VNx16QI "v16qi")
                        (VNx4SI  "v4si") (VNx4SF "v4sf")
                        (VNx2DI  "v2di") (VNx2DF "v2df")])
 
+;; Gives the mode of the 64-bit lowpart of an SVE vector.
+(define_mode_attr V64 [(VNx16QI "V8QI")
+                       (VNx8HI  "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF")
+                       (VNx4SI  "V2SI") (VNx4SF "V2SF")
+                       (VNx2DI  "DI") (VNx2DF "DF")])
+
+;; ...and again in lower case.
+(define_mode_attr v64 [(VNx16QI "v8qi")
+                       (VNx8HI  "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf")
+                       (VNx4SI  "v2si") (VNx4SF "v2sf")
+                       (VNx2DI  "di") (VNx2DF "df")])
+
 (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
 
 ;; 64-bit container modes the inner or scalar source mode.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 
42304cef4391e15598bcd22da590c8663f3ffaa5..7a147294150357ed3484609cac2b5acdf2638e52
 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -46,6 +46,10 @@ (define_predicate "const0_operand"
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_predicate "const0_to_1_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
 (define_predicate "const_0_to_7_operand"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
index 
3570d4da34b5ebb4ae507d57b506f8cd87b76ba8..83ef2148fd84f806c243226f2bb4f4b6b1fdba7a
 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target aarch64_little_endian } */
 /* { dg-options "-O2 -march=armv8-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" } } */
 
 #include <arm_neon_sve_bridge.h>
 
@@ -16,6 +17,11 @@ test_addressable ()
   return vmovl_s8 (vget_high_s8 (z));
 }
 
+/*
+** test_scalable_type:
+**     sxtl2   v0.8h, v0.16b
+**     ret
+*/
 int16x8_t
 test_scalable_type (svint8_t scalable)
 {
@@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo)
   return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
 }
 
-/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */
+/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..a3d59a498bf6152e16f43271da1b400a79f84959
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <stdint.h>
+
+/*
+** sub_neon_i16_sve_bridged:
+**     ssubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
+    return svset_neonq_s16(svundef_s16(),
+            vsubq_s16(vmovl_high_s8(svget_neonq(a)),
+                      vmovl_high_s8(svget_neonq(b))));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..6cca4adb86513970b53d6b102b53d10b78401d63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
@@ -0,0 +1,295 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// ============================================================================
+// 8 -> 16 : SIGNED
+// ============================================================================
+
+/* 
+** add_neon_i16_from_i8_low_sve_bridged:
+**     saddl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** add_neon_i16_from_i8_high_sve_bridged:
+**     saddl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_low_sve_bridged:
+**     ssubl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_high_sve_bridged:
+**     ssubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+// ============================================================================
+// 8 -> 16 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u16_from_u8_low_sve_bridged:
+**     uaddl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** add_neon_u16_from_u8_high_sve_bridged:
+**     uaddl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_low_sve_bridged:
+**     usubl   v0.8h, v0.8b, v1.8b
+**     ret
+*/
+svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_high_sve_bridged:
+**     usubl2  v0.8h, v0.16b, v1.16b
+**     ret
+*/
+svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i32_from_i16_low_sve_bridged:
+**     saddl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** add_neon_i32_from_i16_high_sve_bridged:
+**     saddl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_low_sve_bridged:
+**     ssubl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_high_sve_bridged:
+**     ssubl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u32_from_u16_low_sve_bridged:
+**     uaddl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** add_neon_u32_from_u16_high_sve_bridged:
+**     uaddl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_low_sve_bridged:
+**     usubl   v0.4s, v0.4h, v1.4h
+**     ret
+*/
+svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_high_sve_bridged:
+**     usubl2  v0.4s, v0.8h, v1.8h
+**     ret
+*/
+svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i64_from_i32_low_sve_bridged:
+**     saddl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** add_neon_i64_from_i32_high_sve_bridged:
+**     saddl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_low_sve_bridged:
+**     ssubl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_high_sve_bridged:
+**     ssubl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u64_from_u32_low_sve_bridged:
+**     uaddl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** add_neon_u64_from_u32_high_sve_bridged:
+**     uaddl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_low_sve_bridged:
+**     usubl   v0.2d, v0.2s, v1.2s
+**     ret
+*/
+svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
+
+/* 
+** sub_neon_u64_from_u32_high_sve_bridged:
+**     usubl2  v0.2d, v0.4s, v1.4s
+**     ret
+*/
+svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}


--

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index f459f63d6bb248a53d1e5199646fbb6cf3c9759b..780fb76164f9f26389d5c7a9c6e4ca3e63a63ebc 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3112,6 +3112,48 @@ (define_expand "vec_extract<mode><Vel>"
   }
 )
 
+;; Don't allow expansions of SVE to Adv. SIMD registers immediately as subregs.
+;; Doing so prevents combine from matching instructions generated by the
+;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the instructions.
+;; Eventually early-ra or reload will split them but by then we've lost the
+;; combinations.  Instead split them early and allow forwardprop or combine to
+;; push them into instructions where they are actually supported as part of the
+;; instruction.
+(define_expand "vec_extract<mode><v128>"
+  [(match_operand:<V128> 0 "register_operand")
+   (match_operand:SVE_FULL 1 "register_operand")
+   (match_operand:SI 2 "const0_operand")]
+  "TARGET_SVE"
+{
+    emit_move_insn (operands[0],
+		    force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode));
+    DONE;
+})
+
+;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors.  For
+;; these extractions we can support offsets 0 and 1 by first extracting a
+;; 128-bit vector and then selecting the appropriate half.
+(define_expand "vec_extract<mode><v64>"
+  [(match_operand:<V64> 0 "register_operand")
+   (match_operand:SVE_FULL_BHS 1 "register_operand")
+   (match_operand:SI 2 "const0_to_1_operand")]
+  "TARGET_SVE"
+{
+    if (const0_rtx == operands[2])
+      emit_move_insn (operands[0],
+		      force_lowpart_subreg (<V64>mode, operands[1],
+					    <MODE>mode));
+    else
+      {
+	rtx tmp = gen_reg_rtx (<V128>mode);
+	emit_move_insn (tmp,
+			force_lowpart_subreg (<V128>mode, operands[1],
+					      <MODE>mode));
+	emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2]));
+      }
+    DONE;
+})
+
 ;; Extract element zero.  This is a special case because we want to force
 ;; the registers to be the same for the second alternative, and then
 ;; split the instruction into nothing after RA.
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 517b2808b5f725db81709122848817aaafff1f34..9248f038bf42a8eb4297914347cde33fd0f1d5c9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1816,6 +1816,18 @@ (define_mode_attr v128 [(VNx16QI "v16qi")
 			(VNx4SI  "v4si") (VNx4SF "v4sf")
 			(VNx2DI  "v2di") (VNx2DF "v2df")])
 
+;; Gives the mode of the 64-bit lowpart of an SVE vector.
+(define_mode_attr V64 [(VNx16QI "V8QI")
+			(VNx8HI  "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF")
+			(VNx4SI  "V2SI") (VNx4SF "V2SF")
+			(VNx2DI  "DI") (VNx2DF "DF")])
+
+;; ...and again in lower case.
+(define_mode_attr v64 [(VNx16QI "v8qi")
+			(VNx8HI  "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf")
+			(VNx4SI  "v2si") (VNx4SF "v2sf")
+			(VNx2DI  "di") (VNx2DF "df")])
+
 (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
 
 ;; 64-bit container modes the inner or scalar source mode.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 42304cef4391e15598bcd22da590c8663f3ffaa5..7a147294150357ed3484609cac2b5acdf2638e52 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -46,6 +46,10 @@ (define_predicate "const0_operand"
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_predicate "const0_to_1_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
+
 (define_predicate "const_0_to_7_operand"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
index 3570d4da34b5ebb4ae507d57b506f8cd87b76ba8..83ef2148fd84f806c243226f2bb4f4b6b1fdba7a 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -1,6 +1,7 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target aarch64_little_endian } */
 /* { dg-options "-O2 -march=armv8-a+sve" } */
+/* { dg-final { check-function-bodies "**" "" } } */
 
 #include <arm_neon_sve_bridge.h>
 
@@ -16,6 +17,11 @@ test_addressable ()
   return vmovl_s8 (vget_high_s8 (z));
 }
 
+/*
+** test_scalable_type:
+**	sxtl2	v0.8h, v0.16b
+**	ret
+*/
 int16x8_t
 test_scalable_type (svint8_t scalable)
 {
@@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo)
   return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
 }
 
-/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */
+/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..a3d59a498bf6152e16f43271da1b400a79f84959
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_neon_sve_bridge.h>
+#include <stdint.h>
+
+/*
+** sub_neon_i16_sve_bridged:
+**	ssubl2	v0.8h, v0.16b, v1.16b
+**	ret
+*/
+svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
+    return svset_neonq_s16(svundef_s16(),
+            vsubq_s16(vmovl_high_s8(svget_neonq(a)),
+                      vmovl_high_s8(svget_neonq(b))));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cca4adb86513970b53d6b102b53d10b78401d63
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
@@ -0,0 +1,295 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+#include <arm_neon_sve_bridge.h>
+
+// ============================================================================
+// 8 -> 16 : SIGNED
+// ============================================================================
+
+/* 
+** add_neon_i16_from_i8_low_sve_bridged:
+** 	saddl	v0.8h, v0.8b, v1.8b
+** 	ret
+*/
+svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** add_neon_i16_from_i8_high_sve_bridged:
+** 	saddl2	v0.8h, v0.16b, v1.16b
+** 	ret
+*/
+svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_low_sve_bridged:
+** 	ssubl	v0.8h, v0.8b, v1.8b
+** 	ret
+*/
+svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+/*
+** sub_neon_i16_from_i8_high_sve_bridged:
+** 	ssubl2	v0.8h, v0.16b, v1.16b
+** 	ret
+*/
+svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
+    int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
+    int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
+    return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
+}
+
+// ============================================================================
+// 8 -> 16 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u16_from_u8_low_sve_bridged:
+** 	uaddl	v0.8h, v0.8b, v1.8b
+** 	ret
+*/
+svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** add_neon_u16_from_u8_high_sve_bridged:
+** 	uaddl2	v0.8h, v0.16b, v1.16b
+** 	ret
+*/
+svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_low_sve_bridged:
+** 	usubl	v0.8h, v0.8b, v1.8b
+** 	ret
+*/
+svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+/*
+** sub_neon_u16_from_u8_high_sve_bridged:
+** 	usubl2	v0.8h, v0.16b, v1.16b
+** 	ret
+*/
+svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
+    uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
+    uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
+    return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i32_from_i16_low_sve_bridged:
+** 	saddl	v0.4s, v0.4h, v1.4h
+** 	ret
+*/
+svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** add_neon_i32_from_i16_high_sve_bridged:
+** 	saddl2	v0.4s, v0.8h, v1.8h
+** 	ret
+*/
+svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_low_sve_bridged:
+** 	ssubl	v0.4s, v0.4h, v1.4h
+** 	ret
+*/
+svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+/*
+** sub_neon_i32_from_i16_high_sve_bridged:
+** 	ssubl2	v0.4s, v0.8h, v1.8h
+** 	ret
+*/
+svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
+    int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
+    int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
+    return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
+}
+
+// ============================================================================
+// 16 -> 32 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u32_from_u16_low_sve_bridged:
+** 	uaddl	v0.4s, v0.4h, v1.4h
+** 	ret
+*/
+svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** add_neon_u32_from_u16_high_sve_bridged:
+** 	uaddl2	v0.4s, v0.8h, v1.8h
+** 	ret
+*/
+svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_low_sve_bridged:
+** 	usubl	v0.4s, v0.4h, v1.4h
+** 	ret
+*/
+svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+/*
+** sub_neon_u32_from_u16_high_sve_bridged:
+** 	usubl2	v0.4s, v0.8h, v1.8h
+** 	ret
+*/
+svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) {
+    uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
+    uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
+    return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : SIGNED
+// ============================================================================
+
+/*
+** add_neon_i64_from_i32_low_sve_bridged:
+** 	saddl	v0.2d, v0.2s, v1.2s
+** 	ret
+*/
+svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** add_neon_i64_from_i32_high_sve_bridged:
+** 	saddl2	v0.2d, v0.4s, v1.4s
+** 	ret
+*/
+svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_low_sve_bridged:
+** 	ssubl	v0.2d, v0.2s, v1.2s
+** 	ret
+*/
+svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+/*
+** sub_neon_i64_from_i32_high_sve_bridged:
+** 	ssubl2	v0.2d, v0.4s, v1.4s
+** 	ret
+*/
+svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
+    int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
+    int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
+    return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
+}
+
+// ============================================================================
+// 32 -> 64 : UNSIGNED
+// ============================================================================
+
+/*
+** add_neon_u64_from_u32_low_sve_bridged:
+** 	uaddl	v0.2d, v0.2s, v1.2s
+** 	ret
+*/
+svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** add_neon_u64_from_u32_high_sve_bridged:
+** 	uaddl2	v0.2d, v0.4s, v1.4s
+** 	ret
+*/
+svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
+}
+
+/*
+** sub_neon_u64_from_u32_low_sve_bridged:
+** 	usubl	v0.2d, v0.2s, v1.2s
+** 	ret
+*/
+svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}
+
+/* 
+** sub_neon_u64_from_u32_high_sve_bridged:
+** 	usubl2	v0.2d, v0.4s, v1.4s
+** 	ret
+*/
+svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) {
+    uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
+    uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
+    return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
+}

[PATCH]AArch64: expand extractions of Adv.SIMD registers from SVE as separate insn.

Reply via email to