Hi Tamar,
> On 28 Oct 2025, at 01:36, Tamar Christina <[email protected]> wrote:
>
> For this example using the Adv.SIMD/SVE Bridge
>
> #include <arm_neon.h>
> #include <arm_neon_sve_bridge.h>
> #include <stdint.h>
>
> svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
> return svset_neonq_s16(svundef_s16(),
> vsubq_s16(vmovl_high_s8(svget_neonq(a)),
> vmovl_high_s8(svget_neonq(b))));
> }
>
> we generate:
>
> sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
> sxtl2 v0.8h, v0.16b
> ssubw2 v0.8h, v0.8h, v1.16b
> ret
>
> instead of just
>
> sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t):
> ssubl2 v0.8h, v0.16b, v1.16b
> ret
>
> Commit g:abf865732a7313cf79ffa325faed3467ed28d8b8 added a framework to fold
> uses of instrinsics combined with lo/hi extractions into the appropriate low
> or highpart instructions.
>
> However this doesn't trigger because the Adv.SIMD from SVE extraction code for
>
> vmovl_high_s8(svget_neonq(a))
>
> does not have one argument as constant and only supports folding 2 insn, not 3
> into 1.
>
> The above in RTL generates
>
> (insn 7 4 8 2 (set (reg:V8QI 103 [ _6 ])
> (vec_select:V8QI (subreg:V16QI (reg/v:VNx16QI 109 [ a ]) 0)
> (parallel:V16QI [
> (const_int 8 [0x8])
> (const_int 9 [0x9])
> (const_int 10 [0xa])
> (const_int 11 [0xb])
> (const_int 12 [0xc])
> (const_int 13 [0xd])
> (const_int 14 [0xe])
> (const_int 15 [0xf])
> ]))) "":3174:43 -1
> (nil))
>
> Since the SVE and the Adv. SIMD modes are tieable this is a valid instruction
> to
> make, however it's suboptimal in that we can't fold this into the existing
> instruction patterns. Eventually early-ra will split off the SVE reg from the
> patterns but by then we're passed combine and insn foldings so we miss all the
> optimizations.
>
> This patch introduces vec_extract optabs for 128-bit and 64-bit Adv.SIMD
> vector
> extraction from SVE registers and emits an explicit separate instruction for
> the
> subregs. This then gives combine and rtl folding the opportunity to form the
> combined instructions and if not we arrive at the same RTL after early-ra.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
I haven’t thought the logic through 100%. It seems sensible, but does it work
for big-endian or needs gating for LE-only?
If it all works then ok with a few nits...
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-sve.md (vec_extract<mode><v128>,
> vec_extract<mode><v64>): New.
> * config/aarch64/iterators.md (V64, v64): New.
> * config/aarch64/predicates.md (const0_to_1_operand): New.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/simd/fold_to_highpart_6.c: Update codegen.
> * gcc.target/aarch64/sve/fold_to_highpart_1.c: New test.
> * gcc.target/aarch64/sve/fold_to_highpart_2.c: New test.
>
> ---
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index
> f459f63d6bb248a53d1e5199646fbb6cf3c9759b..780fb76164f9f26389d5c7a9c6e4ca3e63a63ebc
> 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -3112,6 +3112,48 @@ (define_expand "vec_extract<mode><Vel>"
> }
> )
>
> +;; Don't allow expansions of SVE to Adv. SIMD registers immediately as
> subregs.
> +;; Doing so prevents combine from matching instructions generated by the
> +;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the
> instructions.
> +;; Eventually early-ra or reload will split them but by then we've lost the
> +;; combinations. Instead split them early and allow forwardprop or combine
> to
...Wording nit, but GCC calls it elsewhere “fwprop” if we are going to
abbreviate it
> +;; push them into instructions where they are actually supported as part of
> the
> +;; instruction.
> +(define_expand "vec_extract<mode><v128>"
> + [(match_operand:<V128> 0 "register_operand")
> + (match_operand:SVE_FULL 1 "register_operand")
> + (match_operand:SI 2 "const0_operand")]
> + "TARGET_SVE"
> +{
> + emit_move_insn (operands[0],
> + force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode));
> + DONE;
> +})
> +
> +;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors.
> For
> +;; these extractions we can support offsets 0 and 1 by first extracting a
> +;; 128-bit vector and then selecting the appropriate half.
> +(define_expand "vec_extract<mode><v64>"
> + [(match_operand:<V64> 0 "register_operand")
> + (match_operand:SVE_FULL_BHS 1 "register_operand")
> + (match_operand:SI 2 "const0_to_1_operand")]
> + "TARGET_SVE"
> +{
> + if (const0_rtx == operands[2])
… The preferred way of checking for zero in these places is operands[2] ==
CONST0_RTX (SImode)
Ok with those changes.
Thanks,
Kyrill
> + emit_move_insn (operands[0],
> + force_lowpart_subreg (<V64>mode, operands[1],
> + <MODE>mode));
> + else
> + {
> + rtx tmp = gen_reg_rtx (<V128>mode);
> + emit_move_insn (tmp,
> + force_lowpart_subreg (<V128>mode, operands[1],
> + <MODE>mode));
> + emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2]));
> + }
> + DONE;
> +})
> +
> ;; Extract element zero. This is a special case because we want to force
> ;; the registers to be the same for the second alternative, and then
> ;; split the instruction into nothing after RA.
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index
> 517b2808b5f725db81709122848817aaafff1f34..9248f038bf42a8eb4297914347cde33fd0f1d5c9
> 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1816,6 +1816,18 @@ (define_mode_attr v128 [(VNx16QI "v16qi")
> (VNx4SI "v4si") (VNx4SF "v4sf")
> (VNx2DI "v2di") (VNx2DF "v2df")])
>
> +;; Gives the mode of the 64-bit lowpart of an SVE vector.
> +(define_mode_attr V64 [(VNx16QI "V8QI")
> + (VNx8HI "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF")
> + (VNx4SI "V2SI") (VNx4SF "V2SF")
> + (VNx2DI "DI") (VNx2DF "DF")])
> +
> +;; ...and again in lower case.
> +(define_mode_attr v64 [(VNx16QI "v8qi")
> + (VNx8HI "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf")
> + (VNx4SI "v2si") (VNx4SF "v2sf")
> + (VNx2DI "di") (VNx2DF "df")])
> +
> (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")])
>
> ;; 64-bit container modes the inner or scalar source mode.
> diff --git a/gcc/config/aarch64/predicates.md
> b/gcc/config/aarch64/predicates.md
> index
> 42304cef4391e15598bcd22da590c8663f3ffaa5..7a147294150357ed3484609cac2b5acdf2638e52
> 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -46,6 +46,10 @@ (define_predicate "const0_operand"
> (and (match_code "const_int")
> (match_test "op == CONST0_RTX (mode)")))
>
> +(define_predicate "const0_to_1_operand"
> + (and (match_code "const_int")
> + (match_test "IN_RANGE (INTVAL (op), 0, 1)")))
> +
> (define_predicate "const_0_to_7_operand"
> (and (match_code "const_int")
> (match_test "IN_RANGE (INTVAL (op), 0, 7)")))
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
> b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
> index
> 3570d4da34b5ebb4ae507d57b506f8cd87b76ba8..83ef2148fd84f806c243226f2bb4f4b6b1fdba7a
> 100644
> --- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
> @@ -1,6 +1,7 @@
> /* { dg-do compile } */
> /* { dg-require-effective-target aarch64_little_endian } */
> /* { dg-options "-O2 -march=armv8-a+sve" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
>
> #include <arm_neon_sve_bridge.h>
>
> @@ -16,6 +17,11 @@ test_addressable ()
> return vmovl_s8 (vget_high_s8 (z));
> }
>
> +/*
> +** test_scalable_type:
> +** sxtl2 v0.8h, v0.16b
> +** ret
> +*/
> int16x8_t
> test_scalable_type (svint8_t scalable)
> {
> @@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo)
> return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
> }
>
> -/* { dg-final { scan-assembler-not {sxtl2\t} } } */
> +/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */
> +/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a3d59a498bf6152e16f43271da1b400a79f84959
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O1" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <arm_neon.h>
> +#include <arm_neon_sve_bridge.h>
> +#include <stdint.h>
> +
> +/*
> +** sub_neon_i16_sve_bridged:
> +** ssubl2 v0.8h, v0.16b, v1.16b
> +** ret
> +*/
> +svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) {
> + return svset_neonq_s16(svundef_s16(),
> + vsubq_s16(vmovl_high_s8(svget_neonq(a)),
> + vmovl_high_s8(svget_neonq(b))));
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6cca4adb86513970b53d6b102b53d10b78401d63
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c
> @@ -0,0 +1,295 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O1" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <arm_neon.h>
> +#include <arm_sve.h>
> +#include <arm_neon_sve_bridge.h>
> +
> +//
> ============================================================================
> +// 8 -> 16 : SIGNED
> +//
> ============================================================================
> +
> +/*
> +** add_neon_i16_from_i8_low_sve_bridged:
> +** saddl v0.8h, v0.8b, v1.8b
> +** ret
> +*/
> +svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
> + int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
> + int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
> + return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
> +}
> +
> +/*
> +** add_neon_i16_from_i8_high_sve_bridged:
> +** saddl2 v0.8h, v0.16b, v1.16b
> +** ret
> +*/
> +svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
> + int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
> + int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
> + return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br));
> +}
> +
> +/*
> +** sub_neon_i16_from_i8_low_sve_bridged:
> +** ssubl v0.8h, v0.8b, v1.8b
> +** ret
> +*/
> +svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) {
> + int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a)));
> + int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b)));
> + return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
> +}
> +
> +/*
> +** sub_neon_i16_from_i8_high_sve_bridged:
> +** ssubl2 v0.8h, v0.16b, v1.16b
> +** ret
> +*/
> +svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) {
> + int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a)));
> + int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b)));
> + return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br));
> +}
> +
> +//
> ============================================================================
> +// 8 -> 16 : UNSIGNED
> +//
> ============================================================================
> +
> +/*
> +** add_neon_u16_from_u8_low_sve_bridged:
> +** uaddl v0.8h, v0.8b, v1.8b
> +** ret
> +*/
> +svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
> + uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
> + uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
> + return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
> +}
> +
> +/*
> +** add_neon_u16_from_u8_high_sve_bridged:
> +** uaddl2 v0.8h, v0.16b, v1.16b
> +** ret
> +*/
> +svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
> + uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
> + uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
> + return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br));
> +}
> +
> +/*
> +** sub_neon_u16_from_u8_low_sve_bridged:
> +** usubl v0.8h, v0.8b, v1.8b
> +** ret
> +*/
> +svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) {
> + uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a)));
> + uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b)));
> + return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
> +}
> +
> +/*
> +** sub_neon_u16_from_u8_high_sve_bridged:
> +** usubl2 v0.8h, v0.16b, v1.16b
> +** ret
> +*/
> +svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) {
> + uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a)));
> + uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b)));
> + return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br));
> +}
> +
> +//
> ============================================================================
> +// 16 -> 32 : SIGNED
> +//
> ============================================================================
> +
> +/*
> +** add_neon_i32_from_i16_low_sve_bridged:
> +** saddl v0.4s, v0.4h, v1.4h
> +** ret
> +*/
> +svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
> + int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
> + int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
> + return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
> +}
> +
> +/*
> +** add_neon_i32_from_i16_high_sve_bridged:
> +** saddl2 v0.4s, v0.8h, v1.8h
> +** ret
> +*/
> +svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
> + int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
> + int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
> + return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br));
> +}
> +
> +/*
> +** sub_neon_i32_from_i16_low_sve_bridged:
> +** ssubl v0.4s, v0.4h, v1.4h
> +** ret
> +*/
> +svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) {
> + int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a)));
> + int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b)));
> + return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
> +}
> +
> +/*
> +** sub_neon_i32_from_i16_high_sve_bridged:
> +** ssubl2 v0.4s, v0.8h, v1.8h
> +** ret
> +*/
> +svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) {
> + int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a)));
> + int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b)));
> + return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br));
> +}
> +
> +//
> ============================================================================
> +// 16 -> 32 : UNSIGNED
> +//
> ============================================================================
> +
> +/*
> +** add_neon_u32_from_u16_low_sve_bridged:
> +** uaddl v0.4s, v0.4h, v1.4h
> +** ret
> +*/
> +svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b)
> {
> + uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
> + uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
> + return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
> +}
> +
> +/*
> +** add_neon_u32_from_u16_high_sve_bridged:
> +** uaddl2 v0.4s, v0.8h, v1.8h
> +** ret
> +*/
> +svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t
> b) {
> + uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
> + uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
> + return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br));
> +}
> +
> +/*
> +** sub_neon_u32_from_u16_low_sve_bridged:
> +** usubl v0.4s, v0.4h, v1.4h
> +** ret
> +*/
> +svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b)
> {
> + uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a)));
> + uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b)));
> + return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
> +}
> +
> +/*
> +** sub_neon_u32_from_u16_high_sve_bridged:
> +** usubl2 v0.4s, v0.8h, v1.8h
> +** ret
> +*/
> +svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t
> b) {
> + uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a)));
> + uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b)));
> + return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br));
> +}
> +
> +//
> ============================================================================
> +// 32 -> 64 : SIGNED
> +//
> ============================================================================
> +
> +/*
> +** add_neon_i64_from_i32_low_sve_bridged:
> +** saddl v0.2d, v0.2s, v1.2s
> +** ret
> +*/
> +svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
> + int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
> + int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
> + return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
> +}
> +
> +/*
> +** add_neon_i64_from_i32_high_sve_bridged:
> +** saddl2 v0.2d, v0.4s, v1.4s
> +** ret
> +*/
> +svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
> + int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
> + int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
> + return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br));
> +}
> +
> +/*
> +** sub_neon_i64_from_i32_low_sve_bridged:
> +** ssubl v0.2d, v0.2s, v1.2s
> +** ret
> +*/
> +svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) {
> + int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a)));
> + int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b)));
> + return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
> +}
> +
> +/*
> +** sub_neon_i64_from_i32_high_sve_bridged:
> +** ssubl2 v0.2d, v0.4s, v1.4s
> +** ret
> +*/
> +svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) {
> + int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a)));
> + int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b)));
> + return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br));
> +}
> +
> +//
> ============================================================================
> +// 32 -> 64 : UNSIGNED
> +//
> ============================================================================
> +
> +/*
> +** add_neon_u64_from_u32_low_sve_bridged:
> +** uaddl v0.2d, v0.2s, v1.2s
> +** ret
> +*/
> +svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b)
> {
> + uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
> + uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
> + return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
> +}
> +
> +/*
> +** add_neon_u64_from_u32_high_sve_bridged:
> +** uaddl2 v0.2d, v0.4s, v1.4s
> +** ret
> +*/
> +svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t
> b) {
> + uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
> + uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
> + return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br));
> +}
> +
> +/*
> +** sub_neon_u64_from_u32_low_sve_bridged:
> +** usubl v0.2d, v0.2s, v1.2s
> +** ret
> +*/
> +svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b)
> {
> + uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a)));
> + uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b)));
> + return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
> +}
> +
> +/*
> +** sub_neon_u64_from_u32_high_sve_bridged:
> +** usubl2 v0.2d, v0.4s, v1.4s
> +** ret
> +*/
> +svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t
> b) {
> + uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a)));
> + uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b)));
> + return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br));
> +}
>
>
> --
> <rb19969.patch>