https://github.com/keith-packard created https://github.com/llvm/llvm-project/pull/111235
These two libraries don't build for `-march=armv8-a+nofp -mabi=aapcs-soft` as a couple of uses of floating point instructions and registers have crept in. >From 6201bc34f1213e9f8c477421757509c9c4e678ce Mon Sep 17 00:00:00 2001 From: Keith Packard <kei...@keithp.com> Date: Fri, 4 Oct 2024 21:06:37 -0700 Subject: [PATCH 1/2] [libunwind] Support aarch64 without FPU ldp and stp instructions both require an FPU. Use pairs of ldr or str instructions when the target doesn't have one. Signed-off-by: Keith Packard <kei...@keithp.com> --- libunwind/src/UnwindRegistersRestore.S | 36 +++++++++++++++---------- libunwind/src/UnwindRegistersSave.S | 37 +++++++++++++++----------- 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S index 180a66582f41b5..13a40d080c4c2d 100644 --- a/libunwind/src/UnwindRegistersRestore.S +++ b/libunwind/src/UnwindRegistersRestore.S @@ -633,6 +633,12 @@ Lnovec: .arch_extension gcs #endif +#if defined(__ARM_FP) && __ARM_FP != 0 +#define LDP(a,b,r,o,p) stp a, b, [r, o] +#else +#define LDP(a,b,r,o,p) ldr a, [r, o] ; ldr b, [r, p] +#endif + // // extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *); // @@ -642,23 +648,24 @@ Lnovec: .p2align 2 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto) // skip restore of x0,x1 for now - ldp x2, x3, [x0, #0x010] - ldp x4, x5, [x0, #0x020] - ldp x6, x7, [x0, #0x030] - ldp x8, x9, [x0, #0x040] - ldp x10,x11, [x0, #0x050] - ldp x12,x13, [x0, #0x060] - ldp x14,x15, [x0, #0x070] + LDP(x2, x3, x0, #0x010, #0x018) + LDP(x4, x5, x0, #0x020, #0x028) + LDP(x6, x7, x0, #0x030, #0x038) + LDP(x8, x9, x0, #0x040, #0x048) + LDP(x10, x11, x0, #0x050, #0x058) + LDP(x12, x13, x0, #0x060, #0x068) + LDP(x14, x15, x0, #0x070, #0x078) // x16 and x17 were clobbered by the call into the unwinder, so no point in // restoring them. - ldp x18,x19, [x0, #0x090] - ldp x20,x21, [x0, #0x0A0] - ldp x22,x23, [x0, #0x0B0] - ldp x24,x25, [x0, #0x0C0] - ldp x26,x27, [x0, #0x0D0] - ldp x28,x29, [x0, #0x0E0] + LDP(x18, x19, x0, #0x090, #0x098) + LDP(x20, x21, x0, #0x0A0, #0x0A8) + LDP(x22, x23, x0, #0x0B0, #0x0B8) + LDP(x24, x25, x0, #0x0C0, #0x0C8) + LDP(x26, x27, x0, #0x0D0, #0x0D8) + LDP(x28, x29, x0, #0x0E0, #0x0E8) ldr x30, [x0, #0x100] // restore pc into lr +#if defined(__ARM_FP) && __ARM_FP != 0 ldp d0, d1, [x0, #0x110] ldp d2, d3, [x0, #0x120] ldp d4, d5, [x0, #0x130] @@ -676,13 +683,14 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto) ldp d28,d29, [x0, #0x1F0] ldr d30, [x0, #0x200] ldr d31, [x0, #0x208] +#endif // Finally, restore sp. This must be done after the last read from the // context struct, because it is allocated on the stack, and an exception // could clobber the de-allocated portion of the stack after sp has been // restored. ldr x16, [x0, #0x0F8] - ldp x0, x1, [x0, #0x000] // restore x0,x1 + LDP(x0, x1, x0, #0x000, #0x008) // restore x0,x1 mov sp,x16 // restore sp #if defined(__ARM_FEATURE_GCS_DEFAULT) // If GCS is enabled we need to push the address we're returning to onto the diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S index fab234fcd6f318..922469bc11aa22 100644 --- a/libunwind/src/UnwindRegistersSave.S +++ b/libunwind/src/UnwindRegistersSave.S @@ -718,6 +718,12 @@ LnoR2Fix: #elif defined(__aarch64__) +#if defined(__ARM_FP) && __ARM_FP != 0 +#define STP(a,b,r,o,p) stp a, b, [r, o] +#else +#define STP(a,b,r,o,p) str a, [r, o] ; str b, [r, p] +#endif + // // extern int __unw_getcontext(unw_context_t* thread_state) // @@ -726,21 +732,21 @@ LnoR2Fix: // .p2align 2 DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) - stp x0, x1, [x0, #0x000] - stp x2, x3, [x0, #0x010] - stp x4, x5, [x0, #0x020] - stp x6, x7, [x0, #0x030] - stp x8, x9, [x0, #0x040] - stp x10,x11, [x0, #0x050] - stp x12,x13, [x0, #0x060] - stp x14,x15, [x0, #0x070] - stp x16,x17, [x0, #0x080] - stp x18,x19, [x0, #0x090] - stp x20,x21, [x0, #0x0A0] - stp x22,x23, [x0, #0x0B0] - stp x24,x25, [x0, #0x0C0] - stp x26,x27, [x0, #0x0D0] - stp x28,x29, [x0, #0x0E0] + STP(x0, x1, x0, #0x000, #0x008) + STP(x2, x3, x0, #0x010, #0x018) + STP(x4, x5, x0, #0x020, #0x028) + STP(x6, x7, x0, #0x030, #0x038) + STP(x8, x9, x0, #0x040, #0x048) + STP(x10, x11, x0, #0x050, #0x058) + STP(x12, x13, x0, #0x060, #0x068) + STP(x14, x15, x0, #0x070, #0x078) + STP(x16, x17, x0, #0x080, #0x088) + STP(x18, x19, x0, #0x090, #0x098) + STP(x20, x21, x0, #0x0A0, #0x0A8) + STP(x22, x23, x0, #0x0B0, #0x0B8) + STP(x24, x25, x0, #0x0C0, #0x0C8) + STP(x26, x27, x0, #0x0D0, #0x0D8) + STP(x28, x29, x0, #0x0E0, #0x0E8) str x30, [x0, #0x0F0] mov x1,sp str x1, [x0, #0x0F8] @@ -763,6 +769,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) stp d28,d29, [x0, #0x1F0] str d30, [x0, #0x200] str d31, [x0, #0x208] +#endif mov x0, #0 // return UNW_ESUCCESS ret >From 8ba3fedd182b23829abe36aab30ef5ddd55d2fc0 Mon Sep 17 00:00:00 2001 From: Keith Packard <kei...@keithp.com> Date: Fri, 4 Oct 2024 21:08:17 -0700 Subject: [PATCH 2/2] [compiler-rt] Support aarch64 targets without FPU Fall back to the old C implementations of various routines when the target doesn't have an FPU. Signed-off-by: Keith Packard <kei...@keithp.com> --- .../builtins/aarch64/sme-libc-mem-routines.S | 2 +- .../lib/builtins/aarch64/sme-libc-routines.c | 77 +++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S index 0318d9a6f1ebd2..72d87fb4fa8586 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S +++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S @@ -6,7 +6,7 @@ #include "../assembly.h" -#ifdef __aarch64__ +#if defined(__aarch64__) && __ARM_FP != 0 #define L(l) .L ## l diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c index 315490e73ea2b1..92fb953c03a376 100644 --- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c +++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c @@ -1,5 +1,82 @@ #include <stddef.h> +#if __ARM_FP == 0 +// WARNING: When building the scalar versions of these functions you need to +// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang +// from recognising a loop idiom and planting calls to memcpy! + +static void *__arm_sc_memcpy_fwd(void *dest, const void *src, + size_t n) __arm_streaming_compatible { + unsigned char *destp = (unsigned char *)dest; + const unsigned char *srcp = (const unsigned char *)src; + for (size_t i = 0; i < n; ++i) + destp[i] = srcp[i]; + + return dest; +} + +// If dest and src overlap then behaviour is undefined, hence we can add the +// restrict keywords here. This also matches the definition of the libc memcpy +// according to the man page. +void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src, + size_t n) __arm_streaming_compatible { + return __arm_sc_memcpy_fwd(dest, src, n); +} + +void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible { + unsigned char *destp = (unsigned char *)dest; + unsigned char c8 = (unsigned char)c; + for (size_t i = 0; i < n; ++i) + destp[i] = c8; + + return dest; +} + +static void *__arm_sc_memcpy_rev(void *dest, const void *src, + size_t n) __arm_streaming_compatible { + unsigned char *destp = (unsigned char *)dest; + const unsigned char *srcp = (const unsigned char *)src; + // TODO: Improve performance by copying larger chunks in reverse, or by + // using SVE. + while (n > 0) { + --n; + destp[n] = srcp[n]; + } + return dest; +} + +// Semantically a memmove is equivalent to the following: +// 1. Copy the entire contents of src to a temporary array that does not +// overlap with src or dest. +// 2. Copy the contents of the temporary array into dest. +void *__arm_sc_memmove(void *dest, const void *src, + size_t n) __arm_streaming_compatible { + unsigned char *destp = (unsigned char *)dest; + const unsigned char *srcp = (const unsigned char *)src; + + // If src and dest don't overlap then just invoke memcpy + if ((srcp > (destp + n)) || (destp > (srcp + n))) + return __arm_sc_memcpy_fwd(dest, src, n); + + // Overlap case 1: + // src: Low | -> | High + // dest: Low | -> | High + // Here src is always ahead of dest at a higher addres. If we first read a + // chunk of data from src we can safely write the same chunk to dest without + // corrupting future reads of src. + if (srcp > destp) + return __arm_sc_memcpy_fwd(dest, src, n); + + // Overlap case 2: + // src: Low | -> | High + // dest: Low | -> | High + // While we're in the overlap region we're always corrupting future reads of + // src when writing to dest. An efficient way to do this is to copy the data + // in reverse by starting at the highest address. + return __arm_sc_memcpy_rev(dest, src, n); +} +#endif + const void *__arm_sc_memchr(const void *src, int c, size_t n) __arm_streaming_compatible { const unsigned char *srcp = (const unsigned char *)src; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits