Re: [PATCH v13 04/40] prctl: arch-agnostic prctl for shadow stack
On Tue, Oct 01, 2024 at 11:58:43PM +0100, Mark Brown wrote: Three architectures (x86, aarch64, riscv) have announced support for shadow stacks with fairly similar functionality. While x86 is using arch_prctl() to control the functionality neither arm64 nor riscv uses that interface so this patch adds arch-agnostic prctl() support to get and set status of shadow stacks and lock the current configuation to prevent further changes, with support for turning on and off individual subfeatures so applications can limit their exposure to features that they do not need. The features are: - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks, including allocation of a shadow stack if one is not already allocated. - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow stack. - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack. These features are expected to be inherited by new threads and cleared on exec(), unknown features should be rejected for enable but accepted for locking (in order to allow for future proofing). This is based on a patch originally written by Deepak Gupta but modified fairly heavily, support for indirect landing pads is removed, additional modes added and the locking interface reworked. The set status prctl() is also reworked to just set flags, if setting/reading the shadow stack pointer is required this could be a separate prctl. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown --- include/linux/mm.h | 4 include/uapi/linux/prctl.h | 22 ++ kernel/sys.c | 30 ++ 3 files changed, 56 insertions(+) Reviewed-by: Deepak Gupta
[PATCH v13 40/40] KVM: selftests: arm64: Add GCS registers to get-reg-list
GCS adds new registers GCSCR_EL1, GCSCRE0_EL1, GCSPR_EL1 and GCSPR_EL0. Add these to those validated by get-reg-list. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/kvm/aarch64/get-reg-list.c | 28 ++ 1 file changed, 28 insertions(+) diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c index d43fb3f49050..c17451069a15 100644 --- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c +++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c @@ -29,6 +29,24 @@ static struct feature_id_reg feat_id_regs[] = { 0, 1 }, + { + ARM64_SYS_REG(3, 0, 2, 5, 0), /* GCSCR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + 44, + 1 + }, + { + ARM64_SYS_REG(3, 0, 2, 5, 1), /* GCSPR_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + 44, + 1 + }, + { + ARM64_SYS_REG(3, 0, 2, 5, 2), /* GCSCRE0_EL1 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + 44, + 1 + }, { ARM64_SYS_REG(3, 0, 10, 2, 2), /* PIRE0_EL1 */ ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ @@ -52,6 +70,12 @@ static struct feature_id_reg feat_id_regs[] = { ARM64_SYS_REG(3, 0, 0, 7, 3), /* ID_AA64MMFR3_EL1 */ 16, 1 + }, + { + ARM64_SYS_REG(3, 3, 2, 5, 1), /* GCSPR_EL0 */ + ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */ + 44, + 1 } }; @@ -472,6 +496,9 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */ ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */ ARM64_SYS_REG(3, 0, 2, 0, 3), /* TCR2_EL1 */ + ARM64_SYS_REG(3, 0, 2, 5, 0), /* GCSCR_EL1 */ + ARM64_SYS_REG(3, 0, 2, 5, 1), /* GCSPR_EL1 */ + ARM64_SYS_REG(3, 0, 2, 5, 2), /* GCSCRE0_EL1 */ ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */ ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */ ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */ @@ -488,6 +515,7 @@ static __u64 base_regs[] = { ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */ + ARM64_SYS_REG(3, 3, 2, 5, 1), /* GCSPR_EL0 */ ARM64_SYS_REG(3, 3, 10, 2, 4), /* POR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */ ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */ -- 2.39.2
[PATCH v13 09/40] arm64/gcs: Add manual encodings of GCS instructions
Define C callable functions for GCS instructions used by the kernel. In order to avoid ambitious toolchain requirements for GCS support these are manually encoded, this means we have fixed register numbers which will be a bit limiting for the compiler but none of these should be used in sufficiently fast paths for this to be a problem. Note that GCSSTTR is used to store to EL0. Reviewed-by: Thiago Jung Bauermann Acked-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/gcs.h | 51 arch/arm64/include/asm/uaccess.h | 22 + 2 files changed, 73 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h new file mode 100644 index ..7c5e95218db6 --- /dev/null +++ b/arch/arm64/include/asm/gcs.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ +#ifndef __ASM_GCS_H +#define __ASM_GCS_H + +#include +#include + +static inline void gcsb_dsync(void) +{ + asm volatile(".inst 0xd503227f" : : : "memory"); +} + +static inline void gcsstr(u64 *addr, u64 val) +{ + register u64 *_addr __asm__ ("x0") = addr; + register long _val __asm__ ("x1") = val; + + /* GCSSTTR x1, x0 */ + asm volatile( + ".inst 0xd91f1c01\n" + : + : "rZ" (_val), "r" (_addr) + : "memory"); +} + +static inline void gcsss1(u64 Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static inline u64 gcsss2(void) +{ + u64 Xt; + + asm volatile( + "SYSL %0, #3, C7, C7, #3\n" + : "=r" (Xt) + : + : "memory"); + + return Xt; +} + +#endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 1aa4ecb73429..0db494b24dd0 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -502,4 +502,26 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr, #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ +#ifdef CONFIG_ARM64_GCS + +static inline int gcssttr(unsigned long __user *addr, unsigned long val) +{ + register unsigned long __user *_addr __asm__ ("x0") = addr; + register unsigned long _val __asm__ ("x1") = val; + int err = 0; + + /* GCSSTTR x1, x0 */ + asm volatile( + "1: .inst 0xd91f1c01\n" + "2: \n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) + : "+r" (err) + : "rZ" (_val), "r" (_addr) + : "memory"); + + return err; +} + +#endif /* CONFIG_ARM64_GCS */ + #endif /* __ASM_UACCESS_H */ -- 2.39.2
[PATCH v13 00/40] arm64/gcs: Provide support for GCS in userspace
The arm64 Guarded Control Stack (GCS) feature provides support for hardware protected stacks of return addresses, intended to provide hardening against return oriented programming (ROP) attacks and to make it easier to gather call stacks for applications such as profiling. When GCS is active a secondary stack called the Guarded Control Stack is maintained, protected with a memory attribute which means that it can only be written with specific GCS operations. The current GCS pointer can not be directly written to by userspace. When a BL is executed the value stored in LR is also pushed onto the GCS, and when a RET is executed the top of the GCS is popped and compared to LR with a fault being raised if the values do not match. GCS operations may only be performed on GCS pages, a data abort is generated if they are not. The combination of hardware enforcement and lack of extra instructions in the function entry and exit paths should result in something which has less overhead and is more difficult to attack than a purely software implementation like clang's shadow stacks. This series implements support for use of GCS by userspace, along with support for use of GCS within KVM guests. It does not enable use of GCS by either EL1 or EL2, this will be implemented separately. Executables are started without GCS and must use a prctl() to enable it, it is expected that this will be done very early in application execution by the dynamic linker or other startup code. For dynamic linking this will be done by checking that everything in the executable is marked as GCS compatible. x86 has an equivalent feature called shadow stacks, this series depends on the x86 patches for generic memory management support for the new guarded/shadow stack page type and shares APIs as much as possible. As there has been extensive discussion with the wider community around the ABI for shadow stacks I have as far as practical kept implementation decisions close to those for x86, anticipating that review would lead to similar conclusions in the absence of strong reasoning for divergence. The main divergence I am concious of is that x86 allows shadow stack to be enabled and disabled repeatedly, freeing the shadow stack for the thread whenever disabled, while this implementation keeps the GCS allocated after disable but refuses to reenable it. This is to avoid races with things actively walking the GCS during a disable, we do anticipate that some systems will wish to disable GCS at runtime but are not aware of any demand for subsequently reenabling it. x86 uses an arch_prctl() to manage enable and disable, since only x86 and S/390 use arch_prctl() a generic prctl() was proposed[1] as part of a patch set for the equivalent RISC-V Zicfiss feature which I initially adopted fairly directly but following review feedback has been revised quite a bit. We currently maintain the x86 pattern of implicitly allocating a shadow stack for threads started with shadow stack enabled, there has been some discussion of removing this support and requiring the use of clone3() with explicit allocation of shadow stacks instead. I have no strong feelings either way, implicit allocation is not really consistent with anything else we do and creates the potential for errors around thread exit but on the other hand it is existing ABI on x86 and minimises the changes needed in userspace code. glibc and bionic changes using this ABI have been implemented and tested. Headless Android systems have been validated and Ross Burton has used this code has been used to bring up a Yocto system with GCS enabed as standard, a test implementation of V8 support has also been done. uprobes are not currently supported, missing emulation was identified late in review. There is an open issue with support for CRIU, on x86 this required the ability to set the GCS mode via ptrace. This series supports configuring mode bits other than enable/disable via ptrace but it needs to be confirmed if this is sufficient. It is likely that we could relax some of the barriers added here with some more targeted placements, this is left for further study. There is an in process series adding clone3() support for shadow stacks: https://lore.kernel.org/r/20240819-clone3-shadow-stack-v9-0-962d74f99...@kernel.org Previous versions of this series depended on that, this dependency has been removed in order to make merging easier. [1] https://lore.kernel.org/lkml/20240403234054.2020347-1-de...@rivosinc.com/ Signed-off-by: Mark Brown --- Changes in v13: - Rebase onto v6.12-rc1. - Allocate VM_HIGH_ARCH_6 since protection keys used all the existing bits. - Implement mm_release() and free transparently allocated GCSs there. - Use bit 32 of AT_HWCAP for GCS due to AT_HWCAP2 being filled. - Since we now only set GCSCRE0_EL1 on change ensure that it is initialised with GCSPR_EL0 accessible to EL0. - Fix OOM handling on thread copy. - Link to v12: https://lore.kernel.org/r/20240829-ar
[PATCH v13 03/40] arm64/mm: Restructure arch_validate_flags() for extensibility
Currently arch_validate_flags() is written in a very non-extensible fashion, returning immediately if MTE is not supported and writing the MTE check as a direct return. Since we will want to add more checks for GCS refactor the existing code to be more extensible, no functional change intended. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/mman.h | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 9e39217b4afb..03b790fd0ad8 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -62,11 +62,17 @@ static inline bool arch_validate_prot(unsigned long prot, static inline bool arch_validate_flags(unsigned long vm_flags) { - if (!system_supports_mte()) - return true; + if (system_supports_mte()) { + /* +* only allow VM_MTE if VM_MTE_ALLOWED has been set +* previously +*/ + if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED)) + return false; + } + + return true; - /* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */ - return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED); } #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags) -- 2.39.2
[PATCH v13 01/40] mm: Introduce ARCH_HAS_USER_SHADOW_STACK
Since multiple architectures have support for shadow stacks and we need to select support for this feature in several places in the generic code provide a generic config option that the architectures can select. Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Deepak Gupta Reviewed-by: Rick Edgecombe Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Tested-by: Kees Cook Acked-by: Shuah Khan Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- arch/x86/Kconfig | 1 + fs/proc/task_mmu.c | 2 +- mm/Kconfig | 6 ++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2852fcd82cbd..8ccae77d40f7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1954,6 +1954,7 @@ config X86_USER_SHADOW_STACK depends on AS_WRUSS depends on X86_64 select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_USER_SHADOW_STACK select X86_CET help Shadow stack protection is a hardware feature that detects function diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 72f14fd59c2d..23f875e78eae 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -971,7 +971,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -#ifdef CONFIG_X86_USER_SHADOW_STACK +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) diff --git a/mm/Kconfig b/mm/Kconfig index 4c9f5ea13271..4b2a1ef9a161 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1296,6 +1296,12 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. +config ARCH_HAS_USER_SHADOW_STACK + bool + help + The architecture has hardware support for userspace shadow call + stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). + source "mm/damon/Kconfig" endmenu -- 2.39.2
[PATCH v13 07/40] arm64/gcs: Document the ABI for Guarded Control Stacks
Add some documentation of the userspace ABI for Guarded Control Stacks. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown --- Documentation/arch/arm64/gcs.rst | 230 + Documentation/arch/arm64/index.rst | 1 + 2 files changed, 231 insertions(+) diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst new file mode 100644 index ..af58d9151cb7 --- /dev/null +++ b/Documentation/arch/arm64/gcs.rst @@ -0,0 +1,230 @@ +=== +Guarded Control Stack support for AArch64 Linux +=== + +This document outlines briefly the interface provided to userspace by Linux in +order to support use of the ARM Guarded Control Stack (GCS) feature. + +This is an outline of the most important features and issues only and not +intended to be exhaustive. + + + +1. General +--- + +* GCS is an architecture feature intended to provide greater protection + against return oriented programming (ROP) attacks and to simplify the + implementation of features that need to collect stack traces such as + profiling. + +* When GCS is enabled a separate guarded control stack is maintained by the + PE which is writeable only through specific GCS operations. This + stores the call stack only, when a procedure call instruction is + performed the current PC is pushed onto the GCS and on RET the + address in the LR is verified against that on the top of the GCS. + +* When active the current GCS pointer is stored in the system register + GCSPR_EL0. This is readable by userspace but can only be updated + via specific GCS instructions. + +* The architecture provides instructions for switching between guarded + control stacks with checks to ensure that the new stack is a valid + target for switching. + +* The functionality of GCS is similar to that provided by the x86 Shadow + Stack feature, due to sharing of userspace interfaces the ABI refers to + shadow stacks rather than GCS. + +* Support for GCS is reported to userspace via HWCAP_GCS in the aux vector + AT_HWCAP2 entry. + +* GCS is enabled per thread. While there is support for disabling GCS + at runtime this should be done with great care. + +* GCS memory access faults are reported as normal memory access faults. + +* GCS specific errors (those reported with EC 0x2d) will be reported as + SIGSEGV with a si_code of SEGV_CPERR (control protection error). + +* GCS is supported only for AArch64. + +* On systems where GCS is supported GCSPR_EL0 is always readable by EL0 + regardless of the GCS configuration for the thread. + +* The architecture supports enabling GCS without verifying that return values + in LR match those in the GCS, the LR will be ignored. This is not supported + by Linux. + + + +2. Enabling and disabling Guarded Control Stacks +- + +* GCS is enabled and disabled for a thread via the PR_SET_SHADOW_STACK_STATUS + prctl(), this takes a single flags argument specifying which GCS features + should be used. + +* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack + and enables GCS for the thread, enabling the functionality controlled by + GCSCRE0_EL1.{nTR, RVCHKEN, PCRSEL}. + +* When set the PR_SHADOW_STACK_PUSH flag enables the functionality controlled + by GCSCRE0_EL1.PUSHMEn, allowing explicit GCS pushes. + +* When set the PR_SHADOW_STACK_WRITE flag enables the functionality controlled + by GCSCRE0_EL1.STREn, allowing explicit stores to the Guarded Control Stack. + +* Any unknown flags will cause PR_SET_SHADOW_STACK_STATUS to return -EINVAL. + +* PR_LOCK_SHADOW_STACK_STATUS is passed a bitmask of features with the same + values as used for PR_SET_SHADOW_STACK_STATUS. Any future changes to the + status of the specified GCS mode bits will be rejected. + +* PR_LOCK_SHADOW_STACK_STATUS allows any bit to be locked, this allows + userspace to prevent changes to any future features. + +* There is no support for a process to remove a lock that has been set for + it. + +* PR_SET_SHADOW_STACK_STATUS and PR_LOCK_SHADOW_STACK_STATUS affect only the + thread that called them, any other running threads will be unaffected. + +* New threads inherit the GCS configuration of the thread that created them. + +* GCS is disabled on exec(). + +* The current GCS configuration for a thread may be read with the + PR_GET_SHADOW_STACK_STATUS prctl(), this returns the same flags that + are passed to PR_SET_SHADOW_STACK_STATUS. + +* If GCS is disabled for a thread after having previously been enabled then + the stack will remain allocated for the lifetime of the thread. At present + any attempt to reenable GCS for the thread will be rejected, this may be + revisited in future. + +* It should be noted that since enabling GCS will result in GCS becoming + active immediately it i
[PATCH v13 06/40] arm64: Document boot requirements for Guarded Control Stacks
FEAT_GCS introduces a number of new system registers, we require that access to these registers is not trapped when we identify that the feature is present. There is also a HCRX_EL2 control to make GCS operations functional. Since if GCS is enabled any function call instruction will cause a fault we also require that the feature be specifically disabled, existing kernels implicitly have this requirement and especially given that the MMU must be disabled it is difficult to see a situation where leaving GCS enabled would be reasonable. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- Documentation/arch/arm64/booting.rst | 32 1 file changed, 32 insertions(+) diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index b57776a68f15..aed6e9f47cf3 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -411,6 +411,38 @@ Before jumping into the kernel, the following conditions must be met: - HFGRWR_EL2.nPIRE0_EL1 (bit 57) must be initialised to 0b1. + - For CPUs with Guarded Control Stacks (FEAT_GCS): + + - GCSCR_EL1 must be initialised to 0. + + - GCSCRE0_EL1 must be initialised to 0. + + - If EL3 is present: + +- SCR_EL3.GCSEn (bit 39) must be initialised to 0b1. + + - If EL2 is present: + +- GCSCR_EL2 must be initialised to 0. + + - If the kernel is entered at EL1 and EL2 is present: + +- HCRX_EL2.GCSEn must be initialised to 0b1. + +- HFGITR_EL2.nGCSEPP (bit 59) must be initialised to 0b1. + +- HFGITR_EL2.nGCSSTR_EL1 (bit 58) must be initialised to 0b1. + +- HFGITR_EL2.nGCSPUSHM_EL1 (bit 57) must be initialised to 0b1. + +- HFGRTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1. + +- HFGRTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1. + +- HFGWTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1. + +- HFGWTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1. + The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must enter the kernel in the same exception level. Where the values documented -- 2.39.2
[PATCH v13 04/40] prctl: arch-agnostic prctl for shadow stack
Three architectures (x86, aarch64, riscv) have announced support for shadow stacks with fairly similar functionality. While x86 is using arch_prctl() to control the functionality neither arm64 nor riscv uses that interface so this patch adds arch-agnostic prctl() support to get and set status of shadow stacks and lock the current configuation to prevent further changes, with support for turning on and off individual subfeatures so applications can limit their exposure to features that they do not need. The features are: - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks, including allocation of a shadow stack if one is not already allocated. - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow stack. - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack. These features are expected to be inherited by new threads and cleared on exec(), unknown features should be rejected for enable but accepted for locking (in order to allow for future proofing). This is based on a patch originally written by Deepak Gupta but modified fairly heavily, support for indirect landing pads is removed, additional modes added and the locking interface reworked. The set status prctl() is also reworked to just set flags, if setting/reading the shadow stack pointer is required this could be a separate prctl. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown --- include/linux/mm.h | 4 include/uapi/linux/prctl.h | 22 ++ kernel/sys.c | 30 ++ 3 files changed, 56 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 182bad0c55df..56654306a832 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4221,4 +4221,8 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) } #endif /* CONFIG_MEM_ALLOC_PROFILING */ +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); +int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); +int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + #endif /* _LINUX_MM_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 35791791a879..557a3d2ac1d4 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -328,4 +328,26 @@ struct prctl_mm_map { # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC0x10 /* Clear the aspect on exec */ # define PR_PPC_DEXCR_CTRL_MASK0x1f +/* + * Get the current shadow stack configuration for the current thread, + * this will be the value configured via PR_SET_SHADOW_STACK_STATUS. + */ +#define PR_GET_SHADOW_STACK_STATUS 74 + +/* + * Set the current shadow stack configuration. Enabling the shadow + * stack will cause a shadow stack to be allocated for the thread. + */ +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# define PR_SHADOW_STACK_WRITE (1UL << 1) +# define PR_SHADOW_STACK_PUSH (1UL << 2) + +/* + * Prevent further changes to the specified shadow stack + * configuration. All bits may be locked via this call, including + * undefined bits. + */ +#define PR_LOCK_SHADOW_STACK_STATUS 76 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 4da31f28fda8..3d38a9c7c5c9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } +int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME @@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_RISCV_SET_ICACHE_FLUSH_CTX: error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); break; + case PR_GET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2); + break; + case PR_SET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_shadow_stack_status(me, arg2); + break; + case PR_LOCK_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_lock_shadow_stack_status(me, arg2); + break; def
[PATCH v13 05/40] mman: Add map_shadow_stack() flags
In preparation for adding arm64 GCS support make the map_shadow_stack() SHADOW_STACK_SET_TOKEN flag generic and add _SET_MARKER. The existing flag indicates that a token usable for stack switch should be added to the top of the newly mapped GCS region while the new flag indicates that a top of stack marker suitable for use by unwinders should be added above that. For arm64 the top of stack marker is all bits 0. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown --- arch/x86/include/uapi/asm/mman.h | 3 --- include/uapi/asm-generic/mman.h | 4 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 46cdc941f958..ac1e6277212b 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -5,9 +5,6 @@ #define MAP_32BIT 0x40/* only give out 32bit addresses */ #define MAP_ABOVE4G0x80/* only map above 4GB */ -/* Flags for map_shadow_stack(2) */ -#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ - #include #endif /* _ASM_X86_MMAN_H */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 57e8195d0b53..5e3d61ddbd8c 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -19,4 +19,8 @@ #define MCL_FUTURE 2 /* lock all future mappings */ #define MCL_ONFAULT4 /* lock all pages that are faulted in */ +#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ +#define SHADOW_STACK_SET_MARKER (1ULL << 1) /* Set up a top of stack marker in the shadow stack */ + + #endif /* __ASM_GENERIC_MMAN_H */ -- 2.39.2
[PATCH v13 02/40] mm: Define VM_HIGH_ARCH_6
The addition of protection keys means that on arm64 we now use all of the currently defined VM_HIGH_ARCH_x bits. In order to allow us to allocate a new flag for GCS pages define VM_HIGH_ARCH_6. Signed-off-by: Mark Brown --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..182bad0c55df 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -329,12 +329,14 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS -- 2.39.2
[PATCH v13 12/40] arm64/cpufeature: Runtime detection of Guarded Control Stack (GCS)
Add a cpufeature for GCS, allowing other code to conditionally support it at runtime. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/cpufeature.h | 6 ++ arch/arm64/kernel/cpufeature.c | 20 arch/arm64/tools/cpucaps| 1 + 3 files changed, 27 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..69470795f5d2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -838,6 +838,12 @@ static inline bool system_supports_poe(void) alternative_has_cap_unlikely(ARM64_HAS_S1POE); } +static inline bool system_supports_gcs(void) +{ + return IS_ENABLED(CONFIG_ARM64_GCS) && + alternative_has_cap_unlikely(ARM64_HAS_GCS); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..d1e758e99e0a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -291,6 +291,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -2358,6 +2360,14 @@ static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) } #endif +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2889,6 +2899,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, +#endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, #endif {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index eedb5acc21ed..867d25d4a45a 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -29,6 +29,7 @@ HAS_EVT HAS_FPMR HAS_FGT HAS_FPSIMD +HAS_GCS HAS_GENERIC_AUTH HAS_GENERIC_AUTH_ARCH_QARMA3 HAS_GENERIC_AUTH_ARCH_QARMA5 -- 2.39.2
[PATCH v13 11/40] arm64/gcs: Provide basic EL2 setup to allow GCS usage at EL0 and EL1
There is a control HCRX_EL2.GCSEn which must be set to allow GCS features to take effect at lower ELs and also fine grained traps for GCS usage at EL0 and EL1. Configure all these to allow GCS usage by EL0 and EL1. We also initialise GCSCR_EL1 and GCSCRE0_EL1 to ensure that we can execute function call instructions without faulting regardless of the state when the kernel is started. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/el2_setup.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index e0ffdf13a18b..27086a81eae3 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -27,6 +27,14 @@ ubfxx0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 cbz x0, .Lskip_hcrx_\@ mov_q x0, HCRX_HOST_FLAGS + +/* Enable GCS if supported */ + mrs_s x1, SYS_ID_AA64PFR1_EL1 + ubfxx1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 + cbz x1, .Lset_hcrx_\@ + orr x0, x0, #HCRX_EL2_GCSEn + +.Lset_hcrx_\@: msr_s SYS_HCRX_EL2, x0 .Lskip_hcrx_\@: .endm @@ -200,6 +208,16 @@ orr x0, x0, #HFGxTR_EL2_nPOR_EL0 .Lskip_poe_fgt_\@: + /* GCS depends on PIE so we don't check it if PIE is absent */ + mrs_s x1, SYS_ID_AA64PFR1_EL1 + ubfxx1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 + cbz x1, .Lset_fgt_\@ + + /* Disable traps of access to GCS registers at EL0 and EL1 */ + orr x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK + orr x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK + +.Lset_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0 msr_s SYS_HFGITR_EL2, xzr @@ -215,6 +233,17 @@ .Lskip_fgt_\@: .endm +.macro __init_el2_gcs + mrs_s x1, SYS_ID_AA64PFR1_EL1 + ubfxx1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 + cbz x1, .Lskip_gcs_\@ + + /* Ensure GCS is not enabled when we start trying to do BLs */ + msr_s SYS_GCSCR_EL1, xzr + msr_s SYS_GCSCRE0_EL1, xzr +.Lskip_gcs_\@: +.endm + .macro __init_el2_nvhe_prepare_eret mov x0, #INIT_PSTATE_EL1 msr spsr_el2, x0 @@ -240,6 +269,7 @@ __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt +__init_el2_gcs .endm #ifndef __KVM_NVHE_HYPERVISOR__ -- 2.39.2
[PATCH v13 08/40] arm64/sysreg: Add definitions for architected GCS caps
The architecture defines a format for guarded control stack caps, used to mark the top of an unused GCS in order to limit the potential for exploitation via stack switching. Add definitions associated with these. Reviewed-by: Thiago Jung Bauermann Acked-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/sysreg.h | 20 1 file changed, 20 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9ea97dddefc4..9c98ff448bd9 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1101,6 +1101,26 @@ /* Initial value for Permission Overlay Extension for EL0 */ #define POR_EL0_INIT POE_RXW +/* + * Definitions for Guarded Control Stack + */ + +#define GCS_CAP_ADDR_MASK GENMASK(63, 12) +#define GCS_CAP_ADDR_SHIFT 12 +#define GCS_CAP_ADDR_WIDTH 52 +#define GCS_CAP_ADDR(x)FIELD_GET(GCS_CAP_ADDR_MASK, x) + +#define GCS_CAP_TOKEN_MASK GENMASK(11, 0) +#define GCS_CAP_TOKEN_SHIFT0 +#define GCS_CAP_TOKEN_WIDTH12 +#define GCS_CAP_TOKEN(x) FIELD_GET(GCS_CAP_TOKEN_MASK, x) + +#define GCS_CAP_VALID_TOKEN0x1 +#define GCS_CAP_IN_PROGRESS_TOKEN 0x5 + +#define GCS_CAP(x) unsigned long)x) & GCS_CAP_ADDR_MASK) | \ + GCS_CAP_VALID_TOKEN) + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ -- 2.39.2
[PATCH v13 14/40] mm: Define VM_SHADOW_STACK for arm64 when we support GCS
Use VM_HIGH_ARCH_5 for guarded control stack pages. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- Documentation/filesystems/proc.rst | 2 +- include/linux/mm.h | 12 +++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index e834779d9611..6a882c57a7e7 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -579,7 +579,7 @@ encoded manner. The codes are the following: mtarm64 MTE allocation tags are enabled umuserfaultfd missing tracking uwuserfaultfd wr-protect tracking -ssshadow stack page +ssshadow/guarded control stack page slsealed ===== diff --git a/include/linux/mm.h b/include/linux/mm.h index 56654306a832..8852c39c7695 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -367,7 +367,17 @@ extern unsigned int kobjsize(const void *objp); * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#else +#endif + +#if defined(CONFIG_ARM64_GCS) +/* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ +# define VM_SHADOW_STACK VM_HIGH_ARCH_6 +#endif + +#ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif -- 2.39.2
[PATCH v13 18/40] arm64/hwcap: Add hwcap for GCS
Provide a hwcap to enable userspace to detect support for GCS. Signed-off-by: Mark Brown --- Documentation/arch/arm64/elf_hwcaps.rst | 4 arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 3 ++- arch/arm64/kernel/cpufeature.c | 3 +++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index 694f67fa07d1..25b41ff74fa0 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -170,6 +170,10 @@ HWCAP_PACG ID_AA64ISAR1_EL1.GPI == 0b0001, as described by Documentation/arch/arm64/pointer-authentication.rst. +HWCAP_GCS +Functionality implied by ID_AA64PFR1_EL1.GCS == 0b1, as +described by Documentation/arch/arm64/gcs.rst. + HWCAP2_DCPODP Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010. diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index a775adddecf2..7bcf1347ca0b 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -92,6 +92,7 @@ #define KERNEL_HWCAP_SB__khwcap_feature(SB) #define KERNEL_HWCAP_PACA __khwcap_feature(PACA) #define KERNEL_HWCAP_PACG __khwcap_feature(PACG) +#define KERNEL_HWCAP_GCS __khwcap_feature(GCS) #define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64) #define KERNEL_HWCAP_DCPODP__khwcap2_feature(DCPODP) diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 055381b2c615..675642ec4d91 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -21,7 +21,7 @@ * HWCAP flags - for AT_HWCAP * * Bits 62 and 63 are reserved for use by libc. - * Bits 32-61 are unallocated for potential use by libc. + * Bits 33-61 are unallocated for potential use by libc. */ #define HWCAP_FP (1 << 0) #define HWCAP_ASIMD(1 << 1) @@ -55,6 +55,7 @@ #define HWCAP_SB (1 << 29) #define HWCAP_PACA (1 << 30) #define HWCAP_PACG (1UL << 31) +#define HWCAP_GCS (1UL << 32) /* * HWCAP2 flags - for AT_HWCAP2 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d1e758e99e0a..b8655d55f318 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3025,6 +3025,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), +#endif +#ifdef CONFIG_ARM64_GCS + HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), #endif HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 44718d0482b3..f2f92c6b1c85 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -80,6 +80,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SB] = "sb", [KERNEL_HWCAP_PACA] = "paca", [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_GCS] = "gcs", [KERNEL_HWCAP_DCPODP] = "dcpodp", [KERNEL_HWCAP_SVE2] = "sve2", [KERNEL_HWCAP_SVEAES] = "sveaes", -- 2.39.2
[PATCH v13 24/40] arm64/mm: Implement map_shadow_stack()
As discussed extensively in the changelog for the addition of this syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the existing mmap() and madvise() syscalls do not map entirely well onto the security requirements for guarded control stacks since they lead to windows where memory is allocated but not yet protected or stacks which are not properly and safely initialised. Instead a new syscall map_shadow_stack() has been defined which allocates and initialises a shadow stack page. Implement this for arm64. Two flags are provided, allowing applications to request that the stack be initialised with a valid cap token at the top of the stack and optionally also an end of stack marker above that. We support requesting an end of stack marker alone but since this is a NULL pointer it is indistinguishable from not initialising anything by itself. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown --- arch/arm64/mm/gcs.c | 64 + 1 file changed, 64 insertions(+) diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index 61a80de6baf8..5c46ec527b1c 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -68,6 +68,70 @@ unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, return addr; } +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + unsigned long alloc_size; + unsigned long __user *cap_ptr; + unsigned long cap_val; + int ret = 0; + int cap_offset; + + if (!system_supports_gcs()) + return -EOPNOTSUPP; + + if (flags & ~(SHADOW_STACK_SET_TOKEN | SHADOW_STACK_SET_MARKER)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + if (size == 8 || !IS_ALIGNED(size, 8)) + return -EINVAL; + + /* +* An overflow would result in attempting to write the restore token +* to the wrong location. Not catastrophic, but just return the right +* error code and block it. +*/ + alloc_size = PAGE_ALIGN(size); + if (alloc_size < size) + return -EOVERFLOW; + + addr = alloc_gcs(addr, alloc_size); + if (IS_ERR_VALUE(addr)) + return addr; + + /* +* Put a cap token at the end of the allocated region so it +* can be switched to. +*/ + if (flags & SHADOW_STACK_SET_TOKEN) { + /* Leave an extra empty frame as a top of stack marker? */ + if (flags & SHADOW_STACK_SET_MARKER) + cap_offset = 2; + else + cap_offset = 1; + + cap_ptr = (unsigned long __user *)(addr + size - + (cap_offset * sizeof(unsigned long))); + cap_val = GCS_CAP(cap_ptr); + + put_user_gcs(cap_val, cap_ptr, &ret); + if (ret != 0) { + vm_munmap(addr, size); + return -EFAULT; + } + + /* +* Ensure the new cap is ordered before standard +* memory accesses to the same location. +*/ + gcsb_dsync(); + } + + return addr; +} + /* * Apply the GCS mode configured for the specified task to the * hardware. -- 2.39.2
[PATCH v13 25/40] arm64/signal: Set up and restore the GCS context for signal handlers
When invoking a signal handler we use the GCS configuration and stack for the current thread. Since we implement signal return by calling the signal handler with a return address set up pointing to a trampoline in the vDSO we need to also configure any active GCS for this by pushing a frame for the trampoline onto the GCS. If we do not do this then signal return will generate a GCS protection fault. In order to guard against attempts to bypass GCS protections via signal return we only allow returning with GCSPR_EL0 pointing to an address where it was previously preempted by a signal. We do this by pushing a cap onto the GCS, this takes the form of an architectural GCS cap token with the top bit set and token type of 0 which we add on signal entry and validate and pop off on signal return. The combination of the top bit being set and the token type mean that this can't be interpreted as a valid token or address. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/gcs.h | 1 + arch/arm64/kernel/signal.c | 118 +-- 2 files changed, 114 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 48c97e63e56a..f50660603ecf 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -9,6 +9,7 @@ #include struct kernel_clone_args; +struct ksignal; static inline void gcsb_dsync(void) { diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 561986947530..b5ab0e229a78 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,15 @@ #include #include +#ifdef CONFIG_ARM64_GCS +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + +static bool gcs_signal_cap_valid(u64 addr, u64 val) +{ + return val == GCS_SIGNAL_CAP(addr); +} +#endif + /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -904,6 +914,58 @@ static int restore_sigframe(struct pt_regs *regs, return err; } +#ifdef CONFIG_ARM64_GCS +static int gcs_restore_signal(void) +{ + unsigned long __user *gcspr_el0; + u64 cap; + int ret; + + if (!system_supports_gcs()) + return 0; + + if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) + return 0; + + gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + + /* +* Ensure that any changes to the GCS done via GCS operations +* are visible to the normal reads we do to validate the +* token. +*/ + gcsb_dsync(); + + /* +* GCSPR_EL0 should be pointing at a capped GCS, read the cap. +* We don't enforce that this is in a GCS page, if it is not +* then faults will be generated on GCS operations - the main +* concern is to protect GCS pages. +*/ + ret = copy_from_user(&cap, gcspr_el0, sizeof(cap)); + if (ret) + return -EFAULT; + + /* +* Check that the cap is the actual GCS before replacing it. +*/ + if (!gcs_signal_cap_valid((u64)gcspr_el0, cap)) + return -EINVAL; + + /* Invalidate the token to prevent reuse */ + put_user_gcs(0, (__user void*)gcspr_el0, &ret); + if (ret != 0) + return -EFAULT; + + write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0); + + return 0; +} + +#else +static int gcs_restore_signal(void) { return 0; } +#endif + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); @@ -927,6 +989,9 @@ SYSCALL_DEFINE0(rt_sigreturn) if (restore_sigframe(regs, frame)) goto badframe; + if (gcs_restore_signal()) + goto badframe; + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; @@ -1189,7 +1254,48 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +#ifdef CONFIG_ARM64_GCS + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + unsigned long __user *gcspr_el0; + int ret = 0; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(current)) + return 0; + + /* +* We are entering a signal handler, current register state is +* active. +*/ + gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + + /* +* Push a cap and the GCS entry for the trampoline onto the GCS. +*/ + put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret); + if (ret != 0) + return ret; + +
Re: [PATCH v13 16/40] KVM: arm64: Manage GCS access and registers for guests
On Tue, 01 Oct 2024 23:58:55 +0100, Mark Brown wrote: > @@ -4714,6 +4735,10 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) > kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 | > HFGxTR_EL2_nPOR_EL0); > > + if (!kvm_has_gcs(kvm)) > + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nGCS_EL0 | > + HFGxTR_EL2_nGCS_EL1); > + Why are you still allowing the GCS instructions when GCS isn't enabled? M. -- Without deviation from the norm, progress is not possible.
Re: [PATCH RFC v4 0/9] tun: Introduce virtio-net hashing feature
On 2024/10/02 1:31, Stephen Hemminger wrote: On Tue, 1 Oct 2024 14:54:29 +0900 Akihiko Odaki wrote: On 2024/09/30 0:33, Stephen Hemminger wrote: On Sun, 29 Sep 2024 16:10:47 +0900 Akihiko Odaki wrote: On 2024/09/29 11:07, Jason Wang wrote: On Fri, Sep 27, 2024 at 3:51 PM Akihiko Odaki wrote: On 2024/09/27 13:31, Jason Wang wrote: On Fri, Sep 27, 2024 at 10:11 AM Akihiko Odaki wrote: On 2024/09/25 12:30, Jason Wang wrote: On Tue, Sep 24, 2024 at 5:01 PM Akihiko Odaki wrote: virtio-net have two usage of hashes: one is RSS and another is hash reporting. Conventionally the hash calculation was done by the VMM. However, computing the hash after the queue was chosen defeats the purpose of RSS. Another approach is to use eBPF steering program. This approach has another downside: it cannot report the calculated hash due to the restrictive nature of eBPF. Introduce the code to compute hashes to the kernel in order to overcome thse challenges. An alternative solution is to extend the eBPF steering program so that it will be able to report to the userspace, but it is based on context rewrites, which is in feature freeze. We can adopt kfuncs, but they will not be UAPIs. We opt to ioctl to align with other relevant UAPIs (KVM and vhost_net). I wonder if we could clone the skb and reuse some to store the hash, then the steering eBPF program can access these fields without introducing full RSS in the kernel? I don't get how cloning the skb can solve the issue. We can certainly implement Toeplitz function in the kernel or even with tc-bpf to store a hash value that can be used for eBPF steering program and virtio hash reporting. However we don't have a means of storing a hash type, which is specific to virtio hash reporting and lacks a corresponding skb field. I may miss something but looking at sk_filter_is_valid_access(). It looks to me we can make use of skb->cb[0..4]? I didn't opt to using cb. Below is the rationale: cb is for tail call so it means we reuse the field for a different purpose. The context rewrite allows adding a field without increasing the size of the underlying storage (the real sk_buff) so we should add a new field instead of reusing an existing field to avoid confusion. We are however no longer allowed to add a new field. In my understanding, this is because it is an UAPI, and eBPF maintainers found it is difficult to maintain its stability. Reusing cb for hash reporting is a workaround to avoid having a new field, but it does not solve the underlying problem (i.e., keeping eBPF as stable as UAPI is unreasonably hard). In my opinion, adding an ioctl is a reasonable option to keep the API as stable as other virtualization UAPIs while respecting the underlying intention of the context rewrite feature freeze. Fair enough. Btw, I remember DPDK implements tuntap RSS via eBPF as well (probably via cls or other). It might worth to see if anything we miss here. Thanks for the information. I wonder why they used cls instead of steering program. Perhaps it may be due to compatibility with macvtap and ipvtap, which don't steering program. Their RSS implementation looks cleaner so I will improve my RSS implementation accordingly. DPDK needs to support flow rules. The specific case is where packets are classified by a flow, then RSS is done across a subset of the queues. The support for flow in TUN driver is more academic than useful, I fixed it for current BPF, but doubt anyone is using it really. A full steering program would be good, but would require much more complexity to take a general set of flow rules then communicate that to the steering program. It reminded me of RSS context and flow filter. Some physical NICs support to use a dedicated RSS context for packets matched with flow filter, and virtio is also gaining corresponding features. RSS context: https://github.com/oasis-tcs/virtio-spec/issues/178 Flow filter: https://github.com/oasis-tcs/virtio-spec/issues/179 I considered about the possibility of supporting these features with tc instead of adding ioctls to tuntap, but it seems not appropriate for virtualization use case. In a virtualization use case, tuntap is configured according to requests of guests, and the code processing these requests need to have minimal permissions for security. This goal is achieved by passing a file descriptor that represents a tuntap from a privileged process (e.g., libvirt) to the process handling guest requests (e.g., QEMU). However, tc is configured with rtnetlink, which does not seem to have an interface to delegate a permission for one particular device to another process. For now I'll continue working on the current approach that is based on ioctl and lacks RSS context and flow filter features. Eventually they are also likely to require new ioctls if they are to be supported with vhost_net. The DPDK flow handling (rte_flow) was started by Mellanox and many of the features are to support what t
[PATCH 29/33] riscv: kernel command line option to opt out of user cfi
This commit adds a kernel command line option using which user cfi can be disabled. Signed-off-by: Deepak Gupta --- arch/riscv/kernel/usercfi.c | 20 1 file changed, 20 insertions(+) diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index 40c32258b6ec..d92b49261b58 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -17,6 +17,8 @@ #include #include +bool disable_riscv_usercfi; + #define SHSTK_ENTRY_SIZE sizeof(void *) bool is_shstk_enabled(struct task_struct *task) @@ -393,6 +395,9 @@ int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) unsigned long size = 0, addr = 0; bool enable_shstk = false; + if (disable_riscv_usercfi) + return 0; + if (!cpu_supports_shadow_stack()) return -EINVAL; @@ -472,6 +477,9 @@ int arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status) { bool enable_indir_lp = false; + if (disable_riscv_usercfi) + return 0; + if (!cpu_supports_indirect_br_lp_instr()) return -EINVAL; @@ -504,3 +512,15 @@ int arch_lock_indir_br_lp_status(struct task_struct *task, return 0; } + +static int __init setup_global_riscv_enable(char *str) +{ + if (strcmp(str, "true") == 0) + disable_riscv_usercfi = true; + + pr_info("Setting riscv usercfi to be %s\n", (disable_riscv_usercfi ? "disabled" : "enabled")); + + return 1; +} + +__setup("disable_riscv_usercfi=", setup_global_riscv_enable); -- 2.45.0
[PATCH 26/33] riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe
Adding enumeration of zicfilp and zicfiss extensions in hwprobe syscall. Signed-off-by: Deepak Gupta --- arch/riscv/include/uapi/asm/hwprobe.h | 2 ++ arch/riscv/kernel/sys_hwprobe.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 1e153cda57db..d5c5dec9ae6c 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -72,6 +72,8 @@ struct riscv_hwprobe { #defineRISCV_HWPROBE_EXT_ZCF (1ULL << 46) #defineRISCV_HWPROBE_EXT_ZCMOP (1ULL << 47) #defineRISCV_HWPROBE_EXT_ZAWRS (1ULL << 48) +#defineRISCV_HWPROBE_EXT_ZICFILP (1ULL << 49) +#defineRISCV_HWPROBE_EXT_ZICFISS (1ULL << 50) #define RISCV_HWPROBE_KEY_CPUPERF_05 #defineRISCV_HWPROBE_MISALIGNED_UNKNOWN(0 << 0) #defineRISCV_HWPROBE_MISALIGNED_EMULATED (1 << 0) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index cea0ca2bf2a2..98f72ad7124f 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -107,6 +107,8 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, EXT_KEY(ZCB); EXT_KEY(ZCMOP); EXT_KEY(ZICBOZ); + EXT_KEY(ZICFILP); + EXT_KEY(ZICFISS); EXT_KEY(ZICOND); EXT_KEY(ZIHINTNTL); EXT_KEY(ZIHINTPAUSE); -- 2.45.0
[PATCH 30/33] riscv: create a config for shadow stack and landing pad instr support
This patch creates a config for shadow stack support and landing pad instr support. Shadow stack support and landing instr support can be enabled by selecting `CONFIG_RISCV_USER_CFI`. Selecting `CONFIG_RISCV_USER_CFI` wires up path to enumerate CPU support and if cpu support exists, kernel will support cpu assisted user mode cfi. If CONFIG_RISCV_USER_CFI is selected, select `ARCH_USES_HIGH_VMA_FLAGS` and `ARCH_HAS_USER_SHADOW_STACK` for riscv. Signed-off-by: Deepak Gupta --- arch/riscv/Kconfig | 19 +++ 1 file changed, 19 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 808ea66b9537..d0cc2879fcd4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -245,6 +245,25 @@ config ARCH_HAS_BROKEN_DWARF5 # https://github.com/llvm/llvm-project/commit/7ffabb61a5569444b5ac9322e22e5471cc5e4a77 depends on LD_IS_LLD && LLD_VERSION < 18 +config RISCV_USER_CFI + def_bool y + bool "riscv userspace control flow integrity" + depends on 64BIT && $(cc-option,-mabi=lp64 -march=rv64ima_zicfiss) + depends on RISCV_ALTERNATIVE + select ARCH_HAS_USER_SHADOW_STACK + select ARCH_USES_HIGH_VMA_FLAGS + help + Provides CPU assisted control flow integrity to userspace tasks. + Control flow integrity is provided by implementing shadow stack for + backward edge and indirect branch tracking for forward edge in program. + Shadow stack protection is a hardware feature that detects function + return address corruption. This helps mitigate ROP attacks. + Indirect branch tracking enforces that all indirect branches must land + on a landing pad instruction else CPU will fault. This mitigates against + JOP / COP attacks. Applications must be enabled to use it, and old user- + space does not get protection "for free". + default y + config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT default 8 -- 2.45.0
[PATCH 15/33] riscv/mm: Implement map_shadow_stack() syscall
As discussed extensively in the changelog for the addition of this syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the existing mmap() and madvise() syscalls do not map entirely well onto the security requirements for shadow stack memory since they lead to windows where memory is allocated but not yet protected or stacks which are not properly and safely initialised. Instead a new syscall map_shadow_stack() has been defined which allocates and initialises a shadow stack page. This patch implements this syscall for riscv. riscv doesn't require token to be setup by kernel because user mode can do that by itself. However to provide compatibility and portability with other architectues, user mode can specify token set flag. Signed-off-by: Deepak Gupta --- arch/riscv/kernel/Makefile | 2 + arch/riscv/kernel/usercfi.c | 145 include/uapi/asm-generic/mman.h | 4 ++ 3 files changed, 151 insertions(+) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 7f88cc4931f5..eb2c94dd0a9d 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -117,3 +117,5 @@ obj-$(CONFIG_COMPAT)+= compat_vdso/ obj-$(CONFIG_64BIT)+= pi/ obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA)+= acpi_numa.o + +obj-$(CONFIG_RISCV_USER_CFI) += usercfi.o diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c new file mode 100644 index ..ce002eabbdc1 --- /dev/null +++ b/arch/riscv/kernel/usercfi.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 Rivos, Inc. + * Deepak Gupta + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SHSTK_ENTRY_SIZE sizeof(void *) + +/* + * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can happen + * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes pointer to + * shadow stack. To keep it simple, we plan to use `ssamoswap` to perform writes on shadow + * stack. + */ +static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned long val) +{ + /* +* Never expect -1 on shadow stack. Expect return addresses and zero +*/ + unsigned long swap = -1; + + __enable_user_access(); + asm goto( + ".option push\n" + ".option arch, +zicfiss\n" + "1: ssamoswap.d %[swap], %[val], %[addr]\n" + _ASM_EXTABLE(1b, %l[fault]) + RISCV_ACQUIRE_BARRIER + ".option pop\n" + : [swap] "=r" (swap), [addr] "+A" (*addr) + : [val] "r" (val) + : "memory" + : fault + ); + __disable_user_access(); + return swap; +fault: + __disable_user_access(); + return -1; +} + +/* + * Create a restore token on the shadow stack. A token is always XLEN wide + * and aligned to XLEN. + */ +static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) +{ + unsigned long addr; + + /* Token must be aligned */ + if (!IS_ALIGNED(ssp, SHSTK_ENTRY_SIZE)) + return -EINVAL; + + /* On RISC-V we're constructing token to be function of address itself */ + addr = ssp - SHSTK_ENTRY_SIZE; + + if (amo_user_shstk((unsigned long __user *)addr, (unsigned long) ssp) == -1) + return -EFAULT; + + if (token_addr) + *token_addr = addr; + + return 0; +} + +static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long size, + unsigned long token_offset, + bool set_tok) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE; + struct mm_struct *mm = current->mm; + unsigned long populate, tok_loc = 0; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL); + mmap_write_unlock(mm); + + if (!set_tok || IS_ERR_VALUE(addr)) + goto out; + + if (create_rstor_token(addr + token_offset, &tok_loc)) { + vm_munmap(addr, size); + return -EINVAL; + } + + addr = tok_loc; + +out: + return addr; +} + +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + bool set_tok = flags & SHADOW_STACK_SET_TOKEN; + unsigned long aligned_size = 0; + + if (!cpu_supports_shadow_stack()) + return -EOPNOTSUPP; + + /* Anything other than set
[PATCH 23/33] riscv signal: save and restore of shadow stack for signal
Save shadow stack pointer in sigcontext structure while delivering signal. Restore shadow stack pointer from sigcontext on sigreturn. As part of save operation, kernel uses `ssamoswap` to save snapshot of current shadow stack on shadow stack itself (can be called as a save token). During restore on sigreturn, kernel retrieves token from top of shadow stack and validates it. This allows that user mode can't arbitrary pivot to any shadow stack address without having a token and thus provide strong security assurance between signaly delivery and sigreturn window. Use ABI compatible way of saving/restoring shadow stack pointer into signal stack. This follows what Vector extension, where extra registers are placed in a form of extension header + extension body in the stack. The extension header indicates the size of the extra architectural states plus the size of header itself, and a magic identifier of the extension. Then, the extensions body contains the new architectural states in the form defined by uapi. Signed-off-by: Andy Chiu Signed-off-by: Deepak Gupta --- arch/riscv/include/asm/usercfi.h | 10 arch/riscv/include/uapi/asm/ptrace.h | 4 ++ arch/riscv/include/uapi/asm/sigcontext.h | 1 + arch/riscv/kernel/signal.c | 80 arch/riscv/kernel/usercfi.c | 57 +++ 5 files changed, 152 insertions(+) diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h index 099204d0cd4a..8da61b005d0a 100644 --- a/arch/riscv/include/asm/usercfi.h +++ b/arch/riscv/include/asm/usercfi.h @@ -8,6 +8,7 @@ #ifndef __ASSEMBLY__ #include #include +#include struct task_struct; struct kernel_clone_args; @@ -35,6 +36,9 @@ bool is_shstk_locked(struct task_struct *task); bool is_shstk_allocated(struct task_struct *task); void set_shstk_lock(struct task_struct *task); void set_shstk_status(struct task_struct *task, bool enable); +unsigned long get_active_shstk(struct task_struct *task); +int restore_user_shstk(struct task_struct *tsk, unsigned long shstk_ptr); +int save_user_shstk(struct task_struct *tsk, unsigned long *saved_shstk_ptr); bool is_indir_lp_enabled(struct task_struct *task); bool is_indir_lp_locked(struct task_struct *task); void set_indir_lp_status(struct task_struct *task, bool enable); @@ -72,6 +76,12 @@ void set_indir_lp_lock(struct task_struct *task); #define set_indir_lp_lock(task) +#define restore_user_shstk(tsk, shstk_ptr) -EINVAL + +#define save_user_shstk(tsk, saved_shstk_ptr) -EINVAL + +#define get_active_shstk(task) 0 + #endif /* CONFIG_RISCV_USER_CFI */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index a38268b19c3d..659ea3af5680 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -127,6 +127,10 @@ struct __riscv_v_regset_state { */ #define RISCV_MAX_VLENB (8192) +struct __sc_riscv_cfi_state { + unsigned long ss_ptr; /* shadow stack pointer */ +}; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI_ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/include/uapi/asm/sigcontext.h b/arch/riscv/include/uapi/asm/sigcontext.h index cd4f175dc837..f37e4beffe03 100644 --- a/arch/riscv/include/uapi/asm/sigcontext.h +++ b/arch/riscv/include/uapi/asm/sigcontext.h @@ -10,6 +10,7 @@ /* The Magic number for signal context frame header. */ #define RISCV_V_MAGIC 0x53465457 +#define RISCV_ZICFISS_MAGIC0x9487 #define END_MAGIC 0x0 /* The size of END signal context header. */ diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 014ac1024b85..77cbc4a01e49 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -22,11 +22,13 @@ #include #include #include +#include unsigned long signal_minsigstksz __ro_after_init; extern u32 __user_rt_sigreturn[2]; static size_t riscv_v_sc_size __ro_after_init; +static size_t riscv_zicfiss_sc_size __ro_after_init; #define DEBUG_SIG 0 @@ -139,6 +141,62 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } +static long save_cfiss_state(struct pt_regs *regs, void __user *sc_cfi) +{ + struct __sc_riscv_cfi_state __user *state = sc_cfi; + unsigned long ss_ptr = 0; + long err = 0; + + if (!IS_ENABLED(CONFIG_RISCV_USER_CFI) || !is_shstk_enabled(current)) + return 0; + + /* +* Save a pointer to shadow stack itself on shadow stack as a form of token. +* A token on shadow gives following properties +* - Safe save and restore for shadow stack switching. Any save of shadow stack +* must have had saved a token on shadow stack. Similarly any restore of shadow +* stack must check the token before restore. Since writing to shadow stack with +* address o
[PATCH 10/33] riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit
Carves out space in arch specific thread struct for cfi status and shadow stack in usermode on riscv. This patch does following - defines a new structure cfi_status with status bit for cfi feature - defines shadow stack pointer, base and size in cfi_status structure - defines offsets to new member fields in thread in asm-offsets.c - Saves and restore shadow stack pointer on trap entry (U --> S) and exit (S --> U) Shadow stack save/restore is gated on feature availiblity and implemented using alternative. CSR can be context switched in `switch_to` as well but soon as kernel shadow stack support gets rolled in, shadow stack pointer will need to be switched at trap entry/exit point (much like `sp`). It can be argued that kernel using shadow stack deployment scenario may not be as prevalant as user mode using this feature. But even if there is some minimal deployment of kernel shadow stack, that means that it needs to be supported. And thus save/restore of shadow stack pointer in entry.S instead of in `switch_to.h`. Signed-off-by: Deepak Gupta Reviewed-by: Charlie Jenkins --- arch/riscv/include/asm/processor.h | 1 + arch/riscv/include/asm/thread_info.h | 3 +++ arch/riscv/include/asm/usercfi.h | 24 arch/riscv/kernel/asm-offsets.c | 4 arch/riscv/kernel/entry.S| 26 ++ 5 files changed, 58 insertions(+) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 9ea0021a1a75..0e05c9682b3c 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -14,6 +14,7 @@ #include #include +#include #define arch_get_mmap_end(addr, len, flags)\ ({ \ diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index e494871071da..ed9e6cbacaa5 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -58,6 +58,9 @@ struct thread_info { int cpu; unsigned long syscall_work; /* SYSCALL_WORK_ flags */ unsigned long envcfg; +#ifdef CONFIG_RISCV_USER_CFI + struct cfi_status user_cfi_state; +#endif #ifdef CONFIG_SHADOW_CALL_STACK void*scs_base; void*scs_sp; diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h new file mode 100644 index ..4fa201b4fc4e --- /dev/null +++ b/arch/riscv/include/asm/usercfi.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (C) 2024 Rivos, Inc. + * Deepak Gupta + */ +#ifndef _ASM_RISCV_USERCFI_H +#define _ASM_RISCV_USERCFI_H + +#ifndef __ASSEMBLY__ +#include + +#ifdef CONFIG_RISCV_USER_CFI +struct cfi_status { + unsigned long ubcfi_en : 1; /* Enable for backward cfi. */ + unsigned long rsvd : ((sizeof(unsigned long)*8) - 1); + unsigned long user_shdw_stk; /* Current user shadow stack pointer */ + unsigned long shdw_stk_base; /* Base address of shadow stack */ + unsigned long shdw_stk_size; /* size of shadow stack */ +}; + +#endif /* CONFIG_RISCV_USER_CFI */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_RISCV_USERCFI_H */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index e94180ba432f..766bd33f10cb 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -52,6 +52,10 @@ void asm_offsets(void) #endif OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu); +#ifdef CONFIG_RISCV_USER_CFI + OFFSET(TASK_TI_CFI_STATUS, task_struct, thread_info.user_cfi_state); + OFFSET(TASK_TI_USER_SSP, task_struct, thread_info.user_cfi_state.user_shdw_stk); +#endif OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]); OFFSET(TASK_THREAD_F1, task_struct, thread.fstate.f[1]); OFFSET(TASK_THREAD_F2, task_struct, thread.fstate.f[2]); diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index c200d329d4bd..8f7f477517e3 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -147,6 +147,20 @@ SYM_CODE_START(handle_exception) REG_L s0, TASK_TI_USER_SP(tp) csrrc s1, CSR_STATUS, t0 + /* +* If previous mode was U, capture shadow stack pointer and save it away +* Zero CSR_SSP at the same time for sanitization. +*/ + ALTERNATIVE("nop; nop; nop; nop", + __stringify(\ + andi s2, s1, SR_SPP;\ + bnez s2, skip_ssp_save; \ + csrrw s2, CSR_SSP, x0; \ + REG_S s2, TASK_TI_USER_SSP(tp); \ + skip_ssp_save:), + 0, + RISCV_ISA_EXT_ZICFISS, + CONF
[PATCH 09/33] riscv: zicfiss / zicfilp extension csr and bit definitions
zicfiss and zicfilp extension gets enabled via b3 and b2 in *envcfg CSR. menvcfg controls enabling for S/HS mode. henvcfg control enabling for VS while senvcfg controls enabling for U/VU mode. zicfilp extension extends *status CSR to hold `expected landing pad` bit. A trap or interrupt can occur between an indirect jmp/call and target instr. `expected landing pad` bit from CPU is recorded into xstatus CSR so that when supervisor performs xret, `expected landing pad` state of CPU can be restored. zicfiss adds one new CSR - CSR_SSP: CSR_SSP contains current shadow stack pointer. Signed-off-by: Deepak Gupta Reviewed-by: Charlie Jenkins --- arch/riscv/include/asm/csr.h | 16 1 file changed, 16 insertions(+) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 25966995da04..af7ed9bedaee 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -18,6 +18,15 @@ #define SR_MPP _AC(0x1800, UL) /* Previously Machine */ #define SR_SUM _AC(0x0004, UL) /* Supervisor User Memory Access */ +/* zicfilp landing pad status bit */ +#define SR_SPELP _AC(0x0080, UL) +#define SR_MPELP _AC(0x0200, UL) +#ifdef CONFIG_RISCV_M_MODE +#define SR_ELP SR_MPELP +#else +#define SR_ELP SR_SPELP +#endif + #define SR_FS _AC(0x6000, UL) /* Floating-point Status */ #define SR_FS_OFF _AC(0x, UL) #define SR_FS_INITIAL _AC(0x2000, UL) @@ -197,6 +206,8 @@ #define ENVCFG_PBMTE (_AC(1, ULL) << 62) #define ENVCFG_CBZE(_AC(1, UL) << 7) #define ENVCFG_CBCFE (_AC(1, UL) << 6) +#define ENVCFG_LPE (_AC(1, UL) << 2) +#define ENVCFG_SSE (_AC(1, UL) << 3) #define ENVCFG_CBIE_SHIFT 4 #define ENVCFG_CBIE(_AC(0x3, UL) << ENVCFG_CBIE_SHIFT) #define ENVCFG_CBIE_ILL_AC(0x0, UL) @@ -215,6 +226,11 @@ #define SMSTATEEN0_HSENVCFG(_ULL(1) << SMSTATEEN0_HSENVCFG_SHIFT) #define SMSTATEEN0_SSTATEEN0_SHIFT 63 #define SMSTATEEN0_SSTATEEN0 (_ULL(1) << SMSTATEEN0_SSTATEEN0_SHIFT) +/* + * zicfiss user mode csr + * CSR_SSP holds current shadow stack pointer. + */ +#define CSR_SSP 0x011 /* symbolic CSR names: */ #define CSR_CYCLE 0xc00 -- 2.45.0
[PATCH 08/33] riscv: zicfiss / zicfilp enumeration
This patch adds support for detecting zicfiss and zicfilp. zicfiss and zicfilp stands for unprivleged integer spec extension for shadow stack and branch tracking on indirect branches, respectively. This patch looks for zicfiss and zicfilp in device tree and accordinlgy lights up bit in cpu feature bitmap. Furthermore this patch adds detection utility functions to return whether shadow stack or landing pads are supported by cpu. Signed-off-by: Deepak Gupta --- arch/riscv/include/asm/cpufeature.h | 13 + arch/riscv/include/asm/hwcap.h | 2 ++ arch/riscv/include/asm/processor.h | 1 + arch/riscv/kernel/cpufeature.c | 2 ++ 4 files changed, 18 insertions(+) diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index ce9a995730c1..344b8e8cd3e8 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -180,4 +181,16 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi return __riscv_isa_extension_available(hart_isa[cpu].isa, ext); } +static inline bool cpu_supports_shadow_stack(void) +{ + return (IS_ENABLED(CONFIG_RISCV_USER_CFI) && + riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICFISS)); +} + +static inline bool cpu_supports_indirect_br_lp_instr(void) +{ + return (IS_ENABLED(CONFIG_RISCV_USER_CFI) && + riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICFILP)); +} + #endif diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 46d9de54179e..10d315a6ef0e 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -93,6 +93,8 @@ #define RISCV_ISA_EXT_ZCMOP84 #define RISCV_ISA_EXT_ZAWRS85 #define RISCV_ISA_EXT_SVVPTC 86 +#define RISCV_ISA_EXT_ZICFILP 87 +#define RISCV_ISA_EXT_ZICFISS 88 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index efa1b3519b23..9ea0021a1a75 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -13,6 +13,7 @@ #include #include +#include #define arch_get_mmap_end(addr, len, flags)\ ({ \ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 7117366d80db..96a1375d7171 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -317,6 +317,8 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { riscv_ext_zicbom_validate), __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts, riscv_ext_zicboz_validate), + __RISCV_ISA_EXT_SUPERSET(zicfilp, RISCV_ISA_EXT_ZICFILP, riscv_xlinuxenvcfg_exts), + __RISCV_ISA_EXT_SUPERSET(zicfiss, RISCV_ISA_EXT_ZICFISS, riscv_xlinuxenvcfg_exts), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), __RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND), __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), -- 2.45.0
[PATCH 12/33] riscv mm: manufacture shadow stack pte
This patch implements creating shadow stack pte (on riscv). Creating shadow stack PTE on riscv means that clearing RWX and then setting W=1. Signed-off-by: Deepak Gupta Reviewed-by: Alexandre Ghiti --- arch/riscv/include/asm/pgtable.h | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 4948a1f18ae8..2c6edc8d04a3 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -421,6 +421,11 @@ static inline pte_t pte_mkwrite_novma(pte_t pte) return __pte(pte_val(pte) | _PAGE_WRITE); } +static inline pte_t pte_mkwrite_shstk(pte_t pte) +{ + return __pte((pte_val(pte) & ~(_PAGE_LEAF)) | _PAGE_WRITE); +} + /* static inline pte_t pte_mkexec(pte_t pte) */ static inline pte_t pte_mkdirty(pte_t pte) @@ -738,6 +743,11 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) return pte_pmd(pte_mkwrite_novma(pmd_pte(pmd))); } +static inline pmd_t pmd_mkwrite_shstk(pmd_t pte) +{ + return __pmd((pmd_val(pte) & ~(_PAGE_LEAF)) | _PAGE_WRITE); +} + static inline pmd_t pmd_wrprotect(pmd_t pmd) { return pte_pmd(pte_wrprotect(pmd_pte(pmd))); -- 2.45.0
[PATCH 19/33] riscv: Implements arch agnostic shadow stack prctls
Implement architecture agnostic prctls() interface for setting and getting shadow stack status. prctls implemented are PR_GET_SHADOW_STACK_STATUS, PR_SET_SHADOW_STACK_STATUS and PR_LOCK_SHADOW_STACK_STATUS. As part of PR_SET_SHADOW_STACK_STATUS/PR_GET_SHADOW_STACK_STATUS, only PR_SHADOW_STACK_ENABLE is implemented because RISCV allows each mode to write to their own shadow stack using `sspush` or `ssamoswap`. PR_LOCK_SHADOW_STACK_STATUS locks current configuration of shadow stack enabling. Signed-off-by: Deepak Gupta --- arch/riscv/include/asm/usercfi.h | 18 ++- arch/riscv/kernel/process.c | 8 +++ arch/riscv/kernel/usercfi.c | 107 +++ 3 files changed, 132 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h index 719e28e043c8..52850a2c79cf 100644 --- a/arch/riscv/include/asm/usercfi.h +++ b/arch/riscv/include/asm/usercfi.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ #include +#include struct task_struct; struct kernel_clone_args; @@ -14,7 +15,8 @@ struct kernel_clone_args; #ifdef CONFIG_RISCV_USER_CFI struct cfi_status { unsigned long ubcfi_en : 1; /* Enable for backward cfi. */ - unsigned long rsvd : ((sizeof(unsigned long)*8) - 1); + unsigned long ubcfi_locked : 1; + unsigned long rsvd : ((sizeof(unsigned long)*8) - 2); unsigned long user_shdw_stk; /* Current user shadow stack pointer */ unsigned long shdw_stk_base; /* Base address of shadow stack */ unsigned long shdw_stk_size; /* size of shadow stack */ @@ -27,6 +29,12 @@ void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned unsigned long get_shstk_base(struct task_struct *task, unsigned long *size); void set_active_shstk(struct task_struct *task, unsigned long shstk_addr); bool is_shstk_enabled(struct task_struct *task); +bool is_shstk_locked(struct task_struct *task); +bool is_shstk_allocated(struct task_struct *task); +void set_shstk_lock(struct task_struct *task); +void set_shstk_status(struct task_struct *task, bool enable); + +#define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK (PR_SHADOW_STACK_ENABLE) #else @@ -42,6 +50,14 @@ bool is_shstk_enabled(struct task_struct *task); #define is_shstk_enabled(task) false +#define is_shstk_locked(task) false + +#define is_shstk_allocated(task) false + +#define set_shstk_lock(task) + +#define set_shstk_status(task, enable) + #endif /* CONFIG_RISCV_USER_CFI */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index f6f58b1ed905..f7dec532657f 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -152,6 +152,14 @@ void start_thread(struct pt_regs *regs, unsigned long pc, regs->epc = pc; regs->sp = sp; + /* +* clear shadow stack state on exec. +* libc will set it later via prctl. +*/ + set_shstk_status(current, false); + set_shstk_base(current, 0, 0); + set_active_shstk(current, 0); + #ifdef CONFIG_64BIT regs->status &= ~SR_UXL; diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index 7a7f0b57b2d4..c77abe552c88 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -24,6 +24,16 @@ bool is_shstk_enabled(struct task_struct *task) return task->thread_info.user_cfi_state.ubcfi_en ? true : false; } +bool is_shstk_allocated(struct task_struct *task) +{ + return task->thread_info.user_cfi_state.shdw_stk_base ? true : false; +} + +bool is_shstk_locked(struct task_struct *task) +{ + return task->thread_info.user_cfi_state.ubcfi_locked ? true : false; +} + void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size) { task->thread_info.user_cfi_state.shdw_stk_base = shstk_addr; @@ -42,6 +52,23 @@ void set_active_shstk(struct task_struct *task, unsigned long shstk_addr) task->thread_info.user_cfi_state.user_shdw_stk = shstk_addr; } +void set_shstk_status(struct task_struct *task, bool enable) +{ + task->thread_info.user_cfi_state.ubcfi_en = enable ? 1 : 0; + + if (enable) + task->thread_info.envcfg |= ENVCFG_SSE; + else + task->thread_info.envcfg &= ~ENVCFG_SSE; + + csr_write(CSR_ENVCFG, task->thread_info.envcfg); +} + +void set_shstk_lock(struct task_struct *task) +{ + task->thread_info.user_cfi_state.ubcfi_locked = 1; +} + /* * If size is 0, then to be compatible with regular stack we want it to be as big as * regular stack. Else PAGE_ALIGN it and return back @@ -264,3 +291,83 @@ void shstk_release(struct task_struct *tsk) vm_munmap(base, size); set_shstk_base(tsk, 0, 0); } + +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) +{ + unsigned long bcfi_status = 0; + + if (!cpu_supports_shadow_stack()) + return
[PATCH 20/33] riscv: Implements arch agnostic indirect branch tracking prctls
prctls implemented are: PR_SET_INDIR_BR_LP_STATUS, PR_GET_INDIR_BR_LP_STATUS and PR_LOCK_INDIR_BR_LP_STATUS. On trap entry, ELP state is recorded in sstatus image on stack and SR_ELP in CSR_STATUS is cleared. Signed-off-by: Deepak Gupta --- arch/riscv/include/asm/usercfi.h | 16 - arch/riscv/kernel/entry.S| 2 +- arch/riscv/kernel/process.c | 5 +++ arch/riscv/kernel/usercfi.c | 76 4 files changed, 97 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h index 52850a2c79cf..099204d0cd4a 100644 --- a/arch/riscv/include/asm/usercfi.h +++ b/arch/riscv/include/asm/usercfi.h @@ -16,7 +16,9 @@ struct kernel_clone_args; struct cfi_status { unsigned long ubcfi_en : 1; /* Enable for backward cfi. */ unsigned long ubcfi_locked : 1; - unsigned long rsvd : ((sizeof(unsigned long)*8) - 2); + unsigned long ufcfi_en : 1; /* Enable for forward cfi. Note that ELP goes in sstatus */ + unsigned long ufcfi_locked : 1; + unsigned long rsvd : ((sizeof(unsigned long)*8) - 4); unsigned long user_shdw_stk; /* Current user shadow stack pointer */ unsigned long shdw_stk_base; /* Base address of shadow stack */ unsigned long shdw_stk_size; /* size of shadow stack */ @@ -33,6 +35,10 @@ bool is_shstk_locked(struct task_struct *task); bool is_shstk_allocated(struct task_struct *task); void set_shstk_lock(struct task_struct *task); void set_shstk_status(struct task_struct *task, bool enable); +bool is_indir_lp_enabled(struct task_struct *task); +bool is_indir_lp_locked(struct task_struct *task); +void set_indir_lp_status(struct task_struct *task, bool enable); +void set_indir_lp_lock(struct task_struct *task); #define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK (PR_SHADOW_STACK_ENABLE) @@ -58,6 +64,14 @@ void set_shstk_status(struct task_struct *task, bool enable); #define set_shstk_status(task, enable) +#define is_indir_lp_enabled(task) false + +#define is_indir_lp_locked(task) false + +#define set_indir_lp_status(task, enable) + +#define set_indir_lp_lock(task) + #endif /* CONFIG_RISCV_USER_CFI */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 8f7f477517e3..a1f258fd7bbc 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -143,7 +143,7 @@ SYM_CODE_START(handle_exception) * Disable the FPU/Vector to detect illegal usage of floating point * or vector in kernel space. */ - li t0, SR_SUM | SR_FS_VS + li t0, SR_SUM | SR_FS_VS | SR_ELP REG_L s0, TASK_TI_USER_SP(tp) csrrc s1, CSR_STATUS, t0 diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index f7dec532657f..5207f018415c 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -159,6 +159,11 @@ void start_thread(struct pt_regs *regs, unsigned long pc, set_shstk_status(current, false); set_shstk_base(current, 0, 0); set_active_shstk(current, 0); + /* +* disable indirect branch tracking on exec. +* libc will enable it later via prctl. +*/ + set_indir_lp_status(current, false); #ifdef CONFIG_64BIT regs->status &= ~SR_UXL; diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index c77abe552c88..8da509afdbe9 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -69,6 +69,32 @@ void set_shstk_lock(struct task_struct *task) task->thread_info.user_cfi_state.ubcfi_locked = 1; } +bool is_indir_lp_enabled(struct task_struct *task) +{ + return task->thread_info.user_cfi_state.ufcfi_en ? true : false; +} + +bool is_indir_lp_locked(struct task_struct *task) +{ + return task->thread_info.user_cfi_state.ufcfi_locked ? true : false; +} + +void set_indir_lp_status(struct task_struct *task, bool enable) +{ + task->thread_info.user_cfi_state.ufcfi_en = enable ? 1 : 0; + + if (enable) + task->thread_info.envcfg |= ENVCFG_LPE; + else + task->thread_info.envcfg &= ~ENVCFG_LPE; + + csr_write(CSR_ENVCFG, task->thread_info.envcfg); +} + +void set_indir_lp_lock(struct task_struct *task) +{ + task->thread_info.user_cfi_state.ufcfi_locked = 1; +} /* * If size is 0, then to be compatible with regular stack we want it to be as big as * regular stack. Else PAGE_ALIGN it and return back @@ -371,3 +397,53 @@ int arch_lock_shadow_stack_status(struct task_struct *task, return 0; } + +int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status) +{ + unsigned long fcfi_status = 0; + + if (!cpu_supports_indirect_br_lp_instr()) + return -EINVAL; + + /* indirect branch tracking is enabled on the task or not */ + fcfi_status |= (is_indir_lp_enabled(t) ? PR_INDIR_BR_LP_ENABLE : 0); + +
[PATCH 13/33] riscv mmu: teach pte_mkwrite to manufacture shadow stack PTEs
pte_mkwrite creates PTEs with WRITE encodings for underlying arch. Underlying arch can have two types of writeable mappings. One that can be written using regular store instructions. Another one that can only be written using specialized store instructions (like shadow stack stores). pte_mkwrite can select write PTE encoding based on VMA range (i.e. VM_SHADOW_STACK) Signed-off-by: Deepak Gupta Reviewed-by: Alexandre Ghiti --- arch/riscv/include/asm/pgtable.h | 7 +++ arch/riscv/mm/pgtable.c | 17 + 2 files changed, 24 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2c6edc8d04a3..7963ab11d924 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -416,6 +416,10 @@ static inline pte_t pte_wrprotect(pte_t pte) /* static inline pte_t pte_mkread(pte_t pte) */ +struct vm_area_struct; +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma); +#define pte_mkwrite pte_mkwrite + static inline pte_t pte_mkwrite_novma(pte_t pte) { return __pte(pte_val(pte) | _PAGE_WRITE); @@ -738,6 +742,9 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd) return pte_pmd(pte_mkyoung(pmd_pte(pmd))); } +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); +#define pmd_mkwrite pmd_mkwrite + static inline pmd_t pmd_mkwrite_novma(pmd_t pmd) { return pte_pmd(pte_mkwrite_novma(pmd_pte(pmd))); diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 4ae67324f992..be5d38546bb3 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -155,3 +155,20 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, return pmd; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pte_mkwrite_shstk(pte); + + return pte_mkwrite_novma(pte); +} + +pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHADOW_STACK) + return pmd_mkwrite_shstk(pmd); + + return pmd_mkwrite_novma(pmd); +} + -- 2.45.0
[PATCH 14/33] riscv mmu: write protect and shadow stack
`fork` implements copy on write (COW) by making pages readonly in child and parent both. ptep_set_wrprotect and pte_wrprotect clears _PAGE_WRITE in PTE. Assumption is that page is readable and on fault copy on write happens. To implement COW on shadow stack pages, clearing up W bit makes them XWR = 000. This will result in wrong PTE setting which says no perms but V=1 and PFN field pointing to final page. Instead desired behavior is to turn it into a readable page, take an access (load/store) fault on sspush/sspop (shadow stack) and then perform COW on such pages. This way regular reads would still be allowed and not lead to COW maintaining current behavior of COW on non-shadow stack but writeable memory. On the other hand it doesn't interfere with existing COW for read-write memory. Assumption is always that _PAGE_READ must have been set and thus setting _PAGE_READ is harmless. Signed-off-by: Deepak Gupta Alexandre Ghiti --- arch/riscv/include/asm/pgtable.h | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7963ab11d924..fdab7d74437d 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -411,7 +411,7 @@ static inline int pte_devmap(pte_t pte) static inline pte_t pte_wrprotect(pte_t pte) { - return __pte(pte_val(pte) & ~(_PAGE_WRITE)); + return __pte((pte_val(pte) & ~(_PAGE_WRITE)) | (_PAGE_READ)); } /* static inline pte_t pte_mkread(pte_t pte) */ @@ -612,7 +612,15 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep); + pte_t read_pte = READ_ONCE(*ptep); + /* +* ptep_set_wrprotect can be called for shadow stack ranges too. +* shadow stack memory is XWR = 010 and thus clearing _PAGE_WRITE will lead to +* encoding 000b which is wrong encoding with V = 1. This should lead to page fault +* but we dont want this wrong configuration to be set in page tables. +*/ + atomic_long_set((atomic_long_t *)ptep, + ((pte_val(read_pte) & ~(unsigned long)_PAGE_WRITE) | _PAGE_READ)); } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -- 2.45.0
[PATCH 27/33] riscv: Add Firmware Feature SBI extensions definitions
From: Clément Léger Add necessary SBI definitions to use the FWFT extension. Signed-off-by: Clément Léger --- arch/riscv/include/asm/sbi.h | 27 +++ 1 file changed, 27 insertions(+) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 98f631b051db..754e5cdabf46 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -34,6 +34,7 @@ enum sbi_ext_id { SBI_EXT_PMU = 0x504D55, SBI_EXT_DBCN = 0x4442434E, SBI_EXT_STA = 0x535441, + SBI_EXT_FWFT = 0x46574654, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x0800, @@ -281,6 +282,32 @@ struct sbi_sta_struct { #define SBI_SHMEM_DISABLE -1 +/* SBI function IDs for FW feature extension */ +#define SBI_EXT_FWFT_SET 0x0 +#define SBI_EXT_FWFT_GET 0x1 + +enum sbi_fwft_feature_t { + SBI_FWFT_MISALIGNED_EXC_DELEG = 0x0, + SBI_FWFT_LANDING_PAD= 0x1, + SBI_FWFT_SHADOW_STACK = 0x2, + SBI_FWFT_DOUBLE_TRAP= 0x3, + SBI_FWFT_PTE_AD_HW_UPDATING = 0x4, + SBI_FWFT_LOCAL_RESERVED_START = 0x5, + SBI_FWFT_LOCAL_RESERVED_END = 0x3fff, + SBI_FWFT_LOCAL_PLATFORM_START = 0x4000, + SBI_FWFT_LOCAL_PLATFORM_END = 0x7fff, + + SBI_FWFT_GLOBAL_RESERVED_START = 0x8000, + SBI_FWFT_GLOBAL_RESERVED_END= 0xbfff, + SBI_FWFT_GLOBAL_PLATFORM_START = 0xc000, + SBI_FWFT_GLOBAL_PLATFORM_END= 0x, +}; + +#define SBI_FWFT_GLOBAL_FEATURE_BIT(1 << 31) +#define SBI_FWFT_PLATFORM_FEATURE_BIT (1 << 30) + +#define SBI_FWFT_SET_FLAG_LOCK (1 << 0) + /* SBI spec version fields */ #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 -- 2.45.0
[PATCH 16/33] riscv/shstk: If needed allocate a new shadow stack on clone
Userspace specifies CLONE_VM to share address space and spawn new thread. `clone` allow userspace to specify a new stack for new thread. However there is no way to specify new shadow stack base address without changing API. This patch allocates a new shadow stack whenever CLONE_VM is given. In case of CLONE_VFORK, parent is suspended until child finishes and thus can child use parent shadow stack. In case of !CLONE_VM, COW kicks in because entire address space is copied from parent to child. `clone3` is extensible and can provide mechanisms using which shadow stack as an input parameter can be provided. This is not settled yet and being extensively discussed on mailing list. Once that's settled, this commit will adapt to that. Signed-off-by: Deepak Gupta --- arch/riscv/include/asm/usercfi.h | 25 arch/riscv/kernel/process.c | 11 +++- arch/riscv/kernel/usercfi.c | 121 +++ 3 files changed, 156 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h index 4fa201b4fc4e..719e28e043c8 100644 --- a/arch/riscv/include/asm/usercfi.h +++ b/arch/riscv/include/asm/usercfi.h @@ -8,6 +8,9 @@ #ifndef __ASSEMBLY__ #include +struct task_struct; +struct kernel_clone_args; + #ifdef CONFIG_RISCV_USER_CFI struct cfi_status { unsigned long ubcfi_en : 1; /* Enable for backward cfi. */ @@ -17,6 +20,28 @@ struct cfi_status { unsigned long shdw_stk_size; /* size of shadow stack */ }; +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args); +void shstk_release(struct task_struct *tsk); +void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size); +unsigned long get_shstk_base(struct task_struct *task, unsigned long *size); +void set_active_shstk(struct task_struct *task, unsigned long shstk_addr); +bool is_shstk_enabled(struct task_struct *task); + +#else + +#define shstk_alloc_thread_stack(tsk, args) 0 + +#define shstk_release(tsk) + +#define get_shstk_base(task, size) 0 + +#define set_shstk_base(task, shstk_addr, size) + +#define set_active_shstk(task, shstk_addr) + +#define is_shstk_enabled(task) false + #endif /* CONFIG_RISCV_USER_CFI */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 1f2574fb2edb..f6f58b1ed905 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include @@ -203,7 +204,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) void exit_thread(struct task_struct *tsk) { - + shstk_release(tsk); } int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) @@ -211,6 +212,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long clone_flags = args->flags; unsigned long usp = args->stack; unsigned long tls = args->tls; + unsigned long ssp = 0; struct pt_regs *childregs = task_pt_regs(p); memset(&p->thread.s, 0, sizeof(p->thread.s)); @@ -225,11 +227,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.s[0] = (unsigned long)args->fn; p->thread.s[1] = (unsigned long)args->fn_arg; } else { + /* allocate new shadow stack if needed. In case of CLONE_VM we have to */ + ssp = shstk_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(ssp)) + return PTR_ERR((void *)ssp); + *childregs = *(current_pt_regs()); /* Turn off status.VS */ riscv_v_vstate_off(childregs); if (usp) /* User fork */ childregs->sp = usp; + /* if needed, set new ssp */ + ssp ? set_active_shstk(p, ssp) : 0; if (clone_flags & CLONE_SETTLS) childregs->tp = tls; childregs->a0 = 0; /* Return value of fork() */ diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c index ce002eabbdc1..7a7f0b57b2d4 100644 --- a/arch/riscv/kernel/usercfi.c +++ b/arch/riscv/kernel/usercfi.c @@ -19,6 +19,41 @@ #define SHSTK_ENTRY_SIZE sizeof(void *) +bool is_shstk_enabled(struct task_struct *task) +{ + return task->thread_info.user_cfi_state.ubcfi_en ? true : false; +} + +void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size) +{ + task->thread_info.user_cfi_state.shdw_stk_base = shstk_addr; + task->thread_info.user_cfi_state.shdw_stk_size = size; +} + +unsigned long get_shstk_base(struct task_struct *task, unsigned long *size) +{ + if (size) +
[PATCH 21/33] riscv/traps: Introduce software check exception
zicfiss / zicfilp introduces a new exception to priv isa `software check exception` with cause code = 18. This patch implements software check exception. Additionally it implements a cfi violation handler which checks for code in xtval. If xtval=2, it means that sw check exception happened because of an indirect branch not landing on 4 byte aligned PC or not landing on `lpad` instruction or label value embedded in `lpad` not matching label value setup in `x7`. If xtval=3, it means that sw check exception happened because of mismatch between link register (x1 or x5) and top of shadow stack (on execution of `sspopchk`). In case of cfi violation, SIGSEGV is raised with code=SEGV_CPERR. SEGV_CPERR was introduced by x86 shadow stack patches. Signed-off-by: Deepak Gupta --- arch/riscv/include/asm/asm-prototypes.h | 1 + arch/riscv/include/asm/entry-common.h | 2 ++ arch/riscv/kernel/entry.S | 3 +++ arch/riscv/kernel/traps.c | 42 + 4 files changed, 48 insertions(+) diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index cd627ec289f1..5a27cefd7805 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -51,6 +51,7 @@ DECLARE_DO_ERROR_INFO(do_trap_ecall_u); DECLARE_DO_ERROR_INFO(do_trap_ecall_s); DECLARE_DO_ERROR_INFO(do_trap_ecall_m); DECLARE_DO_ERROR_INFO(do_trap_break); +DECLARE_DO_ERROR_INFO(do_trap_software_check); asmlinkage void handle_bad_stack(struct pt_regs *regs); asmlinkage void do_page_fault(struct pt_regs *regs); diff --git a/arch/riscv/include/asm/entry-common.h b/arch/riscv/include/asm/entry-common.h index 2293e535f865..4068c7e5452a 100644 --- a/arch/riscv/include/asm/entry-common.h +++ b/arch/riscv/include/asm/entry-common.h @@ -39,4 +39,6 @@ static inline int handle_misaligned_store(struct pt_regs *regs) } #endif +bool handle_user_cfi_violation(struct pt_regs *regs); + #endif /* _ASM_RISCV_ENTRY_COMMON_H */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index a1f258fd7bbc..aaef4604d841 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -471,6 +471,9 @@ SYM_DATA_START_LOCAL(excp_vect_table) RISCV_PTR do_page_fault /* load page fault */ RISCV_PTR do_trap_unknown RISCV_PTR do_page_fault /* store page fault */ + RISCV_PTR do_trap_unknown /* cause=16 */ + RISCV_PTR do_trap_unknown /* cause=17 */ + RISCV_PTR do_trap_software_check /* cause=18 is sw check exception */ SYM_DATA_END_LABEL(excp_vect_table, SYM_L_LOCAL, excp_vect_table_end) #ifndef CONFIG_MMU diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 51ebfd23e007..225b1d198ab6 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -354,6 +354,48 @@ void do_trap_ecall_u(struct pt_regs *regs) } +#define CFI_TVAL_FCFI_CODE 2 +#define CFI_TVAL_BCFI_CODE 3 +/* handle cfi violations */ +bool handle_user_cfi_violation(struct pt_regs *regs) +{ + bool ret = false; + unsigned long tval = csr_read(CSR_TVAL); + + if (((tval == CFI_TVAL_FCFI_CODE) && cpu_supports_indirect_br_lp_instr()) || + ((tval == CFI_TVAL_BCFI_CODE) && cpu_supports_shadow_stack())) { + do_trap_error(regs, SIGSEGV, SEGV_CPERR, regs->epc, + "Oops - control flow violation"); + ret = true; + } + + return ret; +} +/* + * software check exception is defined with risc-v cfi spec. Software check + * exception is raised when:- + * a) An indirect branch doesn't land on 4 byte aligned PC or `lpad` + *instruction or `label` value programmed in `lpad` instr doesn't + *match with value setup in `x7`. reported code in `xtval` is 2. + * b) `sspopchk` instruction finds a mismatch between top of shadow stack (ssp) + *and x1/x5. reported code in `xtval` is 3. + */ +asmlinkage __visible __trap_section void do_trap_software_check(struct pt_regs *regs) +{ + if (user_mode(regs)) { + irqentry_enter_from_user_mode(regs); + + /* not a cfi violation, then merge into flow of unknown trap handler */ + if (!handle_user_cfi_violation(regs)) + do_trap_unknown(regs); + + irqentry_exit_to_user_mode(regs); + } else { + /* sw check exception coming from kernel is a bug in kernel */ + die(regs, "Kernel BUG"); + } +} + #ifdef CONFIG_MMU asmlinkage __visible noinstr void do_page_fault(struct pt_regs *regs) { -- 2.45.0
[PATCH 31/33] riscv: Documentation for landing pad / indirect branch tracking
Adding documentation on landing pad aka indirect branch tracking on riscv and kernel interfaces exposed so that user tasks can enable it. Signed-off-by: Deepak Gupta --- Documentation/arch/riscv/index.rst | 1 + Documentation/arch/riscv/zicfilp.rst | 115 +++ 2 files changed, 116 insertions(+) diff --git a/Documentation/arch/riscv/index.rst b/Documentation/arch/riscv/index.rst index eecf347ce849..be7237b69682 100644 --- a/Documentation/arch/riscv/index.rst +++ b/Documentation/arch/riscv/index.rst @@ -14,6 +14,7 @@ RISC-V architecture uabi vector cmodx +zicfilp features diff --git a/Documentation/arch/riscv/zicfilp.rst b/Documentation/arch/riscv/zicfilp.rst new file mode 100644 index ..a188d78fcde6 --- /dev/null +++ b/Documentation/arch/riscv/zicfilp.rst @@ -0,0 +1,115 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Author: Deepak Gupta +:Date: 12 January 2024 + + +Tracking indirect control transfers on RISC-V Linux + + +This document briefly describes the interface provided to userspace by Linux +to enable indirect branch tracking for user mode applications on RISV-V + +1. Feature Overview + + +Memory corruption issues usually result in to crashes, however when in hands of +an adversary and if used creatively can result into variety security issues. + +One of those security issues can be code re-use attacks on program where adversary +can use corrupt function pointers and chain them together to perform jump oriented +programming (JOP) or call oriented programming (COP) and thus compromising control +flow integrity (CFI) of the program. + +Function pointers live in read-write memory and thus are susceptible to corruption +and allows an adversary to reach any program counter (PC) in address space. On +RISC-V zicfilp extension enforces a restriction on such indirect control +transfers: + +- indirect control transfers must land on a landing pad instruction ``lpad``. + There are two exception to this rule: + + - rs1 = x1 or rs1 = x5, i.e. a return from a function and returns are +protected using shadow stack (see zicfiss.rst) + + - rs1 = x7. On RISC-V compiler usually does below to reach function +which is beyond the offset possible J-type instruction:: + + auipc x7, + jalr (x7) + + Such form of indirect control transfer are still immutable and don't rely +on memory and thus rs1=x7 is exempted from tracking and considered software +guarded jumps. + +``lpad`` instruction is pseudo of ``auipc rd, `` with ``rd=x0`` and +is a HINT nop. ``lpad`` instruction must be aligned on 4 byte boundary and +compares 20 bit immediate withx7. If ``imm_20bit`` == 0, CPU don't perform any +comparision with ``x7``. If ``imm_20bit`` != 0, then ``imm_20bit`` must match +``x7`` else CPU will raise ``software check exception`` (``cause=18``) with +``*tval = 2``. + +Compiler can generate a hash over function signatures and setup them (truncated +to 20bit) in x7 at callsites and function prologues can have ``lpad`` with same +function hash. This further reduces number of program counters a call site can +reach. + +2. ELF and psABI +- + +Toolchain sets up :c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_FCFI` for property +:c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_AND` in notes section of the object file. + +3. Linux enabling +-- + +User space programs can have multiple shared objects loaded in its address space +and it's a difficult task to make sure all the dependencies have been compiled +with support of indirect branch. Thus it's left to dynamic loader to enable +indirect branch tracking for the program. + +4. prctl() enabling + + +:c:macro:`PR_SET_INDIR_BR_LP_STATUS` / :c:macro:`PR_GET_INDIR_BR_LP_STATUS` / +:c:macro:`PR_LOCK_INDIR_BR_LP_STATUS` are three prctls added to manage indirect +branch tracking. prctls are arch agnostic and returns -EINVAL on other arches. + +* prctl(PR_SET_INDIR_BR_LP_STATUS, unsigned long arg) + +If arg1 is :c:macro:`PR_INDIR_BR_LP_ENABLE` and if CPU supports ``zicfilp`` +then kernel will enabled indirect branch tracking for the task. Dynamic loader +can issue this :c:macro:`prctl` once it has determined that all the objects +loaded in address space support indirect branch tracking. Additionally if there +is a `dlopen` to an object which wasn't compiled with ``zicfilp``, dynamic +loader can issue this prctl with arg1 set to 0 (i.e. +:c:macro:`PR_INDIR_BR_LP_ENABLE` being clear) + +* prctl(PR_GET_INDIR_BR_LP_STATUS, unsigned long arg) + +Returns current status of indirect branch tracking. If enabled it'll return +:c:macro:`PR_INDIR_BR_LP_ENABLE` + +* prctl(PR_LOCK_INDIR_BR_LP_STATUS, unsigned long arg) + +Locks current status of indirect branch tracking on the task. User space may +want to run with strict security posture and
[PATCH 28/33] riscv: enable kernel access to shadow stack memory via FWFT sbi call
Kernel will have to perform shadow stack operations on user shadow stack. Like during signal delivery and sigreturn, shadow stack token must be created and validated respectively. Thus shadow stack access for kernel must be enabled. In future when kernel shadow stacks are enabled for linux kernel, it must be enabled as early as possible for better coverage and prevent imbalance between regular stack and shadow stack. After `relocate_enable_mmu` has been done, this is as early as possible it can enabled. Signed-off-by: Deepak Gupta --- arch/riscv/kernel/asm-offsets.c | 4 arch/riscv/kernel/head.S| 12 2 files changed, 16 insertions(+) diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 766bd33f10cb..a22ab8a41672 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -517,4 +517,8 @@ void asm_offsets(void) DEFINE(FREGS_A6,offsetof(struct ftrace_regs, a6)); DEFINE(FREGS_A7,offsetof(struct ftrace_regs, a7)); #endif + DEFINE(SBI_EXT_FWFT, SBI_EXT_FWFT); + DEFINE(SBI_EXT_FWFT_SET, SBI_EXT_FWFT_SET); + DEFINE(SBI_FWFT_SHADOW_STACK, SBI_FWFT_SHADOW_STACK); + DEFINE(SBI_FWFT_SET_FLAG_LOCK, SBI_FWFT_SET_FLAG_LOCK); } diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 356d5397b2a2..6244408ca917 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -164,6 +164,12 @@ secondary_start_sbi: call relocate_enable_mmu #endif call .Lsetup_trap_vector + li a7, SBI_EXT_FWFT + li a6, SBI_EXT_FWFT_SET + li a0, SBI_FWFT_SHADOW_STACK + li a1, 1 /* enable supervisor to access shadow stack access */ + li a2, SBI_FWFT_SET_FLAG_LOCK + ecall scs_load_current call smp_callin #endif /* CONFIG_SMP */ @@ -320,6 +326,12 @@ SYM_CODE_START(_start_kernel) la tp, init_task la sp, init_thread_union + THREAD_SIZE addi sp, sp, -PT_SIZE_ON_STACK + li a7, SBI_EXT_FWFT + li a6, SBI_EXT_FWFT_SET + li a0, SBI_FWFT_SHADOW_STACK + li a1, 1 /* enable supervisor to access shadow stack access */ + li a2, SBI_FWFT_SET_FLAG_LOCK + ecall scs_load_current #ifdef CONFIG_KASAN -- 2.45.0
[PATCH 02/33] mm: helper `is_shadow_stack_vma` to check shadow stack vma
VM_SHADOW_STACK (alias to VM_HIGH_ARCH_5) is used to encode shadow stack VMA on three architectures (x86 shadow stack, arm GCS and RISC-V shadow stack). In case architecture doesn't implement shadow stack, it's VM_NONE Introducing a helper `is_shadow_stack_vma` to determine shadow stack vma or not. Signed-off-by: Deepak Gupta --- mm/gup.c | 2 +- mm/vma.h | 10 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a82890b46a36..8e6e14179f6c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1282,7 +1282,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) !writable_file_mapping_allowed(vma, gup_flags)) return -EFAULT; - if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { + if (!(vm_flags & VM_WRITE) || is_shadow_stack_vma(vm_flags)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ diff --git a/mm/vma.h b/mm/vma.h index 819f994cf727..0f238dc37231 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -357,7 +357,7 @@ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, } /* - * These three helpers classifies VMAs for virtual memory accounting. + * These four helpers classifies VMAs for virtual memory accounting. */ /* @@ -368,6 +368,11 @@ static inline bool is_exec_mapping(vm_flags_t flags) return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } +static inline bool is_shadow_stack_vma(vm_flags_t vm_flags) +{ + return !!(vm_flags & VM_SHADOW_STACK); +} + /* * Stack area (including shadow stacks) * @@ -376,7 +381,7 @@ static inline bool is_exec_mapping(vm_flags_t flags) */ static inline bool is_stack_mapping(vm_flags_t flags) { - return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); + return ((flags & VM_STACK) == VM_STACK) || is_shadow_stack_vma(flags); } /* @@ -387,7 +392,6 @@ static inline bool is_data_mapping(vm_flags_t flags) return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } - static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { -- 2.45.0
[PATCH 01/33] mm: Introduce ARCH_HAS_USER_SHADOW_STACK
From: Mark Brown Since multiple architectures have support for shadow stacks and we need to select support for this feature in several places in the generic code provide a generic config option that the architectures can select. Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Signed-off-by: Mark Brown Reviewed-by: Rick Edgecombe Reviewed-by: Deepak Gupta Reviewed-by: Carlos Bilbao --- arch/x86/Kconfig | 1 + fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 2 +- mm/Kconfig | 6 ++ 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2852fcd82cbd..8ccae77d40f7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1954,6 +1954,7 @@ config X86_USER_SHADOW_STACK depends on AS_WRUSS depends on X86_64 select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_USER_SHADOW_STACK select X86_CET help Shadow stack protection is a hardware feature that detects function diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 72f14fd59c2d..23f875e78eae 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -971,7 +971,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -#ifdef CONFIG_X86_USER_SHADOW_STACK +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..57533b9cae95 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -354,7 +354,7 @@ extern unsigned int kobjsize(const void *objp); #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ -#ifdef CONFIG_X86_USER_SHADOW_STACK +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. diff --git a/mm/Kconfig b/mm/Kconfig index 4c9f5ea13271..4b2a1ef9a161 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1296,6 +1296,12 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. +config ARCH_HAS_USER_SHADOW_STACK + bool + help + The architecture has hardware support for userspace shadow call + stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). + source "mm/damon/Kconfig" endmenu -- 2.45.0
[PATCH 00/33] riscv control-flow integrity for usermode
v5 for cpu assisted riscv user mode control flow integrity. zicfiss and zicfilp [1] are ratified riscv CPU extensions. Changes in this version are - rebased on v6.12-rc1 - Fixed schema related issues in device tree file - Fixed some of the documentation related issues in zicfilp/ss.rst (style issues and added index) - added `SHADOW_STACK_SET_MARKER` so that implementation can define base of shadow stack. - Fixed warnings on definitions added in usercfi.h when CONFIG_RISCV_USER_CFI is not selected. - Adopted context header based signal handling as proposed by Andy Chiu - Added support for enabling kernel mode access to shadow stack using FWFT [4] v4 [3] and v3 [2] are earlier versions of patch series. To get more information on kernel interactions with respect to zicfilp and zicfiss, patch series adds documentation for `zicfilp` and `zicfiss` Documentation/arch/riscv/zicfiss.rst Documentation/arch/riscv/zicfilp.rst How to test this series === Toolchain - $ git clone g...@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev $ riscv-gnu-toolchain/configure --prefix= --with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb --with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static" $ make -j$(nproc) Qemu $ git clone g...@github.com:deepak0414/qemu.git -b zicfilp_zicfiss_ratified_master_july11 $ cd qemu $ mkdir build $ cd build $ ../configure --target-list=riscv64-softmmu $ make -j$(nproc) Opensbi --- $ git clone g...@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi $ make CROSS_COMPILE= -j$(nproc) PLATFORM=generic Linux - Running defconfig is fine. CFI is enabled by default if the toolchain supports it. $ make ARCH=riscv CROSS_COMPILE=/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) defconfig $ make ARCH=riscv CROSS_COMPILE=/build/bin/riscv64-unknown-linux-gnu- -j$(nproc) Running --- Modify your qemu command to have: -bios /build/platform/generic/firmware/fw_dynamic.bin -cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true vDSO related Opens (in the flux) = I am listing these opens for laying out plan and what to expect in future patch sets. And of course for the sake of discussion. Shadow stack and landing pad enabling in vDSO -- vDSO must have shadow stack and landing pad support compiled in for task to have shadow stack and landing pad support. This patch series doesn't enable that (yet). Enabling shadow stack support in vDSO should be straight forward (intend to do that in next versions of patch set). Enabling landing pad support in vDSO requires some collaboration with toolchain folks to follow a single label scheme for all object binaries. This is necessary to ensure that all indirect call-sites are setting correct label and target landing pads are decorated with same label scheme. How many vDSOs --- Shadow stack instructions are carved out of zimop (may be operations) and if CPU doesn't implement zimop, they're illegal instructions. Kernel could be running on a CPU which may or may not implement zimop. And thus kernel will have to carry 2 different vDSOs and expose the appropriate one depending on whether CPU implements zimop or not. [1] - https://github.com/riscv/riscv-cfi [2] - https://lore.kernel.org/lkml/20240403234054.2020347-1-de...@rivosinc.com/ [3] - https://lore.kernel.org/all/20240912231650.3740732-1-de...@rivosinc.com/ [4] - https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware-features.adoc --- changelog - v4 -- - rebased on 6.11-rc6 - envcfg: Converged with Samuel Holland's patches for envcfg management on per- thread basis. - vma_is_shadow_stack is renamed to is_vma_shadow_stack - picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch - signal context: using extended context management to maintain compatibility. - fixed `-Wmissing-prototypes` compiler warnings for prctl functions - Documentation fixes and amending typos. v3 -- envcfg: logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been picked on per task basis, even though CPU didn't implement it. Fixed in this series. dt-bindings: As suggested, split into separate commit. fixed the messaging that spec is in public review arch_is_shadow_stack change: arch_is_shadow_stack changed to vma_is_shadow_stack hwprobe: zicfiss / zicfilp if present will get enumerated in hwprobe selftests: As suggested, added object and binary filenames to .gitignore Selftest binary anyways need to be compiled with cfi enabled compiler which will make sure that landing pad and shadow stack are enabled. Thus removed separate enable/disable tests. Cleaned up tests a bit. v2 -- - Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow integrity for user mode programs can be compiled in the kernel. - Enabling of control flow integrity for user programs is left to user runtime - This patch
[PATCH 03/33] riscv: Enable cbo.zero only when all harts support Zicboz
From: Samuel Holland Currently, we enable cbo.zero for usermode on each hart that supports the Zicboz extension. This means that the [ms]envcfg CSR value may differ between harts. Other features, such as pointer masking and CFI, require setting [ms]envcfg bits on a per-thread basis. The combination of these two adds quite some complexity and overhead to context switching, as we would need to maintain two separate masks for the per-hart and per-thread bits. Andrew Jones, who originally added Zicboz support, writes[1][2]: I've approached Zicboz the same way I would approach all extensions, which is to be per-hart. I'm not currently aware of a platform that is / will be composed of harts where some have Zicboz and others don't, but there's nothing stopping a platform like that from being built. So, how about we add code that confirms Zicboz is on all harts. If any hart does not have it, then we complain loudly and disable it on all the other harts. If it was just a hardware description bug, then it'll get fixed. If there's actually a platform which doesn't have Zicboz on all harts, then, when the issue is reported, we can decide to not support it, support it with defconfig, or support it under a Kconfig guard which must be enabled by the user. Let's follow his suggested solution and require the extension to be available on all harts, so the envcfg CSR value does not need to change when a thread migrates between harts. Since we are doing this for all extensions with fields in envcfg, the CSR itself only needs to be saved/ restored when it is present on all harts. This should not be a regression as no known hardware has asymmetric Zicboz support, but if anyone reports seeing the warning, we will re-evaluate our solution. Link: https://lore.kernel.org/linux-riscv/20240322-168f191eeb8479b2ea169a5e@orel/ [1] Link: https://lore.kernel.org/linux-riscv/20240323-28943722feb57a41fb0ff488@orel/ [2] Reviewed-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Deepak Gupta Signed-off-by: Samuel Holland --- arch/riscv/kernel/cpufeature.c | 7 ++- arch/riscv/kernel/suspend.c| 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 3a8eeaa9310c..e560a253e99b 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -28,6 +28,8 @@ #define NUM_ALPHA_EXTS ('z' - 'a' + 1) +static bool any_cpu_has_zicboz; + unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ @@ -98,6 +100,7 @@ static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data, pr_err("Zicboz disabled as cboz-block-size present, but is not a power-of-2\n"); return -EINVAL; } + any_cpu_has_zicboz = true; return 0; } @@ -919,8 +922,10 @@ unsigned long riscv_get_elf_hwcap(void) void riscv_user_isa_enable(void) { - if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ)) + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ)) csr_set(CSR_ENVCFG, ENVCFG_CBZE); + else if (any_cpu_has_zicboz) + pr_warn_once("Zicboz disabled as it is unavailable on some harts\n"); } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c index c8cec0cc5833..9a8a0dc035b2 100644 --- a/arch/riscv/kernel/suspend.c +++ b/arch/riscv/kernel/suspend.c @@ -14,7 +14,7 @@ void suspend_save_csrs(struct suspend_context *context) { - if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG)) context->envcfg = csr_read(CSR_ENVCFG); context->tvec = csr_read(CSR_TVEC); context->ie = csr_read(CSR_IE); @@ -37,7 +37,7 @@ void suspend_save_csrs(struct suspend_context *context) void suspend_restore_csrs(struct suspend_context *context) { csr_write(CSR_SCRATCH, 0); - if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_XLINUXENVCFG)) + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG)) csr_write(CSR_ENVCFG, context->envcfg); csr_write(CSR_TVEC, context->tvec); csr_write(CSR_IE, context->ie); -- 2.45.0
[PATCH 06/33] riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv
riscv will need an implementation for exit_thread to clean up shadow stack when thread exits. If current thread had shadow stack enabled, shadow stack is allocated by default for any new thread. Signed-off-by: Deepak Gupta Reviewed-by: Charlie Jenkins --- arch/riscv/Kconfig | 1 + arch/riscv/kernel/process.c | 5 + 2 files changed, 6 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 22dc5ea4196c..808ea66b9537 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -182,6 +182,7 @@ config RISCV select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS + select HAVE_EXIT_THREAD select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select IRQ_DOMAIN select IRQ_FORCED_THREADING diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index e3142d8a6e28..1f2574fb2edb 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -201,6 +201,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } +void exit_thread(struct task_struct *tsk) +{ + +} + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; -- 2.45.0
[PATCH 07/33] riscv: zicfilp / zicfiss in dt-bindings (extensions.yaml)
Make an entry for cfi extensions in extensions.yaml. Signed-off-by: Deepak Gupta --- Documentation/devicetree/bindings/riscv/extensions.yaml | 14 ++ 1 file changed, 14 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index 2cf2026cff57..356c60fd6cc8 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -368,6 +368,20 @@ properties: The standard Zicboz extension for cache-block zeroing as ratified in commit 3dd606f ("Create cmobase-v1.0.pdf") of riscv-CMOs. +- const: zicfilp + description: | +The standard Zicfilp extension for enforcing forward edge +control-flow integrity as ratified in commit 3f8e450 ("merge +pull request #227 from ved-rivos/0709") of riscv-cfi +github repo. + +- const: zicfiss + description: | +The standard Zicfiss extension for enforcing backward edge +control-flow integrity as ratified in commit 3f8e450 ("merge +pull request #227 from ved-rivos/0709") of riscv-cfi +github repo. + - const: zicntr description: The standard Zicntr extension for base counters and timers, as -- 2.45.0
[PATCH 17/33] prctl: arch-agnostic prctl for shadow stack
From: Mark Brown Three architectures (x86, aarch64, riscv) have announced support for shadow stacks with fairly similar functionality. While x86 is using arch_prctl() to control the functionality neither arm64 nor riscv uses that interface so this patch adds arch-agnostic prctl() support to get and set status of shadow stacks and lock the current configuration to prevent further changes, with support for turning on and off individual subfeatures so applications can limit their exposure to features that they do not need. The features are: - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks, including allocation of a shadow stack if one is not already allocated. - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow stack. - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack. - PR_SHADOW_STACK_DISABLE: Allow to disable shadow stack. Note once locked, disable must fail. These features are expected to be inherited by new threads and cleared on exec(), unknown features should be rejected for enable but accepted for locking (in order to allow for future proofing). This is based on a patch originally written by Deepak Gupta but later modified by Mark Brown for arm's GCS patch series. Signed-off-by: Mark Brown Co-developed-by: Deepak Gupta --- include/linux/mm.h | 3 +++ include/uapi/linux/prctl.h | 21 + kernel/sys.c | 30 ++ 3 files changed, 54 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 57533b9cae95..54e2b3f1cc30 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4146,6 +4146,9 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); +int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); +int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 35791791a879..b8d7b6361754 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -327,5 +327,26 @@ struct prctl_mm_map { # define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */ # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC0x10 /* Clear the aspect on exec */ # define PR_PPC_DEXCR_CTRL_MASK0x1f +/* + * Get the current shadow stack configuration for the current thread, + * this will be the value configured via PR_SET_SHADOW_STACK_STATUS. + */ +#define PR_GET_SHADOW_STACK_STATUS 74 + +/* + * Set the current shadow stack configuration. Enabling the shadow + * stack will cause a shadow stack to be allocated for the thread. + */ +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# define PR_SHADOW_STACK_WRITE (1UL << 1) +# define PR_SHADOW_STACK_PUSH (1UL << 2) + +/* + * Prevent further changes to the specified shadow stack + * configuration. All bits may be locked via this call, including + * undefined bits. + */ +#define PR_LOCK_SHADOW_STACK_STATUS 76 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 4da31f28fda8..3d38a9c7c5c9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } +int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME @@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_RISCV_SET_ICACHE_FLUSH_CTX: error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); break; + case PR_GET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2); + break; + case PR_SET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_shadow_stack_status(me, arg2); + break; + case PR_LOCK_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_lock_shadow_stack_status(me, arg2); +
[PATCH 18/33] prctl: arch-agnostic prctl for indirect branch tracking
Three architectures (x86, aarch64, riscv) have support for indirect branch tracking feature in a very similar fashion. On a very high level, indirect branch tracking is a CPU feature where CPU tracks branches which uses memory operand to perform control transfer in program. As part of this tracking on indirect branches, CPU goes in a state where it expects a landing pad instr on target and if not found then CPU raises some fault (architecture dependent) x86 landing pad instr - `ENDBRANCH` aarch64 landing pad instr - `BTI` riscv landing instr - `lpad` Given that three major arches have support for indirect branch tracking, This patch makes `prctl` for indirect branch tracking arch agnostic. To allow userspace to enable this feature for itself, following prtcls are defined: - PR_GET_INDIR_BR_LP_STATUS: Gets current configured status for indirect branch tracking. - PR_SET_INDIR_BR_LP_STATUS: Sets a configuration for indirect branch tracking. Following status options are allowed - PR_INDIR_BR_LP_ENABLE: Enables indirect branch tracking on user thread. - PR_INDIR_BR_LP_DISABLE; Disables indirect branch tracking on user thread. - PR_LOCK_INDIR_BR_LP_STATUS: Locks configured status for indirect branch tracking for user thread. Signed-off-by: Deepak Gupta --- include/linux/cpu.h| 4 include/uapi/linux/prctl.h | 27 +++ kernel/sys.c | 30 ++ 3 files changed, 61 insertions(+) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index bdcec1732445..eff56aae05d7 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -203,4 +203,8 @@ static inline bool cpu_mitigations_auto_nosmt(void) } #endif +int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status); +int arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status); +int arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status); + #endif /* _LINUX_CPU_H_ */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index b8d7b6361754..41ffb53490a4 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -349,4 +349,31 @@ struct prctl_mm_map { */ #define PR_LOCK_SHADOW_STACK_STATUS 76 +/* + * Get the current indirect branch tracking configuration for the current + * thread, this will be the value configured via PR_SET_INDIR_BR_LP_STATUS. + */ +#define PR_GET_INDIR_BR_LP_STATUS 77 + +/* + * Set the indirect branch tracking configuration. PR_INDIR_BR_LP_ENABLE will + * enable cpu feature for user thread, to track all indirect branches and ensure + * they land on arch defined landing pad instruction. + * x86 - If enabled, an indirect branch must land on `ENDBRANCH` instruction. + * arch64 - If enabled, an indirect branch must land on `BTI` instruction. + * riscv - If enabled, an indirect branch must land on `lpad` instruction. + * PR_INDIR_BR_LP_DISABLE will disable feature for user thread and indirect + * branches will no more be tracked by cpu to land on arch defined landing pad + * instruction. + */ +#define PR_SET_INDIR_BR_LP_STATUS 78 +# define PR_INDIR_BR_LP_ENABLE(1UL << 0) + +/* + * Prevent further changes to the specified indirect branch tracking + * configuration. All bits may be locked via this call, including + * undefined bits. + */ +#define PR_LOCK_INDIR_BR_LP_STATUS 79 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 3d38a9c7c5c9..dafa31485584 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2339,6 +2339,21 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st return -EINVAL; } +int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME @@ -2814,6 +2829,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; + case PR_GET_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_indir_br_lp_status(me, (unsigned long __user *) arg2); + break; + case PR_SET_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_indir_br_lp_status(me, arg2); + break; + case PR_LOCK_INDIR_BR_LP_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; +
[PATCH 11/33] riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE
`arch_calc_vm_prot_bits` is implemented on risc-v to return VM_READ | VM_WRITE if PROT_WRITE is specified. Similarly `riscv_sys_mmap` is updated to convert all incoming PROT_WRITE to (PROT_WRITE | PROT_READ). This is to make sure that any existing apps using PROT_WRITE still work. Earlier `protection_map[VM_WRITE]` used to pick read-write PTE encodings. Now `protection_map[VM_WRITE]` will always pick PAGE_SHADOWSTACK PTE encodings for shadow stack. Above changes ensure that existing apps continue to work because underneath kernel will be picking `protection_map[VM_WRITE|VM_READ]` PTE encodings. Signed-off-by: Deepak Gupta --- arch/riscv/include/asm/mman.h| 24 arch/riscv/include/asm/pgtable.h | 1 + arch/riscv/kernel/sys_riscv.c| 10 ++ arch/riscv/mm/init.c | 2 +- mm/mmap.c| 1 + 5 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/mman.h b/arch/riscv/include/asm/mman.h new file mode 100644 index ..ef9fedf32546 --- /dev/null +++ b/arch/riscv/include/asm/mman.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#include +#include +#include + +static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, + unsigned long pkey __always_unused) +{ + unsigned long ret = 0; + + /* +* If PROT_WRITE was specified, force it to VM_READ | VM_WRITE. +* Only VM_WRITE means shadow stack. +*/ + if (prot & PROT_WRITE) + ret = (VM_READ | VM_WRITE); + return ret; +} +#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) + +#endif /* ! __ASM_MMAN_H__ */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e79f15293492..4948a1f18ae8 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -177,6 +177,7 @@ extern struct pt_alloc_ops pt_ops __meminitdata; #define PAGE_READ_EXEC __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) #define PAGE_WRITE_EXEC__pgprot(_PAGE_BASE | _PAGE_READ | \ _PAGE_EXEC | _PAGE_WRITE) +#define PAGE_SHADOWSTACK __pgprot(_PAGE_BASE | _PAGE_WRITE) #define PAGE_COPY PAGE_READ #define PAGE_COPY_EXEC PAGE_READ_EXEC diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index d77afe05578f..43a448bf254b 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -7,6 +7,7 @@ #include #include +#include static long riscv_sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, @@ -16,6 +17,15 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long len, if (unlikely(offset & (~PAGE_MASK >> page_shift_offset))) return -EINVAL; + /* +* If PROT_WRITE is specified then extend that to PROT_READ +* protection_map[VM_WRITE] is now going to select shadow stack encodings. +* So specifying PROT_WRITE actually should select protection_map [VM_WRITE | VM_READ] +* If user wants to create shadow stack then they should use `map_shadow_stack` syscall. +*/ + if (unlikely((prot & PROT_WRITE) && !(prot & PROT_READ))) + prot |= PROT_READ; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, offset >> (PAGE_SHIFT - page_shift_offset)); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 0e8c20adcd98..964810aeb405 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -326,7 +326,7 @@ pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); static const pgprot_t protection_map[16] = { [VM_NONE] = PAGE_NONE, [VM_READ] = PAGE_READ, - [VM_WRITE] = PAGE_COPY, + [VM_WRITE] = PAGE_SHADOWSTACK, [VM_WRITE | VM_READ]= PAGE_COPY, [VM_EXEC] = PAGE_EXEC, [VM_EXEC | VM_READ] = PAGE_READ_EXEC, diff --git a/mm/mmap.c b/mm/mmap.c index dd4b35a25aeb..b56f1e8cbfc6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include -- 2.45.0
[PATCH 33/33] kselftest/riscv: kselftest for user mode cfi
Adds kselftest for RISC-V control flow integrity implementation for user mode. There is not a lot going on in kernel for enabling landing pad for user mode. cfi selftest are intended to be compiled with zicfilp and zicfiss enabled compiler. Thus kselftest simply checks if landing pad and shadow stack for the binary and process are enabled or not. selftest then register a signal handler for SIGSEGV. Any control flow violation are reported as SIGSEGV with si_code = SEGV_CPERR. Test will fail on receiving any SEGV_CPERR. Shadow stack part has more changes in kernel and thus there are separate tests for that - Exercise `map_shadow_stack` syscall - `fork` test to make sure COW works for shadow stack pages - gup tests As of today kernel uses FOLL_FORCE when access happens to memory via /proc//mem. Not breaking that for shadow stack - signal test. Make sure signal delivery results in token creation on shadow stack and consumes (and verifies) token on sigreturn - shadow stack protection test. attempts to write using regular store instruction on shadow stack memory must result in access faults Signed-off-by: Deepak Gupta --- tools/testing/selftests/riscv/Makefile | 2 +- tools/testing/selftests/riscv/cfi/.gitignore | 3 + tools/testing/selftests/riscv/cfi/Makefile | 10 + tools/testing/selftests/riscv/cfi/cfi_rv_test.h| 83 + tools/testing/selftests/riscv/cfi/riscv_cfi_test.c | 82 + tools/testing/selftests/riscv/cfi/shadowstack.c| 362 + tools/testing/selftests/riscv/cfi/shadowstack.h| 37 +++ 7 files changed, 578 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 7ce03d832b64..6e142fe004ab 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= hwprobe vector mm sigreturn +RISCV_SUBTARGETS ?= hwprobe vector mm sigreturn cfi else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/cfi/.gitignore b/tools/testing/selftests/riscv/cfi/.gitignore new file mode 100644 index ..ce7623f9da28 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/.gitignore @@ -0,0 +1,3 @@ +cfitests +riscv_cfi_test +shadowstack \ No newline at end of file diff --git a/tools/testing/selftests/riscv/cfi/Makefile b/tools/testing/selftests/riscv/cfi/Makefile new file mode 100644 index ..b65f7ff38a32 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/Makefile @@ -0,0 +1,10 @@ +CFLAGS += -I$(top_srcdir)/tools/include + +CFLAGS += -march=rv64gc_zicfilp_zicfiss + +TEST_GEN_PROGS := cfitests + +include ../../lib.mk + +$(OUTPUT)/cfitests: riscv_cfi_test.c shadowstack.c + $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/cfi/cfi_rv_test.h b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h new file mode 100644 index ..fa1cf7183672 --- /dev/null +++ b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef SELFTEST_RISCV_CFI_H +#define SELFTEST_RISCV_CFI_H +#include +#include +#include "shadowstack.h" + +#define RISCV_CFI_SELFTEST_COUNT RISCV_SHADOW_STACK_TESTS + +#define CHILD_EXIT_CODE_SSWRITE10 +#define CHILD_EXIT_CODE_SIG_TEST 11 + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num __asm__ ("a7") = (num); \ + register long _arg1 __asm__ ("a0") = (long)(arg1); \ + register long _arg2 __asm__ ("a1") = (long)(arg2); \ + register long _arg3 __asm__ ("a2") = (long)(arg3); \ + register long _arg4 __asm__ ("a3") = (long)(arg4); \ + register long _arg5 __asm__ ("a4") = (long)(arg5); \ + \ + __asm__ volatile ( \ + "ecall\n" \ + : "+r"(_arg1) \ + : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + );
[PATCH 32/33] riscv: Documentation for shadow stack on riscv
Adding documentation on shadow stack for user mode on riscv and kernel interfaces exposed so that user tasks can enable it. Signed-off-by: Deepak Gupta --- Documentation/arch/riscv/index.rst | 1 + Documentation/arch/riscv/zicfiss.rst | 176 +++ 2 files changed, 177 insertions(+) diff --git a/Documentation/arch/riscv/index.rst b/Documentation/arch/riscv/index.rst index be7237b69682..e240eb0ceb70 100644 --- a/Documentation/arch/riscv/index.rst +++ b/Documentation/arch/riscv/index.rst @@ -15,6 +15,7 @@ RISC-V architecture vector cmodx zicfilp +zicfiss features diff --git a/Documentation/arch/riscv/zicfiss.rst b/Documentation/arch/riscv/zicfiss.rst new file mode 100644 index ..5ba389f15b3f --- /dev/null +++ b/Documentation/arch/riscv/zicfiss.rst @@ -0,0 +1,176 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Author: Deepak Gupta +:Date: 12 January 2024 + += +Shadow stack to protect function returns on RISC-V Linux += + +This document briefly describes the interface provided to userspace by Linux +to enable shadow stack for user mode applications on RISV-V + +1. Feature Overview + + +Memory corruption issues usually result in to crashes, however when in hands of +an adversary and if used creatively can result into variety security issues. + +One of those security issues can be code re-use attacks on program where +adversary can use corrupt return addresses present on stack and chain them +together to perform return oriented programming (ROP) and thus compromising +control flow integrity (CFI) of the program. + +Return addresses live on stack and thus in read-write memory and thus are +susceptible to corruption and allows an adversary to reach any program counter +(PC) in address space. On RISC-V ``zicfiss`` extension provides an alternate +stack termed as shadow stack on which return addresses can be safely placed in +prolog of the function and retrieved in epilog. ``zicfiss`` extension makes +following changes: + +- PTE encodings for shadow stack virtual memory + An earlier reserved encoding in first stage translation i.e. + PTE.R=0, PTE.W=1, PTE.X=0 becomes PTE encoding for shadow stack pages. + +- ``sspush x1/x5`` instruction pushes (stores) ``x1/x5`` to shadow stack. + +- ``sspopchk x1/x5`` instruction pops (loads) from shadow stack and compares + with ``x1/x5`` and if un-equal, CPU raises ``software check exception`` with + ``*tval = 3`` + +Compiler toolchain makes sure that function prologue have ``sspush x1/x5`` to +save return address on shadow stack in addition to regular stack. Similarly +function epilogs have ``ld x5, offset(x2)`` followed by ``sspopchk x5`` to +ensure that popped value from regular stack matches with popped value from +shadow stack. + +2. Shadow stack protections and linux memory manager +- + +As mentioned earlier, shadow stack get new page table encodings and thus have +some special properties assigned to them and instructions that operate on them +as below: + +- Regular stores to shadow stack memory raises access store faults. This way + shadow stack memory is protected from stray inadvertant writes. + +- Regular loads to shadow stack memory are allowed. This allows stack trace + utilities or backtrace functions to read true callstack (not tampered). + +- Only shadow stack instructions can generate shadow stack load or shadow stack + store. + +- Shadow stack load / shadow stack store on read-only memory raises AMO/store + page fault. Thus both ``sspush x1/x5`` and ``sspopchk x1/x5`` will raise AMO/ + store page fault. This simplies COW handling in kernel During fork, kernel + can convert shadow stack pages into read-only memory (as it does for regular + read-write memory) and as soon as subsequent ``sspush`` or ``sspopchk`` in + userspace is encountered, then kernel can perform COW. + +- Shadow stack load / shadow stack store on read-write, read-write-execute + memory raises an access fault. This is a fatal condition because shadow stack + should never be operating on read-write, read-write-execute memory. + +3. ELF and psABI +- + +Toolchain sets up :c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_BCFI` for property +:c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_AND` in notes section of the object file. + +4. Linux enabling +-- + +User space programs can have multiple shared objects loaded in its address space +and it's a difficult task to make sure all the dependencies have been compiled +with support of shadow stack. Thus it's left to dynamic loader to enable +shadow stack for the program. + +5. prctl() enabling + + +:c:macro:`PR_SET_SHADOW_STACK_STATUS` / :c:macro:`PR_GET_SHADOW_STACK_STATUS` / +:c:macro:`PR_LOCK_SHADOW_STACK_STATUS` are three prctls added to manage
Re: [PATCH 17/33] prctl: arch-agnostic prctl for shadow stack
On Tue, Oct 01, 2024 at 09:06:22AM -0700, Deepak Gupta wrote: > From: Mark Brown > This is based on a patch originally written by Deepak Gupta but later > modified by Mark Brown for arm's GCS patch series. > > Signed-off-by: Mark Brown > Co-developed-by: Deepak Gupta > --- You need to add your own signoff to this when reposting, see submitting-patches.rst. signature.asc Description: PGP signature
[PATCH 05/33] riscv: Call riscv_user_isa_enable() only on the boot hart
From: Samuel Holland Now that the [ms]envcfg CSR value is maintained per thread, not per hart, riscv_user_isa_enable() only needs to be called once during boot, to set the value for the init task. This also allows it to be marked as __init. Reviewed-by: Andrew Jones Reviewed-by: Conor Dooley Reviewed-by: Deepak Gupta Signed-off-by: Samuel Holland --- arch/riscv/include/asm/cpufeature.h | 2 +- arch/riscv/kernel/cpufeature.c | 4 ++-- arch/riscv/kernel/smpboot.c | 2 -- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 45f9c1171a48..ce9a995730c1 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -31,7 +31,7 @@ DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); /* Per-cpu ISA extensions. */ extern struct riscv_isainfo hart_isa[NR_CPUS]; -void riscv_user_isa_enable(void); +void __init riscv_user_isa_enable(void); #define _RISCV_ISA_EXT_DATA(_name, _id, _subset_exts, _subset_exts_size, _validate) { \ .name = #_name, \ diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index c0986291696a..7117366d80db 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -920,12 +920,12 @@ unsigned long riscv_get_elf_hwcap(void) return hwcap; } -void riscv_user_isa_enable(void) +void __init riscv_user_isa_enable(void) { if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ)) current->thread_info.envcfg |= ENVCFG_CBZE; else if (any_cpu_has_zicboz) - pr_warn_once("Zicboz disabled as it is unavailable on some harts\n"); + pr_warn("Zicboz disabled as it is unavailable on some harts\n"); } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 0f8f1c95ac38..e36d20205bd7 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -233,8 +233,6 @@ asmlinkage __visible void smp_callin(void) numa_add_cpu(curr_cpuid); set_cpu_online(curr_cpuid, true); - riscv_user_isa_enable(); - /* * Remote cache and TLB flushes are ignored while the CPU is offline, * so flush them both right now just in case. -- 2.45.0
[PATCH 24/33] riscv/kernel: update __show_regs to print shadow stack register
Updating __show_regs to print captured shadow stack pointer as well. On tasks where shadow stack is disabled, it'll simply print 0. Signed-off-by: Deepak Gupta Reviewed-by: Alexandre Ghiti --- arch/riscv/kernel/process.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 5207f018415c..6db0fde3701e 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -89,8 +89,8 @@ void __show_regs(struct pt_regs *regs) regs->s8, regs->s9, regs->s10); pr_cont(" s11: " REG_FMT " t3 : " REG_FMT " t4 : " REG_FMT "\n", regs->s11, regs->t3, regs->t4); - pr_cont(" t5 : " REG_FMT " t6 : " REG_FMT "\n", - regs->t5, regs->t6); + pr_cont(" t5 : " REG_FMT " t6 : " REG_FMT " ssp : " REG_FMT "\n", + regs->t5, regs->t6, get_active_shstk(current)); pr_cont("status: " REG_FMT " badaddr: " REG_FMT " cause: " REG_FMT "\n", regs->status, regs->badaddr, regs->cause); -- 2.45.0
Re: [PATCH RFC v4 0/9] tun: Introduce virtio-net hashing feature
On Tue, 1 Oct 2024 14:54:29 +0900 Akihiko Odaki wrote: > On 2024/09/30 0:33, Stephen Hemminger wrote: > > On Sun, 29 Sep 2024 16:10:47 +0900 > > Akihiko Odaki wrote: > > > >> On 2024/09/29 11:07, Jason Wang wrote: > >>> On Fri, Sep 27, 2024 at 3:51 PM Akihiko Odaki > >>> wrote: > > On 2024/09/27 13:31, Jason Wang wrote: > > On Fri, Sep 27, 2024 at 10:11 AM Akihiko Odaki > > wrote: > >> > >> On 2024/09/25 12:30, Jason Wang wrote: > >>> On Tue, Sep 24, 2024 at 5:01 PM Akihiko Odaki > >>> wrote: > > virtio-net have two usage of hashes: one is RSS and another is hash > reporting. Conventionally the hash calculation was done by the VMM. > However, computing the hash after the queue was chosen defeats the > purpose of RSS. > > Another approach is to use eBPF steering program. This approach has > another downside: it cannot report the calculated hash due to the > restrictive nature of eBPF. > > Introduce the code to compute hashes to the kernel in order to > overcome > thse challenges. > > An alternative solution is to extend the eBPF steering program so > that it > will be able to report to the userspace, but it is based on context > rewrites, which is in feature freeze. We can adopt kfuncs, but they > will > not be UAPIs. We opt to ioctl to align with other relevant UAPIs (KVM > and vhost_net). > > >>> > >>> I wonder if we could clone the skb and reuse some to store the hash, > >>> then the steering eBPF program can access these fields without > >>> introducing full RSS in the kernel? > >> > >> I don't get how cloning the skb can solve the issue. > >> > >> We can certainly implement Toeplitz function in the kernel or even with > >> tc-bpf to store a hash value that can be used for eBPF steering program > >> and virtio hash reporting. However we don't have a means of storing a > >> hash type, which is specific to virtio hash reporting and lacks a > >> corresponding skb field. > > > > I may miss something but looking at sk_filter_is_valid_access(). It > > looks to me we can make use of skb->cb[0..4]? > > I didn't opt to using cb. Below is the rationale: > > cb is for tail call so it means we reuse the field for a different > purpose. The context rewrite allows adding a field without increasing > the size of the underlying storage (the real sk_buff) so we should add a > new field instead of reusing an existing field to avoid confusion. > > We are however no longer allowed to add a new field. In my > understanding, this is because it is an UAPI, and eBPF maintainers found > it is difficult to maintain its stability. > > Reusing cb for hash reporting is a workaround to avoid having a new > field, but it does not solve the underlying problem (i.e., keeping eBPF > as stable as UAPI is unreasonably hard). In my opinion, adding an ioctl > is a reasonable option to keep the API as stable as other virtualization > UAPIs while respecting the underlying intention of the context rewrite > feature freeze. > >>> > >>> Fair enough. > >>> > >>> Btw, I remember DPDK implements tuntap RSS via eBPF as well (probably > >>> via cls or other). It might worth to see if anything we miss here. > >> > >> Thanks for the information. I wonder why they used cls instead of > >> steering program. Perhaps it may be due to compatibility with macvtap > >> and ipvtap, which don't steering program. > >> > >> Their RSS implementation looks cleaner so I will improve my RSS > >> implementation accordingly. > >> > > > > DPDK needs to support flow rules. The specific case is where packets > > are classified by a flow, then RSS is done across a subset of the queues. > > The support for flow in TUN driver is more academic than useful, > > I fixed it for current BPF, but doubt anyone is using it really. > > > > A full steering program would be good, but would require much more > > complexity to take a general set of flow rules then communicate that > > to the steering program. > > > > It reminded me of RSS context and flow filter. Some physical NICs > support to use a dedicated RSS context for packets matched with flow > filter, and virtio is also gaining corresponding features. > > RSS context: https://github.com/oasis-tcs/virtio-spec/issues/178 > Flow filter: https://github.com/oasis-tcs/virtio-spec/issues/179 > > I considered about the possibility of supporting these features with tc > instead of adding ioctls to tuntap, but it seems not appropriate for > virtualization use case. > > In a virtualization use case, tuntap is configured according to requests > of guests, and the code processing these requests need to have mi
[PATCH 25/33] riscv/ptrace: riscv cfi status and state via ptrace and in core files
Expose a new register type NT_RISCV_USER_CFI for risc-v cfi status and state. Intentionally both landing pad and shadow stack status and state are rolled into cfi state. Creating two different NT_RISCV_USER_XXX would not be useful and wastage of a note type. Enabling or disabling of feature is not allowed via ptrace set interface. However setting `elp` state or setting shadow stack pointer are allowed via ptrace set interface. It is expected `gdb` might have use to fixup `elp` state or `shadow stack` pointer. Signed-off-by: Deepak Gupta --- arch/riscv/include/uapi/asm/ptrace.h | 18 arch/riscv/kernel/ptrace.c | 83 include/uapi/linux/elf.h | 1 + 3 files changed, 102 insertions(+) diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index 659ea3af5680..e6571fba8a8a 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -131,6 +131,24 @@ struct __sc_riscv_cfi_state { unsigned long ss_ptr; /* shadow stack pointer */ }; +struct __cfi_status { + /* indirect branch tracking state */ + __u64 lp_en : 1; + __u64 lp_lock : 1; + __u64 elp_state : 1; + + /* shadow stack status */ + __u64 shstk_en : 1; + __u64 shstk_lock : 1; + + __u64 rsvd : sizeof(__u64) - 5; +}; + +struct user_cfi_state { + struct __cfi_status cfi_status; + __u64 shstk_ptr; +}; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI_ASM_RISCV_PTRACE_H */ diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 92731ff8c79a..c69b20ea6e79 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -19,6 +19,7 @@ #include #include #include +#include enum riscv_regset { REGSET_X, @@ -28,6 +29,9 @@ enum riscv_regset { #ifdef CONFIG_RISCV_ISA_V REGSET_V, #endif +#ifdef CONFIG_RISCV_USER_CFI + REGSET_CFI, +#endif }; static int riscv_gpr_get(struct task_struct *target, @@ -152,6 +156,75 @@ static int riscv_vr_set(struct task_struct *target, } #endif +#ifdef CONFIG_RISCV_USER_CFI +static int riscv_cfi_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_cfi_state user_cfi; + struct pt_regs *regs; + + regs = task_pt_regs(target); + + user_cfi.cfi_status.lp_en = is_indir_lp_enabled(target); + user_cfi.cfi_status.lp_lock = is_indir_lp_locked(target); + user_cfi.cfi_status.elp_state = (regs->status & SR_ELP); + + user_cfi.cfi_status.shstk_en = is_shstk_enabled(target); + user_cfi.cfi_status.shstk_lock = is_shstk_locked(target); + user_cfi.shstk_ptr = get_active_shstk(target); + + return membuf_write(&to, &user_cfi, sizeof(user_cfi)); +} + +/* + * Does it make sense to allowing enable / disable of cfi via ptrace? + * Not allowing enable / disable / locking control via ptrace for now. + * Setting shadow stack pointer is allowed. GDB might use it to unwind or + * some other fixup. Similarly gdb might want to suppress elp and may want + * to reset elp state. + */ +static int riscv_cfi_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct user_cfi_state user_cfi; + struct pt_regs *regs; + + regs = task_pt_regs(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_cfi, 0, -1); + if (ret) + return ret; + + /* +* Not allowing enabling or locking shadow stack or landing pad +* There is no disabling of shadow stack or landing pad via ptrace +* rsvd field should be set to zero so that if those fields are needed in future +*/ + if (user_cfi.cfi_status.lp_en || user_cfi.cfi_status.lp_lock || + user_cfi.cfi_status.shstk_en || user_cfi.cfi_status.shstk_lock || + !user_cfi.cfi_status.rsvd) + return -EINVAL; + + /* If lpad is enabled on target and ptrace requests to set / clear elp, do that */ + if (is_indir_lp_enabled(target)) { + if (user_cfi.cfi_status.elp_state) /* set elp state */ + regs->status |= SR_ELP; + else + regs->status &= ~SR_ELP; /* clear elp state */ + } + + /* If shadow stack enabled on target, set new shadow stack pointer */ + if (is_shstk_enabled(target)) + set_active_shstk(target, user_cfi.shstk_ptr); + + return 0; +} +#endif + static const struct user_regset riscv_user_regset[] = { [REGSET_X] = { .core_note_type = NT_PRSTATUS, @@ -182,6 +255,16 @@ static const struct user_regset riscv_user_regset[] = { .set = riscv_
[PATCH 22/33] riscv: signal: abstract header saving for setup_sigcontext
From: Andy Chiu The function save_v_state() served two purposes. First, it saved extension context into the signal stack. Then, it constructed the extension header if there was no fault. The second part is independent of the extension itself. As a result, we can pull that part out, so future extensions may reuse it. This patch adds arch_ext_list and makes setup_sigcontext() go through all possible extensions' save() callback. The callback returns a positive value indicating the size of the successfully saved extension. Then the kernel proceeds to construct the header for that extension. The kernel skips an extension if it does not exist, or if the saving fails for some reasons. The error code is propagated out on the later case. This patch does not introduce any functional changes. Signed-off-by: Andy Chiu --- arch/riscv/kernel/signal.c | 60 ++ 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index dcd282419456..014ac1024b85 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -68,18 +68,18 @@ static long save_fp_state(struct pt_regs *regs, #define restore_fp_state(task, regs) (0) #endif -#ifdef CONFIG_RISCV_ISA_V - -static long save_v_state(struct pt_regs *regs, void __user **sc_vec) +static long save_v_state(struct pt_regs *regs, void __user *sc_vec) { - struct __riscv_ctx_hdr __user *hdr; struct __sc_riscv_v_state __user *state; void __user *datap; long err; - hdr = *sc_vec; - /* Place state to the user's signal context space after the hdr */ - state = (struct __sc_riscv_v_state __user *)(hdr + 1); + if (!IS_ENABLED(CONFIG_RISCV_ISA_V) || + !(has_vector() && riscv_v_vstate_query(regs))) + return 0; + + /* Place state to the user's signal context spac */ + state = (struct __sc_riscv_v_state __user *)sc_vec; /* Point datap right after the end of __sc_riscv_v_state */ datap = state + 1; @@ -97,15 +97,11 @@ static long save_v_state(struct pt_regs *regs, void __user **sc_vec) err |= __put_user((__force void *)datap, &state->v_state.datap); /* Copy the whole vector content to user space datap. */ err |= __copy_to_user(datap, current->thread.vstate.datap, riscv_v_vsize); - /* Copy magic to the user space after saving all vector conetext */ - err |= __put_user(RISCV_V_MAGIC, &hdr->magic); - err |= __put_user(riscv_v_sc_size, &hdr->size); if (unlikely(err)) - return err; + return -EFAULT; - /* Only progress the sv_vec if everything has done successfully */ - *sc_vec += riscv_v_sc_size; - return 0; + /* Only return the size if everything has done successfully */ + return riscv_v_sc_size; } /* @@ -142,10 +138,19 @@ static long __restore_v_state(struct pt_regs *regs, void __user *sc_vec) */ return copy_from_user(current->thread.vstate.datap, datap, riscv_v_vsize); } -#else -#define save_v_state(task, regs) (0) -#define __restore_v_state(task, regs) (0) -#endif + +struct arch_ext_priv { + __u32 magic; + long (*save)(struct pt_regs *regs, void __user *sc_vec); +}; + +struct arch_ext_priv arch_ext_list[] = { + { + .magic = RISCV_V_MAGIC, + .save = &save_v_state, + }, +}; +const size_t nr_arch_exts = ARRAY_SIZE(arch_ext_list); static long restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) @@ -276,7 +281,8 @@ static long setup_sigcontext(struct rt_sigframe __user *frame, { struct sigcontext __user *sc = &frame->uc.uc_mcontext; struct __riscv_ctx_hdr __user *sc_ext_ptr = &sc->sc_extdesc.hdr; - long err; + struct arch_ext_priv *arch_ext; + long err, i, ext_size; /* sc_regs is structured the same as the start of pt_regs */ err = __copy_to_user(&sc->sc_regs, regs, sizeof(sc->sc_regs)); @@ -284,8 +290,20 @@ static long setup_sigcontext(struct rt_sigframe __user *frame, if (has_fpu()) err |= save_fp_state(regs, &sc->sc_fpregs); /* Save the vector state. */ - if (has_vector() && riscv_v_vstate_query(regs)) - err |= save_v_state(regs, (void __user **)&sc_ext_ptr); + for (i = 0; i < nr_arch_exts; i++) { + arch_ext = &arch_ext_list[i]; + if (!arch_ext->save) + continue; + + ext_size = arch_ext->save(regs, sc_ext_ptr + 1); + if (ext_size <= 0) { + err |= ext_size; + } else { + err |= __put_user(arch_ext->magic, &sc_ext_ptr->magic); + err |= __put_user(ext_size, &sc_ext_ptr->size); + sc_ext_ptr = (void *)sc_ext_ptr + ext_size; + } + } /* Wr
[PATCH 04/33] riscv: Add support for per-thread envcfg CSR values
From: Samuel Holland Some bits in the [ms]envcfg CSR, such as the CFI state and pointer masking mode, need to be controlled on a per-thread basis. Support this by keeping a copy of the CSR value in struct thread_struct and writing it during context switches. It is safe to discard the old CSR value during the context switch because the CSR is modified only by software, so the CSR will remain in sync with the copy in thread_struct. Use ALTERNATIVE directly instead of riscv_has_extension_unlikely() to minimize branchiness in the context switching code. Since thread_struct is copied during fork(), setting the value for the init task sets the default value for all other threads. Reviewed-by: Andrew Jones Reviewed-by: Deepak Gupta Signed-off-by: Samuel Holland --- arch/riscv/include/asm/switch_to.h | 8 arch/riscv/include/asm/thread_info.h | 1 + arch/riscv/kernel/cpufeature.c | 2 +- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h index 7594df37cc9f..dd4a36ff4356 100644 --- a/arch/riscv/include/asm/switch_to.h +++ b/arch/riscv/include/asm/switch_to.h @@ -70,6 +70,13 @@ static __always_inline bool has_fpu(void) { return false; } #define __switch_to_fpu(__prev, __next) do { } while (0) #endif +static inline void __switch_to_envcfg(struct task_struct *next) +{ + asm volatile (ALTERNATIVE("nop", "csrw " __stringify(CSR_ENVCFG) ", %0", + 0, RISCV_ISA_EXT_XLINUXENVCFG, 1) + :: "r" (next->thread_info.envcfg) : "memory"); +} + extern struct task_struct *__switch_to(struct task_struct *, struct task_struct *); @@ -103,6 +110,7 @@ do { \ __switch_to_vector(__prev, __next); \ if (switch_to_should_flush_icache(__next)) \ local_flush_icache_all(); \ + __switch_to_envcfg(__next); \ ((last) = __switch_to(__prev, __next)); \ } while (0) diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h index ebe52f96da34..e494871071da 100644 --- a/arch/riscv/include/asm/thread_info.h +++ b/arch/riscv/include/asm/thread_info.h @@ -57,6 +57,7 @@ struct thread_info { longuser_sp;/* User stack pointer */ int cpu; unsigned long syscall_work; /* SYSCALL_WORK_ flags */ + unsigned long envcfg; #ifdef CONFIG_SHADOW_CALL_STACK void*scs_base; void*scs_sp; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index e560a253e99b..c0986291696a 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -923,7 +923,7 @@ unsigned long riscv_get_elf_hwcap(void) void riscv_user_isa_enable(void) { if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ)) - csr_set(CSR_ENVCFG, ENVCFG_CBZE); + current->thread_info.envcfg |= ENVCFG_CBZE; else if (any_cpu_has_zicboz) pr_warn_once("Zicboz disabled as it is unavailable on some harts\n"); } -- 2.45.0
Re: [PATCH 05/12] mm/memory: Add dax_insert_pfn
On Sun, 22 Sep 2024 03:41:57 +0200 Dan Williams wrote: > [ add s390 folks to comment on CONFIG_FS_DAX_LIMITED ] [...] > > @@ -2516,6 +2545,44 @@ static vm_fault_t __vm_insert_mixed(struct > > vm_area_struct *vma, > > return VM_FAULT_NOPAGE; > > } > > > > +vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write) > > +{ > > + struct vm_area_struct *vma = vmf->vma; > > + pgprot_t pgprot = vma->vm_page_prot; > > + unsigned long pfn = pfn_t_to_pfn(pfn_t); > > + struct page *page = pfn_to_page(pfn); > > The problem here is that we stubbornly have __dcssblk_direct_access() to > worry about. That is the only dax driver that does not return > pfn_valid() pfns. > > In fact, it looks like __dcssblk_direct_access() is the only thing > standing in the way of the removal of pfn_t. > > It turns out it has been 3 years since the last time the question of > bringing s390 fully into the ZONE_DEVICE regime was raised: > > https://lore.kernel.org/all/20210820210318.187742e8@thinkpad/ > > Given that this series removes PTE_DEVMAP which was a stumbling block, > would it be feasible to remove CONFIG_FS_DAX_LIMITED for a few kernel > cycles until someone from the s390 side can circle back to add full > ZONE_DEVICE support? Yes, see also my reply to your "dcssblk: Mark DAX broken" patch. Thanks Alistair for your effort, making ZONE_DEVICE usable w/o extra PTE bit!
Re: [PATCH v5 4/5] KVM: selftests: Add test for PSCI SYSTEM_OFF2
On Thu, Sep 26, 2024 at 07:37:59PM +0100, David Woodhouse wrote: > +static void guest_test_system_off2(void) > +{ > + uint64_t ret; > + > + /* assert that SYSTEM_OFF2 is discoverable */ > + GUEST_ASSERT(psci_features(PSCI_1_3_FN_SYSTEM_OFF2) & > + BIT(PSCI_1_3_HIBERNATE_TYPE_OFF)); > + GUEST_ASSERT(psci_features(PSCI_1_3_FN64_SYSTEM_OFF2) & > + BIT(PSCI_1_3_HIBERNATE_TYPE_OFF)); > + Can you also assert that the guest gets INVALID_PARAMETERS if it sets arg1 or arg2 to a reserved value? > + ret = psci_system_off2(PSCI_1_3_HIBERNATE_TYPE_OFF); > + GUEST_SYNC(ret); > +} > + > +static void host_test_system_off2(void) > +{ > + struct kvm_vcpu *source, *target; > + uint64_t psci_version = 0; > + struct kvm_run *run; > + struct kvm_vm *vm; > + > + vm = setup_vm(guest_test_system_off2, &source, &target); > + vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION, &psci_version); > + TEST_ASSERT(psci_version >= PSCI_VERSION(0, 2), > + "Unexpected PSCI version %lu.%lu", > + PSCI_VERSION_MAJOR(psci_version), > + PSCI_VERSION_MINOR(psci_version)); > + > + if (psci_version < PSCI_VERSION(1,3)) > + goto skip; I'm not following this. Is there a particular reason why we'd want to skip for v1.2 and fail the test for anything less than that? Just do TEST_REQUIRE(psci_version >= PSCI_VERSION(1, 3)), it makes the requirements obvious in the case someone runs new selftests on an old kernel. -- Thanks, Oliver
Re: [PATCH v5 2/5] KVM: arm64: Add PSCI v1.3 SYSTEM_OFF2 function for hibernation
Hi David, On Thu, Sep 26, 2024 at 07:37:57PM +0100, David Woodhouse wrote: > @@ -392,6 +403,32 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 > minor) > break; > } > break; > + case PSCI_1_3_FN_SYSTEM_OFF2: > + kvm_psci_narrow_to_32bit(vcpu); > + fallthrough; > + case PSCI_1_3_FN64_SYSTEM_OFF2: > + if (minor < 3) > + break; > + > + arg = smccc_get_arg1(vcpu); > + if (arg != PSCI_1_3_HIBERNATE_TYPE_OFF) { > + val = PSCI_RET_INVALID_PARAMS; > + break; > + } This is missing a check that arg2 must be zero. > + kvm_psci_system_off2(vcpu); > + /* > + * We shouldn't be going back to guest VCPU after > + * receiving SYSTEM_OFF2 request. > + * > + * If user space accidentally/deliberately resumes > + * guest VCPU after SYSTEM_OFF2 request then guest > + * VCPU should see internal failure from PSCI return > + * value. To achieve this, we preload r0 (or x0) with > + * PSCI return value INTERNAL_FAILURE. > + */ > + val = PSCI_RET_INTERNAL_FAILURE; > + ret = 0; > + break; > default: > return kvm_psci_0_2_call(vcpu); > } > -- > 2.44.0 > -- Thanks, Oliver
Re: [PATCH v5 3/5] KVM: arm64: Add support for PSCI v1.2 and v1.3
On Thu, Sep 26, 2024 at 07:37:58PM +0100, David Woodhouse wrote: > From: David Woodhouse Please, add changelogs to your patches. What we really need here is the detail on *why* we can just bump the PSCI version like this, i.e. no new required ABI. On top of that, you could mention that KVM has made the implementation choice to provide SYSTEM_OFF2 unconditionally in its PSCIv1.3 implementation. > Signed-off-by: David Woodhouse > --- > arch/arm64/kvm/hypercalls.c | 2 ++ > arch/arm64/kvm/psci.c | 6 +- > include/kvm/arm_psci.h | 4 +++- > 3 files changed, 10 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c > index 5763d979d8ca..9c6267ca2b82 100644 > --- a/arch/arm64/kvm/hypercalls.c > +++ b/arch/arm64/kvm/hypercalls.c > @@ -575,6 +575,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const > struct kvm_one_reg *reg) > case KVM_ARM_PSCI_0_2: > case KVM_ARM_PSCI_1_0: > case KVM_ARM_PSCI_1_1: > + case KVM_ARM_PSCI_1_2: > + case KVM_ARM_PSCI_1_3: > if (!wants_02) > return -EINVAL; > vcpu->kvm->arch.psci_version = val; > diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c > index fd0f82464f7d..5177dda5a411 100644 > --- a/arch/arm64/kvm/psci.c > +++ b/arch/arm64/kvm/psci.c > @@ -328,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 > minor) > > switch(psci_fn) { > case PSCI_0_2_FN_PSCI_VERSION: > - val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1; > + val = PSCI_VERSION(1, minor); > break; > case PSCI_1_0_FN_PSCI_FEATURES: > arg = smccc_get_arg1(vcpu); > @@ -486,6 +486,10 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) > } > > switch (version) { > + case KVM_ARM_PSCI_1_3: > + return kvm_psci_1_x_call(vcpu, 3); > + case KVM_ARM_PSCI_1_2: > + return kvm_psci_1_x_call(vcpu, 2); > case KVM_ARM_PSCI_1_1: > return kvm_psci_1_x_call(vcpu, 1); > case KVM_ARM_PSCI_1_0: > diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h > index e8fb624013d1..cbaec804eb83 100644 > --- a/include/kvm/arm_psci.h > +++ b/include/kvm/arm_psci.h > @@ -14,8 +14,10 @@ > #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2) > #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0) > #define KVM_ARM_PSCI_1_1 PSCI_VERSION(1, 1) > +#define KVM_ARM_PSCI_1_2 PSCI_VERSION(1, 2) > +#define KVM_ARM_PSCI_1_3 PSCI_VERSION(1, 3) > > -#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_1 > +#define KVM_ARM_PSCI_LATEST KVM_ARM_PSCI_1_3 > > static inline int kvm_psci_version(struct kvm_vcpu *vcpu) > { > -- > 2.44.0 > -- Thanks, Oliver
[PATCH] docs: dev-tools: Add documentation for the device focused kselftests
Add documentation for the kselftests focused on testing devices and point to it from the kselftest documentation. There are multiple tests in this category so the aim of this page is to make it clear when to run each test. Signed-off-by: NÃcolas F. R. A. Prado --- This patch depends on patch "kselftest: devices: Add test to detect missing devices" [1], since this patch documents that test. [1] https://lore.kernel.org/all/20240928-kselftest-dev-exist-v2-1-fab07de6b...@collabora.com --- Documentation/dev-tools/kselftest.rst | 9 ++ Documentation/dev-tools/testing-devices.rst | 47 + 2 files changed, 56 insertions(+) diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index f3766e326d1e..fdb1df86783a 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -31,6 +31,15 @@ kselftest runs as a userspace process. Tests that can be written/run in userspace may wish to use the `Test Harness`_. Tests that need to be run in kernel space may wish to use a `Test Module`_. +Documentation on the tests +== + +For documentation on the kselftests themselves, see: + +.. toctree:: + + testing-devices + Running the selftests (hotplug tests are run in limited mode) = diff --git a/Documentation/dev-tools/testing-devices.rst b/Documentation/dev-tools/testing-devices.rst new file mode 100644 index ..ab26adb99051 --- /dev/null +++ b/Documentation/dev-tools/testing-devices.rst @@ -0,0 +1,47 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (c) 2024 Collabora Ltd + += +Device testing with kselftest += + + +There are a few different kselftests available for testing devices generically, +with some overlap in coverage and different requirements. This document aims to +give an overview of each one. + +Note: Paths in this document are relative to the kselftest folder +(``tools/testing/selftests``). + +Device oriented kselftests: + +* Devicetree (``dt``) + + * **Coverage**: Probe status for devices described in Devicetree + * **Requirements**: None + +* Error logs (``devices/error_logs``) + + * **Coverage**: Error (or more critical) log messages presence coming from any +device + * **Requirements**: None + +* Discoverable bus (``devices/probe``) + + * **Coverage**: Presence and probe status of USB or PCI devices that have been +described in the reference file + * **Requirements**: Manually describe the devices that should be tested in a +YAML reference file (see ``devices/probe/boards/google,spherion.yaml`` for +an example) + +* Exist (``devices/exist``) + + * **Coverage**: Presence of all devices + * **Requirements**: Generate the reference (see ``devices/exist/README.rst`` +for details) on a known-good kernel + +Therefore, the suggestion is to enable the error log and devicetree tests on all +(DT-based) platforms, since they don't have any requirements. Then to greatly +improve coverage, generate the reference for each platform and enable the exist +test. The discoverable bus test can be used to verify the probe status of +specific USB or PCI devices, but is probably not worth it for most cases. --- base-commit: cea5425829f77e476b03702426f6b3701299b925 change-id: 20241001-kselftest-device-docs-6c8a411109b5 Best regards, -- NÃcolas F. R. A. Prado
Re: [PATCH doc] docs: gcov: fix link to LCOV website
On 26.09.2024 15:09, Matthieu Baerts (NGI0) wrote: > The previous website hosted on SourceForge is no longer available since > January 2024 according to archive.org [1]. > > It looks like the website has been officially moved to GitHub in June > 2022 [2]. Best to redirect readers to the new location then. > > Link: > https://web.archive.org/web/20240105235756/https://ltp.sourceforge.net/coverage/lcov.php > [1] > Link: https://github.com/linux-test-project/lcov/commit/6da8399c7a7a [2] > Signed-off-by: Matthieu Baerts (NGI0) Yes, let's update the link. Acked-by: Peter Oberparleiter -- Peter Oberparleiter Linux on IBM Z Development - IBM Germany R&D
Re: [PATCH v3] kernel-docs: Add new section for Rust learning materials
On Sun, Sep 22, 2024 at 6:04 PM Carlos Bilbao wrote: > > + * Title: **Experiment: Improving the Rust Book** > + > + :Author: Cognitive Engineering Lab at Brown University > + :URL: https://rust-book.cs.brown.edu/ > + :Date: Accessed Sep 22 2024 > + :Keywords: rust, blog. > + :Description: From the website: "The goal of this experiment is to > +evaluate and improve the content of the Rust Book to help people > +learn Rust more effectively.". Perhaps this could go closer to the Rust book entry since it is a variant of it. Or are these sorted in a particular way? > + * Title: **Opsem-team** (repository) > + > + :Author: Operational semantics team > + :URL: https://github.com/rust-lang/opsem-team/tree/main Nit: I think you can remove the `/tree/main` part of the URL to simplify. > + :Date: Accessed Sep 22 2024 Since these are repositories, and elsewhere you say "rolling version", should that one have a concrete date, or should some of these ones have a rolling date too? One more that I remembered and that we could have here if we have other videos/podcasts/... is Crust of Rust (and probably the book from the same author). In any case, I think it looks good, thanks! Acked-by: Miguel Ojeda Cheers, Miguel
Re: [PATCH 17/33] prctl: arch-agnostic prctl for shadow stack
On Tue, Oct 01, 2024 at 05:15:08PM +0100, Mark Brown wrote: On Tue, Oct 01, 2024 at 09:06:22AM -0700, Deepak Gupta wrote: From: Mark Brown This is based on a patch originally written by Deepak Gupta but later modified by Mark Brown for arm's GCS patch series. Signed-off-by: Mark Brown Co-developed-by: Deepak Gupta --- You need to add your own signoff to this when reposting, see submitting-patches.rst. Thanks. Will do that.
[PATCH v13 16/40] KVM: arm64: Manage GCS access and registers for guests
GCS introduces a number of system registers for EL1 and EL0, on systems with GCS we need to context switch them and expose them to VMMs to allow guests to use GCS. In order to allow guests to use GCS we also need to configure HCRX_EL2.GCSEn, if this is not set GCS instructions will be noops and CHKFEAT will report GCS as disabled. Also enable fine grained traps for access to the GCS registers by guests which do not have the feature enabled. In order to allow userspace to control availability of the feature to guests we enable writability for only ID_AA64PFR1_EL1.GCS, this is a deliberately conservative choice to avoid errors due to oversights. Further fields should be made writable in future. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- arch/arm64/include/asm/kvm_host.h | 12 arch/arm64/include/asm/vncr_mapping.h | 2 ++ arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 31 ++ arch/arm64/kvm/sys_regs.c | 27 +- 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 329619c6fa96..31887d3f3de1 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -448,6 +448,10 @@ enum vcpu_sysreg { POR_EL0,/* Permission Overlay Register 0 (EL0) */ + /* Guarded Control Stack registers */ + GCSCRE0_EL1,/* Guarded Control Stack Control (EL0) */ + GCSPR_EL0, /* Guarded Control Stack Pointer (EL0) */ + /* FP/SIMD/SVE */ SVCR, FPMR, @@ -525,6 +529,10 @@ enum vcpu_sysreg { VNCR(POR_EL1), /* Permission Overlay Register 1 (EL1) */ + /* Guarded Control Stack registers */ + VNCR(GCSPR_EL1),/* Guarded Control Stack Pointer (EL1) */ + VNCR(GCSCR_EL1),/* Guarded Control Stack Control (EL1) */ + VNCR(HFGRTR_EL2), VNCR(HFGWTR_EL2), VNCR(HFGITR_EL2), @@ -1495,4 +1503,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (system_supports_fpmr() && \ kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP)) +#define kvm_has_gcs(k) \ + (system_supports_gcs() && \ +kvm_has_feat((k), ID_AA64PFR1_EL1, GCS, IMP)) + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/vncr_mapping.h b/arch/arm64/include/asm/vncr_mapping.h index 06f8ec0906a6..e289064148b3 100644 --- a/arch/arm64/include/asm/vncr_mapping.h +++ b/arch/arm64/include/asm/vncr_mapping.h @@ -89,6 +89,8 @@ #define VNCR_PMSIRR_EL1 0x840 #define VNCR_PMSLATFR_EL1 0x848 #define VNCR_TRFCR_EL1 0x880 +#define VNCR_GCSPR_EL1 0x8C0 +#define VNCR_GCSCR_EL1 0x8D0 #define VNCR_MPAM1_EL1 0x900 #define VNCR_MPAMHCR_EL20x930 #define VNCR_MPAMVPMV_EL2 0x938 diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 1579a3c08a36..70bd61430834 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -17,6 +17,7 @@ #include static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); +static inline bool ctxt_has_gcs(struct kvm_cpu_context *ctxt); static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { @@ -31,6 +32,11 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, TPIDR_EL0) = read_sysreg(tpidr_el0); ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); + + if (ctxt_has_gcs(ctxt)) { + ctxt_sys_reg(ctxt, GCSPR_EL0) = read_sysreg_s(SYS_GCSPR_EL0); + ctxt_sys_reg(ctxt, GCSCRE0_EL1) = read_sysreg_s(SYS_GCSCRE0_EL1); + } } static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) @@ -83,6 +89,17 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt) return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, IMP); } +static inline bool ctxt_has_gcs(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu; + + if (!cpus_have_final_cap(ARM64_HAS_GCS)) + return false; + + vcpu = ctxt_to_vcpu(ctxt); + return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64PFR1_EL1, GCS, IMP); +} + static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) { ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR); @@ -96,6 +113,10 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) if (ctxt_has_s1pie(ctxt)) { ctxt_sys_reg(ctxt, PIR_EL1) = read_sysreg_el1(SYS_PIR); ctxt_sys_reg(ctxt, PIRE0_EL1) = read_sysreg_el1(SYS_PIRE0); + if (ctxt_has_gcs(ctxt)) { +
[PATCH v13 17/40] arm64/idreg: Add overrride for GCS
Hook up an override for GCS, allowing it to be disabled from the command line by specifying arm64.nogcs in case there are problems. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Catalin Marinas Signed-off-by: Mark Brown --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/kernel/pi/idreg-override.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..c1b00f709734 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -446,6 +446,9 @@ arm64.nobti [ARM64] Unconditionally disable Branch Target Identification support + arm64.nogcs [ARM64] Unconditionally disable Guarded Control Stack + support + arm64.nomops[ARM64] Unconditionally disable Memory Copy and Memory Set instructions support diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index 29d4b6244a6f..2bb709d78405 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -133,6 +133,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { .override = &id_aa64pfr1_override, .fields = { FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), {} @@ -215,6 +216,7 @@ static const struct { { "arm64.nosve","id_aa64pfr0.sve=0" }, { "arm64.nosme","id_aa64pfr1.sme=0" }, { "arm64.nobti","id_aa64pfr1.bt=0" }, + { "arm64.nogcs","id_aa64pfr1.gcs=0" }, { "arm64.nopauth", "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " "id_aa64isar1.api=0 id_aa64isar1.apa=0 " -- 2.39.2
[PATCH v13 10/40] arm64/gcs: Provide put_user_gcs()
In order for EL1 to write to an EL0 GCS it must use the GCSSTTR instruction rather than a normal STTR. Provide a put_user_gcs() which does this. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/uaccess.h | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 0db494b24dd0..5b91803201ef 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -522,6 +522,24 @@ static inline int gcssttr(unsigned long __user *addr, unsigned long val) return err; } +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) +{ + int ret; + + if (!access_ok((char __user *)addr, sizeof(u64))) { + *err = -EFAULT; + return; + } + + uaccess_ttbr0_enable(); + ret = gcssttr(addr, val); + if (ret != 0) + *err = ret; + uaccess_ttbr0_disable(); +} + + #endif /* CONFIG_ARM64_GCS */ #endif /* __ASM_UACCESS_H */ -- 2.39.2
[PATCH v13 21/40] arm64/gcs: Context switch GCS state for EL0
There are two registers controlling the GCS state of EL0, GCSPR_EL0 which is the current GCS pointer and GCSCRE0_EL1 which has enable bits for the specific GCS functionality enabled for EL0. Manage these on context switch and process lifetime events, GCS is reset on exec(). Also ensure that any changes to the GCS memory are visible to other PEs and that changes from other PEs are visible on this one by issuing a GCSB DSYNC when moving to or from a thread with GCS. Since the current GCS configuration of a thread will be visible to userspace we store the configuration in the format used with userspace and provide a helper which configures the system register as needed. On systems that support GCS we always allow access to GCSPR_EL0, this facilitates reporting of GCS faults if userspace implements disabling of GCS on error - the GCS can still be discovered and examined even if GCS has been disabled. Reviewed-by: Catalin Marinas Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- arch/arm64/include/asm/gcs.h | 24 +++ arch/arm64/include/asm/processor.h | 6 arch/arm64/kernel/process.c| 62 ++ arch/arm64/mm/Makefile | 1 + arch/arm64/mm/gcs.c| 42 ++ 5 files changed, 135 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 7c5e95218db6..04594ef59dad 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -48,4 +48,28 @@ static inline u64 gcsss2(void) return Xt; } +#ifdef CONFIG_ARM64_GCS + +static inline bool task_gcs_el0_enabled(struct task_struct *task) +{ + return current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE; +} + +void gcs_set_el0_mode(struct task_struct *task); +void gcs_free(struct task_struct *task); +void gcs_preserve_current_state(void); + +#else + +static inline bool task_gcs_el0_enabled(struct task_struct *task) +{ + return false; +} + +static inline void gcs_set_el0_mode(struct task_struct *task) { } +static inline void gcs_free(struct task_struct *task) { } +static inline void gcs_preserve_current_state(void) { } + +#endif + #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 1438424f0064..5260788247d8 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -185,6 +185,12 @@ struct thread_struct { u64 svcr; u64 tpidr2_el0; u64 por_el0; +#ifdef CONFIG_ARM64_GCS + unsigned intgcs_el0_mode; + u64 gcspr_el0; + u64 gcs_base; + u64 gcs_size; +#endif }; static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0540653fbf38..aedcf332f422 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -280,6 +281,25 @@ static void flush_poe(void) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); } +#ifdef CONFIG_ARM64_GCS + +static void flush_gcs(void) +{ + if (!system_supports_gcs()) + return; + + gcs_free(current); + current->thread.gcs_el0_mode = 0; + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); + write_sysreg_s(0, SYS_GCSPR_EL0); +} + +#else + +static void flush_gcs(void) { } + +#endif + void flush_thread(void) { fpsimd_flush_thread(); @@ -287,6 +307,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); flush_poe(); + flush_gcs(); } void arch_release_task_struct(struct task_struct *tsk) @@ -484,6 +505,46 @@ static void entry_task_switch(struct task_struct *next) __this_cpu_write(__entry_task, next); } +#ifdef CONFIG_ARM64_GCS + +void gcs_preserve_current_state(void) +{ + current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); +} + +static void gcs_thread_switch(struct task_struct *next) +{ + if (!system_supports_gcs()) + return; + + /* GCSPR_EL0 is always readable */ + gcs_preserve_current_state(); + write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0); + + if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode) + gcs_set_el0_mode(next); + + /* +* Ensure that GCS memory effects of the 'prev' thread are +* ordered before other memory accesses with release semantics +* (or preceded by a DMB) on the current PE. In addition, any +* memory accesses with acquire semantics (or succeeded by a +* DMB) are ordered before GCS memory effects of the 'next' +* thread. This will ensure that the GCS memory effects are +* visible to other PEs in case
[PATCH v13 22/40] arm64/gcs: Ensure that new threads have a GCS
When a new thread is created by a thread with GCS enabled the GCS needs to be specified along with the regular stack. Unfortunately plain clone() is not extensible and existing clone3() users will not specify a stack so all existing code would be broken if we mandated specifying the stack explicitly. For compatibility with these cases and also x86 (which did not initially implement clone3() support for shadow stacks) if no GCS is specified we will allocate one so when a thread is created which has GCS enabled allocate one for it. We follow the extensively discussed x86 implementation and allocate min(RLIMIT_STACK/2, 2G). Since the GCS only stores the call stack and not any variables this should be more than sufficient for most applications. GCSs allocated via this mechanism will be freed when the thread exits. Reviewed-by: Thiago Jung Bauermann Acked-by: Yury Khrustalev Signed-off-by: Mark Brown --- arch/arm64/include/asm/gcs.h | 9 + arch/arm64/include/asm/mmu_context.h | 9 + arch/arm64/kernel/process.c | 32 + arch/arm64/mm/gcs.c | 69 4 files changed, 119 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 04594ef59dad..c1f274fdb9c0 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -8,6 +8,8 @@ #include #include +struct kernel_clone_args; + static inline void gcsb_dsync(void) { asm volatile(".inst 0xd503227f" : : : "memory"); @@ -58,6 +60,8 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) void gcs_set_el0_mode(struct task_struct *task); void gcs_free(struct task_struct *task); void gcs_preserve_current_state(void); +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, +const struct kernel_clone_args *args); #else @@ -69,6 +73,11 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) static inline void gcs_set_el0_mode(struct task_struct *task) { } static inline void gcs_free(struct task_struct *task) { } static inline void gcs_preserve_current_state(void) { } +static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args) +{ + return -ENOTSUPP; +} #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 7c09d47e09cb..48b3d9553b67 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,14 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return por_el0_allows_pkey(vma_pkey(vma), write, execute); } +#define deactivate_mm deactivate_mm +static inline void deactivate_mm(struct task_struct *tsk, + struct mm_struct *mm) +{ + gcs_free(tsk); +} + + #include #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index aedcf332f422..fdd095480c3f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -294,9 +294,35 @@ static void flush_gcs(void) write_sysreg_s(0, SYS_GCSPR_EL0); } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + if (!system_supports_gcs()) + return 0; + + p->thread.gcs_base = 0; + p->thread.gcs_size = 0; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + return 0; +} + #else static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} #endif @@ -313,6 +339,7 @@ void flush_thread(void) void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); + gcs_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -376,6 +403,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -420,6 +448,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.uw.tp_value = tls; p->thread.tpidr2_el0 = 0; } + + ret = copy_thread_gcs(p, args); + if (ret != 0) +
[PATCH v13 19/40] arm64/traps: Handle GCS exceptions
A new exception code is defined for GCS specific faults other than standard load/store faults, for example GCS token validation failures, add handling for this. These faults are reported to userspace as segfaults with code SEGV_CPERR (protection error), mirroring the reporting for x86 shadow stack errors. GCS faults due to memory load/store operations generate data aborts with a flag set, these will be handled separately as part of the data abort handling. Since we do not currently enable GCS for EL1 we should not get any faults there but while we're at it we wire things up there, treating any GCS fault as fatal. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/esr.h | 28 +++- arch/arm64/include/asm/exception.h | 2 ++ arch/arm64/kernel/entry-common.c | 23 +++ arch/arm64/kernel/traps.c | 11 +++ 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index da6d2c1c0b03..d1b1a33f9a8b 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -51,7 +51,8 @@ #define ESR_ELx_EC_FP_EXC32UL(0x28) /* Unallocated EC: 0x29 - 0x2B */ #define ESR_ELx_EC_FP_EXC64UL(0x2C) -/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_GCS UL(0x2D) +/* Unallocated EC: 0x2E */ #define ESR_ELx_EC_SERROR UL(0x2F) #define ESR_ELx_EC_BREAKPT_LOW UL(0x30) #define ESR_ELx_EC_BREAKPT_CUR UL(0x31) @@ -386,6 +387,31 @@ #define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5) #define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0) +/* ISS field definitions for GCS */ +#define ESR_ELx_ExType_SHIFT (20) +#define ESR_ELx_ExType_MASKGENMASK(23, 20) +#define ESR_ELx_Raddr_SHIFT(10) +#define ESR_ELx_Raddr_MASK GENMASK(14, 10) +#define ESR_ELx_Rn_SHIFT (5) +#define ESR_ELx_Rn_MASKGENMASK(9, 5) +#define ESR_ELx_Rvalue_SHIFT 5 +#define ESR_ELx_Rvalue_MASKGENMASK(9, 5) +#define ESR_ELx_IT_SHIFT (0) +#define ESR_ELx_IT_MASKGENMASK(4, 0) + +#define ESR_ELx_ExType_DATA_CHECK 0 +#define ESR_ELx_ExType_EXLOCK 1 +#define ESR_ELx_ExType_STR 2 + +#define ESR_ELx_IT_RET 0 +#define ESR_ELx_IT_GCSPOPM 1 +#define ESR_ELx_IT_RET_KEYA2 +#define ESR_ELx_IT_RET_KEYB3 +#define ESR_ELx_IT_GCSSS1 4 +#define ESR_ELx_IT_GCSSS2 5 +#define ESR_ELx_IT_GCSPOPCX6 +#define ESR_ELx_IT_GCSPOPX 7 + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index f296662590c7..674518464718 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -57,6 +57,8 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr); void do_el1_undef(struct pt_regs *regs, unsigned long esr); void do_el0_bti(struct pt_regs *regs); void do_el1_bti(struct pt_regs *regs, unsigned long esr); +void do_el0_gcs(struct pt_regs *regs, unsigned long esr); +void do_el1_gcs(struct pt_regs *regs, unsigned long esr); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs); void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs); diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3fcd9d080bf2..fe74813009bd 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -463,6 +463,15 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) exit_to_kernel_mode(regs); } +static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_gcs(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -505,6 +514,9 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BTI: el1_bti(regs, esr); break; + case ESR_ELx_EC_GCS: + el1_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_CUR: case ESR_ELx_EC_SOFTSTP_CUR: case ESR_ELx_EC_WATCHPT_CUR: @@ -684,6 +696,14 @@ static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) exit_to_user_mode(regs); } +static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_gcs(regs, esr); + exit_to_user_mode(regs); +} + static void noinstr el0_inv(struct pt_regs *regs, unsigned long e
[PATCH v13 27/40] arm64/ptrace: Expose GCS via ptrace and core files
Provide a new register type NT_ARM_GCS reporting the current GCS mode and pointer for EL0. Due to the interactions with allocation and deallocation of Guarded Control Stacks we do not permit any changes to the GCS mode via ptrace, only GCSPR_EL0 may be changed. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/uapi/asm/ptrace.h | 8 + arch/arm64/kernel/ptrace.c | 62 +++- include/uapi/linux/elf.h | 1 + 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 7fa2f7036aa7..0f39ba4f3efd 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -324,6 +324,14 @@ struct user_za_header { #define ZA_PT_SIZE(vq) \ (ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq)) +/* GCS state (NT_ARM_GCS) */ + +struct user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI__ASM_PTRACE_H */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b756578aeaee..6c1dcfe6d25a 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1473,6 +1474,52 @@ static int poe_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_GCS +static int gcs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + if (target == current) + gcs_preserve_current_state(); + + user_gcs.features_enabled = target->thread.gcs_el0_mode; + user_gcs.features_locked = target->thread.gcs_el0_locked; + user_gcs.gcspr_el0 = target->thread.gcspr_el0; + + return membuf_write(&to, &user_gcs, sizeof(user_gcs)); +} + +static int gcs_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); + if (ret) + return ret; + + if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + target->thread.gcs_el0_mode = user_gcs.features_enabled; + target->thread.gcs_el0_locked = user_gcs.features_locked; + target->thread.gcspr_el0 = user_gcs.gcspr_el0; + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1503,7 +1550,10 @@ enum aarch64_regset { REGSET_TAGGED_ADDR_CTRL, #endif #ifdef CONFIG_ARM64_POE - REGSET_POE + REGSET_POE, +#endif +#ifdef CONFIG_ARM64_GCS + REGSET_GCS, #endif }; @@ -1674,6 +1724,16 @@ static const struct user_regset aarch64_regsets[] = { .set = poe_set, }, #endif +#ifdef CONFIG_ARM64_GCS + [REGSET_GCS] = { + .core_note_type = NT_ARM_GCS, + .n = sizeof(struct user_gcs) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = gcs_get, + .set = gcs_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b9935988da5c..9adc218fb6df 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -443,6 +443,7 @@ typedef struct elf64_shdr { #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ #define NT_ARM_FPMR0x40e /* ARM floating point mode register */ #define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NT_ARM_GCS 0x410 /* ARM GCS state */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP0x800 /* MIPS DSP ASE registers */ -- 2.39.2
[PATCH v13 26/40] arm64/signal: Expose GCS state in signal frames
Add a context for the GCS state and include it in the signal context when running on a system that supports GCS. We reuse the same flags that the prctl() uses to specify which GCS features are enabled and also provide the current GCS pointer. We do not support enabling GCS via signal return, there is a conflict between specifying GCSPR_EL0 and allocation of a new GCS and this is not an ancticipated use case. We also enforce GCS configuration locking on signal return. Reviewed-by: Catalin Marinas Reviewed-by: Thiago Jung Bauermann Acked-by: Yury Khrustalev Signed-off-by: Mark Brown --- arch/arm64/include/uapi/asm/sigcontext.h | 9 +++ arch/arm64/kernel/signal.c | 109 +++ 2 files changed, 118 insertions(+) diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index bb7af77a30a7..d42f7a92238b 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -183,6 +183,15 @@ struct zt_context { __u16 __reserved[3]; }; +#define GCS_MAGIC 0x47435300 + +struct gcs_context { + struct _aarch64_ctx head; + __u64 gcspr; + __u64 features_enabled; + __u64 reserved; +}; + #endif /* !__ASSEMBLY__ */ #include diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b5ab0e229a78..62d666278264 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -66,6 +66,7 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; @@ -198,6 +199,8 @@ struct user_ctxs { u32 fpmr_size; struct poe_context __user *poe; u32 poe_size; + struct gcs_context __user *gcs; + u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -643,6 +646,82 @@ extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ +#ifdef CONFIG_ARM64_GCS + +static int preserve_gcs_context(struct gcs_context __user *ctx) +{ + int err = 0; + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + /* +* If GCS is enabled we will add a cap token to the frame, +* include it in the GCSPR_EL0 we report to support stack +* switching via sigreturn if GCS is enabled. We do not allow +* enabling via sigreturn so the token is only relevant for +* threads with GCS enabled. +*/ + if (task_gcs_el0_enabled(current)) + gcspr -= 8; + + __put_user_error(GCS_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(gcspr, &ctx->gcspr, err); + __put_user_error(0, &ctx->reserved, err); + __put_user_error(current->thread.gcs_el0_mode, +&ctx->features_enabled, err); + + return err; +} + +static int restore_gcs_context(struct user_ctxs *user) +{ + u64 gcspr, enabled; + int err = 0; + + if (user->gcs_size != sizeof(*user->gcs)) + return -EINVAL; + + __get_user_error(gcspr, &user->gcs->gcspr, err); + __get_user_error(enabled, &user->gcs->features_enabled, err); + if (err) + return err; + + /* Don't allow unknown modes */ + if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + err = gcs_check_locked(current, enabled); + if (err != 0) + return err; + + /* Don't allow enabling */ + if (!task_gcs_el0_enabled(current) && + (enabled & PR_SHADOW_STACK_ENABLE)) + return -EINVAL; + + /* If we are disabling disable everything */ + if (!(enabled & PR_SHADOW_STACK_ENABLE)) + enabled = 0; + + current->thread.gcs_el0_mode = enabled; + + /* +* We let userspace set GCSPR_EL0 to anything here, we will +* validate later in gcs_restore_signal(). +*/ + write_sysreg_s(gcspr, SYS_GCSPR_EL0); + + return 0; +} + +#else /* ! CONFIG_ARM64_GCS */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_gcs_context(void __user *ctx); +extern int restore_gcs_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_GCS */ + static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { @@ -661,6 +740,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->zt = NULL; user->fpmr = NULL; user->poe = NULL; + user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -777,6 +857,17 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpmr_size = size; break; +
[PATCH v13 28/40] arm64: Add Kconfig for Guarded Control Stack (GCS)
Provide a Kconfig option allowing the user to select if GCS support is built into the kernel. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/Kconfig | 21 + 1 file changed, 21 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..dcb12f041c13 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2178,6 +2178,27 @@ config ARCH_PKEY_BITS endmenu # "ARMv8.9 architectural features" +menu "v9.4 architectural features" + +config ARM64_GCS + bool "Enable support for Guarded Control Stack (GCS)" + default y + select ARCH_HAS_USER_SHADOW_STACK + select ARCH_USES_HIGH_VMA_FLAGS + depends on !UPROBES + help + Guarded Control Stack (GCS) provides support for a separate + stack with restricted access which contains only return + addresses. This can be used to harden against some attacks + by comparing return address used by the program with what is + stored in the GCS, and may also be used to efficiently obtain + the call stack for applications such as profiling. + + The feature is detected at runtime, and will remain disabled + if the system does not implement the feature. + +endmenu # "v9.4 architectural features" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y -- 2.39.2
[PATCH v13 29/40] kselftest/arm64: Verify the GCS hwcap
Add coverage of the GCS hwcap to the hwcap selftest, using a read of GCSPR_EL0 to generate SIGILL without having to worry about enabling GCS. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/abi/hwcap.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index f2d6007a2b98..1f07772ae578 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -98,6 +98,17 @@ static void fpmr_sigill(void) asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0"); } +static void gcs_sigill(void) +{ + unsigned long *gcspr; + + asm volatile( + "mrs%0, S3_3_C2_C5_1" + : "=r" (gcspr) + : + : "cc"); +} + static void ilrcpc_sigill(void) { /* LDAPUR W0, [SP, #8] */ @@ -534,6 +545,14 @@ static const struct hwcap_data { .sigill_fn = fpmr_sigill, .sigill_reliable = true, }, + { + .name = "GCS", + .at_hwcap = AT_HWCAP, + .hwcap_bit = HWCAP_GCS, + .cpuinfo = "gcs", + .sigill_fn = gcs_sigill, + .sigill_reliable = true, + }, { .name = "JSCVT", .at_hwcap = AT_HWCAP, -- 2.39.2
[PATCH v13 34/40] kselftest/arm64: Add very basic GCS test program
This test program just covers the basic GCS ABI, covering aspects of the ABI as standalone features without attempting to integrate things. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/Makefile| 2 +- tools/testing/selftests/arm64/gcs/.gitignore | 1 + tools/testing/selftests/arm64/gcs/Makefile| 18 ++ tools/testing/selftests/arm64/gcs/basic-gcs.c | 357 ++ tools/testing/selftests/arm64/gcs/gcs-util.h | 90 +++ 5 files changed, 467 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 28b93cab8c0d..22029e60eff3 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -4,7 +4,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),aarch64 arm64)) -ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi +ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi gcs else ARM64_SUBTARGETS := endif diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore new file mode 100644 index ..0e5e695ecba5 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -0,0 +1 @@ +basic-gcs diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile new file mode 100644 index ..61a30f483429 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2023 ARM Limited +# +# In order to avoid interaction with the toolchain and dynamic linker the +# portions of these tests that interact with the GCS are implemented using +# nolibc. +# + +TEST_GEN_PROGS := basic-gcs + +include ../../lib.mk + +$(OUTPUT)/basic-gcs: basic-gcs.c + $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + -static -include ../../../../include/nolibc/nolibc.h \ + -I../../../../../usr/include \ + -std=gnu99 -I../.. -g \ + -ffreestanding -Wall $^ -o $@ -lgcc diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c new file mode 100644 index ..3fb9742342a3 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Limited. + */ + +#include +#include + +#include + +#include +#include +#include + +#include "kselftest.h" +#include "gcs-util.h" + +/* nolibc doesn't have sysconf(), just hard code the maximum */ +static size_t page_size = 65536; + +static __attribute__((noinline)) void valid_gcs_function(void) +{ + /* Do something the compiler can't optimise out */ + my_syscall1(__NR_prctl, PR_SVE_GET_VL); +} + +static inline int gcs_set_status(unsigned long mode) +{ + bool enabling = mode & PR_SHADOW_STACK_ENABLE; + int ret; + unsigned long new_mode; + + /* +* The prctl takes 1 argument but we need to ensure that the +* other 3 values passed in registers to the syscall are zero +* since the kernel validates them. +*/ + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, + 0, 0, 0); + + if (ret == 0) { + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, + &new_mode, 0, 0, 0); + if (ret == 0) { + if (new_mode != mode) { + ksft_print_msg("Mode set to %lx not %lx\n", + new_mode, mode); + ret = -EINVAL; + } + } else { + ksft_print_msg("Failed to validate mode: %d\n", ret); + } + + if (enabling != chkfeat_gcs()) { + ksft_print_msg("%senabled by prctl but %senabled in CHKFEAT\n", + enabling ? "" : "not ", + chkfeat_gcs() ? "" : "not "); + ret = -EINVAL; + } + } + + return ret; +} + +/* Try to read the status */ +static bool read_status(void) +{ + unsigned long state; + int ret; + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, + &state, 0, 0, 0); + if (ret != 0) { + ksft_print_msg("Failed to read state: %d\n", ret); + return false; + } + + return state & PR_SHADOW_STACK_ENABLE; +} + +/* Just a straight enable */ +static bool base_enable(void) +{ + int ret; + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE); + if (ret) { + ksft_print_msg("PR_SHADOW_STACK_ENABLE failed %d\n", ret); + return fa
[PATCH v13 13/40] arm64/mm: Allocate PIE slots for EL0 guarded control stack
Pages used for guarded control stacks need to be described to the hardware using the Permission Indirection Extension, GCS is not supported without PIE. In order to support copy on write for guarded stacks we allocate two values, one for active GCSs and one for GCS pages marked as read only prior to copy. Since the actual effect is defined using PIE the specific bit pattern used does not matter to the hardware but we choose two values which differ only in PTE_WRITE in order to help share code with non-PIE cases. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/pgtable-prot.h | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2a11d0c10760..4e4bcd676f4c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -144,15 +144,23 @@ static inline bool __pure lpa2_is_enabled(void) /* 6:PTE_PXN | PTE_WRITE*/ /* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */ /* 8: PAGE_KERNEL_ROX PTE_UXN */ -/* 9: PTE_UXN | PTE_USER */ +/* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */ /* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE*/ -/* b: PTE_UXN | PTE_WRITE | PTE_USER */ +/* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */ /* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN*/ /* d: PAGE_READONLYPTE_UXN | PTE_PXN | PTE_USER */ /* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE*/ /* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */ +#define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) +#define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) + +#define PAGE_GCS __pgprot(_PAGE_GCS) +#define PAGE_GCS_RO__pgprot(_PAGE_GCS_RO) + #define PIE_E0 ( \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO),PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ @@ -160,6 +168,8 @@ static inline bool __pure lpa2_is_enabled(void) PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED),PIE_RW_O)) #define PIE_E1 ( \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO),PIE_NONE_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ -- 2.39.2
[PATCH v13 33/40] kselftest/arm64: Always run signals tests with GCS enabled
Since it is not possible to return from the function that enabled GCS without disabling GCS it is very inconvenient to use the signal handling tests to cover GCS when GCS is not enabled by the toolchain and runtime, something that no current distribution does. Since none of the testcases do anything with stacks that would cause problems with GCS we can sidestep this issue by unconditionally enabling GCS on startup and exiting with a call to exit() rather than a return from main(). Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- .../testing/selftests/arm64/signal/test_signals.c | 17 - .../selftests/arm64/signal/test_signals_utils.h| 29 ++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/signal/test_signals.c b/tools/testing/selftests/arm64/signal/test_signals.c index 00051b40d71e..1304c8ec0f2f 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.c +++ b/tools/testing/selftests/arm64/signal/test_signals.c @@ -7,6 +7,10 @@ * Each test provides its own tde struct tdescr descriptor to link with * this wrapper. Framework provides common helpers. */ + +#include +#include + #include #include "test_signals.h" @@ -16,6 +20,16 @@ struct tdescr *current = &tde; int main(int argc, char *argv[]) { + /* +* Ensure GCS is at least enabled throughout the tests if +* supported, otherwise the inability to return from the +* function that enabled GCS makes it very inconvenient to set +* up test cases. The prctl() may fail if GCS was locked by +* libc setup code. +*/ + if (getauxval(AT_HWCAP) & HWCAP_GCS) + gcs_set_state(PR_SHADOW_STACK_ENABLE); + ksft_print_msg("%s :: %s\n", current->name, current->descr); if (test_setup(current) && test_init(current)) { test_run(current); @@ -23,5 +37,6 @@ int main(int argc, char *argv[]) } test_result(current); - return current->result; + /* Do not return in case GCS was enabled */ + exit(current->result); } diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h index 762c8fe9c54a..1e80808ee105 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.h +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h @@ -18,6 +18,35 @@ void test_cleanup(struct tdescr *td); int test_run(struct tdescr *td); void test_result(struct tdescr *td); +#ifndef __NR_prctl +#define __NR_prctl 167 +#endif + +/* + * The prctl takes 1 argument but we need to ensure that the other + * values passed in registers to the syscall are zero since the kernel + * validates them. + */ +#define gcs_set_state(state) \ + ({ \ + register long _num __asm__ ("x8") = __NR_prctl;\ + register long _arg1 __asm__ ("x0") = PR_SET_SHADOW_STACK_STATUS; \ + register long _arg2 __asm__ ("x1") = (long)(state); \ + register long _arg3 __asm__ ("x2") = 0; \ + register long _arg4 __asm__ ("x3") = 0; \ + register long _arg5 __asm__ ("x4") = 0; \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_arg3), "r"(_arg4), \ + "r"(_arg5), "r"(_num) \ + : "memory", "cc"\ + ); \ + _arg1; \ + }) + static inline bool feats_ok(struct tdescr *td) { if (td->feats_incompatible & td->feats_supported) -- 2.39.2
[PATCH v13 30/40] kselftest/arm64: Add GCS as a detected feature in the signal tests
In preparation for testing GCS related signal handling add it as a feature we check for in the signal handling support code. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/signal/test_signals.h | 2 ++ tools/testing/selftests/arm64/signal/test_signals_utils.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h index 1e6273d81575..7ada43688c02 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.h +++ b/tools/testing/selftests/arm64/signal/test_signals.h @@ -35,6 +35,7 @@ enum { FSME_BIT, FSME_FA64_BIT, FSME2_BIT, + FGCS_BIT, FMAX_END }; @@ -43,6 +44,7 @@ enum { #define FEAT_SME (1UL << FSME_BIT) #define FEAT_SME_FA64 (1UL << FSME_FA64_BIT) #define FEAT_SME2 (1UL << FSME2_BIT) +#define FEAT_GCS (1UL << FGCS_BIT) /* * A descriptor used to describe and configure a test case. diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c index 0dc948db3a4a..dcc49e3ce1eb 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.c +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c @@ -30,6 +30,7 @@ static char const *const feats_names[FMAX_END] = { " SME ", " FA64 ", " SME2 ", + " GCS ", }; #define MAX_FEATS_SZ 128 @@ -329,6 +330,8 @@ int test_init(struct tdescr *td) td->feats_supported |= FEAT_SME_FA64; if (getauxval(AT_HWCAP2) & HWCAP2_SME2) td->feats_supported |= FEAT_SME2; + if (getauxval(AT_HWCAP) & HWCAP_GCS) + td->feats_supported |= FEAT_GCS; if (feats_ok(td)) { if (td->feats_required & td->feats_supported) fprintf(stderr, -- 2.39.2
[PATCH v13 36/40] kselftest/arm64: Add test coverage for GCS mode locking
Verify that we can lock individual GCS mode bits, that other modes aren't affected and as a side effect also that every combination of modes can be enabled. Normally the inability to reenable GCS after disabling it would be an issue with testing but fortunately the kselftest_harness runs each test within a fork()ed child. This can be inconvenient for some kinds of testing but here it means that each test is in a separate thread and therefore won't be affected by other tests in the suite. Once we get toolchains with support for enabling GCS by default we will need to take care to not do that in the build system but there are no such toolchains yet so it is not yet an issue. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/gcs/.gitignore| 1 + tools/testing/selftests/arm64/gcs/Makefile | 2 +- tools/testing/selftests/arm64/gcs/gcs-locking.c | 200 3 files changed, 202 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore index 5810c4a163d4..0c86f53f68ad 100644 --- a/tools/testing/selftests/arm64/gcs/.gitignore +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -1,2 +1,3 @@ basic-gcs libc-gcs +gcs-locking diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index a8fdf21e9a47..2173d6275956 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -6,7 +6,7 @@ # nolibc. # -TEST_GEN_PROGS := basic-gcs libc-gcs +TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking LDLIBS+=-lpthread diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c new file mode 100644 index ..989f75a491b7 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Limited. + * + * Tests for GCS mode locking. These tests rely on both having GCS + * unconfigured on entry and on the kselftest harness running each + * test in a fork()ed process which will have it's own mode. + */ + +#include + +#include +#include + +#include + +#include "kselftest_harness.h" + +#include "gcs-util.h" + +#define my_syscall2(num, arg1, arg2) \ +({\ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1);\ + register long _arg2 __asm__ ("x1") = (long)(arg2);\ + register long _arg3 __asm__ ("x2") = 0; \ + register long _arg4 __asm__ ("x3") = 0; \ + register long _arg5 __asm__ ("x4") = 0; \ + \ + __asm__ volatile ( \ + "svc #0\n"\ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_arg3), "r"(_arg4), \ + "r"(_arg5), "r"(_num) \ + : "memory", "cc" \ + );\ + _arg1;\ +}) + +/* No mode bits are rejected for locking */ +TEST(lock_all_modes) +{ + int ret; + + ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, ULONG_MAX, 0, 0, 0); + ASSERT_EQ(ret, 0); +} + +FIXTURE(valid_modes) +{ +}; + +FIXTURE_VARIANT(valid_modes) +{ + unsigned long mode; +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable) +{ + .mode = PR_SHADOW_STACK_ENABLE, +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable_write) +{ + .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE, +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable_push) +{ + .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH, +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable_write_push) +{ + .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | + PR_SHADOW_STACK_PUSH, +}; + +FIXTURE_SETUP(valid_modes) +{ +} + +FIXTURE_TEARDOWN(valid_modes) +{ +} + +/* We can set the mode at all */ +TEST_F(valid_modes, set) +{ + int ret; + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + variant->mode); + ASSERT_EQ(ret, 0); + + _exit(0); +} + +/* Enabling, locking then disabling is rejected */ +TEST_F(valid_modes, enable_l
[PATCH v13 39/40] kselftest/arm64: Enable GCS for the FP stress tests
While it's a bit off topic for them the floating point stress tests do give us some coverage of context thrashing cases, and also of active signal delivery separate to the relatively complicated framework in the actual signals tests. Have the tests enable GCS on startup, ignoring failures so they continue to work as before on systems without GCS. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/fp/assembler.h | 15 +++ tools/testing/selftests/arm64/fp/fpsimd-test.S | 2 ++ tools/testing/selftests/arm64/fp/sve-test.S| 2 ++ tools/testing/selftests/arm64/fp/za-test.S | 2 ++ tools/testing/selftests/arm64/fp/zt-test.S | 2 ++ 5 files changed, 23 insertions(+) diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h index 9b38a0da407d..1fc46a5642c2 100644 --- a/tools/testing/selftests/arm64/fp/assembler.h +++ b/tools/testing/selftests/arm64/fp/assembler.h @@ -65,4 +65,19 @@ endfunction bl puts .endm +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) + +.macro enable_gcs + // Run with GCS + mov x0, PR_SET_SHADOW_STACK_STATUS + mov x1, PR_SHADOW_STACK_ENABLE + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x8, #__NR_prctl + svc #0 +.endm + #endif /* ! ASSEMBLER_H */ diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S index 8b960d01ed2e..b16fb7f42e3e 100644 --- a/tools/testing/selftests/arm64/fp/fpsimd-test.S +++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S @@ -215,6 +215,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // signal count mov w0, #SIGINT diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index fff60e2a25ad..2fb4f0b84476 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -378,6 +378,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // Irritation signal count mov w0, #SIGINT diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S index 095b45531640..b2603aba99de 100644 --- a/tools/testing/selftests/arm64/fp/za-test.S +++ b/tools/testing/selftests/arm64/fp/za-test.S @@ -231,6 +231,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // signal count mov w0, #SIGINT diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index b5c81e81a379..8d9609a49008 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -200,6 +200,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // signal count mov w0, #SIGINT -- 2.39.2
[PATCH v13 38/40] kselftest/arm64: Add a GCS stress test
Add a stress test which runs one more process than we have CPUs spinning through a very recursive function with frequent syscalls immediately prior to return and signals being injected every 100ms. The goal is to flag up any scheduling related issues, for example failure to ensure that barriers are inserted when moving a GCS using task to another CPU. The test runs for a configurable amount of time, defaulting to 10 seconds. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/gcs/.gitignore | 2 + tools/testing/selftests/arm64/gcs/Makefile | 6 +- tools/testing/selftests/arm64/gcs/asm-offsets.h| 0 .../selftests/arm64/gcs/gcs-stress-thread.S| 311 tools/testing/selftests/arm64/gcs/gcs-stress.c | 530 + 5 files changed, 848 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore index 0c86f53f68ad..1e8d1f6b27f2 100644 --- a/tools/testing/selftests/arm64/gcs/.gitignore +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -1,3 +1,5 @@ basic-gcs libc-gcs gcs-locking +gcs-stress +gcs-stress-thread diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index 2173d6275956..d8b06ca51e22 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -6,7 +6,8 @@ # nolibc. # -TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking +TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress +TEST_GEN_PROGS_EXTENDED := gcs-stress-thread LDLIBS+=-lpthread @@ -18,3 +19,6 @@ $(OUTPUT)/basic-gcs: basic-gcs.c -I../../../../../usr/include \ -std=gnu99 -I../.. -g \ -ffreestanding -Wall $^ -o $@ -lgcc + +$(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S + $(CC) -nostdlib $^ -o $@ diff --git a/tools/testing/selftests/arm64/gcs/asm-offsets.h b/tools/testing/selftests/arm64/gcs/asm-offsets.h new file mode 100644 index ..e69de29bb2d1 diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S new file mode 100644 index ..b88b25217da5 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S @@ -0,0 +1,311 @@ +// Program that loops for ever doing lots of recursions and system calls, +// intended to be used as part of a stress test for GCS context switching. +// +// Copyright 2015-2023 Arm Ltd + +#include + +#define sa_sz 32 +#define sa_flags 8 +#define sa_handler 0 +#define sa_mask_sz 8 + +#define si_code 8 + +#define SIGINT 2 +#define SIGABRT 6 +#define SIGUSR1 10 +#define SIGSEGV 11 +#define SIGUSR2 12 +#define SIGTERM 15 +#define SEGV_CPERR 10 + +#define SA_NODEFER 1073741824 +#define SA_SIGINFO 4 +#define ucontext_regs 184 + +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) + +#defineGCSPR_EL0 S3_3_C2_C5_1 + +.macro function name + .macro endfunction + .type \name, @function + .purgem endfunction + .endm +\name: +.endm + +// Print a single character x0 to stdout +// Clobbers x0-x2,x8 +function putc + str x0, [sp, #-16]! + + mov x0, #1 // STDOUT_FILENO + mov x1, sp + mov x2, #1 + mov x8, #__NR_write + svc #0 + + add sp, sp, #16 + ret +endfunction +.globl putc + +// Print a NUL-terminated string starting at address x0 to stdout +// Clobbers x0-x3,x8 +function puts + mov x1, x0 + + mov x2, #0 +0: ldrbw3, [x0], #1 + cbz w3, 1f + add x2, x2, #1 + b 0b + +1: mov w0, #1 // STDOUT_FILENO + mov x8, #__NR_write + svc #0 + + ret +endfunction +.globl puts + +// Utility macro to print a literal string +// Clobbers x0-x4,x8 +.macro puts string + .pushsection .rodata.str1.1, "aMS", @progbits, 1 +.L__puts_literal\@: .string "\string" + .popsection + + ldr x0, =.L__puts_literal\@ + bl puts +.endm + +// Print an unsigned decimal number x0 to stdout +// Clobbers x0-x4,x8 +function putdec + mov x1, sp + str x30, [sp, #-32]!// Result can't be > 20 digits + + mov x2, #0 + strbw2, [x1, #-1]! // Write the NUL terminator + + mov x2, #10 +0: udivx3, x0, x2 // div-mod loop to generate the digits + msubx0, x3, x2, x0 + add w0, w0, #'0' + strbw0, [x1, #-1]! + mov x0, x3 + cbnzx3, 0b + + ldrbw0, [x1] + cbnzw0, 1f + mov w0, #'0'// Print "0" for 0, not "" + strbw0, [x1, #-1]! + +1: mov x0, x1 + bl puts + + ldr x30, [sp], #32 + ret
[PATCH v13 35/40] kselftest/arm64: Add a GCS test program built with the system libc
There are things like threads which nolibc struggles with which we want to add coverage for, and the ABI allows us to test most of these even if libc itself does not understand GCS so add a test application built using the system libc. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/gcs/.gitignore | 1 + tools/testing/selftests/arm64/gcs/Makefile | 4 +- tools/testing/selftests/arm64/gcs/gcs-util.h | 10 + tools/testing/selftests/arm64/gcs/libc-gcs.c | 728 +++ 4 files changed, 742 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore index 0e5e695ecba5..5810c4a163d4 100644 --- a/tools/testing/selftests/arm64/gcs/.gitignore +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -1 +1,2 @@ basic-gcs +libc-gcs diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index 61a30f483429..a8fdf21e9a47 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -6,7 +6,9 @@ # nolibc. # -TEST_GEN_PROGS := basic-gcs +TEST_GEN_PROGS := basic-gcs libc-gcs + +LDLIBS+=-lpthread include ../../lib.mk diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h index 1ae6864d3f86..c99a6b39ac14 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-util.h +++ b/tools/testing/selftests/arm64/gcs/gcs-util.h @@ -16,6 +16,16 @@ #define __NR_prctl 167 #endif +#ifndef NT_ARM_GCS +#define NT_ARM_GCS 0x410 + +struct user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; +#endif + /* Shadow Stack/Guarded Control Stack interface */ #define PR_GET_SHADOW_STACK_STATUS 74 #define PR_SET_SHADOW_STACK_STATUS 75 diff --git a/tools/testing/selftests/arm64/gcs/libc-gcs.c b/tools/testing/selftests/arm64/gcs/libc-gcs.c new file mode 100644 index ..17b2fabfec38 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/libc-gcs.c @@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Limited. + */ + +#define _GNU_SOURCE + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kselftest_harness.h" + +#include "gcs-util.h" + +#define my_syscall2(num, arg1, arg2) \ +({\ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1);\ + register long _arg2 __asm__ ("x1") = (long)(arg2);\ + register long _arg3 __asm__ ("x2") = 0; \ + register long _arg4 __asm__ ("x3") = 0; \ + register long _arg5 __asm__ ("x4") = 0; \ + \ + __asm__ volatile ( \ + "svc #0\n"\ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_arg3), "r"(_arg4), \ + "r"(_arg5), "r"(_num) \ + : "memory", "cc" \ + );\ + _arg1;\ +}) + +static noinline void gcs_recurse(int depth) +{ + if (depth) + gcs_recurse(depth - 1); + + /* Prevent tail call optimization so we actually recurse */ + asm volatile("dsb sy" : : : "memory"); +} + +/* Smoke test that a function call and return works*/ +TEST(can_call_function) +{ + gcs_recurse(0); +} + +static void *gcs_test_thread(void *arg) +{ + int ret; + unsigned long mode; + + /* +* Some libcs don't seem to fill unused arguments with 0 but +* the kernel validates this so we supply all 5 arguments. +*/ + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + if (ret != 0) { + ksft_print_msg("PR_GET_SHADOW_STACK_STATUS failed: %d\n", ret); + return NULL; + } + + if (!(mode & PR_SHADOW_STACK_ENABLE)) { + ksft_print_msg("GCS not enabled in thread, mode is %lu\n", + mode); + return NULL; + } + + /* Just in case... */ + gcs_recurse(0); + + /* Use a non-NULL value to indicate a pas
[PATCH v13 37/40] kselftest/arm64: Add GCS signal tests
Do some testing of the signal handling for GCS, checking that a GCS frame has the expected information in it and that the expected signals are delivered with invalid operations. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/signal/.gitignore| 1 + .../selftests/arm64/signal/test_signals_utils.h| 10 +++ .../arm64/signal/testcases/gcs_exception_fault.c | 62 +++ .../selftests/arm64/signal/testcases/gcs_frame.c | 88 ++ .../arm64/signal/testcases/gcs_write_fault.c | 67 5 files changed, 228 insertions(+) diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore index b2f2bfd5c6aa..b257db665a35 100644 --- a/tools/testing/selftests/arm64/signal/.gitignore +++ b/tools/testing/selftests/arm64/signal/.gitignore @@ -3,6 +3,7 @@ mangle_* fake_sigreturn_* fpmr_* poe_* +gcs_* sme_* ssve_* sve_* diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h index 1e80808ee105..36fc12b3cd60 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.h +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -47,6 +48,15 @@ void test_result(struct tdescr *td); _arg1; \ }) +static inline __attribute__((always_inline)) uint64_t get_gcspr_el0(void) +{ + uint64_t val; + + asm volatile("mrs %0, S3_3_C2_C5_1" : "=r" (val)); + + return val; +} + static inline bool feats_ok(struct tdescr *td) { if (td->feats_incompatible & td->feats_supported) diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c new file mode 100644 index ..6228448b2ae7 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 ARM Limited + */ + +#include +#include +#include + +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +/* + * We should get this from asm/siginfo.h but the testsuite is being + * clever with redefining siginfo_t. + */ +#ifndef SEGV_CPERR +#define SEGV_CPERR 10 +#endif + +static inline void gcsss1(uint64_t Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static int gcs_op_fault_trigger(struct tdescr *td) +{ + /* +* The slot below our current GCS should be in a valid GCS but +* must not have a valid cap in it. +*/ + gcsss1(get_gcspr_el0() - 8); + + return 0; +} + +static int gcs_op_fault_signal(struct tdescr *td, siginfo_t *si, + ucontext_t *uc) +{ + ASSERT_GOOD_CONTEXT(uc); + + return 1; +} + +struct tdescr tde = { + .name = "Invalid GCS operation", + .descr = "An invalid GCS operation generates the expected signal", + .feats_required = FEAT_GCS, + .timeout = 3, + .sig_ok = SIGSEGV, + .sig_ok_code = SEGV_CPERR, + .sanity_disabled = true, + .trigger = gcs_op_fault_trigger, + .run = gcs_op_fault_signal, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c new file mode 100644 index ..b405d82321da --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 ARM Limited + */ + +#include +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +static union { + ucontext_t uc; + char buf[1024 * 64]; +} context; + +static int gcs_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + size_t offset; + struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context); + struct gcs_context *gcs; + unsigned long expected, gcspr; + uint64_t *u64_val; + int ret; + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &expected, 0, 0, 0); + if (ret != 0) { + fprintf(stderr, "Unable to query GCS status\n"); + return 1; + } + + /* We expect a cap to be added to the GCS in the signal frame */ + gcspr = get_gcspr_el0(); + gcspr -= 8; + fprintf(stderr, "Expecting GCSPR_EL0 %lx\n", gcspr); + + if (!get_current_context(td, &context.uc, sizeof(context))) { + fprintf(stderr, "Failed getting context\n"); + return 1; + } + + /* Ensure that the signal restore token was consumed */ + u64_val = (uint64_t *)get_gcspr_el0() + 1; +
[PATCH v13 20/40] arm64/mm: Handle GCS data aborts
All GCS operations at EL0 must happen on a page which is marked as having UnprivGCS access, including read operations. If a GCS operation attempts to access a page without this then it will generate a data abort with the GCS bit set in ESR_EL1.ISS2. EL0 may validly generate such faults, for example due to copy on write which will cause the GCS data to be stored in a read only page with no GCS permissions until the actual copy happens. Since UnprivGCS allows both reads and writes to the GCS (though only through GCS operations) we need to ensure that the memory management subsystem handles GCS accesses as writes at all times. Do this by adding FAULT_FLAG_WRITE to any GCS page faults, adding handling to ensure that invalid cases are identfied as such early so the memory management core does not think they will succeed. The core cannot distinguish between VMAs which are generally writeable and VMAs which are only writeable through GCS operations. EL1 may validly write to EL0 GCS for management purposes (eg, while initialising with cap tokens). We also report any GCS faults in VMAs not marked as part of a GCS as access violations, causing a fault to be delivered to userspace if it attempts to do GCS operations outside a GCS. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/mm/fault.c | 40 1 file changed, 40 insertions(+) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 8b281cf308b3..c2f89a678ac0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -504,6 +504,14 @@ static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, false); } +static bool is_gcs_fault(unsigned long esr) +{ + if (!esr_is_data_abort(esr)) + return false; + + return ESR_ELx_ISS2(esr) & ESR_ELx_GCS; +} + static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -518,6 +526,23 @@ static bool is_write_abort(unsigned long esr) return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } +static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr) +{ + if (!system_supports_gcs()) + return false; + + if (unlikely(is_gcs_fault(esr))) { + /* GCS accesses must be performed on a GCS page */ + if (!(vma->vm_flags & VM_SHADOW_STACK)) + return true; + } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) { + /* Only GCS operations can write to a GCS page */ + return esr_is_data_abort(esr) && is_write_abort(esr); + } + + return false; +} + static int __kprobes do_page_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { @@ -554,6 +579,14 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, /* It was exec fault */ vm_flags = VM_EXEC; mm_flags |= FAULT_FLAG_INSTRUCTION; + } else if (is_gcs_fault(esr)) { + /* +* The GCS permission on a page implies both read and +* write so always handle any GCS fault as a write fault, +* we need to trigger CoW even for GCS reads. +*/ + vm_flags = VM_WRITE; + mm_flags |= FAULT_FLAG_WRITE; } else if (is_write_abort(esr)) { /* It was write fault */ vm_flags = VM_WRITE; @@ -587,6 +620,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, if (!vma) goto lock_mmap; + if (is_invalid_gcs_access(vma, esr)) { + vma_end_read(vma); + fault = 0; + si_code = SEGV_ACCERR; + goto bad_area; + } + if (!(vma->vm_flags & vm_flags)) { vma_end_read(vma); fault = 0; -- 2.39.2
[PATCH v13 31/40] kselftest/arm64: Add framework support for GCS to signal handling tests
Teach the framework about the GCS signal context, avoiding warnings on the unknown context. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- tools/testing/selftests/arm64/signal/testcases/testcases.c | 7 +++ tools/testing/selftests/arm64/signal/testcases/testcases.h | 1 + 2 files changed, 8 insertions(+) diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index e6daa94fcd2e..0c1a6b26afac 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -198,6 +198,13 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) *err = "Bad size for fpmr_context"; new_flags |= FPMR_CTX; break; + case GCS_MAGIC: + if (flags & GCS_CTX) + *err = "Multiple GCS_MAGIC"; + if (head->size != sizeof(struct gcs_context)) + *err = "Bad size for gcs_context"; + new_flags |= GCS_CTX; + break; case EXTRA_MAGIC: if (flags & EXTRA_CTX) *err = "Multiple EXTRA_MAGIC"; diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h index 9872b8912714..98b97efdda23 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.h +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -20,6 +20,7 @@ #define EXTRA_CTX (1 << 3) #define ZT_CTX (1 << 4) #define FPMR_CTX (1 << 5) +#define GCS_CTX(1 << 6) #define KSFT_BAD_MAGIC 0xdeadbeef -- 2.39.2
[PATCH v13 32/40] kselftest/arm64: Allow signals tests to specify an expected si_code
Currently we ignore si_code unless the expected signal is a SIGSEGV, in which case we enforce it being SEGV_ACCERR. Allow test cases to specify exactly which si_code should be generated so we can validate this, and test for other segfault codes. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown --- .../testing/selftests/arm64/signal/test_signals.h | 4 +++ .../selftests/arm64/signal/test_signals_utils.c| 29 ++ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h index 7ada43688c02..ee75a2c25ce7 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.h +++ b/tools/testing/selftests/arm64/signal/test_signals.h @@ -71,6 +71,10 @@ struct tdescr { * Zero when no signal is expected on success */ int sig_ok; + /* +* expected si_code for sig_ok, or 0 to not check +*/ + int sig_ok_code; /* signum expected on unsupported CPU features. */ int sig_unsupp; /* a timeout in second for test completion */ diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c index dcc49e3ce1eb..5d3621921cfe 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.c +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c @@ -143,16 +143,25 @@ static bool handle_signal_ok(struct tdescr *td, "current->token ZEROED...test is probably broken!\n"); abort(); } - /* -* Trying to narrow down the SEGV to the ones generated by Kernel itself -* via arm64_notify_segfault(). This is a best-effort check anyway, and -* the si_code check may need to change if this aspect of the kernel -* ABI changes. -*/ - if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) { - fprintf(stdout, - "si_code != SEGV_ACCERR...test is probably broken!\n"); - abort(); + if (td->sig_ok_code) { + if (si->si_code != td->sig_ok_code) { + fprintf(stdout, "si_code is %d not %d\n", + si->si_code, td->sig_ok_code); + abort(); + } + } else { + /* +* Trying to narrow down the SEGV to the ones +* generated by Kernel itself via +* arm64_notify_segfault(). This is a best-effort +* check anyway, and the si_code check may need to +* change if this aspect of the kernel ABI changes. +*/ + if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) { + fprintf(stdout, + "si_code != SEGV_ACCERR...test is probably broken!\n"); + abort(); + } } td->pass = 1; /* -- 2.39.2
[PATCH v13 23/40] arm64/gcs: Implement shadow stack prctl() interface
Implement the architecture neutral prctl() interface for setting the shadow stack status, this supports setting and reading the current GCS configuration for the current thread. Userspace can enable basic GCS functionality and additionally also support for GCS pushes and arbitrary GCS stores. It is expected that this prctl() will be called very early in application startup, for example by the dynamic linker, and not subsequently adjusted during normal operation. Users should carefully note that after enabling GCS for a thread GCS will become active with no call stack so it is not normally possible to return from the function that invoked the prctl(). State is stored per thread, enabling GCS for a thread causes a GCS to be allocated for that thread. Userspace may lock the current GCS configuration by specifying PR_SHADOW_STACK_ENABLE_LOCK, this prevents any further changes to the GCS configuration via any means. If GCS is not being enabled then all flags other than _LOCK are ignored, it is not possible to enable stores or pops without enabling GCS. When disabling the GCS we do not free the allocated stack, this allows for inspection of the GCS after disabling as part of fault reporting. Since it is not an expected use case and since it presents some complications in determining what to do with previously initialsed data on the GCS attempts to reenable GCS after this are rejected. This can be revisted if a use case arises. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/gcs.h | 22 +++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/mm/gcs.c| 79 ++ 3 files changed, 102 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index c1f274fdb9c0..48c97e63e56a 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -50,6 +50,9 @@ static inline u64 gcsss2(void) return Xt; } +#define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK \ + (PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH) + #ifdef CONFIG_ARM64_GCS static inline bool task_gcs_el0_enabled(struct task_struct *task) @@ -63,6 +66,20 @@ void gcs_preserve_current_state(void); unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, const struct kernel_clone_args *args); +static inline int gcs_check_locked(struct task_struct *task, + unsigned long new_val) +{ + unsigned long cur_val = task->thread.gcs_el0_mode; + + cur_val &= task->thread.gcs_el0_locked; + new_val &= task->thread.gcs_el0_locked; + + if (cur_val != new_val) + return -EBUSY; + + return 0; +} + #else static inline bool task_gcs_el0_enabled(struct task_struct *task) @@ -78,6 +95,11 @@ static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, { return -ENOTSUPP; } +static inline int gcs_check_locked(struct task_struct *task, + unsigned long new_val) +{ + return 0; +} #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 5260788247d8..37fefdc3d3a3 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -187,6 +187,7 @@ struct thread_struct { u64 por_el0; #ifdef CONFIG_ARM64_GCS unsigned intgcs_el0_mode; + unsigned intgcs_el0_locked; u64 gcspr_el0; u64 gcs_base; u64 gcs_size; diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index 3c7a18f57ea9..61a80de6baf8 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -109,3 +109,82 @@ void gcs_free(struct task_struct *task) task->thread.gcs_base = 0; task->thread.gcs_size = 0; } + +int arch_set_shadow_stack_status(struct task_struct *task, unsigned long arg) +{ + unsigned long gcs, size; + int ret; + + if (!system_supports_gcs()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(task))) + return -EINVAL; + + /* Reject unknown flags */ + if (arg & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + ret = gcs_check_locked(task, arg); + if (ret != 0) + return ret; + + /* If we are enabling GCS then make sure we have a stack */ + if (arg & PR_SHADOW_STACK_ENABLE && + !task_gcs_el0_enabled(task)) { + /* Do not allow GCS to be reenabled */ + if (task->thread.gcs_base || task->thread.gcspr_el0) + return -EINVAL; + + if (task != current) + return -EBUSY; + + size = gcs_size(0); + gcs = alloc_gcs(0, size); +
[PATCH v13 15/40] arm64/mm: Map pages for guarded control stack
Map pages flagged as being part of a GCS as such rather than using the full set of generic VM flags. This is done using a conditional rather than extending the size of protection_map since that would make for a very sparse array. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown --- arch/arm64/include/asm/mman.h | 9 + arch/arm64/mm/mmap.c | 9 - 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 03b790fd0ad8..f6d784f8e6e0 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -71,6 +71,15 @@ static inline bool arch_validate_flags(unsigned long vm_flags) return false; } + if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { + /* An executable GCS isn't a good idea. */ + if (vm_flags & VM_EXEC) + return false; + + /* The memory management core should prevent this */ + VM_WARN_ON(vm_flags & VM_SHARED); + } + return true; } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 7e3ad97e27d8..07aeab8a7606 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -83,8 +83,15 @@ arch_initcall(adjust_protection_map); pgprot_t vm_get_page_prot(unsigned long vm_flags) { - pteval_t prot = pgprot_val(protection_map[vm_flags & + pteval_t prot; + + /* Short circuit GCS to avoid bloating the table. */ + if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { + prot = _PAGE_GCS_RO; + } else { + prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + } if (vm_flags & VM_ARM64_BTI) prot |= PTE_GP; -- 2.39.2