date:20241001

Re: [PATCH v13 04/40] prctl: arch-agnostic prctl for shadow stack

2024-10-01 Thread Deepak Gupta


On Tue, Oct 01, 2024 at 11:58:43PM +0100, Mark Brown wrote:

Three architectures (x86, aarch64, riscv) have announced support for
shadow stacks with fairly similar functionality.  While x86 is using
arch_prctl() to control the functionality neither arm64 nor riscv uses
that interface so this patch adds arch-agnostic prctl() support to
get and set status of shadow stacks and lock the current configuation to
prevent further changes, with support for turning on and off individual
subfeatures so applications can limit their exposure to features that
they do not need.  The features are:

 - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks,
   including allocation of a shadow stack if one is not already
   allocated.
 - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow
   stack.
 - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack.

These features are expected to be inherited by new threads and cleared
on exec(), unknown features should be rejected for enable but accepted
for locking (in order to allow for future proofing).

This is based on a patch originally written by Deepak Gupta but modified
fairly heavily, support for indirect landing pads is removed, additional
modes added and the locking interface reworked.  The set status prctl()
is also reworked to just set flags, if setting/reading the shadow stack
pointer is required this could be a separate prctl.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Acked-by: Yury Khrustalev 
Signed-off-by: Mark Brown 
---
include/linux/mm.h |  4 
include/uapi/linux/prctl.h | 22 ++
kernel/sys.c   | 30 ++
3 files changed, 56 insertions(+)


Reviewed-by: Deepak Gupta

[PATCH v13 40/40] KVM: selftests: arm64: Add GCS registers to get-reg-list

2024-10-01 Thread Mark Brown

GCS adds new registers GCSCR_EL1, GCSCRE0_EL1, GCSPR_EL1 and GCSPR_EL0. Add
these to those validated by get-reg-list.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/kvm/aarch64/get-reg-list.c | 28 ++
 1 file changed, 28 insertions(+)

diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c 
b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
index d43fb3f49050..c17451069a15 100644
--- a/tools/testing/selftests/kvm/aarch64/get-reg-list.c
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
@@ -29,6 +29,24 @@ static struct feature_id_reg feat_id_regs[] = {
0,
1
},
+   {
+   ARM64_SYS_REG(3, 0, 2, 5, 0),   /* GCSCR_EL1 */
+   ARM64_SYS_REG(3, 0, 0, 4, 1),   /* ID_AA64PFR1_EL1 */
+   44,
+   1
+   },
+   {
+   ARM64_SYS_REG(3, 0, 2, 5, 1),   /* GCSPR_EL1 */
+   ARM64_SYS_REG(3, 0, 0, 4, 1),   /* ID_AA64PFR1_EL1 */
+   44,
+   1
+   },
+   {
+   ARM64_SYS_REG(3, 0, 2, 5, 2),   /* GCSCRE0_EL1 */
+   ARM64_SYS_REG(3, 0, 0, 4, 1),   /* ID_AA64PFR1_EL1 */
+   44,
+   1
+   },
{
ARM64_SYS_REG(3, 0, 10, 2, 2),  /* PIRE0_EL1 */
ARM64_SYS_REG(3, 0, 0, 7, 3),   /* ID_AA64MMFR3_EL1 */
@@ -52,6 +70,12 @@ static struct feature_id_reg feat_id_regs[] = {
ARM64_SYS_REG(3, 0, 0, 7, 3),   /* ID_AA64MMFR3_EL1 */
16,
1
+   },
+   {
+   ARM64_SYS_REG(3, 3, 2, 5, 1),   /* GCSPR_EL0 */
+   ARM64_SYS_REG(3, 0, 0, 4, 1),   /* ID_AA64PFR1_EL1 */
+   44,
+   1
}
 };
 
@@ -472,6 +496,9 @@ static __u64 base_regs[] = {
ARM64_SYS_REG(3, 0, 2, 0, 1),   /* TTBR1_EL1 */
ARM64_SYS_REG(3, 0, 2, 0, 2),   /* TCR_EL1 */
ARM64_SYS_REG(3, 0, 2, 0, 3),   /* TCR2_EL1 */
+   ARM64_SYS_REG(3, 0, 2, 5, 0),   /* GCSCR_EL1 */
+   ARM64_SYS_REG(3, 0, 2, 5, 1),   /* GCSPR_EL1 */
+   ARM64_SYS_REG(3, 0, 2, 5, 2),   /* GCSCRE0_EL1 */
ARM64_SYS_REG(3, 0, 5, 1, 0),   /* AFSR0_EL1 */
ARM64_SYS_REG(3, 0, 5, 1, 1),   /* AFSR1_EL1 */
ARM64_SYS_REG(3, 0, 5, 2, 0),   /* ESR_EL1 */
@@ -488,6 +515,7 @@ static __u64 base_regs[] = {
ARM64_SYS_REG(3, 0, 13, 0, 4),  /* TPIDR_EL1 */
ARM64_SYS_REG(3, 0, 14, 1, 0),  /* CNTKCTL_EL1 */
ARM64_SYS_REG(3, 2, 0, 0, 0),   /* CSSELR_EL1 */
+   ARM64_SYS_REG(3, 3, 2, 5, 1),   /* GCSPR_EL0 */
ARM64_SYS_REG(3, 3, 10, 2, 4),  /* POR_EL0 */
ARM64_SYS_REG(3, 3, 13, 0, 2),  /* TPIDR_EL0 */
ARM64_SYS_REG(3, 3, 13, 0, 3),  /* TPIDRRO_EL0 */

-- 
2.39.2

[PATCH v13 09/40] arm64/gcs: Add manual encodings of GCS instructions

2024-10-01 Thread Mark Brown

Define C callable functions for GCS instructions used by the kernel. In
order to avoid ambitious toolchain requirements for GCS support these are
manually encoded, this means we have fixed register numbers which will be
a bit limiting for the compiler but none of these should be used in
sufficiently fast paths for this to be a problem.

Note that GCSSTTR is used to store to EL0.

Reviewed-by: Thiago Jung Bauermann 
Acked-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/gcs.h | 51 
 arch/arm64/include/asm/uaccess.h | 22 +
 2 files changed, 73 insertions(+)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
new file mode 100644
index ..7c5e95218db6
--- /dev/null
+++ b/arch/arm64/include/asm/gcs.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+#ifndef __ASM_GCS_H
+#define __ASM_GCS_H
+
+#include 
+#include 
+
+static inline void gcsb_dsync(void)
+{
+   asm volatile(".inst 0xd503227f" : : : "memory");
+}
+
+static inline void gcsstr(u64 *addr, u64 val)
+{
+   register u64 *_addr __asm__ ("x0") = addr;
+   register long _val __asm__ ("x1") = val;
+
+   /* GCSSTTR x1, x0 */
+   asm volatile(
+   ".inst 0xd91f1c01\n"
+   :
+   : "rZ" (_val), "r" (_addr)
+   : "memory");
+}
+
+static inline void gcsss1(u64 Xt)
+{
+   asm volatile (
+   "sys #3, C7, C7, #2, %0\n"
+   :
+   : "rZ" (Xt)
+   : "memory");
+}
+
+static inline u64 gcsss2(void)
+{
+   u64 Xt;
+
+   asm volatile(
+   "SYSL %0, #3, C7, C7, #3\n"
+   : "=r" (Xt)
+   :
+   : "memory");
+
+   return Xt;
+}
+
+#endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 1aa4ecb73429..0db494b24dd0 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -502,4 +502,26 @@ static inline size_t probe_subpage_writeable(const char 
__user *uaddr,
 
 #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */
 
+#ifdef CONFIG_ARM64_GCS
+
+static inline int gcssttr(unsigned long __user *addr, unsigned long val)
+{
+   register unsigned long __user *_addr __asm__ ("x0") = addr;
+   register unsigned long _val __asm__ ("x1") = val;
+   int err = 0;
+
+   /* GCSSTTR x1, x0 */
+   asm volatile(
+   "1: .inst 0xd91f1c01\n"
+   "2: \n"
+   _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)
+   : "+r" (err)
+   : "rZ" (_val), "r" (_addr)
+   : "memory");
+
+   return err;
+}
+
+#endif /* CONFIG_ARM64_GCS */
+
 #endif /* __ASM_UACCESS_H */

-- 
2.39.2

[PATCH v13 00/40] arm64/gcs: Provide support for GCS in userspace

2024-10-01 Thread Mark Brown

The arm64 Guarded Control Stack (GCS) feature provides support for
hardware protected stacks of return addresses, intended to provide
hardening against return oriented programming (ROP) attacks and to make
it easier to gather call stacks for applications such as profiling.

When GCS is active a secondary stack called the Guarded Control Stack is
maintained, protected with a memory attribute which means that it can
only be written with specific GCS operations.  The current GCS pointer
can not be directly written to by userspace.  When a BL is executed the
value stored in LR is also pushed onto the GCS, and when a RET is
executed the top of the GCS is popped and compared to LR with a fault
being raised if the values do not match.  GCS operations may only be
performed on GCS pages, a data abort is generated if they are not.

The combination of hardware enforcement and lack of extra instructions
in the function entry and exit paths should result in something which
has less overhead and is more difficult to attack than a purely software
implementation like clang's shadow stacks.

This series implements support for use of GCS by userspace, along with
support for use of GCS within KVM guests.  It does not enable use of GCS
by either EL1 or EL2, this will be implemented separately.  Executables
are started without GCS and must use a prctl() to enable it, it is
expected that this will be done very early in application execution by
the dynamic linker or other startup code.  For dynamic linking this will
be done by checking that everything in the executable is marked as GCS
compatible.

x86 has an equivalent feature called shadow stacks, this series depends
on the x86 patches for generic memory management support for the new
guarded/shadow stack page type and shares APIs as much as possible.  As
there has been extensive discussion with the wider community around the
ABI for shadow stacks I have as far as practical kept implementation
decisions close to those for x86, anticipating that review would lead to
similar conclusions in the absence of strong reasoning for divergence.

The main divergence I am concious of is that x86 allows shadow stack to
be enabled and disabled repeatedly, freeing the shadow stack for the
thread whenever disabled, while this implementation keeps the GCS
allocated after disable but refuses to reenable it.  This is to avoid
races with things actively walking the GCS during a disable, we do
anticipate that some systems will wish to disable GCS at runtime but are
not aware of any demand for subsequently reenabling it.

x86 uses an arch_prctl() to manage enable and disable, since only x86
and S/390 use arch_prctl() a generic prctl() was proposed[1] as part of a
patch set for the equivalent RISC-V Zicfiss feature which I initially
adopted fairly directly but following review feedback has been revised
quite a bit.

We currently maintain the x86 pattern of implicitly allocating a shadow
stack for threads started with shadow stack enabled, there has been some
discussion of removing this support and requiring the use of clone3()
with explicit allocation of shadow stacks instead.  I have no strong
feelings either way, implicit allocation is not really consistent with
anything else we do and creates the potential for errors around thread
exit but on the other hand it is existing ABI on x86 and minimises the
changes needed in userspace code.

glibc and bionic changes using this ABI have been implemented and
tested.  Headless Android systems have been validated and Ross Burton
has used this code has been used to bring up a Yocto system with GCS
enabed as standard, a test implementation of V8 support has also been
done.

uprobes are not currently supported, missing emulation was identified
late in review.

There is an open issue with support for CRIU, on x86 this required the
ability to set the GCS mode via ptrace.  This series supports
configuring mode bits other than enable/disable via ptrace but it needs
to be confirmed if this is sufficient.

It is likely that we could relax some of the barriers added here with
some more targeted placements, this is left for further study.

There is an in process series adding clone3() support for shadow stacks:

   
https://lore.kernel.org/r/20240819-clone3-shadow-stack-v9-0-962d74f99...@kernel.org

Previous versions of this series depended on that, this dependency has
been removed in order to make merging easier.

[1] https://lore.kernel.org/lkml/20240403234054.2020347-1-de...@rivosinc.com/

Signed-off-by: Mark Brown 
---
Changes in v13:
- Rebase onto v6.12-rc1.
- Allocate VM_HIGH_ARCH_6 since protection keys used all the existing
  bits.
- Implement mm_release() and free transparently allocated GCSs there.
- Use bit 32 of AT_HWCAP for GCS due to AT_HWCAP2 being filled.
- Since we now only set GCSCRE0_EL1 on change ensure that it is
  initialised with GCSPR_EL0 accessible to EL0.
- Fix OOM handling on thread copy.
- Link to v12: 
https://lore.kernel.org/r/20240829-ar

[PATCH v13 03/40] arm64/mm: Restructure arch_validate_flags() for extensibility

2024-10-01 Thread Mark Brown

Currently arch_validate_flags() is written in a very non-extensible
fashion, returning immediately if MTE is not supported and writing the MTE
check as a direct return. Since we will want to add more checks for GCS
refactor the existing code to be more extensible, no functional change
intended.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/mman.h | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 9e39217b4afb..03b790fd0ad8 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -62,11 +62,17 @@ static inline bool arch_validate_prot(unsigned long prot,
 
 static inline bool arch_validate_flags(unsigned long vm_flags)
 {
-   if (!system_supports_mte())
-   return true;
+   if (system_supports_mte()) {
+   /*
+* only allow VM_MTE if VM_MTE_ALLOWED has been set
+* previously
+*/
+   if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED))
+   return false;
+   }
+
+   return true;
 
-   /* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */
-   return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED);
 }
 #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
 

-- 
2.39.2

[PATCH v13 01/40] mm: Introduce ARCH_HAS_USER_SHADOW_STACK

2024-10-01 Thread Mark Brown

Since multiple architectures have support for shadow stacks and we need to
select support for this feature in several places in the generic code
provide a generic config option that the architectures can select.

Suggested-by: David Hildenbrand 
Acked-by: David Hildenbrand 
Reviewed-by: Deepak Gupta 
Reviewed-by: Rick Edgecombe 
Reviewed-by: Mike Rapoport (IBM) 
Reviewed-by: Catalin Marinas 
Reviewed-by: Kees Cook 
Tested-by: Kees Cook 
Acked-by: Shuah Khan 
Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 arch/x86/Kconfig   | 1 +
 fs/proc/task_mmu.c | 2 +-
 mm/Kconfig | 6 ++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2852fcd82cbd..8ccae77d40f7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1954,6 +1954,7 @@ config X86_USER_SHADOW_STACK
depends on AS_WRUSS
depends on X86_64
select ARCH_USES_HIGH_VMA_FLAGS
+   select ARCH_HAS_USER_SHADOW_STACK
select X86_CET
help
  Shadow stack protection is a hardware feature that detects function
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 72f14fd59c2d..23f875e78eae 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -971,7 +971,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct 
vm_area_struct *vma)
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)]  = "ui",
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-#ifdef CONFIG_X86_USER_SHADOW_STACK
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
 #endif
 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
diff --git a/mm/Kconfig b/mm/Kconfig
index 4c9f5ea13271..4b2a1ef9a161 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1296,6 +1296,12 @@ config NUMA_EMU
  into virtual nodes when booted with "numa=fake=N", where N is the
  number of nodes. This is only useful for debugging.
 
+config ARCH_HAS_USER_SHADOW_STACK
+   bool
+   help
+ The architecture has hardware support for userspace shadow call
+  stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+
 source "mm/damon/Kconfig"
 
 endmenu

-- 
2.39.2

[PATCH v13 07/40] arm64/gcs: Document the ABI for Guarded Control Stacks

2024-10-01 Thread Mark Brown

Add some documentation of the userspace ABI for Guarded Control Stacks.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Acked-by: Yury Khrustalev 
Signed-off-by: Mark Brown 
---
 Documentation/arch/arm64/gcs.rst   | 230 +
 Documentation/arch/arm64/index.rst |   1 +
 2 files changed, 231 insertions(+)

diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst
new file mode 100644
index ..af58d9151cb7
--- /dev/null
+++ b/Documentation/arch/arm64/gcs.rst
@@ -0,0 +1,230 @@
+===
+Guarded Control Stack support for AArch64 Linux
+===
+
+This document outlines briefly the interface provided to userspace by Linux in
+order to support use of the ARM Guarded Control Stack (GCS) feature.
+
+This is an outline of the most important features and issues only and not
+intended to be exhaustive.
+
+
+
+1.  General
+---
+
+* GCS is an architecture feature intended to provide greater protection
+  against return oriented programming (ROP) attacks and to simplify the
+  implementation of features that need to collect stack traces such as
+  profiling.
+
+* When GCS is enabled a separate guarded control stack is maintained by the
+  PE which is writeable only through specific GCS operations.  This
+  stores the call stack only, when a procedure call instruction is
+  performed the current PC is pushed onto the GCS and on RET the
+  address in the LR is verified against that on the top of the GCS.
+
+* When active the current GCS pointer is stored in the system register
+  GCSPR_EL0.  This is readable by userspace but can only be updated
+  via specific GCS instructions.
+
+* The architecture provides instructions for switching between guarded
+  control stacks with checks to ensure that the new stack is a valid
+  target for switching.
+
+* The functionality of GCS is similar to that provided by the x86 Shadow
+  Stack feature, due to sharing of userspace interfaces the ABI refers to
+  shadow stacks rather than GCS.
+
+* Support for GCS is reported to userspace via HWCAP_GCS in the aux vector
+  AT_HWCAP2 entry.
+
+* GCS is enabled per thread.  While there is support for disabling GCS
+  at runtime this should be done with great care.
+
+* GCS memory access faults are reported as normal memory access faults.
+
+* GCS specific errors (those reported with EC 0x2d) will be reported as
+  SIGSEGV with a si_code of SEGV_CPERR (control protection error).
+
+* GCS is supported only for AArch64.
+
+* On systems where GCS is supported GCSPR_EL0 is always readable by EL0
+  regardless of the GCS configuration for the thread.
+
+* The architecture supports enabling GCS without verifying that return values
+  in LR match those in the GCS, the LR will be ignored.  This is not supported
+  by Linux.
+
+
+
+2.  Enabling and disabling Guarded Control Stacks
+-
+
+* GCS is enabled and disabled for a thread via the PR_SET_SHADOW_STACK_STATUS
+  prctl(), this takes a single flags argument specifying which GCS features
+  should be used.
+
+* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack
+  and enables GCS for the thread, enabling the functionality controlled by
+  GCSCRE0_EL1.{nTR, RVCHKEN, PCRSEL}.
+
+* When set the PR_SHADOW_STACK_PUSH flag enables the functionality controlled
+  by GCSCRE0_EL1.PUSHMEn, allowing explicit GCS pushes.
+
+* When set the PR_SHADOW_STACK_WRITE flag enables the functionality controlled
+  by GCSCRE0_EL1.STREn, allowing explicit stores to the Guarded Control Stack.
+
+* Any unknown flags will cause PR_SET_SHADOW_STACK_STATUS to return -EINVAL.
+
+* PR_LOCK_SHADOW_STACK_STATUS is passed a bitmask of features with the same
+  values as used for PR_SET_SHADOW_STACK_STATUS.  Any future changes to the
+  status of the specified GCS mode bits will be rejected.
+
+* PR_LOCK_SHADOW_STACK_STATUS allows any bit to be locked, this allows
+  userspace to prevent changes to any future features.
+
+* There is no support for a process to remove a lock that has been set for
+  it.
+
+* PR_SET_SHADOW_STACK_STATUS and PR_LOCK_SHADOW_STACK_STATUS affect only the
+  thread that called them, any other running threads will be unaffected.
+
+* New threads inherit the GCS configuration of the thread that created them.
+
+* GCS is disabled on exec().
+
+* The current GCS configuration for a thread may be read with the
+  PR_GET_SHADOW_STACK_STATUS prctl(), this returns the same flags that
+  are passed to PR_SET_SHADOW_STACK_STATUS.
+
+* If GCS is disabled for a thread after having previously been enabled then
+  the stack will remain allocated for the lifetime of the thread.  At present
+  any attempt to reenable GCS for the thread will be rejected, this may be
+  revisited in future.
+
+* It should be noted that since enabling GCS will result in GCS becoming
+  active immediately it i

[PATCH v13 06/40] arm64: Document boot requirements for Guarded Control Stacks

2024-10-01 Thread Mark Brown

FEAT_GCS introduces a number of new system registers, we require that
access to these registers is not trapped when we identify that the feature
is present.  There is also a HCRX_EL2 control to make GCS operations
functional.

Since if GCS is enabled any function call instruction will cause a fault
we also require that the feature be specifically disabled, existing
kernels implicitly have this requirement and especially given that the
MMU must be disabled it is difficult to see a situation where leaving
GCS enabled would be reasonable.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 Documentation/arch/arm64/booting.rst | 32 
 1 file changed, 32 insertions(+)

diff --git a/Documentation/arch/arm64/booting.rst 
b/Documentation/arch/arm64/booting.rst
index b57776a68f15..aed6e9f47cf3 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -411,6 +411,38 @@ Before jumping into the kernel, the following conditions 
must be met:
 
 - HFGRWR_EL2.nPIRE0_EL1 (bit 57) must be initialised to 0b1.
 
+ - For CPUs with Guarded Control Stacks (FEAT_GCS):
+
+  - GCSCR_EL1 must be initialised to 0.
+
+  - GCSCRE0_EL1 must be initialised to 0.
+
+  - If EL3 is present:
+
+- SCR_EL3.GCSEn (bit 39) must be initialised to 0b1.
+
+  - If EL2 is present:
+
+- GCSCR_EL2 must be initialised to 0.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+- HCRX_EL2.GCSEn must be initialised to 0b1.
+
+- HFGITR_EL2.nGCSEPP (bit 59) must be initialised to 0b1.
+
+- HFGITR_EL2.nGCSSTR_EL1 (bit 58) must be initialised to 0b1.
+
+- HFGITR_EL2.nGCSPUSHM_EL1 (bit 57) must be initialised to 0b1.
+
+- HFGRTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1.
+
+- HFGRTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1.
+
+- HFGWTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1.
+
+- HFGWTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1.
+
 The requirements described above for CPU mode, caches, MMUs, architected
 timers, coherency and system registers apply to all CPUs.  All CPUs must
 enter the kernel in the same exception level.  Where the values documented

-- 
2.39.2

[PATCH v13 04/40] prctl: arch-agnostic prctl for shadow stack

2024-10-01 Thread Mark Brown

Three architectures (x86, aarch64, riscv) have announced support for
shadow stacks with fairly similar functionality.  While x86 is using
arch_prctl() to control the functionality neither arm64 nor riscv uses
that interface so this patch adds arch-agnostic prctl() support to
get and set status of shadow stacks and lock the current configuation to
prevent further changes, with support for turning on and off individual
subfeatures so applications can limit their exposure to features that
they do not need.  The features are:

  - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks,
including allocation of a shadow stack if one is not already
allocated.
  - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow
stack.
  - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack.

These features are expected to be inherited by new threads and cleared
on exec(), unknown features should be rejected for enable but accepted
for locking (in order to allow for future proofing).

This is based on a patch originally written by Deepak Gupta but modified
fairly heavily, support for indirect landing pads is removed, additional
modes added and the locking interface reworked.  The set status prctl()
is also reworked to just set flags, if setting/reading the shadow stack
pointer is required this could be a separate prctl.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Acked-by: Yury Khrustalev 
Signed-off-by: Mark Brown 
---
 include/linux/mm.h |  4 
 include/uapi/linux/prctl.h | 22 ++
 kernel/sys.c   | 30 ++
 3 files changed, 56 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 182bad0c55df..56654306a832 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4221,4 +4221,8 @@ static inline void pgalloc_tag_copy(struct folio *new, 
struct folio *old)
 }
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user 
*status);
+int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
+int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+
 #endif /* _LINUX_MM_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..557a3d2ac1d4 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,26 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC0x10 /* Clear the aspect on 
exec */
 # define PR_PPC_DEXCR_CTRL_MASK0x1f
 
+/*
+ * Get the current shadow stack configuration for the current thread,
+ * this will be the value configured via PR_SET_SHADOW_STACK_STATUS.
+ */
+#define PR_GET_SHADOW_STACK_STATUS  74
+
+/*
+ * Set the current shadow stack configuration.  Enabling the shadow
+ * stack will cause a shadow stack to be allocated for the thread.
+ */
+#define PR_SET_SHADOW_STACK_STATUS  75
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+# define PR_SHADOW_STACK_WRITE (1UL << 1)
+# define PR_SHADOW_STACK_PUSH  (1UL << 2)
+
+/*
+ * Prevent further changes to the specified shadow stack
+ * configuration.  All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_SHADOW_STACK_STATUS  76
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..3d38a9c7c5c9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct 
*t, unsigned long which,
return -EINVAL;
 }
 
+int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long 
__user *status)
+{
+   return -EINVAL;
+}
+
+int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long 
status)
+{
+   return -EINVAL;
+}
+
+int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long 
status)
+{
+   return -EINVAL;
+}
+
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
 #ifdef CONFIG_ANON_VMA_NAME
@@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, 
unsigned long, arg3,
case PR_RISCV_SET_ICACHE_FLUSH_CTX:
error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
break;
+   case PR_GET_SHADOW_STACK_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_get_shadow_stack_status(me, (unsigned long __user 
*) arg2);
+   break;
+   case PR_SET_SHADOW_STACK_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_set_shadow_stack_status(me, arg2);
+   break;
+   case PR_LOCK_SHADOW_STACK_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_lock_shadow_stack_status(me, arg2);
+   break;
def

[PATCH v13 05/40] mman: Add map_shadow_stack() flags

2024-10-01 Thread Mark Brown

In preparation for adding arm64 GCS support make the map_shadow_stack()
SHADOW_STACK_SET_TOKEN flag generic and add _SET_MARKER. The existing
flag indicates that a token usable for stack switch should be added to
the top of the newly mapped GCS region while the new flag indicates that
a top of stack marker suitable for use by unwinders should be added
above that.

For arm64 the top of stack marker is all bits 0.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Acked-by: Yury Khrustalev 
Signed-off-by: Mark Brown 
---
 arch/x86/include/uapi/asm/mman.h | 3 ---
 include/uapi/asm-generic/mman.h  | 4 
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 46cdc941f958..ac1e6277212b 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -5,9 +5,6 @@
 #define MAP_32BIT  0x40/* only give out 32bit addresses */
 #define MAP_ABOVE4G0x80/* only map above 4GB */
 
-/* Flags for map_shadow_stack(2) */
-#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in 
the shadow stack */
-
 #include 
 
 #endif /* _ASM_X86_MMAN_H */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 57e8195d0b53..5e3d61ddbd8c 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -19,4 +19,8 @@
 #define MCL_FUTURE 2   /* lock all future mappings */
 #define MCL_ONFAULT4   /* lock all pages that are faulted in */
 
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in 
the shadow stack */
+#define SHADOW_STACK_SET_MARKER (1ULL << 1) /* Set up a top of stack 
marker in the shadow stack */
+
+
 #endif /* __ASM_GENERIC_MMAN_H */

-- 
2.39.2

[PATCH v13 02/40] mm: Define VM_HIGH_ARCH_6

2024-10-01 Thread Mark Brown

The addition of protection keys means that on arm64 we now use all of the
currently defined VM_HIGH_ARCH_x bits. In order to allow us to allocate a
new flag for GCS pages define VM_HIGH_ARCH_6.

Signed-off-by: Mark Brown 
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..182bad0c55df 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -329,12 +329,14 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_3 35  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_4 36  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_5 37  /* bit only usable on 64-bit 
architectures */
+#define VM_HIGH_ARCH_BIT_6 38  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
 #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
+#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS

-- 
2.39.2

[PATCH v13 12/40] arm64/cpufeature: Runtime detection of Guarded Control Stack (GCS)

2024-10-01 Thread Mark Brown

Add a cpufeature for GCS, allowing other code to conditionally support it
at runtime.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/cpufeature.h |  6 ++
 arch/arm64/kernel/cpufeature.c  | 20 
 arch/arm64/tools/cpucaps|  1 +
 3 files changed, 27 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h 
b/arch/arm64/include/asm/cpufeature.h
index 3d261cc123c1..69470795f5d2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -838,6 +838,12 @@ static inline bool system_supports_poe(void)
alternative_has_cap_unlikely(ARM64_HAS_S1POE);
 }
 
+static inline bool system_supports_gcs(void)
+{
+   return IS_ENABLED(CONFIG_ARM64_GCS) &&
+   alternative_has_cap_unlikely(ARM64_HAS_GCS);
+}
+
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
 bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 718728a85430..d1e758e99e0a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -291,6 +291,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+   ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
+  FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 
4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
   FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 
4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 
ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0),
@@ -2358,6 +2360,14 @@ static void cpu_enable_poe(const struct 
arm64_cpu_capabilities *__unused)
 }
 #endif
 
+#ifdef CONFIG_ARM64_GCS
+static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused)
+{
+   /* GCSPR_EL0 is always readable */
+   write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
+}
+#endif
+
 /* Internal helper functions to match cpu capability type */
 static bool
 cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@ -2889,6 +2899,16 @@ static const struct arm64_cpu_capabilities 
arm64_features[] = {
.cpu_enable = cpu_enable_poe,
ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP)
},
+#endif
+#ifdef CONFIG_ARM64_GCS
+   {
+   .desc = "Guarded Control Stack (GCS)",
+   .capability = ARM64_HAS_GCS,
+   .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+   .cpu_enable = cpu_enable_gcs,
+   .matches = has_cpuid_feature,
+   ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP)
+   },
 #endif
{},
 };
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index eedb5acc21ed..867d25d4a45a 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -29,6 +29,7 @@ HAS_EVT
 HAS_FPMR
 HAS_FGT
 HAS_FPSIMD
+HAS_GCS
 HAS_GENERIC_AUTH
 HAS_GENERIC_AUTH_ARCH_QARMA3
 HAS_GENERIC_AUTH_ARCH_QARMA5

-- 
2.39.2

[PATCH v13 11/40] arm64/gcs: Provide basic EL2 setup to allow GCS usage at EL0 and EL1

2024-10-01 Thread Mark Brown

There is a control HCRX_EL2.GCSEn which must be set to allow GCS
features to take effect at lower ELs and also fine grained traps for GCS
usage at EL0 and EL1.  Configure all these to allow GCS usage by EL0 and
EL1.

We also initialise GCSCR_EL1 and GCSCRE0_EL1 to ensure that we can
execute function call instructions without faulting regardless of the
state when the kernel is started.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/el2_setup.h | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/arch/arm64/include/asm/el2_setup.h 
b/arch/arm64/include/asm/el2_setup.h
index e0ffdf13a18b..27086a81eae3 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -27,6 +27,14 @@
ubfxx0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
cbz x0, .Lskip_hcrx_\@
mov_q   x0, HCRX_HOST_FLAGS
+
+/* Enable GCS if supported */
+   mrs_s   x1, SYS_ID_AA64PFR1_EL1
+   ubfxx1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+   cbz x1, .Lset_hcrx_\@
+   orr x0, x0, #HCRX_EL2_GCSEn
+
+.Lset_hcrx_\@:
msr_s   SYS_HCRX_EL2, x0
 .Lskip_hcrx_\@:
 .endm
@@ -200,6 +208,16 @@
orr x0, x0, #HFGxTR_EL2_nPOR_EL0
 
 .Lskip_poe_fgt_\@:
+   /* GCS depends on PIE so we don't check it if PIE is absent */
+   mrs_s   x1, SYS_ID_AA64PFR1_EL1
+   ubfxx1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+   cbz x1, .Lset_fgt_\@
+
+   /* Disable traps of access to GCS registers at EL0 and EL1 */
+   orr x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK
+   orr x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK
+
+.Lset_fgt_\@:
msr_s   SYS_HFGRTR_EL2, x0
msr_s   SYS_HFGWTR_EL2, x0
msr_s   SYS_HFGITR_EL2, xzr
@@ -215,6 +233,17 @@
 .Lskip_fgt_\@:
 .endm
 
+.macro __init_el2_gcs
+   mrs_s   x1, SYS_ID_AA64PFR1_EL1
+   ubfxx1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+   cbz x1, .Lskip_gcs_\@
+
+   /* Ensure GCS is not enabled when we start trying to do BLs */
+   msr_s   SYS_GCSCR_EL1, xzr
+   msr_s   SYS_GCSCRE0_EL1, xzr
+.Lskip_gcs_\@:
+.endm
+
 .macro __init_el2_nvhe_prepare_eret
mov x0, #INIT_PSTATE_EL1
msr spsr_el2, x0
@@ -240,6 +269,7 @@
__init_el2_nvhe_idregs
__init_el2_cptr
__init_el2_fgt
+__init_el2_gcs
 .endm
 
 #ifndef __KVM_NVHE_HYPERVISOR__

-- 
2.39.2

[PATCH v13 08/40] arm64/sysreg: Add definitions for architected GCS caps

2024-10-01 Thread Mark Brown

The architecture defines a format for guarded control stack caps, used
to mark the top of an unused GCS in order to limit the potential for
exploitation via stack switching. Add definitions associated with these.

Reviewed-by: Thiago Jung Bauermann 
Acked-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/sysreg.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9ea97dddefc4..9c98ff448bd9 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1101,6 +1101,26 @@
 /* Initial value for Permission Overlay Extension for EL0 */
 #define POR_EL0_INIT   POE_RXW
 
+/*
+ * Definitions for Guarded Control Stack
+ */
+
+#define GCS_CAP_ADDR_MASK  GENMASK(63, 12)
+#define GCS_CAP_ADDR_SHIFT 12
+#define GCS_CAP_ADDR_WIDTH 52
+#define GCS_CAP_ADDR(x)FIELD_GET(GCS_CAP_ADDR_MASK, x)
+
+#define GCS_CAP_TOKEN_MASK GENMASK(11, 0)
+#define GCS_CAP_TOKEN_SHIFT0
+#define GCS_CAP_TOKEN_WIDTH12
+#define GCS_CAP_TOKEN(x)   FIELD_GET(GCS_CAP_TOKEN_MASK, x)
+
+#define GCS_CAP_VALID_TOKEN0x1
+#define GCS_CAP_IN_PROGRESS_TOKEN  0x5
+
+#define GCS_CAP(x) unsigned long)x) & GCS_CAP_ADDR_MASK) | \
+  GCS_CAP_VALID_TOKEN)
+
 #define ARM64_FEATURE_FIELD_BITS   4
 
 /* Defined for compatibility only, do not add new users. */

-- 
2.39.2

[PATCH v13 14/40] mm: Define VM_SHADOW_STACK for arm64 when we support GCS

2024-10-01 Thread Mark Brown

Use VM_HIGH_ARCH_5 for guarded control stack pages.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 Documentation/filesystems/proc.rst |  2 +-
 include/linux/mm.h | 12 +++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/proc.rst 
b/Documentation/filesystems/proc.rst
index e834779d9611..6a882c57a7e7 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -579,7 +579,7 @@ encoded manner. The codes are the following:
 mtarm64 MTE allocation tags are enabled
 umuserfaultfd missing tracking
 uwuserfaultfd wr-protect tracking
-ssshadow stack page
+ssshadow/guarded control stack page
 slsealed
 =====
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 56654306a832..8852c39c7695 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -367,7 +367,17 @@ extern unsigned int kobjsize(const void *objp);
  * for more details on the guard size.
  */
 # define VM_SHADOW_STACK   VM_HIGH_ARCH_5
-#else
+#endif
+
+#if defined(CONFIG_ARM64_GCS)
+/*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+# define VM_SHADOW_STACK   VM_HIGH_ARCH_6
+#endif
+
+#ifndef VM_SHADOW_STACK
 # define VM_SHADOW_STACK   VM_NONE
 #endif
 

-- 
2.39.2

[PATCH v13 18/40] arm64/hwcap: Add hwcap for GCS

2024-10-01 Thread Mark Brown

Provide a hwcap to enable userspace to detect support for GCS.

Signed-off-by: Mark Brown 
---
 Documentation/arch/arm64/elf_hwcaps.rst | 4 
 arch/arm64/include/asm/hwcap.h  | 1 +
 arch/arm64/include/uapi/asm/hwcap.h | 3 ++-
 arch/arm64/kernel/cpufeature.c  | 3 +++
 arch/arm64/kernel/cpuinfo.c | 1 +
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Documentation/arch/arm64/elf_hwcaps.rst 
b/Documentation/arch/arm64/elf_hwcaps.rst
index 694f67fa07d1..25b41ff74fa0 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -170,6 +170,10 @@ HWCAP_PACG
 ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
 Documentation/arch/arm64/pointer-authentication.rst.
 
+HWCAP_GCS
+Functionality implied by ID_AA64PFR1_EL1.GCS == 0b1, as
+described by Documentation/arch/arm64/gcs.rst.
+
 HWCAP2_DCPODP
 Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index a775adddecf2..7bcf1347ca0b 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -92,6 +92,7 @@
 #define KERNEL_HWCAP_SB__khwcap_feature(SB)
 #define KERNEL_HWCAP_PACA  __khwcap_feature(PACA)
 #define KERNEL_HWCAP_PACG  __khwcap_feature(PACG)
+#define KERNEL_HWCAP_GCS   __khwcap_feature(GCS)
 
 #define __khwcap2_feature(x)   (const_ilog2(HWCAP2_ ## x) + 64)
 #define KERNEL_HWCAP_DCPODP__khwcap2_feature(DCPODP)
diff --git a/arch/arm64/include/uapi/asm/hwcap.h 
b/arch/arm64/include/uapi/asm/hwcap.h
index 055381b2c615..675642ec4d91 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -21,7 +21,7 @@
  * HWCAP flags - for AT_HWCAP
  *
  * Bits 62 and 63 are reserved for use by libc.
- * Bits 32-61 are unallocated for potential use by libc.
+ * Bits 33-61 are unallocated for potential use by libc.
  */
 #define HWCAP_FP   (1 << 0)
 #define HWCAP_ASIMD(1 << 1)
@@ -55,6 +55,7 @@
 #define HWCAP_SB   (1 << 29)
 #define HWCAP_PACA (1 << 30)
 #define HWCAP_PACG (1UL << 31)
+#define HWCAP_GCS  (1UL << 32)
 
 /*
  * HWCAP2 flags - for AT_HWCAP2
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d1e758e99e0a..b8655d55f318 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3025,6 +3025,9 @@ static const struct arm64_cpu_capabilities 
arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM),
HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, 
KERNEL_HWCAP_SVEF32MM),
HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, 
KERNEL_HWCAP_SVEF64MM),
+#endif
+#ifdef CONFIG_ARM64_GCS
+   HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS),
 #endif
HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS),
 #ifdef CONFIG_ARM64_BTI
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 44718d0482b3..f2f92c6b1c85 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -80,6 +80,7 @@ static const char *const hwcap_str[] = {
[KERNEL_HWCAP_SB]   = "sb",
[KERNEL_HWCAP_PACA] = "paca",
[KERNEL_HWCAP_PACG] = "pacg",
+   [KERNEL_HWCAP_GCS]  = "gcs",
[KERNEL_HWCAP_DCPODP]   = "dcpodp",
[KERNEL_HWCAP_SVE2] = "sve2",
[KERNEL_HWCAP_SVEAES]   = "sveaes",

-- 
2.39.2

[PATCH v13 24/40] arm64/mm: Implement map_shadow_stack()

2024-10-01 Thread Mark Brown

As discussed extensively in the changelog for the addition of this
syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
existing mmap() and madvise() syscalls do not map entirely well onto the
security requirements for guarded control stacks since they lead to
windows where memory is allocated but not yet protected or stacks which
are not properly and safely initialised. Instead a new syscall
map_shadow_stack() has been defined which allocates and initialises a
shadow stack page.

Implement this for arm64.  Two flags are provided, allowing applications
to request that the stack be initialised with a valid cap token at the
top of the stack and optionally also an end of stack marker above that.
We support requesting an end of stack marker alone but since this is a
NULL pointer it is indistinguishable from not initialising anything by
itself.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Acked-by: Yury Khrustalev 
Signed-off-by: Mark Brown 
---
 arch/arm64/mm/gcs.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 61a80de6baf8..5c46ec527b1c 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -68,6 +68,70 @@ unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
return addr;
 }
 
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, 
unsigned int, flags)
+{
+   unsigned long alloc_size;
+   unsigned long __user *cap_ptr;
+   unsigned long cap_val;
+   int ret = 0;
+   int cap_offset;
+
+   if (!system_supports_gcs())
+   return -EOPNOTSUPP;
+
+   if (flags & ~(SHADOW_STACK_SET_TOKEN | SHADOW_STACK_SET_MARKER))
+   return -EINVAL;
+
+   if (!PAGE_ALIGNED(addr))
+   return -EINVAL;
+
+   if (size == 8 || !IS_ALIGNED(size, 8))
+   return -EINVAL;
+
+   /*
+* An overflow would result in attempting to write the restore token
+* to the wrong location. Not catastrophic, but just return the right
+* error code and block it.
+*/
+   alloc_size = PAGE_ALIGN(size);
+   if (alloc_size < size)
+   return -EOVERFLOW;
+
+   addr = alloc_gcs(addr, alloc_size);
+   if (IS_ERR_VALUE(addr))
+   return addr;
+
+   /*
+* Put a cap token at the end of the allocated region so it
+* can be switched to.
+*/
+   if (flags & SHADOW_STACK_SET_TOKEN) {
+   /* Leave an extra empty frame as a top of stack marker? */
+   if (flags & SHADOW_STACK_SET_MARKER)
+   cap_offset = 2;
+   else
+   cap_offset = 1;
+
+   cap_ptr = (unsigned long __user *)(addr + size -
+  (cap_offset * 
sizeof(unsigned long)));
+   cap_val = GCS_CAP(cap_ptr);
+
+   put_user_gcs(cap_val, cap_ptr, &ret);
+   if (ret != 0) {
+   vm_munmap(addr, size);
+   return -EFAULT;
+   }
+
+   /*
+* Ensure the new cap is ordered before standard
+* memory accesses to the same location.
+*/
+   gcsb_dsync();
+   }
+
+   return addr;
+}
+
 /*
  * Apply the GCS mode configured for the specified task to the
  * hardware.

-- 
2.39.2

[PATCH v13 25/40] arm64/signal: Set up and restore the GCS context for signal handlers

2024-10-01 Thread Mark Brown

When invoking a signal handler we use the GCS configuration and stack
for the current thread.

Since we implement signal return by calling the signal handler with a
return address set up pointing to a trampoline in the vDSO we need to
also configure any active GCS for this by pushing a frame for the
trampoline onto the GCS.  If we do not do this then signal return will
generate a GCS protection fault.

In order to guard against attempts to bypass GCS protections via signal
return we only allow returning with GCSPR_EL0 pointing to an address
where it was previously preempted by a signal.  We do this by pushing a
cap onto the GCS, this takes the form of an architectural GCS cap token
with the top bit set and token type of 0 which we add on signal entry
and validate and pop off on signal return.  The combination of the top
bit being set and the token type mean that this can't be interpreted as
a valid token or address.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/gcs.h |   1 +
 arch/arm64/kernel/signal.c   | 118 +--
 2 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 48c97e63e56a..f50660603ecf 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -9,6 +9,7 @@
 #include 
 
 struct kernel_clone_args;
+struct ksignal;
 
 static inline void gcsb_dsync(void)
 {
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 561986947530..b5ab0e229a78 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -34,6 +35,15 @@
 #include 
 #include 
 
+#ifdef CONFIG_ARM64_GCS
+#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK)
+
+static bool gcs_signal_cap_valid(u64 addr, u64 val)
+{
+   return val == GCS_SIGNAL_CAP(addr);
+}
+#endif
+
 /*
  * Do a signal return; undo the signal stack. These are aligned to 128-bit.
  */
@@ -904,6 +914,58 @@ static int restore_sigframe(struct pt_regs *regs,
return err;
 }
 
+#ifdef CONFIG_ARM64_GCS
+static int gcs_restore_signal(void)
+{
+   unsigned long __user *gcspr_el0;
+   u64 cap;
+   int ret;
+
+   if (!system_supports_gcs())
+   return 0;
+
+   if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE))
+   return 0;
+
+   gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0);
+
+   /*
+* Ensure that any changes to the GCS done via GCS operations
+* are visible to the normal reads we do to validate the
+* token.
+*/
+   gcsb_dsync();
+
+   /*
+* GCSPR_EL0 should be pointing at a capped GCS, read the cap.
+* We don't enforce that this is in a GCS page, if it is not
+* then faults will be generated on GCS operations - the main
+* concern is to protect GCS pages.
+*/
+   ret = copy_from_user(&cap, gcspr_el0, sizeof(cap));
+   if (ret)
+   return -EFAULT;
+
+   /*
+* Check that the cap is the actual GCS before replacing it.
+*/
+   if (!gcs_signal_cap_valid((u64)gcspr_el0, cap))
+   return -EINVAL;
+
+   /* Invalidate the token to prevent reuse */
+   put_user_gcs(0, (__user void*)gcspr_el0, &ret);
+   if (ret != 0)
+   return -EFAULT;
+
+   write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0);
+
+   return 0;
+}
+
+#else
+static int gcs_restore_signal(void) { return 0; }
+#endif
+
 SYSCALL_DEFINE0(rt_sigreturn)
 {
struct pt_regs *regs = current_pt_regs();
@@ -927,6 +989,9 @@ SYSCALL_DEFINE0(rt_sigreturn)
if (restore_sigframe(regs, frame))
goto badframe;
 
+   if (gcs_restore_signal())
+   goto badframe;
+
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
 
@@ -1189,7 +1254,48 @@ static int get_sigframe(struct rt_sigframe_user_layout 
*user,
return 0;
 }
 
-static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
+#ifdef CONFIG_ARM64_GCS
+
+static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig)
+{
+   unsigned long __user *gcspr_el0;
+   int ret = 0;
+
+   if (!system_supports_gcs())
+   return 0;
+
+   if (!task_gcs_el0_enabled(current))
+   return 0;
+
+   /*
+* We are entering a signal handler, current register state is
+* active.
+*/
+   gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0);
+
+   /*
+* Push a cap and the GCS entry for the trampoline onto the GCS.
+*/
+   put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret);
+   put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret);
+   if (ret != 0)
+   return ret;
+
+

Re: [PATCH v13 16/40] KVM: arm64: Manage GCS access and registers for guests

2024-10-01 Thread Marc Zyngier

On Tue, 01 Oct 2024 23:58:55 +0100,
Mark Brown  wrote:

> @@ -4714,6 +4735,10 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu)
>   kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nPOR_EL1 |
>   HFGxTR_EL2_nPOR_EL0);
>  
> + if (!kvm_has_gcs(kvm))
> + kvm->arch.fgu[HFGxTR_GROUP] |= (HFGxTR_EL2_nGCS_EL0 |
> + HFGxTR_EL2_nGCS_EL1);
> +

Why are you still allowing the GCS instructions when GCS isn't
enabled?

M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH RFC v4 0/9] tun: Introduce virtio-net hashing feature

2024-10-01 Thread Akihiko Odaki


On 2024/10/02 1:31, Stephen Hemminger wrote:

On Tue, 1 Oct 2024 14:54:29 +0900
Akihiko Odaki  wrote:


On 2024/09/30 0:33, Stephen Hemminger wrote:

On Sun, 29 Sep 2024 16:10:47 +0900
Akihiko Odaki  wrote:
   

On 2024/09/29 11:07, Jason Wang wrote:

On Fri, Sep 27, 2024 at 3:51 PM Akihiko Odaki  wrote:


On 2024/09/27 13:31, Jason Wang wrote:

On Fri, Sep 27, 2024 at 10:11 AM Akihiko Odaki  wrote:


On 2024/09/25 12:30, Jason Wang wrote:

On Tue, Sep 24, 2024 at 5:01 PM Akihiko Odaki  wrote:


virtio-net have two usage of hashes: one is RSS and another is hash
reporting. Conventionally the hash calculation was done by the VMM.
However, computing the hash after the queue was chosen defeats the
purpose of RSS.

Another approach is to use eBPF steering program. This approach has
another downside: it cannot report the calculated hash due to the
restrictive nature of eBPF.

Introduce the code to compute hashes to the kernel in order to overcome
thse challenges.

An alternative solution is to extend the eBPF steering program so that it
will be able to report to the userspace, but it is based on context
rewrites, which is in feature freeze. We can adopt kfuncs, but they will
not be UAPIs. We opt to ioctl to align with other relevant UAPIs (KVM
and vhost_net).
 


I wonder if we could clone the skb and reuse some to store the hash,
then the steering eBPF program can access these fields without
introducing full RSS in the kernel?


I don't get how cloning the skb can solve the issue.

We can certainly implement Toeplitz function in the kernel or even with
tc-bpf to store a hash value that can be used for eBPF steering program
and virtio hash reporting. However we don't have a means of storing a
hash type, which is specific to virtio hash reporting and lacks a
corresponding skb field.


I may miss something but looking at sk_filter_is_valid_access(). It
looks to me we can make use of skb->cb[0..4]?


I didn't opt to using cb. Below is the rationale:

cb is for tail call so it means we reuse the field for a different
purpose. The context rewrite allows adding a field without increasing
the size of the underlying storage (the real sk_buff) so we should add a
new field instead of reusing an existing field to avoid confusion.

We are however no longer allowed to add a new field. In my
understanding, this is because it is an UAPI, and eBPF maintainers found
it is difficult to maintain its stability.

Reusing cb for hash reporting is a workaround to avoid having a new
field, but it does not solve the underlying problem (i.e., keeping eBPF
as stable as UAPI is unreasonably hard). In my opinion, adding an ioctl
is a reasonable option to keep the API as stable as other virtualization
UAPIs while respecting the underlying intention of the context rewrite
feature freeze.


Fair enough.

Btw, I remember DPDK implements tuntap RSS via eBPF as well (probably
via cls or other). It might worth to see if anything we miss here.


Thanks for the information. I wonder why they used cls instead of
steering program. Perhaps it may be due to compatibility with macvtap
and ipvtap, which don't steering program.

Their RSS implementation looks cleaner so I will improve my RSS
implementation accordingly.
  


DPDK needs to support flow rules. The specific case is where packets
are classified by a flow, then RSS is done across a subset of the queues.
The support for flow in TUN driver is more academic than useful,
I fixed it for current BPF, but doubt anyone is using it really.

A full steering program would be good, but would require much more
complexity to take a general set of flow rules then communicate that
to the steering program.
   


It reminded me of RSS context and flow filter. Some physical NICs
support to use a dedicated RSS context for packets matched with flow
filter, and virtio is also gaining corresponding features.

RSS context: https://github.com/oasis-tcs/virtio-spec/issues/178
Flow filter: https://github.com/oasis-tcs/virtio-spec/issues/179

I considered about the possibility of supporting these features with tc
instead of adding ioctls to tuntap, but it seems not appropriate for
virtualization use case.

In a virtualization use case, tuntap is configured according to requests
of guests, and the code processing these requests need to have minimal
permissions for security. This goal is achieved by passing a file
descriptor that represents a tuntap from a privileged process (e.g.,
libvirt) to the process handling guest requests (e.g., QEMU).

However, tc is configured with rtnetlink, which does not seem to have an
interface to delegate a permission for one particular device to another
process.

For now I'll continue working on the current approach that is based on
ioctl and lacks RSS context and flow filter features. Eventually they
are also likely to require new ioctls if they are to be supported with
vhost_net.


The DPDK flow handling (rte_flow) was started by Mellanox and many of
the features are to support what t

[PATCH 29/33] riscv: kernel command line option to opt out of user cfi

2024-10-01 Thread Deepak Gupta

This commit adds a kernel command line option using which user cfi can be
disabled.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/kernel/usercfi.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
index 40c32258b6ec..d92b49261b58 100644
--- a/arch/riscv/kernel/usercfi.c
+++ b/arch/riscv/kernel/usercfi.c
@@ -17,6 +17,8 @@
 #include 
 #include 
 
+bool disable_riscv_usercfi;
+
 #define SHSTK_ENTRY_SIZE sizeof(void *)
 
 bool is_shstk_enabled(struct task_struct *task)
@@ -393,6 +395,9 @@ int arch_set_shadow_stack_status(struct task_struct *t, 
unsigned long status)
unsigned long size = 0, addr = 0;
bool enable_shstk = false;
 
+   if (disable_riscv_usercfi)
+   return 0;
+
if (!cpu_supports_shadow_stack())
return -EINVAL;
 
@@ -472,6 +477,9 @@ int arch_set_indir_br_lp_status(struct task_struct *t, 
unsigned long status)
 {
bool enable_indir_lp = false;
 
+   if (disable_riscv_usercfi)
+   return 0;
+
if (!cpu_supports_indirect_br_lp_instr())
return -EINVAL;
 
@@ -504,3 +512,15 @@ int arch_lock_indir_br_lp_status(struct task_struct *task,
 
return 0;
 }
+
+static int __init setup_global_riscv_enable(char *str)
+{
+   if (strcmp(str, "true") == 0)
+   disable_riscv_usercfi = true;
+
+   pr_info("Setting riscv usercfi to be %s\n", (disable_riscv_usercfi ? 
"disabled" : "enabled"));
+
+   return 1;
+}
+
+__setup("disable_riscv_usercfi=", setup_global_riscv_enable);

-- 
2.45.0

[PATCH 26/33] riscv/hwprobe: zicfilp / zicfiss enumeration in hwprobe

2024-10-01 Thread Deepak Gupta

Adding enumeration of zicfilp and zicfiss extensions in hwprobe syscall.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/uapi/asm/hwprobe.h | 2 ++
 arch/riscv/kernel/sys_hwprobe.c   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/riscv/include/uapi/asm/hwprobe.h 
b/arch/riscv/include/uapi/asm/hwprobe.h
index 1e153cda57db..d5c5dec9ae6c 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -72,6 +72,8 @@ struct riscv_hwprobe {
 #defineRISCV_HWPROBE_EXT_ZCF   (1ULL << 46)
 #defineRISCV_HWPROBE_EXT_ZCMOP (1ULL << 47)
 #defineRISCV_HWPROBE_EXT_ZAWRS (1ULL << 48)
+#defineRISCV_HWPROBE_EXT_ZICFILP   (1ULL << 49)
+#defineRISCV_HWPROBE_EXT_ZICFISS   (1ULL << 50)
 #define RISCV_HWPROBE_KEY_CPUPERF_05
 #defineRISCV_HWPROBE_MISALIGNED_UNKNOWN(0 << 0)
 #defineRISCV_HWPROBE_MISALIGNED_EMULATED   (1 << 0)
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index cea0ca2bf2a2..98f72ad7124f 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -107,6 +107,8 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
EXT_KEY(ZCB);
EXT_KEY(ZCMOP);
EXT_KEY(ZICBOZ);
+   EXT_KEY(ZICFILP);
+   EXT_KEY(ZICFISS);
EXT_KEY(ZICOND);
EXT_KEY(ZIHINTNTL);
EXT_KEY(ZIHINTPAUSE);

-- 
2.45.0

[PATCH 30/33] riscv: create a config for shadow stack and landing pad instr support

2024-10-01 Thread Deepak Gupta

This patch creates a config for shadow stack support and landing pad instr
support. Shadow stack support and landing instr support can be enabled by
selecting `CONFIG_RISCV_USER_CFI`. Selecting `CONFIG_RISCV_USER_CFI` wires
up path to enumerate CPU support and if cpu support exists, kernel will
support cpu assisted user mode cfi.

If CONFIG_RISCV_USER_CFI is selected, select `ARCH_USES_HIGH_VMA_FLAGS`
and `ARCH_HAS_USER_SHADOW_STACK` for riscv.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/Kconfig | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 808ea66b9537..d0cc2879fcd4 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -245,6 +245,25 @@ config ARCH_HAS_BROKEN_DWARF5
# 
https://github.com/llvm/llvm-project/commit/7ffabb61a5569444b5ac9322e22e5471cc5e4a77
depends on LD_IS_LLD && LLD_VERSION < 18
 
+config RISCV_USER_CFI
+   def_bool y
+   bool "riscv userspace control flow integrity"
+   depends on 64BIT && $(cc-option,-mabi=lp64 -march=rv64ima_zicfiss)
+   depends on RISCV_ALTERNATIVE
+   select ARCH_HAS_USER_SHADOW_STACK
+   select ARCH_USES_HIGH_VMA_FLAGS
+   help
+ Provides CPU assisted control flow integrity to userspace tasks.
+ Control flow integrity is provided by implementing shadow stack for
+ backward edge and indirect branch tracking for forward edge in 
program.
+ Shadow stack protection is a hardware feature that detects function
+ return address corruption. This helps mitigate ROP attacks.
+ Indirect branch tracking enforces that all indirect branches must land
+ on a landing pad instruction else CPU will fault. This mitigates 
against
+ JOP / COP attacks. Applications must be enabled to use it, and old 
user-
+ space does not get protection "for free".
+ default y
+
 config ARCH_MMAP_RND_BITS_MIN
default 18 if 64BIT
default 8

-- 
2.45.0

[PATCH 15/33] riscv/mm: Implement map_shadow_stack() syscall

2024-10-01 Thread Deepak Gupta

As discussed extensively in the changelog for the addition of this
syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
existing mmap() and madvise() syscalls do not map entirely well onto the
security requirements for shadow stack memory since they lead to windows
where memory is allocated but not yet protected or stacks which are not
properly and safely initialised. Instead a new syscall map_shadow_stack()
has been defined which allocates and initialises a shadow stack page.

This patch implements this syscall for riscv. riscv doesn't require token
to be setup by kernel because user mode can do that by itself. However to
provide compatibility and portability with other architectues, user mode
can specify token set flag.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/kernel/Makefile  |   2 +
 arch/riscv/kernel/usercfi.c | 145 
 include/uapi/asm-generic/mman.h |   4 ++
 3 files changed, 151 insertions(+)

diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 7f88cc4931f5..eb2c94dd0a9d 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -117,3 +117,5 @@ obj-$(CONFIG_COMPAT)+= compat_vdso/
 obj-$(CONFIG_64BIT)+= pi/
 obj-$(CONFIG_ACPI) += acpi.o
 obj-$(CONFIG_ACPI_NUMA)+= acpi_numa.o
+
+obj-$(CONFIG_RISCV_USER_CFI) += usercfi.o
diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
new file mode 100644
index ..ce002eabbdc1
--- /dev/null
+++ b/arch/riscv/kernel/usercfi.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Rivos, Inc.
+ * Deepak Gupta 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SHSTK_ENTRY_SIZE sizeof(void *)
+
+/*
+ * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can 
happen
+ * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes 
pointer to
+ * shadow stack. To keep it simple, we plan to use `ssamoswap` to perform 
writes on shadow
+ * stack.
+ */
+static noinline unsigned long amo_user_shstk(unsigned long *addr, unsigned 
long val)
+{
+   /*
+* Never expect -1 on shadow stack. Expect return addresses and zero
+*/
+   unsigned long swap = -1;
+
+   __enable_user_access();
+   asm goto(
+   ".option push\n"
+   ".option arch, +zicfiss\n"
+   "1: ssamoswap.d %[swap], %[val], %[addr]\n"
+   _ASM_EXTABLE(1b, %l[fault])
+   RISCV_ACQUIRE_BARRIER
+   ".option pop\n"
+   : [swap] "=r" (swap), [addr] "+A" (*addr)
+   : [val] "r" (val)
+   : "memory"
+   : fault
+   );
+   __disable_user_access();
+   return swap;
+fault:
+   __disable_user_access();
+   return -1;
+}
+
+/*
+ * Create a restore token on the shadow stack.  A token is always XLEN wide
+ * and aligned to XLEN.
+ */
+static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+{
+   unsigned long addr;
+
+   /* Token must be aligned */
+   if (!IS_ALIGNED(ssp, SHSTK_ENTRY_SIZE))
+   return -EINVAL;
+
+   /* On RISC-V we're constructing token to be function of address itself 
*/
+   addr = ssp - SHSTK_ENTRY_SIZE;
+
+   if (amo_user_shstk((unsigned long __user *)addr, (unsigned long) ssp) 
== -1)
+   return -EFAULT;
+
+   if (token_addr)
+   *token_addr = addr;
+
+   return 0;
+}
+
+static unsigned long allocate_shadow_stack(unsigned long addr, unsigned long 
size,
+   unsigned long token_offset,
+   bool set_tok)
+{
+   int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+   struct mm_struct *mm = current->mm;
+   unsigned long populate, tok_loc = 0;
+
+   if (addr)
+   flags |= MAP_FIXED_NOREPLACE;
+
+   mmap_write_lock(mm);
+   addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+   VM_SHADOW_STACK | VM_WRITE, 0, &populate, NULL);
+   mmap_write_unlock(mm);
+
+   if (!set_tok || IS_ERR_VALUE(addr))
+   goto out;
+
+   if (create_rstor_token(addr + token_offset, &tok_loc)) {
+   vm_munmap(addr, size);
+   return -EINVAL;
+   }
+
+   addr = tok_loc;
+
+out:
+   return addr;
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, 
unsigned int, flags)
+{
+   bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
+   unsigned long aligned_size = 0;
+
+   if (!cpu_supports_shadow_stack())
+   return -EOPNOTSUPP;
+
+   /* Anything other than set

[PATCH 23/33] riscv signal: save and restore of shadow stack for signal

2024-10-01 Thread Deepak Gupta

Save shadow stack pointer in sigcontext structure while delivering signal.
Restore shadow stack pointer from sigcontext on sigreturn.

As part of save operation, kernel uses `ssamoswap` to save snapshot of
current shadow stack on shadow stack itself (can be called as a save
token). During restore on sigreturn, kernel retrieves token from top of
shadow stack and validates it. This allows that user mode can't arbitrary
pivot to any shadow stack address without having a token and thus provide
strong security assurance between signaly delivery and sigreturn window.

Use ABI compatible way of saving/restoring shadow stack pointer into
signal stack. This follows what Vector extension, where extra registers
are placed in a form of extension header + extension body in the stack.
The extension header indicates the size of the extra architectural
states plus the size of header itself, and a magic identifier of the
extension. Then, the extensions body contains the new architectural
states in the form defined by uapi.

Signed-off-by: Andy Chiu 
Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/asm/usercfi.h | 10 
 arch/riscv/include/uapi/asm/ptrace.h |  4 ++
 arch/riscv/include/uapi/asm/sigcontext.h |  1 +
 arch/riscv/kernel/signal.c   | 80 
 arch/riscv/kernel/usercfi.c  | 57 +++
 5 files changed, 152 insertions(+)

diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h
index 099204d0cd4a..8da61b005d0a 100644
--- a/arch/riscv/include/asm/usercfi.h
+++ b/arch/riscv/include/asm/usercfi.h
@@ -8,6 +8,7 @@
 #ifndef __ASSEMBLY__
 #include 
 #include 
+#include 
 
 struct task_struct;
 struct kernel_clone_args;
@@ -35,6 +36,9 @@ bool is_shstk_locked(struct task_struct *task);
 bool is_shstk_allocated(struct task_struct *task);
 void set_shstk_lock(struct task_struct *task);
 void set_shstk_status(struct task_struct *task, bool enable);
+unsigned long get_active_shstk(struct task_struct *task);
+int restore_user_shstk(struct task_struct *tsk, unsigned long shstk_ptr);
+int save_user_shstk(struct task_struct *tsk, unsigned long *saved_shstk_ptr);
 bool is_indir_lp_enabled(struct task_struct *task);
 bool is_indir_lp_locked(struct task_struct *task);
 void set_indir_lp_status(struct task_struct *task, bool enable);
@@ -72,6 +76,12 @@ void set_indir_lp_lock(struct task_struct *task);
 
 #define set_indir_lp_lock(task)
 
+#define restore_user_shstk(tsk, shstk_ptr) -EINVAL
+
+#define save_user_shstk(tsk, saved_shstk_ptr) -EINVAL
+
+#define get_active_shstk(task) 0
+
 #endif /* CONFIG_RISCV_USER_CFI */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/include/uapi/asm/ptrace.h 
b/arch/riscv/include/uapi/asm/ptrace.h
index a38268b19c3d..659ea3af5680 100644
--- a/arch/riscv/include/uapi/asm/ptrace.h
+++ b/arch/riscv/include/uapi/asm/ptrace.h
@@ -127,6 +127,10 @@ struct __riscv_v_regset_state {
  */
 #define RISCV_MAX_VLENB (8192)
 
+struct __sc_riscv_cfi_state {
+   unsigned long ss_ptr;   /* shadow stack pointer */
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI_ASM_RISCV_PTRACE_H */
diff --git a/arch/riscv/include/uapi/asm/sigcontext.h 
b/arch/riscv/include/uapi/asm/sigcontext.h
index cd4f175dc837..f37e4beffe03 100644
--- a/arch/riscv/include/uapi/asm/sigcontext.h
+++ b/arch/riscv/include/uapi/asm/sigcontext.h
@@ -10,6 +10,7 @@
 
 /* The Magic number for signal context frame header. */
 #define RISCV_V_MAGIC  0x53465457
+#define RISCV_ZICFISS_MAGIC0x9487
 #define END_MAGIC  0x0
 
 /* The size of END signal context header. */
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index 014ac1024b85..77cbc4a01e49 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -22,11 +22,13 @@
 #include 
 #include 
 #include 
+#include 
 
 unsigned long signal_minsigstksz __ro_after_init;
 
 extern u32 __user_rt_sigreturn[2];
 static size_t riscv_v_sc_size __ro_after_init;
+static size_t riscv_zicfiss_sc_size __ro_after_init;
 
 #define DEBUG_SIG 0
 
@@ -139,6 +141,62 @@ static long __restore_v_state(struct pt_regs *regs, void 
__user *sc_vec)
return copy_from_user(current->thread.vstate.datap, datap, 
riscv_v_vsize);
 }
 
+static long save_cfiss_state(struct pt_regs *regs, void __user *sc_cfi)
+{
+   struct __sc_riscv_cfi_state __user *state = sc_cfi;
+   unsigned long ss_ptr = 0;
+   long err = 0;
+
+   if (!IS_ENABLED(CONFIG_RISCV_USER_CFI) || !is_shstk_enabled(current))
+   return 0;
+
+   /*
+* Save a pointer to shadow stack itself on shadow stack as a form of 
token.
+* A token on shadow gives following properties
+* - Safe save and restore for shadow stack switching. Any save of 
shadow stack
+*   must have had saved a token on shadow stack. Similarly any restore 
of shadow
+*   stack must check the token before restore. Since writing to shadow 
stack with
+*   address o

[PATCH 10/33] riscv: usercfi state for task and save/restore of CSR_SSP on trap entry/exit

2024-10-01 Thread Deepak Gupta

Carves out space in arch specific thread struct for cfi status and shadow
stack in usermode on riscv.

This patch does following
- defines a new structure cfi_status with status bit for cfi feature
- defines shadow stack pointer, base and size in cfi_status structure
- defines offsets to new member fields in thread in asm-offsets.c
- Saves and restore shadow stack pointer on trap entry (U --> S) and exit
  (S --> U)

Shadow stack save/restore is gated on feature availiblity and implemented
using alternative. CSR can be context switched in `switch_to` as well but
soon as kernel shadow stack support gets rolled in, shadow stack pointer
will need to be switched at trap entry/exit point (much like `sp`). It can
be argued that kernel using shadow stack deployment scenario may not be as
prevalant as user mode using this feature. But even if there is some
minimal deployment of kernel shadow stack, that means that it needs to be
supported. And thus save/restore of shadow stack pointer in entry.S instead
of in `switch_to.h`.

Signed-off-by: Deepak Gupta 
Reviewed-by: Charlie Jenkins 
---
 arch/riscv/include/asm/processor.h   |  1 +
 arch/riscv/include/asm/thread_info.h |  3 +++
 arch/riscv/include/asm/usercfi.h | 24 
 arch/riscv/kernel/asm-offsets.c  |  4 
 arch/riscv/kernel/entry.S| 26 ++
 5 files changed, 58 insertions(+)

diff --git a/arch/riscv/include/asm/processor.h 
b/arch/riscv/include/asm/processor.h
index 9ea0021a1a75..0e05c9682b3c 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -14,6 +14,7 @@
 
 #include 
 #include 
+#include 
 
 #define arch_get_mmap_end(addr, len, flags)\
 ({ \
diff --git a/arch/riscv/include/asm/thread_info.h 
b/arch/riscv/include/asm/thread_info.h
index e494871071da..ed9e6cbacaa5 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -58,6 +58,9 @@ struct thread_info {
int cpu;
unsigned long   syscall_work;   /* SYSCALL_WORK_ flags */
unsigned long envcfg;
+#ifdef CONFIG_RISCV_USER_CFI
+   struct cfi_status   user_cfi_state;
+#endif
 #ifdef CONFIG_SHADOW_CALL_STACK
void*scs_base;
void*scs_sp;
diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h
new file mode 100644
index ..4fa201b4fc4e
--- /dev/null
+++ b/arch/riscv/include/asm/usercfi.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (C) 2024 Rivos, Inc.
+ * Deepak Gupta 
+ */
+#ifndef _ASM_RISCV_USERCFI_H
+#define _ASM_RISCV_USERCFI_H
+
+#ifndef __ASSEMBLY__
+#include 
+
+#ifdef CONFIG_RISCV_USER_CFI
+struct cfi_status {
+   unsigned long ubcfi_en : 1; /* Enable for backward cfi. */
+   unsigned long rsvd : ((sizeof(unsigned long)*8) - 1);
+   unsigned long user_shdw_stk; /* Current user shadow stack pointer */
+   unsigned long shdw_stk_base; /* Base address of shadow stack */
+   unsigned long shdw_stk_size; /* size of shadow stack */
+};
+
+#endif /* CONFIG_RISCV_USER_CFI */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_RISCV_USERCFI_H */
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index e94180ba432f..766bd33f10cb 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -52,6 +52,10 @@ void asm_offsets(void)
 #endif
 
OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
+#ifdef CONFIG_RISCV_USER_CFI
+   OFFSET(TASK_TI_CFI_STATUS, task_struct, thread_info.user_cfi_state);
+   OFFSET(TASK_TI_USER_SSP, task_struct, 
thread_info.user_cfi_state.user_shdw_stk);
+#endif
OFFSET(TASK_THREAD_F0,  task_struct, thread.fstate.f[0]);
OFFSET(TASK_THREAD_F1,  task_struct, thread.fstate.f[1]);
OFFSET(TASK_THREAD_F2,  task_struct, thread.fstate.f[2]);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index c200d329d4bd..8f7f477517e3 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -147,6 +147,20 @@ SYM_CODE_START(handle_exception)
 
REG_L s0, TASK_TI_USER_SP(tp)
csrrc s1, CSR_STATUS, t0
+   /*
+* If previous mode was U, capture shadow stack pointer and save it away
+* Zero CSR_SSP at the same time for sanitization.
+*/
+   ALTERNATIVE("nop; nop; nop; nop",
+   __stringify(\
+   andi s2, s1, SR_SPP;\
+   bnez s2, skip_ssp_save; \
+   csrrw s2, CSR_SSP, x0;  \
+   REG_S s2, TASK_TI_USER_SSP(tp); \
+   skip_ssp_save:),
+   0,
+   RISCV_ISA_EXT_ZICFISS,
+   CONF

[PATCH 09/33] riscv: zicfiss / zicfilp extension csr and bit definitions

2024-10-01 Thread Deepak Gupta

zicfiss and zicfilp extension gets enabled via b3 and b2 in *envcfg CSR.
menvcfg controls enabling for S/HS mode. henvcfg control enabling for VS
while senvcfg controls enabling for U/VU mode.

zicfilp extension extends *status CSR to hold `expected landing pad` bit.
A trap or interrupt can occur between an indirect jmp/call and target
instr. `expected landing pad` bit from CPU is recorded into xstatus CSR so
that when supervisor performs xret, `expected landing pad` state of CPU can
be restored.

zicfiss adds one new CSR
- CSR_SSP: CSR_SSP contains current shadow stack pointer.

Signed-off-by: Deepak Gupta 
Reviewed-by: Charlie Jenkins 
---
 arch/riscv/include/asm/csr.h | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 25966995da04..af7ed9bedaee 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -18,6 +18,15 @@
 #define SR_MPP _AC(0x1800, UL) /* Previously Machine */
 #define SR_SUM _AC(0x0004, UL) /* Supervisor User Memory Access */
 
+/* zicfilp landing pad status bit */
+#define SR_SPELP   _AC(0x0080, UL)
+#define SR_MPELP   _AC(0x0200, UL)
+#ifdef CONFIG_RISCV_M_MODE
+#define SR_ELP SR_MPELP
+#else
+#define SR_ELP SR_SPELP
+#endif
+
 #define SR_FS  _AC(0x6000, UL) /* Floating-point Status */
 #define SR_FS_OFF  _AC(0x, UL)
 #define SR_FS_INITIAL  _AC(0x2000, UL)
@@ -197,6 +206,8 @@
 #define ENVCFG_PBMTE   (_AC(1, ULL) << 62)
 #define ENVCFG_CBZE(_AC(1, UL) << 7)
 #define ENVCFG_CBCFE   (_AC(1, UL) << 6)
+#define ENVCFG_LPE (_AC(1, UL) << 2)
+#define ENVCFG_SSE (_AC(1, UL) << 3)
 #define ENVCFG_CBIE_SHIFT  4
 #define ENVCFG_CBIE(_AC(0x3, UL) << ENVCFG_CBIE_SHIFT)
 #define ENVCFG_CBIE_ILL_AC(0x0, UL)
@@ -215,6 +226,11 @@
 #define SMSTATEEN0_HSENVCFG(_ULL(1) << SMSTATEEN0_HSENVCFG_SHIFT)
 #define SMSTATEEN0_SSTATEEN0_SHIFT 63
 #define SMSTATEEN0_SSTATEEN0   (_ULL(1) << SMSTATEEN0_SSTATEEN0_SHIFT)
+/*
+ * zicfiss user mode csr
+ * CSR_SSP holds current shadow stack pointer.
+ */
+#define CSR_SSP 0x011
 
 /* symbolic CSR names: */
 #define CSR_CYCLE  0xc00

-- 
2.45.0

[PATCH 08/33] riscv: zicfiss / zicfilp enumeration

2024-10-01 Thread Deepak Gupta

This patch adds support for detecting zicfiss and zicfilp. zicfiss and
zicfilp stands for unprivleged integer spec extension for shadow stack
and branch tracking on indirect branches, respectively.

This patch looks for zicfiss and zicfilp in device tree and accordinlgy
lights up bit in cpu feature bitmap. Furthermore this patch adds detection
utility functions to return whether shadow stack or landing pads are
supported by cpu.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/asm/cpufeature.h | 13 +
 arch/riscv/include/asm/hwcap.h  |  2 ++
 arch/riscv/include/asm/processor.h  |  1 +
 arch/riscv/kernel/cpufeature.c  |  2 ++
 4 files changed, 18 insertions(+)

diff --git a/arch/riscv/include/asm/cpufeature.h 
b/arch/riscv/include/asm/cpufeature.h
index ce9a995730c1..344b8e8cd3e8 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -8,6 +8,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -180,4 +181,16 @@ static __always_inline bool 
riscv_cpu_has_extension_unlikely(int cpu, const unsi
return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
 }
 
+static inline bool cpu_supports_shadow_stack(void)
+{
+   return (IS_ENABLED(CONFIG_RISCV_USER_CFI) &&
+   riscv_cpu_has_extension_unlikely(smp_processor_id(), 
RISCV_ISA_EXT_ZICFISS));
+}
+
+static inline bool cpu_supports_indirect_br_lp_instr(void)
+{
+   return (IS_ENABLED(CONFIG_RISCV_USER_CFI) &&
+   riscv_cpu_has_extension_unlikely(smp_processor_id(), 
RISCV_ISA_EXT_ZICFILP));
+}
+
 #endif
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index 46d9de54179e..10d315a6ef0e 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -93,6 +93,8 @@
 #define RISCV_ISA_EXT_ZCMOP84
 #define RISCV_ISA_EXT_ZAWRS85
 #define RISCV_ISA_EXT_SVVPTC   86
+#define RISCV_ISA_EXT_ZICFILP  87
+#define RISCV_ISA_EXT_ZICFISS  88
 
 #define RISCV_ISA_EXT_XLINUXENVCFG 127
 
diff --git a/arch/riscv/include/asm/processor.h 
b/arch/riscv/include/asm/processor.h
index efa1b3519b23..9ea0021a1a75 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -13,6 +13,7 @@
 #include 
 
 #include 
+#include 
 
 #define arch_get_mmap_end(addr, len, flags)\
 ({ \
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 7117366d80db..96a1375d7171 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -317,6 +317,8 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
  riscv_ext_zicbom_validate),
__RISCV_ISA_EXT_SUPERSET_VALIDATE(zicboz, RISCV_ISA_EXT_ZICBOZ, 
riscv_xlinuxenvcfg_exts,
  riscv_ext_zicboz_validate),
+   __RISCV_ISA_EXT_SUPERSET(zicfilp, RISCV_ISA_EXT_ZICFILP, 
riscv_xlinuxenvcfg_exts),
+   __RISCV_ISA_EXT_SUPERSET(zicfiss, RISCV_ISA_EXT_ZICFISS, 
riscv_xlinuxenvcfg_exts),
__RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
__RISCV_ISA_EXT_DATA(zicond, RISCV_ISA_EXT_ZICOND),
__RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR),

-- 
2.45.0

[PATCH 12/33] riscv mm: manufacture shadow stack pte

2024-10-01 Thread Deepak Gupta

This patch implements creating shadow stack pte (on riscv). Creating
shadow stack PTE on riscv means that clearing RWX and then setting W=1.

Signed-off-by: Deepak Gupta 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/include/asm/pgtable.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 4948a1f18ae8..2c6edc8d04a3 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -421,6 +421,11 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
return __pte(pte_val(pte) | _PAGE_WRITE);
 }
 
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+   return __pte((pte_val(pte) & ~(_PAGE_LEAF)) | _PAGE_WRITE);
+}
+
 /* static inline pte_t pte_mkexec(pte_t pte) */
 
 static inline pte_t pte_mkdirty(pte_t pte)
@@ -738,6 +743,11 @@ static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
return pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)));
 }
 
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pte)
+{
+   return __pmd((pmd_val(pte) & ~(_PAGE_LEAF)) | _PAGE_WRITE);
+}
+
 static inline pmd_t pmd_wrprotect(pmd_t pmd)
 {
return pte_pmd(pte_wrprotect(pmd_pte(pmd)));

-- 
2.45.0

[PATCH 19/33] riscv: Implements arch agnostic shadow stack prctls

2024-10-01 Thread Deepak Gupta

Implement architecture agnostic prctls() interface for setting and getting
shadow stack status.

prctls implemented are PR_GET_SHADOW_STACK_STATUS,
PR_SET_SHADOW_STACK_STATUS and PR_LOCK_SHADOW_STACK_STATUS.

As part of PR_SET_SHADOW_STACK_STATUS/PR_GET_SHADOW_STACK_STATUS, only
PR_SHADOW_STACK_ENABLE is implemented because RISCV allows each mode to
write to their own shadow stack using `sspush` or `ssamoswap`.

PR_LOCK_SHADOW_STACK_STATUS locks current configuration of shadow stack
enabling.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/asm/usercfi.h |  18 ++-
 arch/riscv/kernel/process.c  |   8 +++
 arch/riscv/kernel/usercfi.c  | 107 +++
 3 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h
index 719e28e043c8..52850a2c79cf 100644
--- a/arch/riscv/include/asm/usercfi.h
+++ b/arch/riscv/include/asm/usercfi.h
@@ -7,6 +7,7 @@
 
 #ifndef __ASSEMBLY__
 #include 
+#include 
 
 struct task_struct;
 struct kernel_clone_args;
@@ -14,7 +15,8 @@ struct kernel_clone_args;
 #ifdef CONFIG_RISCV_USER_CFI
 struct cfi_status {
unsigned long ubcfi_en : 1; /* Enable for backward cfi. */
-   unsigned long rsvd : ((sizeof(unsigned long)*8) - 1);
+   unsigned long ubcfi_locked : 1;
+   unsigned long rsvd : ((sizeof(unsigned long)*8) - 2);
unsigned long user_shdw_stk; /* Current user shadow stack pointer */
unsigned long shdw_stk_base; /* Base address of shadow stack */
unsigned long shdw_stk_size; /* size of shadow stack */
@@ -27,6 +29,12 @@ void set_shstk_base(struct task_struct *task, unsigned long 
shstk_addr, unsigned
 unsigned long get_shstk_base(struct task_struct *task, unsigned long *size);
 void set_active_shstk(struct task_struct *task, unsigned long shstk_addr);
 bool is_shstk_enabled(struct task_struct *task);
+bool is_shstk_locked(struct task_struct *task);
+bool is_shstk_allocated(struct task_struct *task);
+void set_shstk_lock(struct task_struct *task);
+void set_shstk_status(struct task_struct *task, bool enable);
+
+#define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK (PR_SHADOW_STACK_ENABLE)
 
 #else
 
@@ -42,6 +50,14 @@ bool is_shstk_enabled(struct task_struct *task);
 
 #define is_shstk_enabled(task) false
 
+#define is_shstk_locked(task) false
+
+#define is_shstk_allocated(task) false
+
+#define set_shstk_lock(task)
+
+#define set_shstk_status(task, enable)
+
 #endif /* CONFIG_RISCV_USER_CFI */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index f6f58b1ed905..f7dec532657f 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -152,6 +152,14 @@ void start_thread(struct pt_regs *regs, unsigned long pc,
regs->epc = pc;
regs->sp = sp;
 
+   /*
+* clear shadow stack state on exec.
+* libc will set it later via prctl.
+*/
+   set_shstk_status(current, false);
+   set_shstk_base(current, 0, 0);
+   set_active_shstk(current, 0);
+
 #ifdef CONFIG_64BIT
regs->status &= ~SR_UXL;
 
diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
index 7a7f0b57b2d4..c77abe552c88 100644
--- a/arch/riscv/kernel/usercfi.c
+++ b/arch/riscv/kernel/usercfi.c
@@ -24,6 +24,16 @@ bool is_shstk_enabled(struct task_struct *task)
return task->thread_info.user_cfi_state.ubcfi_en ? true : false;
 }
 
+bool is_shstk_allocated(struct task_struct *task)
+{
+   return task->thread_info.user_cfi_state.shdw_stk_base ? true : false;
+}
+
+bool is_shstk_locked(struct task_struct *task)
+{
+   return task->thread_info.user_cfi_state.ubcfi_locked ? true : false;
+}
+
 void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, 
unsigned long size)
 {
task->thread_info.user_cfi_state.shdw_stk_base = shstk_addr;
@@ -42,6 +52,23 @@ void set_active_shstk(struct task_struct *task, unsigned 
long shstk_addr)
task->thread_info.user_cfi_state.user_shdw_stk = shstk_addr;
 }
 
+void set_shstk_status(struct task_struct *task, bool enable)
+{
+   task->thread_info.user_cfi_state.ubcfi_en = enable ? 1 : 0;
+
+   if (enable)
+   task->thread_info.envcfg |= ENVCFG_SSE;
+   else
+   task->thread_info.envcfg &= ~ENVCFG_SSE;
+
+   csr_write(CSR_ENVCFG, task->thread_info.envcfg);
+}
+
+void set_shstk_lock(struct task_struct *task)
+{
+   task->thread_info.user_cfi_state.ubcfi_locked = 1;
+}
+
 /*
  * If size is 0, then to be compatible with regular stack we want it to be as 
big as
  * regular stack. Else PAGE_ALIGN it and return back
@@ -264,3 +291,83 @@ void shstk_release(struct task_struct *tsk)
vm_munmap(base, size);
set_shstk_base(tsk, 0, 0);
 }
+
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user 
*status)
+{
+   unsigned long bcfi_status = 0;
+
+   if (!cpu_supports_shadow_stack())
+   return

[PATCH 20/33] riscv: Implements arch agnostic indirect branch tracking prctls

2024-10-01 Thread Deepak Gupta

prctls implemented are:
PR_SET_INDIR_BR_LP_STATUS, PR_GET_INDIR_BR_LP_STATUS and
PR_LOCK_INDIR_BR_LP_STATUS.

On trap entry, ELP state is recorded in sstatus image on stack and SR_ELP
in CSR_STATUS is cleared.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/asm/usercfi.h | 16 -
 arch/riscv/kernel/entry.S|  2 +-
 arch/riscv/kernel/process.c  |  5 +++
 arch/riscv/kernel/usercfi.c  | 76 
 4 files changed, 97 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h
index 52850a2c79cf..099204d0cd4a 100644
--- a/arch/riscv/include/asm/usercfi.h
+++ b/arch/riscv/include/asm/usercfi.h
@@ -16,7 +16,9 @@ struct kernel_clone_args;
 struct cfi_status {
unsigned long ubcfi_en : 1; /* Enable for backward cfi. */
unsigned long ubcfi_locked : 1;
-   unsigned long rsvd : ((sizeof(unsigned long)*8) - 2);
+   unsigned long ufcfi_en : 1; /* Enable for forward cfi. Note that ELP 
goes in sstatus */
+   unsigned long ufcfi_locked : 1;
+   unsigned long rsvd : ((sizeof(unsigned long)*8) - 4);
unsigned long user_shdw_stk; /* Current user shadow stack pointer */
unsigned long shdw_stk_base; /* Base address of shadow stack */
unsigned long shdw_stk_size; /* size of shadow stack */
@@ -33,6 +35,10 @@ bool is_shstk_locked(struct task_struct *task);
 bool is_shstk_allocated(struct task_struct *task);
 void set_shstk_lock(struct task_struct *task);
 void set_shstk_status(struct task_struct *task, bool enable);
+bool is_indir_lp_enabled(struct task_struct *task);
+bool is_indir_lp_locked(struct task_struct *task);
+void set_indir_lp_status(struct task_struct *task, bool enable);
+void set_indir_lp_lock(struct task_struct *task);
 
 #define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK (PR_SHADOW_STACK_ENABLE)
 
@@ -58,6 +64,14 @@ void set_shstk_status(struct task_struct *task, bool enable);
 
 #define set_shstk_status(task, enable)
 
+#define is_indir_lp_enabled(task) false
+
+#define is_indir_lp_locked(task) false
+
+#define set_indir_lp_status(task, enable)
+
+#define set_indir_lp_lock(task)
+
 #endif /* CONFIG_RISCV_USER_CFI */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 8f7f477517e3..a1f258fd7bbc 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -143,7 +143,7 @@ SYM_CODE_START(handle_exception)
 * Disable the FPU/Vector to detect illegal usage of floating point
 * or vector in kernel space.
 */
-   li t0, SR_SUM | SR_FS_VS
+   li t0, SR_SUM | SR_FS_VS | SR_ELP
 
REG_L s0, TASK_TI_USER_SP(tp)
csrrc s1, CSR_STATUS, t0
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index f7dec532657f..5207f018415c 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -159,6 +159,11 @@ void start_thread(struct pt_regs *regs, unsigned long pc,
set_shstk_status(current, false);
set_shstk_base(current, 0, 0);
set_active_shstk(current, 0);
+   /*
+* disable indirect branch tracking on exec.
+* libc will enable it later via prctl.
+*/
+   set_indir_lp_status(current, false);
 
 #ifdef CONFIG_64BIT
regs->status &= ~SR_UXL;
diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
index c77abe552c88..8da509afdbe9 100644
--- a/arch/riscv/kernel/usercfi.c
+++ b/arch/riscv/kernel/usercfi.c
@@ -69,6 +69,32 @@ void set_shstk_lock(struct task_struct *task)
task->thread_info.user_cfi_state.ubcfi_locked = 1;
 }
 
+bool is_indir_lp_enabled(struct task_struct *task)
+{
+   return task->thread_info.user_cfi_state.ufcfi_en ? true : false;
+}
+
+bool is_indir_lp_locked(struct task_struct *task)
+{
+   return task->thread_info.user_cfi_state.ufcfi_locked ? true : false;
+}
+
+void set_indir_lp_status(struct task_struct *task, bool enable)
+{
+   task->thread_info.user_cfi_state.ufcfi_en = enable ? 1 : 0;
+
+   if (enable)
+   task->thread_info.envcfg |= ENVCFG_LPE;
+   else
+   task->thread_info.envcfg &= ~ENVCFG_LPE;
+
+   csr_write(CSR_ENVCFG, task->thread_info.envcfg);
+}
+
+void set_indir_lp_lock(struct task_struct *task)
+{
+   task->thread_info.user_cfi_state.ufcfi_locked = 1;
+}
 /*
  * If size is 0, then to be compatible with regular stack we want it to be as 
big as
  * regular stack. Else PAGE_ALIGN it and return back
@@ -371,3 +397,53 @@ int arch_lock_shadow_stack_status(struct task_struct *task,
 
return 0;
 }
+
+int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user 
*status)
+{
+   unsigned long fcfi_status = 0;
+
+   if (!cpu_supports_indirect_br_lp_instr())
+   return -EINVAL;
+
+   /* indirect branch tracking is enabled on the task or not */
+   fcfi_status |= (is_indir_lp_enabled(t) ? PR_INDIR_BR_LP_ENABLE : 0);
+
+

[PATCH 13/33] riscv mmu: teach pte_mkwrite to manufacture shadow stack PTEs

2024-10-01 Thread Deepak Gupta

pte_mkwrite creates PTEs with WRITE encodings for underlying arch.
Underlying arch can have two types of writeable mappings. One that can be
written using regular store instructions. Another one that can only be
written using specialized store instructions (like shadow stack stores).
pte_mkwrite can select write PTE encoding based on VMA range (i.e.
VM_SHADOW_STACK)

Signed-off-by: Deepak Gupta 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/include/asm/pgtable.h |  7 +++
 arch/riscv/mm/pgtable.c  | 17 +
 2 files changed, 24 insertions(+)

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 2c6edc8d04a3..7963ab11d924 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -416,6 +416,10 @@ static inline pte_t pte_wrprotect(pte_t pte)
 
 /* static inline pte_t pte_mkread(pte_t pte) */
 
+struct vm_area_struct;
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
+#define pte_mkwrite pte_mkwrite
+
 static inline pte_t pte_mkwrite_novma(pte_t pte)
 {
return __pte(pte_val(pte) | _PAGE_WRITE);
@@ -738,6 +742,9 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
return pte_pmd(pte_mkyoung(pmd_pte(pmd)));
 }
 
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+#define pmd_mkwrite pmd_mkwrite
+
 static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
 {
return pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)));
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index 4ae67324f992..be5d38546bb3 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -155,3 +155,20 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
return pmd;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   if (vma->vm_flags & VM_SHADOW_STACK)
+   return pte_mkwrite_shstk(pte);
+
+   return pte_mkwrite_novma(pte);
+}
+
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+   if (vma->vm_flags & VM_SHADOW_STACK)
+   return pmd_mkwrite_shstk(pmd);
+
+   return pmd_mkwrite_novma(pmd);
+}
+

-- 
2.45.0

[PATCH 14/33] riscv mmu: write protect and shadow stack

2024-10-01 Thread Deepak Gupta

`fork` implements copy on write (COW) by making pages readonly in child
and parent both.

ptep_set_wrprotect and pte_wrprotect clears _PAGE_WRITE in PTE.
Assumption is that page is readable and on fault copy on write happens.

To implement COW on shadow stack pages, clearing up W bit makes them XWR =
000. This will result in wrong PTE setting which says no perms but V=1 and
PFN field pointing to final page. Instead desired behavior is to turn it
into a readable page, take an access (load/store) fault on sspush/sspop
(shadow stack) and then perform COW on such pages. This way regular reads
would still be allowed and not lead to COW maintaining current behavior
of COW on non-shadow stack but writeable memory.

On the other hand it doesn't interfere with existing COW for read-write
memory. Assumption is always that _PAGE_READ must have been set and thus
setting _PAGE_READ is harmless.

Signed-off-by: Deepak Gupta 
Alexandre Ghiti 
---
 arch/riscv/include/asm/pgtable.h | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 7963ab11d924..fdab7d74437d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -411,7 +411,7 @@ static inline int pte_devmap(pte_t pte)
 
 static inline pte_t pte_wrprotect(pte_t pte)
 {
-   return __pte(pte_val(pte) & ~(_PAGE_WRITE));
+   return __pte((pte_val(pte) & ~(_PAGE_WRITE)) | (_PAGE_READ));
 }
 
 /* static inline pte_t pte_mkread(pte_t pte) */
@@ -612,7 +612,15 @@ static inline pte_t ptep_get_and_clear(struct mm_struct 
*mm,
 static inline void ptep_set_wrprotect(struct mm_struct *mm,
  unsigned long address, pte_t *ptep)
 {
-   atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep);
+   pte_t read_pte = READ_ONCE(*ptep);
+   /*
+* ptep_set_wrprotect can be called for shadow stack ranges too.
+* shadow stack memory is XWR = 010 and thus clearing _PAGE_WRITE will 
lead to
+* encoding 000b which is wrong encoding with V = 1. This should lead 
to page fault
+* but we dont want this wrong configuration to be set in page tables.
+*/
+   atomic_long_set((atomic_long_t *)ptep,
+   ((pte_val(read_pte) & ~(unsigned long)_PAGE_WRITE) | 
_PAGE_READ));
 }
 
 #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH

-- 
2.45.0

[PATCH 27/33] riscv: Add Firmware Feature SBI extensions definitions

2024-10-01 Thread Deepak Gupta

From: Clément Léger 

Add necessary SBI definitions to use the FWFT extension.

Signed-off-by: Clément Léger 
---
 arch/riscv/include/asm/sbi.h | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 98f631b051db..754e5cdabf46 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -34,6 +34,7 @@ enum sbi_ext_id {
SBI_EXT_PMU = 0x504D55,
SBI_EXT_DBCN = 0x4442434E,
SBI_EXT_STA = 0x535441,
+   SBI_EXT_FWFT = 0x46574654,
 
/* Experimentals extensions must lie within this range */
SBI_EXT_EXPERIMENTAL_START = 0x0800,
@@ -281,6 +282,32 @@ struct sbi_sta_struct {
 
 #define SBI_SHMEM_DISABLE  -1
 
+/* SBI function IDs for FW feature extension */
+#define SBI_EXT_FWFT_SET   0x0
+#define SBI_EXT_FWFT_GET   0x1
+
+enum sbi_fwft_feature_t {
+   SBI_FWFT_MISALIGNED_EXC_DELEG   = 0x0,
+   SBI_FWFT_LANDING_PAD= 0x1,
+   SBI_FWFT_SHADOW_STACK   = 0x2,
+   SBI_FWFT_DOUBLE_TRAP= 0x3,
+   SBI_FWFT_PTE_AD_HW_UPDATING = 0x4,
+   SBI_FWFT_LOCAL_RESERVED_START   = 0x5,
+   SBI_FWFT_LOCAL_RESERVED_END = 0x3fff,
+   SBI_FWFT_LOCAL_PLATFORM_START   = 0x4000,
+   SBI_FWFT_LOCAL_PLATFORM_END = 0x7fff,
+
+   SBI_FWFT_GLOBAL_RESERVED_START  = 0x8000,
+   SBI_FWFT_GLOBAL_RESERVED_END= 0xbfff,
+   SBI_FWFT_GLOBAL_PLATFORM_START  = 0xc000,
+   SBI_FWFT_GLOBAL_PLATFORM_END= 0x,
+};
+
+#define SBI_FWFT_GLOBAL_FEATURE_BIT(1 << 31)
+#define SBI_FWFT_PLATFORM_FEATURE_BIT  (1 << 30)
+
+#define SBI_FWFT_SET_FLAG_LOCK (1 << 0)
+
 /* SBI spec version fields */
 #define SBI_SPEC_VERSION_DEFAULT   0x1
 #define SBI_SPEC_VERSION_MAJOR_SHIFT   24

-- 
2.45.0

[PATCH 16/33] riscv/shstk: If needed allocate a new shadow stack on clone

2024-10-01 Thread Deepak Gupta

Userspace specifies CLONE_VM to share address space and spawn new thread.
`clone` allow userspace to specify a new stack for new thread. However
there is no way to specify new shadow stack base address without changing
API. This patch allocates a new shadow stack whenever CLONE_VM is given.

In case of CLONE_VFORK, parent is suspended until child finishes and thus
can child use parent shadow stack. In case of !CLONE_VM, COW kicks in
because entire address space is copied from parent to child.

`clone3` is extensible and can provide mechanisms using which shadow stack
as an input parameter can be provided. This is not settled yet and being
extensively discussed on mailing list. Once that's settled, this commit
will adapt to that.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/asm/usercfi.h |  25 
 arch/riscv/kernel/process.c  |  11 +++-
 arch/riscv/kernel/usercfi.c  | 121 +++
 3 files changed, 156 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h
index 4fa201b4fc4e..719e28e043c8 100644
--- a/arch/riscv/include/asm/usercfi.h
+++ b/arch/riscv/include/asm/usercfi.h
@@ -8,6 +8,9 @@
 #ifndef __ASSEMBLY__
 #include 
 
+struct task_struct;
+struct kernel_clone_args;
+
 #ifdef CONFIG_RISCV_USER_CFI
 struct cfi_status {
unsigned long ubcfi_en : 1; /* Enable for backward cfi. */
@@ -17,6 +20,28 @@ struct cfi_status {
unsigned long shdw_stk_size; /* size of shadow stack */
 };
 
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
+   const struct 
kernel_clone_args *args);
+void shstk_release(struct task_struct *tsk);
+void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, 
unsigned long size);
+unsigned long get_shstk_base(struct task_struct *task, unsigned long *size);
+void set_active_shstk(struct task_struct *task, unsigned long shstk_addr);
+bool is_shstk_enabled(struct task_struct *task);
+
+#else
+
+#define shstk_alloc_thread_stack(tsk, args) 0
+
+#define shstk_release(tsk)
+
+#define get_shstk_base(task, size) 0
+
+#define set_shstk_base(task, shstk_addr, size)
+
+#define set_active_shstk(task, shstk_addr)
+
+#define is_shstk_enabled(task) false
+
 #endif /* CONFIG_RISCV_USER_CFI */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 1f2574fb2edb..f6f58b1ed905 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
 #include 
@@ -203,7 +204,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src)
 
 void exit_thread(struct task_struct *tsk)
 {
-
+   shstk_release(tsk);
 }
 
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
@@ -211,6 +212,7 @@ int copy_thread(struct task_struct *p, const struct 
kernel_clone_args *args)
unsigned long clone_flags = args->flags;
unsigned long usp = args->stack;
unsigned long tls = args->tls;
+   unsigned long ssp = 0;
struct pt_regs *childregs = task_pt_regs(p);
 
memset(&p->thread.s, 0, sizeof(p->thread.s));
@@ -225,11 +227,18 @@ int copy_thread(struct task_struct *p, const struct 
kernel_clone_args *args)
p->thread.s[0] = (unsigned long)args->fn;
p->thread.s[1] = (unsigned long)args->fn_arg;
} else {
+   /* allocate new shadow stack if needed. In case of CLONE_VM we 
have to */
+   ssp = shstk_alloc_thread_stack(p, args);
+   if (IS_ERR_VALUE(ssp))
+   return PTR_ERR((void *)ssp);
+
*childregs = *(current_pt_regs());
/* Turn off status.VS */
riscv_v_vstate_off(childregs);
if (usp) /* User fork */
childregs->sp = usp;
+   /* if needed, set new ssp */
+   ssp ? set_active_shstk(p, ssp) : 0;
if (clone_flags & CLONE_SETTLS)
childregs->tp = tls;
childregs->a0 = 0; /* Return value of fork() */
diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
index ce002eabbdc1..7a7f0b57b2d4 100644
--- a/arch/riscv/kernel/usercfi.c
+++ b/arch/riscv/kernel/usercfi.c
@@ -19,6 +19,41 @@
 
 #define SHSTK_ENTRY_SIZE sizeof(void *)
 
+bool is_shstk_enabled(struct task_struct *task)
+{
+   return task->thread_info.user_cfi_state.ubcfi_en ? true : false;
+}
+
+void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, 
unsigned long size)
+{
+   task->thread_info.user_cfi_state.shdw_stk_base = shstk_addr;
+   task->thread_info.user_cfi_state.shdw_stk_size = size;
+}
+
+unsigned long get_shstk_base(struct task_struct *task, unsigned long *size)
+{
+   if (size)
+

[PATCH 21/33] riscv/traps: Introduce software check exception

2024-10-01 Thread Deepak Gupta

zicfiss / zicfilp introduces a new exception to priv isa `software check
exception` with cause code = 18. This patch implements software check
exception.

Additionally it implements a cfi violation handler which checks for code
in xtval. If xtval=2, it means that sw check exception happened because of
an indirect branch not landing on 4 byte aligned PC or not landing on
`lpad` instruction or label value embedded in `lpad` not matching label
value setup in `x7`. If xtval=3, it means that sw check exception happened
because of mismatch between link register (x1 or x5) and top of shadow
stack (on execution of `sspopchk`).

In case of cfi violation, SIGSEGV is raised with code=SEGV_CPERR.
SEGV_CPERR was introduced by x86 shadow stack patches.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/asm/asm-prototypes.h |  1 +
 arch/riscv/include/asm/entry-common.h   |  2 ++
 arch/riscv/kernel/entry.S   |  3 +++
 arch/riscv/kernel/traps.c   | 42 +
 4 files changed, 48 insertions(+)

diff --git a/arch/riscv/include/asm/asm-prototypes.h 
b/arch/riscv/include/asm/asm-prototypes.h
index cd627ec289f1..5a27cefd7805 100644
--- a/arch/riscv/include/asm/asm-prototypes.h
+++ b/arch/riscv/include/asm/asm-prototypes.h
@@ -51,6 +51,7 @@ DECLARE_DO_ERROR_INFO(do_trap_ecall_u);
 DECLARE_DO_ERROR_INFO(do_trap_ecall_s);
 DECLARE_DO_ERROR_INFO(do_trap_ecall_m);
 DECLARE_DO_ERROR_INFO(do_trap_break);
+DECLARE_DO_ERROR_INFO(do_trap_software_check);
 
 asmlinkage void handle_bad_stack(struct pt_regs *regs);
 asmlinkage void do_page_fault(struct pt_regs *regs);
diff --git a/arch/riscv/include/asm/entry-common.h 
b/arch/riscv/include/asm/entry-common.h
index 2293e535f865..4068c7e5452a 100644
--- a/arch/riscv/include/asm/entry-common.h
+++ b/arch/riscv/include/asm/entry-common.h
@@ -39,4 +39,6 @@ static inline int handle_misaligned_store(struct pt_regs 
*regs)
 }
 #endif
 
+bool handle_user_cfi_violation(struct pt_regs *regs);
+
 #endif /* _ASM_RISCV_ENTRY_COMMON_H */
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index a1f258fd7bbc..aaef4604d841 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -471,6 +471,9 @@ SYM_DATA_START_LOCAL(excp_vect_table)
RISCV_PTR do_page_fault   /* load page fault */
RISCV_PTR do_trap_unknown
RISCV_PTR do_page_fault   /* store page fault */
+   RISCV_PTR do_trap_unknown /* cause=16 */
+   RISCV_PTR do_trap_unknown /* cause=17 */
+   RISCV_PTR do_trap_software_check /* cause=18 is sw check exception */
 SYM_DATA_END_LABEL(excp_vect_table, SYM_L_LOCAL, excp_vect_table_end)
 
 #ifndef CONFIG_MMU
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 51ebfd23e007..225b1d198ab6 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -354,6 +354,48 @@ void do_trap_ecall_u(struct pt_regs *regs)
 
 }
 
+#define CFI_TVAL_FCFI_CODE 2
+#define CFI_TVAL_BCFI_CODE 3
+/* handle cfi violations */
+bool handle_user_cfi_violation(struct pt_regs *regs)
+{
+   bool ret = false;
+   unsigned long tval = csr_read(CSR_TVAL);
+
+   if (((tval == CFI_TVAL_FCFI_CODE) && 
cpu_supports_indirect_br_lp_instr()) ||
+   ((tval == CFI_TVAL_BCFI_CODE) && cpu_supports_shadow_stack())) {
+   do_trap_error(regs, SIGSEGV, SEGV_CPERR, regs->epc,
+ "Oops - control flow violation");
+   ret = true;
+   }
+
+   return ret;
+}
+/*
+ * software check exception is defined with risc-v cfi spec. Software check
+ * exception is raised when:-
+ * a) An indirect branch doesn't land on 4 byte aligned PC or `lpad`
+ *instruction or `label` value programmed in `lpad` instr doesn't
+ *match with value setup in `x7`. reported code in `xtval` is 2.
+ * b) `sspopchk` instruction finds a mismatch between top of shadow stack (ssp)
+ *and x1/x5. reported code in `xtval` is 3.
+ */
+asmlinkage __visible __trap_section void do_trap_software_check(struct pt_regs 
*regs)
+{
+   if (user_mode(regs)) {
+   irqentry_enter_from_user_mode(regs);
+
+   /* not a cfi violation, then merge into flow of unknown trap 
handler */
+   if (!handle_user_cfi_violation(regs))
+   do_trap_unknown(regs);
+
+   irqentry_exit_to_user_mode(regs);
+   } else {
+   /* sw check exception coming from kernel is a bug in kernel */
+   die(regs, "Kernel BUG");
+   }
+}
+
 #ifdef CONFIG_MMU
 asmlinkage __visible noinstr void do_page_fault(struct pt_regs *regs)
 {

-- 
2.45.0

[PATCH 31/33] riscv: Documentation for landing pad / indirect branch tracking

2024-10-01 Thread Deepak Gupta

Adding documentation on landing pad aka indirect branch tracking on riscv
and kernel interfaces exposed so that user tasks can enable it.

Signed-off-by: Deepak Gupta 
---
 Documentation/arch/riscv/index.rst   |   1 +
 Documentation/arch/riscv/zicfilp.rst | 115 +++
 2 files changed, 116 insertions(+)

diff --git a/Documentation/arch/riscv/index.rst 
b/Documentation/arch/riscv/index.rst
index eecf347ce849..be7237b69682 100644
--- a/Documentation/arch/riscv/index.rst
+++ b/Documentation/arch/riscv/index.rst
@@ -14,6 +14,7 @@ RISC-V architecture
 uabi
 vector
 cmodx
+zicfilp
 
 features
 
diff --git a/Documentation/arch/riscv/zicfilp.rst 
b/Documentation/arch/riscv/zicfilp.rst
new file mode 100644
index ..a188d78fcde6
--- /dev/null
+++ b/Documentation/arch/riscv/zicfilp.rst
@@ -0,0 +1,115 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Author: Deepak Gupta 
+:Date:   12 January 2024
+
+
+Tracking indirect control transfers on RISC-V Linux
+
+
+This document briefly describes the interface provided to userspace by Linux
+to enable indirect branch tracking for user mode applications on RISV-V
+
+1. Feature Overview
+
+
+Memory corruption issues usually result in to crashes, however when in hands of
+an adversary and if used creatively can result into variety security issues.
+
+One of those security issues can be code re-use attacks on program where 
adversary
+can use corrupt function pointers and chain them together to perform jump 
oriented
+programming (JOP) or call oriented programming (COP) and thus compromising 
control
+flow integrity (CFI) of the program.
+
+Function pointers live in read-write memory and thus are susceptible to 
corruption
+and allows an adversary to reach any program counter (PC) in address space. On
+RISC-V zicfilp extension enforces a restriction on such indirect control
+transfers:
+
+- indirect control transfers must land on a landing pad instruction ``lpad``.
+  There are two exception to this rule:
+
+  - rs1 = x1 or rs1 = x5, i.e. a return from a function and returns are
+protected using shadow stack (see zicfiss.rst)
+
+  - rs1 = x7. On RISC-V compiler usually does below to reach function
+which is beyond the offset possible J-type instruction::
+
+  auipc x7, 
+  jalr (x7)
+
+   Such form of indirect control transfer are still immutable and don't 
rely
+on memory and thus rs1=x7 is exempted from tracking and considered software
+guarded jumps.
+
+``lpad`` instruction is pseudo of ``auipc rd, `` with ``rd=x0`` and
+is a HINT nop. ``lpad`` instruction must be aligned on 4 byte boundary and
+compares 20 bit immediate withx7. If ``imm_20bit`` == 0, CPU don't perform any
+comparision with ``x7``. If ``imm_20bit`` != 0, then ``imm_20bit`` must match
+``x7`` else CPU will raise ``software check exception`` (``cause=18``) with
+``*tval = 2``.
+
+Compiler can generate a hash over function signatures and setup them (truncated
+to 20bit) in x7 at callsites and function prologues can have ``lpad`` with same
+function hash. This further reduces number of program counters a call site can
+reach.
+
+2. ELF and psABI
+-
+
+Toolchain sets up :c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_FCFI` for property
+:c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_AND` in notes section of the object 
file.
+
+3. Linux enabling
+--
+
+User space programs can have multiple shared objects loaded in its address 
space
+and it's a difficult task to make sure all the dependencies have been compiled
+with support of indirect branch. Thus it's left to dynamic loader to enable
+indirect branch tracking for the program.
+
+4. prctl() enabling
+
+
+:c:macro:`PR_SET_INDIR_BR_LP_STATUS` / :c:macro:`PR_GET_INDIR_BR_LP_STATUS` /
+:c:macro:`PR_LOCK_INDIR_BR_LP_STATUS` are three prctls added to manage indirect
+branch tracking. prctls are arch agnostic and returns -EINVAL on other arches.
+
+* prctl(PR_SET_INDIR_BR_LP_STATUS, unsigned long arg)
+
+If arg1 is :c:macro:`PR_INDIR_BR_LP_ENABLE` and if CPU supports ``zicfilp``
+then kernel will enabled indirect branch tracking for the task. Dynamic loader
+can issue this :c:macro:`prctl` once it has determined that all the objects
+loaded in address space support indirect branch tracking. Additionally if there
+is a `dlopen` to an object which wasn't compiled with ``zicfilp``, dynamic
+loader can issue this prctl with arg1 set to 0 (i.e.
+:c:macro:`PR_INDIR_BR_LP_ENABLE` being clear)
+
+* prctl(PR_GET_INDIR_BR_LP_STATUS, unsigned long arg)
+
+Returns current status of indirect branch tracking. If enabled it'll return
+:c:macro:`PR_INDIR_BR_LP_ENABLE`
+
+* prctl(PR_LOCK_INDIR_BR_LP_STATUS, unsigned long arg)
+
+Locks current status of indirect branch tracking on the task. User space may
+want to run with strict security posture and

[PATCH 28/33] riscv: enable kernel access to shadow stack memory via FWFT sbi call

2024-10-01 Thread Deepak Gupta

Kernel will have to perform shadow stack operations on user shadow stack.
Like during signal delivery and sigreturn, shadow stack token must be
created and validated respectively. Thus shadow stack access for kernel
must be enabled.

In future when kernel shadow stacks are enabled for linux kernel, it must
be enabled as early as possible for better coverage and prevent imbalance
between regular stack and shadow stack. After `relocate_enable_mmu` has
been done, this is as early as possible it can enabled.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/kernel/asm-offsets.c |  4 
 arch/riscv/kernel/head.S| 12 
 2 files changed, 16 insertions(+)

diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 766bd33f10cb..a22ab8a41672 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -517,4 +517,8 @@ void asm_offsets(void)
DEFINE(FREGS_A6,offsetof(struct ftrace_regs, a6));
DEFINE(FREGS_A7,offsetof(struct ftrace_regs, a7));
 #endif
+   DEFINE(SBI_EXT_FWFT, SBI_EXT_FWFT);
+   DEFINE(SBI_EXT_FWFT_SET, SBI_EXT_FWFT_SET);
+   DEFINE(SBI_FWFT_SHADOW_STACK, SBI_FWFT_SHADOW_STACK);
+   DEFINE(SBI_FWFT_SET_FLAG_LOCK, SBI_FWFT_SET_FLAG_LOCK);
 }
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 356d5397b2a2..6244408ca917 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -164,6 +164,12 @@ secondary_start_sbi:
call relocate_enable_mmu
 #endif
call .Lsetup_trap_vector
+   li a7, SBI_EXT_FWFT
+   li a6, SBI_EXT_FWFT_SET
+   li a0, SBI_FWFT_SHADOW_STACK
+   li a1, 1 /* enable supervisor to access shadow stack access */
+   li a2, SBI_FWFT_SET_FLAG_LOCK
+   ecall
scs_load_current
call smp_callin
 #endif /* CONFIG_SMP */
@@ -320,6 +326,12 @@ SYM_CODE_START(_start_kernel)
la tp, init_task
la sp, init_thread_union + THREAD_SIZE
addi sp, sp, -PT_SIZE_ON_STACK
+   li a7, SBI_EXT_FWFT
+   li a6, SBI_EXT_FWFT_SET
+   li a0, SBI_FWFT_SHADOW_STACK
+   li a1, 1 /* enable supervisor to access shadow stack access */
+   li a2, SBI_FWFT_SET_FLAG_LOCK
+   ecall
scs_load_current
 
 #ifdef CONFIG_KASAN

-- 
2.45.0

[PATCH 02/33] mm: helper `is_shadow_stack_vma` to check shadow stack vma

2024-10-01 Thread Deepak Gupta

VM_SHADOW_STACK (alias to VM_HIGH_ARCH_5) is used to encode shadow stack
VMA on three architectures (x86 shadow stack, arm GCS and RISC-V shadow
stack). In case architecture doesn't implement shadow stack, it's VM_NONE
Introducing a helper `is_shadow_stack_vma` to determine shadow stack vma
or not.

Signed-off-by: Deepak Gupta 
---
 mm/gup.c |  2 +-
 mm/vma.h | 10 +++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index a82890b46a36..8e6e14179f6c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1282,7 +1282,7 @@ static int check_vma_flags(struct vm_area_struct *vma, 
unsigned long gup_flags)
!writable_file_mapping_allowed(vma, gup_flags))
return -EFAULT;
 
-   if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
+   if (!(vm_flags & VM_WRITE) || is_shadow_stack_vma(vm_flags)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
/* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
diff --git a/mm/vma.h b/mm/vma.h
index 819f994cf727..0f238dc37231 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -357,7 +357,7 @@ static inline struct vm_area_struct *vma_prev_limit(struct 
vma_iterator *vmi,
 }
 
 /*
- * These three helpers classifies VMAs for virtual memory accounting.
+ * These four helpers classifies VMAs for virtual memory accounting.
  */
 
 /*
@@ -368,6 +368,11 @@ static inline bool is_exec_mapping(vm_flags_t flags)
return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
 }
 
+static inline bool is_shadow_stack_vma(vm_flags_t vm_flags)
+{
+   return !!(vm_flags & VM_SHADOW_STACK);
+}
+
 /*
  * Stack area (including shadow stacks)
  *
@@ -376,7 +381,7 @@ static inline bool is_exec_mapping(vm_flags_t flags)
  */
 static inline bool is_stack_mapping(vm_flags_t flags)
 {
-   return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
+   return ((flags & VM_STACK) == VM_STACK) || is_shadow_stack_vma(flags);
 }
 
 /*
@@ -387,7 +392,6 @@ static inline bool is_data_mapping(vm_flags_t flags)
return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
 }
 
-
 static inline void vma_iter_config(struct vma_iterator *vmi,
unsigned long index, unsigned long last)
 {

-- 
2.45.0

[PATCH 01/33] mm: Introduce ARCH_HAS_USER_SHADOW_STACK

2024-10-01 Thread Deepak Gupta

From: Mark Brown 

Since multiple architectures have support for shadow stacks and we need to
select support for this feature in several places in the generic code
provide a generic config option that the architectures can select.

Suggested-by: David Hildenbrand 
Acked-by: David Hildenbrand 
Signed-off-by: Mark Brown 
Reviewed-by: Rick Edgecombe 
Reviewed-by: Deepak Gupta 
Reviewed-by: Carlos Bilbao 
---
 arch/x86/Kconfig   | 1 +
 fs/proc/task_mmu.c | 2 +-
 include/linux/mm.h | 2 +-
 mm/Kconfig | 6 ++
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2852fcd82cbd..8ccae77d40f7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1954,6 +1954,7 @@ config X86_USER_SHADOW_STACK
depends on AS_WRUSS
depends on X86_64
select ARCH_USES_HIGH_VMA_FLAGS
+   select ARCH_HAS_USER_SHADOW_STACK
select X86_CET
help
  Shadow stack protection is a hardware feature that detects function
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 72f14fd59c2d..23f875e78eae 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -971,7 +971,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct 
vm_area_struct *vma)
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)]  = "ui",
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-#ifdef CONFIG_X86_USER_SHADOW_STACK
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
 #endif
 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..57533b9cae95 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -354,7 +354,7 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
 
-#ifdef CONFIG_X86_USER_SHADOW_STACK
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
 /*
  * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
  * support core mm.
diff --git a/mm/Kconfig b/mm/Kconfig
index 4c9f5ea13271..4b2a1ef9a161 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1296,6 +1296,12 @@ config NUMA_EMU
  into virtual nodes when booted with "numa=fake=N", where N is the
  number of nodes. This is only useful for debugging.
 
+config ARCH_HAS_USER_SHADOW_STACK
+   bool
+   help
+ The architecture has hardware support for userspace shadow call
+  stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+
 source "mm/damon/Kconfig"
 
 endmenu

-- 
2.45.0

[PATCH 00/33] riscv control-flow integrity for usermode

2024-10-01 Thread Deepak Gupta

v5 for cpu assisted riscv user mode control flow integrity.
zicfiss and zicfilp [1] are ratified riscv CPU extensions.

Changes in this version are
- rebased on v6.12-rc1
- Fixed schema related issues in device tree file
- Fixed some of the documentation related issues in zicfilp/ss.rst
  (style issues and added index)
- added `SHADOW_STACK_SET_MARKER` so that implementation can define base
  of shadow stack.
- Fixed warnings on definitions added in usercfi.h when
  CONFIG_RISCV_USER_CFI is not selected.
- Adopted context header based signal handling as proposed by Andy Chiu
- Added support for enabling kernel mode access to shadow stack using
  FWFT [4]

v4 [3] and v3 [2] are earlier versions of patch series.

To get more information on kernel interactions with respect to
zicfilp and zicfiss, patch series adds documentation for
`zicfilp` and `zicfiss`

Documentation/arch/riscv/zicfiss.rst
Documentation/arch/riscv/zicfilp.rst

How to test this series
===

Toolchain
-
$ git clone g...@github.com:sifive/riscv-gnu-toolchain.git -b cfi-dev
$ riscv-gnu-toolchain/configure --prefix= 
--with-arch=rv64gc_zicfilp_zicfiss --enable-linux --disable-gdb  
--with-extra-multilib-test="rv64gc_zicfilp_zicfiss-lp64d:-static"
$ make -j$(nproc)

Qemu

$ git clone g...@github.com:deepak0414/qemu.git -b 
zicfilp_zicfiss_ratified_master_july11
$ cd qemu
$ mkdir build
$ cd build
$ ../configure --target-list=riscv64-softmmu
$ make -j$(nproc)

Opensbi
---
$ git clone g...@github.com:deepak0414/opensbi.git -b v6_cfi_spec_split_opensbi
$ make CROSS_COMPILE= -j$(nproc) PLATFORM=generic

Linux
-
Running defconfig is fine. CFI is enabled by default if the toolchain
supports it.

$ make ARCH=riscv 
CROSS_COMPILE=/build/bin/riscv64-unknown-linux-gnu-
 -j$(nproc) defconfig
$ make ARCH=riscv 
CROSS_COMPILE=/build/bin/riscv64-unknown-linux-gnu-
 -j$(nproc)

Running
---

Modify your qemu command to have:
-bios /build/platform/generic/firmware/fw_dynamic.bin
-cpu rv64,zicfilp=true,zicfiss=true,zimop=true,zcmop=true

vDSO related Opens (in the flux)
=

I am listing these opens for laying out plan and what to expect in future
patch sets. And of course for the sake of discussion.

Shadow stack and landing pad enabling in vDSO
--
vDSO must have shadow stack and landing pad support compiled in for task
to have shadow stack and landing pad support. This patch series doesn't
enable that (yet). Enabling shadow stack support in vDSO should be
straight forward (intend to do that in next versions of patch set). Enabling
landing pad support in vDSO requires some collaboration with toolchain folks
to follow a single label scheme for all object binaries. This is necessary to
ensure that all indirect call-sites are setting correct label and target landing
pads are decorated with same label scheme.

How many vDSOs
---
Shadow stack instructions are carved out of zimop (may be operations) and if CPU
doesn't implement zimop, they're illegal instructions. Kernel could be running 
on
a CPU which may or may not implement zimop. And thus kernel will have to carry 2
different vDSOs and expose the appropriate one depending on whether CPU 
implements
zimop or not.

[1] - https://github.com/riscv/riscv-cfi
[2] - https://lore.kernel.org/lkml/20240403234054.2020347-1-de...@rivosinc.com/
[3] - https://lore.kernel.org/all/20240912231650.3740732-1-de...@rivosinc.com/
[4] - 
https://github.com/riscv-non-isa/riscv-sbi-doc/blob/master/src/ext-firmware-features.adoc

---
changelog
-

v4
--
- rebased on 6.11-rc6
- envcfg: Converged with Samuel Holland's patches for envcfg management on per-
thread basis.
- vma_is_shadow_stack is renamed to is_vma_shadow_stack
- picked up Mark Brown's `ARCH_HAS_USER_SHADOW_STACK` patch
- signal context: using extended context management to maintain compatibility.
- fixed `-Wmissing-prototypes` compiler warnings for prctl functions
- Documentation fixes and amending typos.

v3
--
envcfg:
logic to pick up base envcfg had a bug where `ENVCFG_CBZE` could have been
picked on per task basis, even though CPU didn't implement it. Fixed in
this series.

dt-bindings:
As suggested, split into separate commit. fixed the messaging that spec is
in public review

arch_is_shadow_stack change:
arch_is_shadow_stack changed to vma_is_shadow_stack

hwprobe:
zicfiss / zicfilp if present will get enumerated in hwprobe

selftests:
As suggested, added object and binary filenames to .gitignore
Selftest binary anyways need to be compiled with cfi enabled compiler which
will make sure that landing pad and shadow stack are enabled. Thus removed
separate enable/disable tests. Cleaned up tests a bit.

v2
--

- Using config `CONFIG_RISCV_USER_CFI`, kernel support for riscv control flow
integrity for user mode programs can be compiled in the kernel.

- Enabling of control flow integrity for user programs is left to user runtime

- This patch

[PATCH 03/33] riscv: Enable cbo.zero only when all harts support Zicboz

2024-10-01 Thread Deepak Gupta

From: Samuel Holland 

Currently, we enable cbo.zero for usermode on each hart that supports
the Zicboz extension. This means that the [ms]envcfg CSR value may
differ between harts. Other features, such as pointer masking and CFI,
require setting [ms]envcfg bits on a per-thread basis. The combination
of these two adds quite some complexity and overhead to context
switching, as we would need to maintain two separate masks for the
per-hart and per-thread bits. Andrew Jones, who originally added Zicboz
support, writes[1][2]:

  I've approached Zicboz the same way I would approach all
  extensions, which is to be per-hart. I'm not currently aware of
  a platform that is / will be composed of harts where some have
  Zicboz and others don't, but there's nothing stopping a platform
  like that from being built.

  So, how about we add code that confirms Zicboz is on all harts.
  If any hart does not have it, then we complain loudly and disable
  it on all the other harts. If it was just a hardware description
  bug, then it'll get fixed. If there's actually a platform which
  doesn't have Zicboz on all harts, then, when the issue is reported,
  we can decide to not support it, support it with defconfig, or
  support it under a Kconfig guard which must be enabled by the user.

Let's follow his suggested solution and require the extension to be
available on all harts, so the envcfg CSR value does not need to change
when a thread migrates between harts. Since we are doing this for all
extensions with fields in envcfg, the CSR itself only needs to be saved/
restored when it is present on all harts.

This should not be a regression as no known hardware has asymmetric
Zicboz support, but if anyone reports seeing the warning, we will
re-evaluate our solution.

Link: 
https://lore.kernel.org/linux-riscv/20240322-168f191eeb8479b2ea169a5e@orel/ [1]
Link: 
https://lore.kernel.org/linux-riscv/20240323-28943722feb57a41fb0ff488@orel/ [2]
Reviewed-by: Andrew Jones 
Reviewed-by: Conor Dooley 
Reviewed-by: Deepak Gupta 
Signed-off-by: Samuel Holland 
---
 arch/riscv/kernel/cpufeature.c | 7 ++-
 arch/riscv/kernel/suspend.c| 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 3a8eeaa9310c..e560a253e99b 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -28,6 +28,8 @@
 
 #define NUM_ALPHA_EXTS ('z' - 'a' + 1)
 
+static bool any_cpu_has_zicboz;
+
 unsigned long elf_hwcap __read_mostly;
 
 /* Host ISA bitmap */
@@ -98,6 +100,7 @@ static int riscv_ext_zicboz_validate(const struct 
riscv_isa_ext_data *data,
pr_err("Zicboz disabled as cboz-block-size present, but is not 
a power-of-2\n");
return -EINVAL;
}
+   any_cpu_has_zicboz = true;
return 0;
 }
 
@@ -919,8 +922,10 @@ unsigned long riscv_get_elf_hwcap(void)
 
 void riscv_user_isa_enable(void)
 {
-   if (riscv_cpu_has_extension_unlikely(smp_processor_id(), 
RISCV_ISA_EXT_ZICBOZ))
+   if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ))
csr_set(CSR_ENVCFG, ENVCFG_CBZE);
+   else if (any_cpu_has_zicboz)
+   pr_warn_once("Zicboz disabled as it is unavailable on some 
harts\n");
 }
 
 #ifdef CONFIG_RISCV_ALTERNATIVE
diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c
index c8cec0cc5833..9a8a0dc035b2 100644
--- a/arch/riscv/kernel/suspend.c
+++ b/arch/riscv/kernel/suspend.c
@@ -14,7 +14,7 @@
 
 void suspend_save_csrs(struct suspend_context *context)
 {
-   if (riscv_cpu_has_extension_unlikely(smp_processor_id(), 
RISCV_ISA_EXT_XLINUXENVCFG))
+   if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG))
context->envcfg = csr_read(CSR_ENVCFG);
context->tvec = csr_read(CSR_TVEC);
context->ie = csr_read(CSR_IE);
@@ -37,7 +37,7 @@ void suspend_save_csrs(struct suspend_context *context)
 void suspend_restore_csrs(struct suspend_context *context)
 {
csr_write(CSR_SCRATCH, 0);
-   if (riscv_cpu_has_extension_unlikely(smp_processor_id(), 
RISCV_ISA_EXT_XLINUXENVCFG))
+   if (riscv_has_extension_unlikely(RISCV_ISA_EXT_XLINUXENVCFG))
csr_write(CSR_ENVCFG, context->envcfg);
csr_write(CSR_TVEC, context->tvec);
csr_write(CSR_IE, context->ie);

-- 
2.45.0

[PATCH 06/33] riscv/Kconfig: enable HAVE_EXIT_THREAD for riscv

2024-10-01 Thread Deepak Gupta

riscv will need an implementation for exit_thread to clean up shadow stack
when thread exits. If current thread had shadow stack enabled, shadow
stack is allocated by default for any new thread.

Signed-off-by: Deepak Gupta 
Reviewed-by: Charlie Jenkins 
---
 arch/riscv/Kconfig  | 1 +
 arch/riscv/kernel/process.c | 5 +
 2 files changed, 6 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 22dc5ea4196c..808ea66b9537 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -182,6 +182,7 @@ config RISCV
select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
select HAVE_STACKPROTECTOR
select HAVE_SYSCALL_TRACEPOINTS
+   select HAVE_EXIT_THREAD
select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index e3142d8a6e28..1f2574fb2edb 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -201,6 +201,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src)
return 0;
 }
 
+void exit_thread(struct task_struct *tsk)
+{
+
+}
+
 int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 {
unsigned long clone_flags = args->flags;

-- 
2.45.0

[PATCH 07/33] riscv: zicfilp / zicfiss in dt-bindings (extensions.yaml)

2024-10-01 Thread Deepak Gupta

Make an entry for cfi extensions in extensions.yaml.

Signed-off-by: Deepak Gupta 
---
 Documentation/devicetree/bindings/riscv/extensions.yaml | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml 
b/Documentation/devicetree/bindings/riscv/extensions.yaml
index 2cf2026cff57..356c60fd6cc8 100644
--- a/Documentation/devicetree/bindings/riscv/extensions.yaml
+++ b/Documentation/devicetree/bindings/riscv/extensions.yaml
@@ -368,6 +368,20 @@ properties:
 The standard Zicboz extension for cache-block zeroing as ratified
 in commit 3dd606f ("Create cmobase-v1.0.pdf") of riscv-CMOs.
 
+- const: zicfilp
+  description: |
+The standard Zicfilp extension for enforcing forward edge
+control-flow integrity as ratified in commit 3f8e450 ("merge
+pull request #227 from ved-rivos/0709") of riscv-cfi
+github repo.
+
+- const: zicfiss
+  description: |
+The standard Zicfiss extension for enforcing backward edge
+control-flow integrity as ratified in commit 3f8e450 ("merge
+pull request #227 from ved-rivos/0709") of riscv-cfi
+github repo.
+
 - const: zicntr
   description:
 The standard Zicntr extension for base counters and timers, as

-- 
2.45.0

[PATCH 17/33] prctl: arch-agnostic prctl for shadow stack

2024-10-01 Thread Deepak Gupta

From: Mark Brown 

Three architectures (x86, aarch64, riscv) have announced support for
shadow stacks with fairly similar functionality.  While x86 is using
arch_prctl() to control the functionality neither arm64 nor riscv uses
that interface so this patch adds arch-agnostic prctl() support to
get and set status of shadow stacks and lock the current configuration to
prevent further changes, with support for turning on and off individual
subfeatures so applications can limit their exposure to features that
they do not need.  The features are:

  - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks,
including allocation of a shadow stack if one is not already
allocated.
  - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow
stack.
  - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack.
  - PR_SHADOW_STACK_DISABLE: Allow to disable shadow stack.
Note once locked, disable must fail.

These features are expected to be inherited by new threads and cleared
on exec(), unknown features should be rejected for enable but accepted
for locking (in order to allow for future proofing).

This is based on a patch originally written by Deepak Gupta but later
modified by Mark Brown for arm's GCS patch series.

Signed-off-by: Mark Brown 
Co-developed-by: Deepak Gupta 
---
 include/linux/mm.h |  3 +++
 include/uapi/linux/prctl.h | 21 +
 kernel/sys.c   | 30 ++
 3 files changed, 54 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57533b9cae95..54e2b3f1cc30 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4146,6 +4146,9 @@ static inline bool pfn_is_unaccepted_memory(unsigned long 
pfn)
 {
return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
 }
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user 
*status);
+int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
+int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
 
 void vma_pgtable_walk_begin(struct vm_area_struct *vma);
 void vma_pgtable_walk_end(struct vm_area_struct *vma);
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..b8d7b6361754 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -327,5 +327,26 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_SET_ONEXEC   0x8 /* Set the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC0x10 /* Clear the aspect on 
exec */
 # define PR_PPC_DEXCR_CTRL_MASK0x1f
+/*
+ * Get the current shadow stack configuration for the current thread,
+ * this will be the value configured via PR_SET_SHADOW_STACK_STATUS.
+ */
+#define PR_GET_SHADOW_STACK_STATUS  74
+
+/*
+ * Set the current shadow stack configuration.  Enabling the shadow
+ * stack will cause a shadow stack to be allocated for the thread.
+ */
+#define PR_SET_SHADOW_STACK_STATUS  75
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+# define PR_SHADOW_STACK_WRITE (1UL << 1)
+# define PR_SHADOW_STACK_PUSH  (1UL << 2)
+
+/*
+ * Prevent further changes to the specified shadow stack
+ * configuration.  All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_SHADOW_STACK_STATUS  76
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..3d38a9c7c5c9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct 
*t, unsigned long which,
return -EINVAL;
 }
 
+int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long 
__user *status)
+{
+   return -EINVAL;
+}
+
+int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long 
status)
+{
+   return -EINVAL;
+}
+
+int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long 
status)
+{
+   return -EINVAL;
+}
+
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
 #ifdef CONFIG_ANON_VMA_NAME
@@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, 
unsigned long, arg3,
case PR_RISCV_SET_ICACHE_FLUSH_CTX:
error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
break;
+   case PR_GET_SHADOW_STACK_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_get_shadow_stack_status(me, (unsigned long __user 
*) arg2);
+   break;
+   case PR_SET_SHADOW_STACK_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_set_shadow_stack_status(me, arg2);
+   break;
+   case PR_LOCK_SHADOW_STACK_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_lock_shadow_stack_status(me, arg2);
+

[PATCH 18/33] prctl: arch-agnostic prctl for indirect branch tracking

2024-10-01 Thread Deepak Gupta

Three architectures (x86, aarch64, riscv) have support for indirect branch
tracking feature in a very similar fashion. On a very high level, indirect
branch tracking is a CPU feature where CPU tracks branches which uses
memory operand to perform control transfer in program. As part of this
tracking on indirect branches, CPU goes in a state where it expects a
landing pad instr on target and if not found then CPU raises some fault
(architecture dependent)

x86 landing pad instr - `ENDBRANCH`
aarch64 landing pad instr - `BTI`
riscv landing instr - `lpad`

Given that three major arches have support for indirect branch tracking,
This patch makes `prctl` for indirect branch tracking arch agnostic.

To allow userspace to enable this feature for itself, following prtcls are
defined:
 - PR_GET_INDIR_BR_LP_STATUS: Gets current configured status for indirect
   branch tracking.
 - PR_SET_INDIR_BR_LP_STATUS: Sets a configuration for indirect branch
   tracking.
   Following status options are allowed
   - PR_INDIR_BR_LP_ENABLE: Enables indirect branch tracking on user
 thread.
   - PR_INDIR_BR_LP_DISABLE; Disables indirect branch tracking on user
 thread.
 - PR_LOCK_INDIR_BR_LP_STATUS: Locks configured status for indirect branch
   tracking for user thread.

Signed-off-by: Deepak Gupta 
---
 include/linux/cpu.h|  4 
 include/uapi/linux/prctl.h | 27 +++
 kernel/sys.c   | 30 ++
 3 files changed, 61 insertions(+)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index bdcec1732445..eff56aae05d7 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -203,4 +203,8 @@ static inline bool cpu_mitigations_auto_nosmt(void)
 }
 #endif
 
+int arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user 
*status);
+int arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status);
+int arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status);
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b8d7b6361754..41ffb53490a4 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -349,4 +349,31 @@ struct prctl_mm_map {
  */
 #define PR_LOCK_SHADOW_STACK_STATUS  76
 
+/*
+ * Get the current indirect branch tracking configuration for the current
+ * thread, this will be the value configured via PR_SET_INDIR_BR_LP_STATUS.
+ */
+#define PR_GET_INDIR_BR_LP_STATUS  77
+
+/*
+ * Set the indirect branch tracking configuration. PR_INDIR_BR_LP_ENABLE will
+ * enable cpu feature for user thread, to track all indirect branches and 
ensure
+ * they land on arch defined landing pad instruction.
+ * x86 - If enabled, an indirect branch must land on `ENDBRANCH` instruction.
+ * arch64 - If enabled, an indirect branch must land on `BTI` instruction.
+ * riscv - If enabled, an indirect branch must land on `lpad` instruction.
+ * PR_INDIR_BR_LP_DISABLE will disable feature for user thread and indirect
+ * branches will no more be tracked by cpu to land on arch defined landing pad
+ * instruction.
+ */
+#define PR_SET_INDIR_BR_LP_STATUS  78
+# define PR_INDIR_BR_LP_ENABLE(1UL << 0)
+
+/*
+ * Prevent further changes to the specified indirect branch tracking
+ * configuration.  All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_INDIR_BR_LP_STATUS  79
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 3d38a9c7c5c9..dafa31485584 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2339,6 +2339,21 @@ int __weak arch_lock_shadow_stack_status(struct 
task_struct *t, unsigned long st
return -EINVAL;
 }
 
+int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long 
__user *status)
+{
+   return -EINVAL;
+}
+
+int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long 
status)
+{
+   return -EINVAL;
+}
+
+int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long 
status)
+{
+   return -EINVAL;
+}
+
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
 #ifdef CONFIG_ANON_VMA_NAME
@@ -2814,6 +2829,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, 
unsigned long, arg3,
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+   case PR_GET_INDIR_BR_LP_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_get_indir_br_lp_status(me, (unsigned long __user 
*) arg2);
+   break;
+   case PR_SET_INDIR_BR_LP_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+   error = arch_set_indir_br_lp_status(me, arg2);
+   break;
+   case PR_LOCK_INDIR_BR_LP_STATUS:
+   if (arg3 || arg4 || arg5)
+   return -EINVAL;
+

[PATCH 11/33] riscv/mm : ensure PROT_WRITE leads to VM_READ | VM_WRITE

2024-10-01 Thread Deepak Gupta

`arch_calc_vm_prot_bits` is implemented on risc-v to return VM_READ |
VM_WRITE if PROT_WRITE is specified. Similarly `riscv_sys_mmap` is
updated to convert all incoming PROT_WRITE to (PROT_WRITE | PROT_READ).
This is to make sure that any existing apps using PROT_WRITE still work.

Earlier `protection_map[VM_WRITE]` used to pick read-write PTE encodings.
Now `protection_map[VM_WRITE]` will always pick PAGE_SHADOWSTACK PTE
encodings for shadow stack. Above changes ensure that existing apps
continue to work because underneath kernel will be picking
`protection_map[VM_WRITE|VM_READ]` PTE encodings.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/asm/mman.h| 24 
 arch/riscv/include/asm/pgtable.h |  1 +
 arch/riscv/kernel/sys_riscv.c| 10 ++
 arch/riscv/mm/init.c |  2 +-
 mm/mmap.c|  1 +
 5 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/mman.h b/arch/riscv/include/asm/mman.h
new file mode 100644
index ..ef9fedf32546
--- /dev/null
+++ b/arch/riscv/include/asm/mman.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_MMAN_H__
+#define __ASM_MMAN_H__
+
+#include 
+#include 
+#include 
+
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+   unsigned long pkey __always_unused)
+{
+   unsigned long ret = 0;
+
+   /*
+* If PROT_WRITE was specified, force it to VM_READ | VM_WRITE.
+* Only VM_WRITE means shadow stack.
+*/
+   if (prot & PROT_WRITE)
+   ret = (VM_READ | VM_WRITE);
+   return ret;
+}
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+
+#endif /* ! __ASM_MMAN_H__ */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index e79f15293492..4948a1f18ae8 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -177,6 +177,7 @@ extern struct pt_alloc_ops pt_ops __meminitdata;
 #define PAGE_READ_EXEC __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC)
 #define PAGE_WRITE_EXEC__pgprot(_PAGE_BASE | _PAGE_READ |  
\
 _PAGE_EXEC | _PAGE_WRITE)
+#define PAGE_SHADOWSTACK   __pgprot(_PAGE_BASE | _PAGE_WRITE)
 
 #define PAGE_COPY  PAGE_READ
 #define PAGE_COPY_EXEC PAGE_READ_EXEC
diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c
index d77afe05578f..43a448bf254b 100644
--- a/arch/riscv/kernel/sys_riscv.c
+++ b/arch/riscv/kernel/sys_riscv.c
@@ -7,6 +7,7 @@
 
 #include 
 #include 
+#include 
 
 static long riscv_sys_mmap(unsigned long addr, unsigned long len,
   unsigned long prot, unsigned long flags,
@@ -16,6 +17,15 @@ static long riscv_sys_mmap(unsigned long addr, unsigned long 
len,
if (unlikely(offset & (~PAGE_MASK >> page_shift_offset)))
return -EINVAL;
 
+   /*
+* If PROT_WRITE is specified then extend that to PROT_READ
+* protection_map[VM_WRITE] is now going to select shadow stack 
encodings.
+* So specifying PROT_WRITE actually should select protection_map 
[VM_WRITE | VM_READ]
+* If user wants to create shadow stack then they should use 
`map_shadow_stack` syscall.
+*/
+   if (unlikely((prot & PROT_WRITE) && !(prot & PROT_READ)))
+   prot |= PROT_READ;
+
return ksys_mmap_pgoff(addr, len, prot, flags, fd,
   offset >> (PAGE_SHIFT - page_shift_offset));
 }
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 0e8c20adcd98..964810aeb405 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -326,7 +326,7 @@ pgd_t early_pg_dir[PTRS_PER_PGD] __initdata 
__aligned(PAGE_SIZE);
 static const pgprot_t protection_map[16] = {
[VM_NONE]   = PAGE_NONE,
[VM_READ]   = PAGE_READ,
-   [VM_WRITE]  = PAGE_COPY,
+   [VM_WRITE]  = PAGE_SHADOWSTACK,
[VM_WRITE | VM_READ]= PAGE_COPY,
[VM_EXEC]   = PAGE_EXEC,
[VM_EXEC | VM_READ] = PAGE_READ_EXEC,
diff --git a/mm/mmap.c b/mm/mmap.c
index dd4b35a25aeb..b56f1e8cbfc6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 

-- 
2.45.0

[PATCH 33/33] kselftest/riscv: kselftest for user mode cfi

2024-10-01 Thread Deepak Gupta

Adds kselftest for RISC-V control flow integrity implementation for user
mode. There is not a lot going on in kernel for enabling landing pad for
user mode. cfi selftest are intended to be compiled with zicfilp and
zicfiss enabled compiler. Thus kselftest simply checks if landing pad and
shadow stack for the binary and process are enabled or not. selftest then
register a signal handler for SIGSEGV. Any control flow violation are
reported as SIGSEGV with si_code = SEGV_CPERR. Test will fail on receiving
any SEGV_CPERR. Shadow stack part has more changes in kernel and thus there
are separate tests for that
- Exercise `map_shadow_stack` syscall
- `fork` test to make sure COW works for shadow stack pages
- gup tests
  As of today kernel uses FOLL_FORCE when access happens to memory via
  /proc//mem. Not breaking that for shadow stack
- signal test. Make sure signal delivery results in token creation on
  shadow stack and consumes (and verifies) token on sigreturn
- shadow stack protection test. attempts to write using regular store
  instruction on shadow stack memory must result in access faults

Signed-off-by: Deepak Gupta 
---
 tools/testing/selftests/riscv/Makefile |   2 +-
 tools/testing/selftests/riscv/cfi/.gitignore   |   3 +
 tools/testing/selftests/riscv/cfi/Makefile |  10 +
 tools/testing/selftests/riscv/cfi/cfi_rv_test.h|  83 +
 tools/testing/selftests/riscv/cfi/riscv_cfi_test.c |  82 +
 tools/testing/selftests/riscv/cfi/shadowstack.c| 362 +
 tools/testing/selftests/riscv/cfi/shadowstack.h|  37 +++
 7 files changed, 578 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/riscv/Makefile 
b/tools/testing/selftests/riscv/Makefile
index 7ce03d832b64..6e142fe004ab 100644
--- a/tools/testing/selftests/riscv/Makefile
+++ b/tools/testing/selftests/riscv/Makefile
@@ -5,7 +5,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),riscv))
-RISCV_SUBTARGETS ?= hwprobe vector mm sigreturn
+RISCV_SUBTARGETS ?= hwprobe vector mm sigreturn cfi
 else
 RISCV_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/riscv/cfi/.gitignore 
b/tools/testing/selftests/riscv/cfi/.gitignore
new file mode 100644
index ..ce7623f9da28
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/.gitignore
@@ -0,0 +1,3 @@
+cfitests
+riscv_cfi_test
+shadowstack
\ No newline at end of file
diff --git a/tools/testing/selftests/riscv/cfi/Makefile 
b/tools/testing/selftests/riscv/cfi/Makefile
new file mode 100644
index ..b65f7ff38a32
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/Makefile
@@ -0,0 +1,10 @@
+CFLAGS += -I$(top_srcdir)/tools/include
+
+CFLAGS += -march=rv64gc_zicfilp_zicfiss
+
+TEST_GEN_PROGS := cfitests
+
+include ../../lib.mk
+
+$(OUTPUT)/cfitests: riscv_cfi_test.c shadowstack.c
+   $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^
diff --git a/tools/testing/selftests/riscv/cfi/cfi_rv_test.h 
b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h
new file mode 100644
index ..fa1cf7183672
--- /dev/null
+++ b/tools/testing/selftests/riscv/cfi/cfi_rv_test.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef SELFTEST_RISCV_CFI_H
+#define SELFTEST_RISCV_CFI_H
+#include 
+#include 
+#include "shadowstack.h"
+
+#define RISCV_CFI_SELFTEST_COUNT RISCV_SHADOW_STACK_TESTS
+
+#define CHILD_EXIT_CODE_SSWRITE10
+#define CHILD_EXIT_CODE_SIG_TEST   11
+
+#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \
+({ 
\
+   register long _num  __asm__ ("a7") = (num); 
\
+   register long _arg1 __asm__ ("a0") = (long)(arg1);  \
+   register long _arg2 __asm__ ("a1") = (long)(arg2);  \
+   register long _arg3 __asm__ ("a2") = (long)(arg3);  \
+   register long _arg4 __asm__ ("a3") = (long)(arg4);  \
+   register long _arg5 __asm__ ("a4") = (long)(arg5);  \
+   
\
+   __asm__ volatile (  
\
+   "ecall\n"   
\
+   : "+r"(_arg1)   
\
+   : "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5),   \
+ "r"(_num) 
\
+   : "memory", "cc"
\
+   );

[PATCH 32/33] riscv: Documentation for shadow stack on riscv

2024-10-01 Thread Deepak Gupta

Adding documentation on shadow stack for user mode on riscv and kernel
interfaces exposed so that user tasks can enable it.

Signed-off-by: Deepak Gupta 
---
 Documentation/arch/riscv/index.rst   |   1 +
 Documentation/arch/riscv/zicfiss.rst | 176 +++
 2 files changed, 177 insertions(+)

diff --git a/Documentation/arch/riscv/index.rst 
b/Documentation/arch/riscv/index.rst
index be7237b69682..e240eb0ceb70 100644
--- a/Documentation/arch/riscv/index.rst
+++ b/Documentation/arch/riscv/index.rst
@@ -15,6 +15,7 @@ RISC-V architecture
 vector
 cmodx
 zicfilp
+zicfiss
 
 features
 
diff --git a/Documentation/arch/riscv/zicfiss.rst 
b/Documentation/arch/riscv/zicfiss.rst
new file mode 100644
index ..5ba389f15b3f
--- /dev/null
+++ b/Documentation/arch/riscv/zicfiss.rst
@@ -0,0 +1,176 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Author: Deepak Gupta 
+:Date:   12 January 2024
+
+=
+Shadow stack to protect function returns on RISC-V Linux
+=
+
+This document briefly describes the interface provided to userspace by Linux
+to enable shadow stack for user mode applications on RISV-V
+
+1. Feature Overview
+
+
+Memory corruption issues usually result in to crashes, however when in hands of
+an adversary and if used creatively can result into variety security issues.
+
+One of those security issues can be code re-use attacks on program where
+adversary can use corrupt return addresses present on stack and chain them
+together to perform return oriented programming (ROP) and thus compromising
+control flow integrity (CFI) of the program.
+
+Return addresses live on stack and thus in read-write memory and thus are
+susceptible to corruption and allows an adversary to reach any program counter
+(PC) in address space. On RISC-V ``zicfiss`` extension provides an alternate
+stack termed as shadow stack on which return addresses can be safely placed in
+prolog of the function and retrieved in epilog. ``zicfiss`` extension makes
+following changes:
+
+- PTE encodings for shadow stack virtual memory
+  An earlier reserved encoding in first stage translation i.e.
+  PTE.R=0, PTE.W=1, PTE.X=0  becomes PTE encoding for shadow stack pages.
+
+- ``sspush x1/x5`` instruction pushes (stores) ``x1/x5`` to shadow stack.
+
+- ``sspopchk x1/x5`` instruction pops (loads) from shadow stack and compares
+  with ``x1/x5`` and if un-equal, CPU raises ``software check exception`` with
+  ``*tval = 3``
+
+Compiler toolchain makes sure that function prologue have ``sspush x1/x5`` to
+save return address on shadow stack in addition to regular stack. Similarly
+function epilogs have ``ld x5, offset(x2)`` followed by ``sspopchk x5`` to
+ensure that popped value from regular stack matches with popped value from
+shadow stack.
+
+2. Shadow stack protections and linux memory manager
+-
+
+As mentioned earlier, shadow stack get new page table encodings and thus have
+some special properties assigned to them and instructions that operate on them
+as below:
+
+- Regular stores to shadow stack memory raises access store faults. This way
+  shadow stack memory is protected from stray inadvertant writes.
+
+- Regular loads to shadow stack memory are allowed. This allows stack trace
+  utilities or backtrace functions to read true callstack (not tampered).
+
+- Only shadow stack instructions can generate shadow stack load or shadow stack
+  store.
+
+- Shadow stack load / shadow stack store on read-only memory raises AMO/store
+  page fault. Thus both ``sspush x1/x5`` and ``sspopchk x1/x5`` will raise AMO/
+  store page fault. This simplies COW handling in kernel During fork, kernel
+  can convert shadow stack pages into read-only memory (as it does for regular
+  read-write memory) and as soon as subsequent ``sspush`` or ``sspopchk`` in
+  userspace is encountered, then kernel can perform COW.
+
+- Shadow stack load / shadow stack store on read-write, read-write-execute
+  memory raises an access fault. This is a fatal condition because shadow stack
+  should never be operating on read-write, read-write-execute memory.
+
+3. ELF and psABI
+-
+
+Toolchain sets up :c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_BCFI` for property
+:c:macro:`GNU_PROPERTY_RISCV_FEATURE_1_AND` in notes section of the object 
file.
+
+4. Linux enabling
+--
+
+User space programs can have multiple shared objects loaded in its address 
space
+and it's a difficult task to make sure all the dependencies have been compiled
+with support of shadow stack. Thus it's left to dynamic loader to enable
+shadow stack for the program.
+
+5. prctl() enabling
+
+
+:c:macro:`PR_SET_SHADOW_STACK_STATUS` / :c:macro:`PR_GET_SHADOW_STACK_STATUS` /
+:c:macro:`PR_LOCK_SHADOW_STACK_STATUS` are three prctls added to manage

Re: [PATCH 17/33] prctl: arch-agnostic prctl for shadow stack

2024-10-01 Thread Mark Brown

On Tue, Oct 01, 2024 at 09:06:22AM -0700, Deepak Gupta wrote:
> From: Mark Brown 

> This is based on a patch originally written by Deepak Gupta but later
> modified by Mark Brown for arm's GCS patch series.
> 
> Signed-off-by: Mark Brown 
> Co-developed-by: Deepak Gupta 
> ---

You need to add your own signoff to this when reposting, see
submitting-patches.rst.


signature.asc
Description: PGP signature

[PATCH 05/33] riscv: Call riscv_user_isa_enable() only on the boot hart

2024-10-01 Thread Deepak Gupta

From: Samuel Holland 

Now that the [ms]envcfg CSR value is maintained per thread, not per
hart, riscv_user_isa_enable() only needs to be called once during boot,
to set the value for the init task. This also allows it to be marked as
__init.

Reviewed-by: Andrew Jones 
Reviewed-by: Conor Dooley 
Reviewed-by: Deepak Gupta 
Signed-off-by: Samuel Holland 
---
 arch/riscv/include/asm/cpufeature.h | 2 +-
 arch/riscv/kernel/cpufeature.c  | 4 ++--
 arch/riscv/kernel/smpboot.c | 2 --
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/riscv/include/asm/cpufeature.h 
b/arch/riscv/include/asm/cpufeature.h
index 45f9c1171a48..ce9a995730c1 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -31,7 +31,7 @@ DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo);
 /* Per-cpu ISA extensions. */
 extern struct riscv_isainfo hart_isa[NR_CPUS];
 
-void riscv_user_isa_enable(void);
+void __init riscv_user_isa_enable(void);
 
 #define _RISCV_ISA_EXT_DATA(_name, _id, _subset_exts, _subset_exts_size, 
_validate) {  \
.name = #_name, 
\
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index c0986291696a..7117366d80db 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -920,12 +920,12 @@ unsigned long riscv_get_elf_hwcap(void)
return hwcap;
 }
 
-void riscv_user_isa_enable(void)
+void __init riscv_user_isa_enable(void)
 {
if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ))
current->thread_info.envcfg |= ENVCFG_CBZE;
else if (any_cpu_has_zicboz)
-   pr_warn_once("Zicboz disabled as it is unavailable on some 
harts\n");
+   pr_warn("Zicboz disabled as it is unavailable on some harts\n");
 }
 
 #ifdef CONFIG_RISCV_ALTERNATIVE
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 0f8f1c95ac38..e36d20205bd7 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -233,8 +233,6 @@ asmlinkage __visible void smp_callin(void)
numa_add_cpu(curr_cpuid);
set_cpu_online(curr_cpuid, true);
 
-   riscv_user_isa_enable();
-
/*
 * Remote cache and TLB flushes are ignored while the CPU is offline,
 * so flush them both right now just in case.

-- 
2.45.0

[PATCH 24/33] riscv/kernel: update __show_regs to print shadow stack register

2024-10-01 Thread Deepak Gupta

Updating __show_regs to print captured shadow stack pointer as well.
On tasks where shadow stack is disabled, it'll simply print 0.

Signed-off-by: Deepak Gupta 
Reviewed-by: Alexandre Ghiti 
---
 arch/riscv/kernel/process.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 5207f018415c..6db0fde3701e 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -89,8 +89,8 @@ void __show_regs(struct pt_regs *regs)
regs->s8, regs->s9, regs->s10);
pr_cont(" s11: " REG_FMT " t3 : " REG_FMT " t4 : " REG_FMT "\n",
regs->s11, regs->t3, regs->t4);
-   pr_cont(" t5 : " REG_FMT " t6 : " REG_FMT "\n",
-   regs->t5, regs->t6);
+   pr_cont(" t5 : " REG_FMT " t6 : " REG_FMT " ssp : " REG_FMT "\n",
+   regs->t5, regs->t6, get_active_shstk(current));
 
pr_cont("status: " REG_FMT " badaddr: " REG_FMT " cause: " REG_FMT "\n",
regs->status, regs->badaddr, regs->cause);

-- 
2.45.0

Re: [PATCH RFC v4 0/9] tun: Introduce virtio-net hashing feature

2024-10-01 Thread Stephen Hemminger

On Tue, 1 Oct 2024 14:54:29 +0900
Akihiko Odaki  wrote:

> On 2024/09/30 0:33, Stephen Hemminger wrote:
> > On Sun, 29 Sep 2024 16:10:47 +0900
> > Akihiko Odaki  wrote:
> >   
> >> On 2024/09/29 11:07, Jason Wang wrote:  
> >>> On Fri, Sep 27, 2024 at 3:51 PM Akihiko Odaki  
> >>> wrote:  
> 
>  On 2024/09/27 13:31, Jason Wang wrote:  
> > On Fri, Sep 27, 2024 at 10:11 AM Akihiko Odaki 
> >  wrote:  
> >>
> >> On 2024/09/25 12:30, Jason Wang wrote:  
> >>> On Tue, Sep 24, 2024 at 5:01 PM Akihiko Odaki 
> >>>  wrote:  
> 
>  virtio-net have two usage of hashes: one is RSS and another is hash
>  reporting. Conventionally the hash calculation was done by the VMM.
>  However, computing the hash after the queue was chosen defeats the
>  purpose of RSS.
> 
>  Another approach is to use eBPF steering program. This approach has
>  another downside: it cannot report the calculated hash due to the
>  restrictive nature of eBPF.
> 
>  Introduce the code to compute hashes to the kernel in order to 
>  overcome
>  thse challenges.
> 
>  An alternative solution is to extend the eBPF steering program so 
>  that it
>  will be able to report to the userspace, but it is based on context
>  rewrites, which is in feature freeze. We can adopt kfuncs, but they 
>  will
>  not be UAPIs. We opt to ioctl to align with other relevant UAPIs (KVM
>  and vhost_net).
>  
> >>>
> >>> I wonder if we could clone the skb and reuse some to store the hash,
> >>> then the steering eBPF program can access these fields without
> >>> introducing full RSS in the kernel?  
> >>
> >> I don't get how cloning the skb can solve the issue.
> >>
> >> We can certainly implement Toeplitz function in the kernel or even with
> >> tc-bpf to store a hash value that can be used for eBPF steering program
> >> and virtio hash reporting. However we don't have a means of storing a
> >> hash type, which is specific to virtio hash reporting and lacks a
> >> corresponding skb field.  
> >
> > I may miss something but looking at sk_filter_is_valid_access(). It
> > looks to me we can make use of skb->cb[0..4]?  
> 
>  I didn't opt to using cb. Below is the rationale:
> 
>  cb is for tail call so it means we reuse the field for a different
>  purpose. The context rewrite allows adding a field without increasing
>  the size of the underlying storage (the real sk_buff) so we should add a
>  new field instead of reusing an existing field to avoid confusion.
> 
>  We are however no longer allowed to add a new field. In my
>  understanding, this is because it is an UAPI, and eBPF maintainers found
>  it is difficult to maintain its stability.
> 
>  Reusing cb for hash reporting is a workaround to avoid having a new
>  field, but it does not solve the underlying problem (i.e., keeping eBPF
>  as stable as UAPI is unreasonably hard). In my opinion, adding an ioctl
>  is a reasonable option to keep the API as stable as other virtualization
>  UAPIs while respecting the underlying intention of the context rewrite
>  feature freeze.  
> >>>
> >>> Fair enough.
> >>>
> >>> Btw, I remember DPDK implements tuntap RSS via eBPF as well (probably
> >>> via cls or other). It might worth to see if anything we miss here.  
> >>
> >> Thanks for the information. I wonder why they used cls instead of
> >> steering program. Perhaps it may be due to compatibility with macvtap
> >> and ipvtap, which don't steering program.
> >>
> >> Their RSS implementation looks cleaner so I will improve my RSS
> >> implementation accordingly.
> >>  
> > 
> > DPDK needs to support flow rules. The specific case is where packets
> > are classified by a flow, then RSS is done across a subset of the queues.
> > The support for flow in TUN driver is more academic than useful,
> > I fixed it for current BPF, but doubt anyone is using it really.
> > 
> > A full steering program would be good, but would require much more
> > complexity to take a general set of flow rules then communicate that
> > to the steering program.
> >   
> 
> It reminded me of RSS context and flow filter. Some physical NICs 
> support to use a dedicated RSS context for packets matched with flow 
> filter, and virtio is also gaining corresponding features.
> 
> RSS context: https://github.com/oasis-tcs/virtio-spec/issues/178
> Flow filter: https://github.com/oasis-tcs/virtio-spec/issues/179
> 
> I considered about the possibility of supporting these features with tc 
> instead of adding ioctls to tuntap, but it seems not appropriate for 
> virtualization use case.
> 
> In a virtualization use case, tuntap is configured according to requests 
> of guests, and the code processing these requests need to have mi

[PATCH 25/33] riscv/ptrace: riscv cfi status and state via ptrace and in core files

2024-10-01 Thread Deepak Gupta

Expose a new register type NT_RISCV_USER_CFI for risc-v cfi status and
state. Intentionally both landing pad and shadow stack status and state
are rolled into cfi state. Creating two different NT_RISCV_USER_XXX would
not be useful and wastage of a note type. Enabling or disabling of feature
is not allowed via ptrace set interface. However setting `elp` state or
setting shadow stack pointer are allowed via ptrace set interface. It is
expected `gdb` might have use to fixup `elp` state or `shadow stack`
pointer.

Signed-off-by: Deepak Gupta 
---
 arch/riscv/include/uapi/asm/ptrace.h | 18 
 arch/riscv/kernel/ptrace.c   | 83 
 include/uapi/linux/elf.h |  1 +
 3 files changed, 102 insertions(+)

diff --git a/arch/riscv/include/uapi/asm/ptrace.h 
b/arch/riscv/include/uapi/asm/ptrace.h
index 659ea3af5680..e6571fba8a8a 100644
--- a/arch/riscv/include/uapi/asm/ptrace.h
+++ b/arch/riscv/include/uapi/asm/ptrace.h
@@ -131,6 +131,24 @@ struct __sc_riscv_cfi_state {
unsigned long ss_ptr;   /* shadow stack pointer */
 };
 
+struct __cfi_status {
+   /* indirect branch tracking state */
+   __u64 lp_en : 1;
+   __u64 lp_lock : 1;
+   __u64 elp_state : 1;
+
+   /* shadow stack status */
+   __u64 shstk_en : 1;
+   __u64 shstk_lock : 1;
+
+   __u64 rsvd : sizeof(__u64) - 5;
+};
+
+struct user_cfi_state {
+   struct __cfi_status cfi_status;
+   __u64 shstk_ptr;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI_ASM_RISCV_PTRACE_H */
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index 92731ff8c79a..c69b20ea6e79 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 enum riscv_regset {
REGSET_X,
@@ -28,6 +29,9 @@ enum riscv_regset {
 #ifdef CONFIG_RISCV_ISA_V
REGSET_V,
 #endif
+#ifdef CONFIG_RISCV_USER_CFI
+   REGSET_CFI,
+#endif
 };
 
 static int riscv_gpr_get(struct task_struct *target,
@@ -152,6 +156,75 @@ static int riscv_vr_set(struct task_struct *target,
 }
 #endif
 
+#ifdef CONFIG_RISCV_USER_CFI
+static int riscv_cfi_get(struct task_struct *target,
+   const struct user_regset *regset,
+   struct membuf to)
+{
+   struct user_cfi_state user_cfi;
+   struct pt_regs *regs;
+
+   regs = task_pt_regs(target);
+
+   user_cfi.cfi_status.lp_en = is_indir_lp_enabled(target);
+   user_cfi.cfi_status.lp_lock = is_indir_lp_locked(target);
+   user_cfi.cfi_status.elp_state = (regs->status & SR_ELP);
+
+   user_cfi.cfi_status.shstk_en = is_shstk_enabled(target);
+   user_cfi.cfi_status.shstk_lock = is_shstk_locked(target);
+   user_cfi.shstk_ptr = get_active_shstk(target);
+
+   return membuf_write(&to, &user_cfi, sizeof(user_cfi));
+}
+
+/*
+ * Does it make sense to allowing enable / disable of cfi via ptrace?
+ * Not allowing enable / disable / locking control via ptrace for now.
+ * Setting shadow stack pointer is allowed. GDB might use it to unwind or
+ * some other fixup. Similarly gdb might want to suppress elp and may want
+ * to reset elp state.
+ */
+static int riscv_cfi_set(struct task_struct *target,
+   const struct user_regset *regset,
+   unsigned int pos, unsigned int count,
+   const void *kbuf, const void __user *ubuf)
+{
+   int ret;
+   struct user_cfi_state user_cfi;
+   struct pt_regs *regs;
+
+   regs = task_pt_regs(target);
+
+   ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_cfi, 0, -1);
+   if (ret)
+   return ret;
+
+   /*
+* Not allowing enabling or locking shadow stack or landing pad
+* There is no disabling of shadow stack or landing pad via ptrace
+* rsvd field should be set to zero so that if those fields are needed 
in future
+*/
+   if (user_cfi.cfi_status.lp_en || user_cfi.cfi_status.lp_lock ||
+   user_cfi.cfi_status.shstk_en || user_cfi.cfi_status.shstk_lock 
||
+   !user_cfi.cfi_status.rsvd)
+   return -EINVAL;
+
+   /* If lpad is enabled on target and ptrace requests to set / clear elp, 
do that */
+   if (is_indir_lp_enabled(target)) {
+   if (user_cfi.cfi_status.elp_state) /* set elp state */
+   regs->status |= SR_ELP;
+   else
+   regs->status &= ~SR_ELP; /* clear elp state */
+   }
+
+   /* If shadow stack enabled on target, set new shadow stack pointer */
+   if (is_shstk_enabled(target))
+   set_active_shstk(target, user_cfi.shstk_ptr);
+
+   return 0;
+}
+#endif
+
 static const struct user_regset riscv_user_regset[] = {
[REGSET_X] = {
.core_note_type = NT_PRSTATUS,
@@ -182,6 +255,16 @@ static const struct user_regset riscv_user_regset[] = {
.set = riscv_

[PATCH 22/33] riscv: signal: abstract header saving for setup_sigcontext

2024-10-01 Thread Deepak Gupta

From: Andy Chiu 

The function save_v_state() served two purposes. First, it saved
extension context into the signal stack. Then, it constructed the
extension header if there was no fault. The second part is independent
of the extension itself. As a result, we can pull that part out, so
future extensions may reuse it. This patch adds arch_ext_list and makes
setup_sigcontext() go through all possible extensions' save() callback.
The callback returns a positive value indicating the size of the
successfully saved extension. Then the kernel proceeds to construct the
header for that extension. The kernel skips an extension if it does
not exist, or if the saving fails for some reasons. The error code is
propagated out on the later case.

This patch does not introduce any functional changes.

Signed-off-by: Andy Chiu 
---
 arch/riscv/kernel/signal.c | 60 ++
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index dcd282419456..014ac1024b85 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -68,18 +68,18 @@ static long save_fp_state(struct pt_regs *regs,
 #define restore_fp_state(task, regs) (0)
 #endif
 
-#ifdef CONFIG_RISCV_ISA_V
-
-static long save_v_state(struct pt_regs *regs, void __user **sc_vec)
+static long save_v_state(struct pt_regs *regs, void __user *sc_vec)
 {
-   struct __riscv_ctx_hdr __user *hdr;
struct __sc_riscv_v_state __user *state;
void __user *datap;
long err;
 
-   hdr = *sc_vec;
-   /* Place state to the user's signal context space after the hdr */
-   state = (struct __sc_riscv_v_state __user *)(hdr + 1);
+   if (!IS_ENABLED(CONFIG_RISCV_ISA_V) ||
+   !(has_vector() && riscv_v_vstate_query(regs)))
+   return 0;
+
+   /* Place state to the user's signal context spac */
+   state = (struct __sc_riscv_v_state __user *)sc_vec;
/* Point datap right after the end of __sc_riscv_v_state */
datap = state + 1;
 
@@ -97,15 +97,11 @@ static long save_v_state(struct pt_regs *regs, void __user 
**sc_vec)
err |= __put_user((__force void *)datap, &state->v_state.datap);
/* Copy the whole vector content to user space datap. */
err |= __copy_to_user(datap, current->thread.vstate.datap, 
riscv_v_vsize);
-   /* Copy magic to the user space after saving  all vector conetext */
-   err |= __put_user(RISCV_V_MAGIC, &hdr->magic);
-   err |= __put_user(riscv_v_sc_size, &hdr->size);
if (unlikely(err))
-   return err;
+   return -EFAULT;
 
-   /* Only progress the sv_vec if everything has done successfully  */
-   *sc_vec += riscv_v_sc_size;
-   return 0;
+   /* Only return the size if everything has done successfully  */
+   return riscv_v_sc_size;
 }
 
 /*
@@ -142,10 +138,19 @@ static long __restore_v_state(struct pt_regs *regs, void 
__user *sc_vec)
 */
return copy_from_user(current->thread.vstate.datap, datap, 
riscv_v_vsize);
 }
-#else
-#define save_v_state(task, regs) (0)
-#define __restore_v_state(task, regs) (0)
-#endif
+
+struct arch_ext_priv {
+   __u32 magic;
+   long (*save)(struct pt_regs *regs, void __user *sc_vec);
+};
+
+struct arch_ext_priv arch_ext_list[] = {
+   {
+   .magic = RISCV_V_MAGIC,
+   .save = &save_v_state,
+   },
+};
+const size_t nr_arch_exts = ARRAY_SIZE(arch_ext_list);
 
 static long restore_sigcontext(struct pt_regs *regs,
struct sigcontext __user *sc)
@@ -276,7 +281,8 @@ static long setup_sigcontext(struct rt_sigframe __user 
*frame,
 {
struct sigcontext __user *sc = &frame->uc.uc_mcontext;
struct __riscv_ctx_hdr __user *sc_ext_ptr = &sc->sc_extdesc.hdr;
-   long err;
+   struct arch_ext_priv *arch_ext;
+   long err, i, ext_size;
 
/* sc_regs is structured the same as the start of pt_regs */
err = __copy_to_user(&sc->sc_regs, regs, sizeof(sc->sc_regs));
@@ -284,8 +290,20 @@ static long setup_sigcontext(struct rt_sigframe __user 
*frame,
if (has_fpu())
err |= save_fp_state(regs, &sc->sc_fpregs);
/* Save the vector state. */
-   if (has_vector() && riscv_v_vstate_query(regs))
-   err |= save_v_state(regs, (void __user **)&sc_ext_ptr);
+   for (i = 0; i < nr_arch_exts; i++) {
+   arch_ext = &arch_ext_list[i];
+   if (!arch_ext->save)
+   continue;
+
+   ext_size = arch_ext->save(regs, sc_ext_ptr + 1);
+   if (ext_size <= 0) {
+   err |= ext_size;
+   } else {
+   err |= __put_user(arch_ext->magic, &sc_ext_ptr->magic);
+   err |= __put_user(ext_size, &sc_ext_ptr->size);
+   sc_ext_ptr = (void *)sc_ext_ptr + ext_size;
+   }
+   }
/* Wr

[PATCH 04/33] riscv: Add support for per-thread envcfg CSR values

2024-10-01 Thread Deepak Gupta

From: Samuel Holland 

Some bits in the [ms]envcfg CSR, such as the CFI state and pointer
masking mode, need to be controlled on a per-thread basis. Support this
by keeping a copy of the CSR value in struct thread_struct and writing
it during context switches. It is safe to discard the old CSR value
during the context switch because the CSR is modified only by software,
so the CSR will remain in sync with the copy in thread_struct.

Use ALTERNATIVE directly instead of riscv_has_extension_unlikely() to
minimize branchiness in the context switching code.

Since thread_struct is copied during fork(), setting the value for the
init task sets the default value for all other threads.

Reviewed-by: Andrew Jones 
Reviewed-by: Deepak Gupta 
Signed-off-by: Samuel Holland 
---
 arch/riscv/include/asm/switch_to.h   | 8 
 arch/riscv/include/asm/thread_info.h | 1 +
 arch/riscv/kernel/cpufeature.c   | 2 +-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/switch_to.h 
b/arch/riscv/include/asm/switch_to.h
index 7594df37cc9f..dd4a36ff4356 100644
--- a/arch/riscv/include/asm/switch_to.h
+++ b/arch/riscv/include/asm/switch_to.h
@@ -70,6 +70,13 @@ static __always_inline bool has_fpu(void) { return false; }
 #define __switch_to_fpu(__prev, __next) do { } while (0)
 #endif
 
+static inline void __switch_to_envcfg(struct task_struct *next)
+{
+   asm volatile (ALTERNATIVE("nop", "csrw " __stringify(CSR_ENVCFG) ", %0",
+ 0, RISCV_ISA_EXT_XLINUXENVCFG, 1)
+   :: "r" (next->thread_info.envcfg) : "memory");
+}
+
 extern struct task_struct *__switch_to(struct task_struct *,
   struct task_struct *);
 
@@ -103,6 +110,7 @@ do {
\
__switch_to_vector(__prev, __next); \
if (switch_to_should_flush_icache(__next))  \
local_flush_icache_all();   \
+   __switch_to_envcfg(__next); \
((last) = __switch_to(__prev, __next)); \
 } while (0)
 
diff --git a/arch/riscv/include/asm/thread_info.h 
b/arch/riscv/include/asm/thread_info.h
index ebe52f96da34..e494871071da 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -57,6 +57,7 @@ struct thread_info {
longuser_sp;/* User stack pointer */
int cpu;
unsigned long   syscall_work;   /* SYSCALL_WORK_ flags */
+   unsigned long envcfg;
 #ifdef CONFIG_SHADOW_CALL_STACK
void*scs_base;
void*scs_sp;
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index e560a253e99b..c0986291696a 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -923,7 +923,7 @@ unsigned long riscv_get_elf_hwcap(void)
 void riscv_user_isa_enable(void)
 {
if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOZ))
-   csr_set(CSR_ENVCFG, ENVCFG_CBZE);
+   current->thread_info.envcfg |= ENVCFG_CBZE;
else if (any_cpu_has_zicboz)
pr_warn_once("Zicboz disabled as it is unavailable on some 
harts\n");
 }

-- 
2.45.0

Re: [PATCH 05/12] mm/memory: Add dax_insert_pfn

2024-10-01 Thread Gerald Schaefer

On Sun, 22 Sep 2024 03:41:57 +0200
Dan Williams  wrote:

> [ add s390 folks to comment on CONFIG_FS_DAX_LIMITED ]

[...]

> > @@ -2516,6 +2545,44 @@ static vm_fault_t __vm_insert_mixed(struct 
> > vm_area_struct *vma,
> > return VM_FAULT_NOPAGE;
> >  }
> >  
> > +vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write)
> > +{
> > +   struct vm_area_struct *vma = vmf->vma;
> > +   pgprot_t pgprot = vma->vm_page_prot;
> > +   unsigned long pfn = pfn_t_to_pfn(pfn_t);
> > +   struct page *page = pfn_to_page(pfn);  
> 
> The problem here is that we stubbornly have __dcssblk_direct_access() to
> worry about. That is the only dax driver that does not return
> pfn_valid() pfns.
> 
> In fact, it looks like __dcssblk_direct_access() is the only thing
> standing in the way of the removal of pfn_t.
> 
> It turns out it has been 3 years since the last time the question of
> bringing s390 fully into the ZONE_DEVICE regime was raised:
> 
> https://lore.kernel.org/all/20210820210318.187742e8@thinkpad/
> 
> Given that this series removes PTE_DEVMAP which was a stumbling block,
> would it be feasible to remove CONFIG_FS_DAX_LIMITED for a few kernel
> cycles until someone from the s390 side can circle back to add full
> ZONE_DEVICE support?

Yes, see also my reply to your "dcssblk: Mark DAX broken" patch.
Thanks Alistair for your effort, making ZONE_DEVICE usable w/o extra
PTE bit!

Re: [PATCH v5 4/5] KVM: selftests: Add test for PSCI SYSTEM_OFF2

2024-10-01 Thread Oliver Upton

On Thu, Sep 26, 2024 at 07:37:59PM +0100, David Woodhouse wrote:
> +static void guest_test_system_off2(void)
> +{
> + uint64_t ret;
> +
> + /* assert that SYSTEM_OFF2 is discoverable */
> + GUEST_ASSERT(psci_features(PSCI_1_3_FN_SYSTEM_OFF2) &
> +  BIT(PSCI_1_3_HIBERNATE_TYPE_OFF));
> + GUEST_ASSERT(psci_features(PSCI_1_3_FN64_SYSTEM_OFF2) &
> +  BIT(PSCI_1_3_HIBERNATE_TYPE_OFF));
> +

Can you also assert that the guest gets INVALID_PARAMETERS if it sets
arg1 or arg2 to a reserved value?

> + ret = psci_system_off2(PSCI_1_3_HIBERNATE_TYPE_OFF);
> + GUEST_SYNC(ret);
> +}
> +
> +static void host_test_system_off2(void)
> +{
> + struct kvm_vcpu *source, *target;
> + uint64_t psci_version = 0;
> + struct kvm_run *run;
> + struct kvm_vm *vm;
> +
> + vm = setup_vm(guest_test_system_off2, &source, &target);
> + vcpu_get_reg(target, KVM_REG_ARM_PSCI_VERSION, &psci_version);
> + TEST_ASSERT(psci_version >= PSCI_VERSION(0, 2),
> + "Unexpected PSCI version %lu.%lu",
> + PSCI_VERSION_MAJOR(psci_version),
> + PSCI_VERSION_MINOR(psci_version));
> +
> + if (psci_version < PSCI_VERSION(1,3))
> + goto skip;

I'm not following this. Is there a particular reason why we'd want to
skip for v1.2 and fail the test for anything less than that?

Just do TEST_REQUIRE(psci_version >= PSCI_VERSION(1, 3)), it makes the
requirements obvious in the case someone runs new selftests on an old
kernel.

-- 
Thanks,
Oliver

Re: [PATCH v5 2/5] KVM: arm64: Add PSCI v1.3 SYSTEM_OFF2 function for hibernation

2024-10-01 Thread Oliver Upton

Hi David,

On Thu, Sep 26, 2024 at 07:37:57PM +0100, David Woodhouse wrote:
> @@ -392,6 +403,32 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 
> minor)
>   break;
>   }
>   break;
> + case PSCI_1_3_FN_SYSTEM_OFF2:
> + kvm_psci_narrow_to_32bit(vcpu);
> + fallthrough;
> + case PSCI_1_3_FN64_SYSTEM_OFF2:
> + if (minor < 3)
> + break;
> +
> + arg = smccc_get_arg1(vcpu);
> + if (arg != PSCI_1_3_HIBERNATE_TYPE_OFF) {
> + val = PSCI_RET_INVALID_PARAMS;
> + break;
> + }

This is missing a check that arg2 must be zero.

> + kvm_psci_system_off2(vcpu);
> + /*
> +  * We shouldn't be going back to guest VCPU after
> +  * receiving SYSTEM_OFF2 request.
> +  *
> +  * If user space accidentally/deliberately resumes
> +  * guest VCPU after SYSTEM_OFF2 request then guest
> +  * VCPU should see internal failure from PSCI return
> +  * value. To achieve this, we preload r0 (or x0) with
> +  * PSCI return value INTERNAL_FAILURE.
> +  */
> + val = PSCI_RET_INTERNAL_FAILURE;
> + ret = 0;
> + break;
>   default:
>   return kvm_psci_0_2_call(vcpu);
>   }
> -- 
> 2.44.0
>

-- 
Thanks,
Oliver

Re: [PATCH v5 3/5] KVM: arm64: Add support for PSCI v1.2 and v1.3

2024-10-01 Thread Oliver Upton

On Thu, Sep 26, 2024 at 07:37:58PM +0100, David Woodhouse wrote:
> From: David Woodhouse 

Please, add changelogs to your patches.

What we really need here is the detail on *why* we can just bump the
PSCI version like this, i.e. no new required ABI. On top of that, you
could mention that KVM has made the implementation choice to provide
SYSTEM_OFF2 unconditionally in its PSCIv1.3 implementation.

> Signed-off-by: David Woodhouse 
> ---
>  arch/arm64/kvm/hypercalls.c | 2 ++
>  arch/arm64/kvm/psci.c   | 6 +-
>  include/kvm/arm_psci.h  | 4 +++-
>  3 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
> index 5763d979d8ca..9c6267ca2b82 100644
> --- a/arch/arm64/kvm/hypercalls.c
> +++ b/arch/arm64/kvm/hypercalls.c
> @@ -575,6 +575,8 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const 
> struct kvm_one_reg *reg)
>   case KVM_ARM_PSCI_0_2:
>   case KVM_ARM_PSCI_1_0:
>   case KVM_ARM_PSCI_1_1:
> + case KVM_ARM_PSCI_1_2:
> + case KVM_ARM_PSCI_1_3:
>   if (!wants_02)
>   return -EINVAL;
>   vcpu->kvm->arch.psci_version = val;
> diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
> index fd0f82464f7d..5177dda5a411 100644
> --- a/arch/arm64/kvm/psci.c
> +++ b/arch/arm64/kvm/psci.c
> @@ -328,7 +328,7 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 
> minor)
>  
>   switch(psci_fn) {
>   case PSCI_0_2_FN_PSCI_VERSION:
> - val = minor == 0 ? KVM_ARM_PSCI_1_0 : KVM_ARM_PSCI_1_1;
> + val = PSCI_VERSION(1, minor);
>   break;
>   case PSCI_1_0_FN_PSCI_FEATURES:
>   arg = smccc_get_arg1(vcpu);
> @@ -486,6 +486,10 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
>   }
>  
>   switch (version) {
> + case KVM_ARM_PSCI_1_3:
> + return kvm_psci_1_x_call(vcpu, 3);
> + case KVM_ARM_PSCI_1_2:
> + return kvm_psci_1_x_call(vcpu, 2);
>   case KVM_ARM_PSCI_1_1:
>   return kvm_psci_1_x_call(vcpu, 1);
>   case KVM_ARM_PSCI_1_0:
> diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h
> index e8fb624013d1..cbaec804eb83 100644
> --- a/include/kvm/arm_psci.h
> +++ b/include/kvm/arm_psci.h
> @@ -14,8 +14,10 @@
>  #define KVM_ARM_PSCI_0_2 PSCI_VERSION(0, 2)
>  #define KVM_ARM_PSCI_1_0 PSCI_VERSION(1, 0)
>  #define KVM_ARM_PSCI_1_1 PSCI_VERSION(1, 1)
> +#define KVM_ARM_PSCI_1_2 PSCI_VERSION(1, 2)
> +#define KVM_ARM_PSCI_1_3 PSCI_VERSION(1, 3)
>  
> -#define KVM_ARM_PSCI_LATEST  KVM_ARM_PSCI_1_1
> +#define KVM_ARM_PSCI_LATEST  KVM_ARM_PSCI_1_3
>  
>  static inline int kvm_psci_version(struct kvm_vcpu *vcpu)
>  {
> -- 
> 2.44.0
> 

-- 
Thanks,
Oliver

[PATCH] docs: dev-tools: Add documentation for the device focused kselftests

2024-10-01 Thread Nícolas F . R . A . Prado

Add documentation for the kselftests focused on testing devices and
point to it from the kselftest documentation. There are multiple tests
in this category so the aim of this page is to make it clear when to run
each test.

Signed-off-by: Nícolas F. R. A. Prado 
---
This patch depends on patch "kselftest: devices: Add test to detect
missing devices" [1], since this patch documents that test.

[1] 
https://lore.kernel.org/all/20240928-kselftest-dev-exist-v2-1-fab07de6b...@collabora.com
---
 Documentation/dev-tools/kselftest.rst   |  9 ++
 Documentation/dev-tools/testing-devices.rst | 47 +
 2 files changed, 56 insertions(+)

diff --git a/Documentation/dev-tools/kselftest.rst 
b/Documentation/dev-tools/kselftest.rst
index f3766e326d1e..fdb1df86783a 100644
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -31,6 +31,15 @@ kselftest runs as a userspace process.  Tests that can be 
written/run in
 userspace may wish to use the `Test Harness`_.  Tests that need to be
 run in kernel space may wish to use a `Test Module`_.
 
+Documentation on the tests
+==
+
+For documentation on the kselftests themselves, see:
+
+.. toctree::
+
+   testing-devices
+
 Running the selftests (hotplug tests are run in limited mode)
 =
 
diff --git a/Documentation/dev-tools/testing-devices.rst 
b/Documentation/dev-tools/testing-devices.rst
new file mode 100644
index ..ab26adb99051
--- /dev/null
+++ b/Documentation/dev-tools/testing-devices.rst
@@ -0,0 +1,47 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright (c) 2024 Collabora Ltd
+
+=
+Device testing with kselftest
+=
+
+
+There are a few different kselftests available for testing devices generically,
+with some overlap in coverage and different requirements. This document aims to
+give an overview of each one.
+
+Note: Paths in this document are relative to the kselftest folder
+(``tools/testing/selftests``).
+
+Device oriented kselftests:
+
+* Devicetree (``dt``)
+
+  * **Coverage**: Probe status for devices described in Devicetree
+  * **Requirements**: None
+
+* Error logs (``devices/error_logs``)
+
+  * **Coverage**: Error (or more critical) log messages presence coming from 
any
+device
+  * **Requirements**: None
+
+* Discoverable bus (``devices/probe``)
+
+  * **Coverage**: Presence and probe status of USB or PCI devices that have 
been
+described in the reference file
+  * **Requirements**: Manually describe the devices that should be tested in a
+YAML reference file (see ``devices/probe/boards/google,spherion.yaml`` for
+an example)
+
+* Exist (``devices/exist``)
+
+  * **Coverage**: Presence of all devices
+  * **Requirements**: Generate the reference (see ``devices/exist/README.rst``
+for details) on a known-good kernel
+
+Therefore, the suggestion is to enable the error log and devicetree tests on 
all
+(DT-based) platforms, since they don't have any requirements. Then to greatly
+improve coverage, generate the reference for each platform and enable the exist
+test. The discoverable bus test can be used to verify the probe status of
+specific USB or PCI devices, but is probably not worth it for most cases.

---
base-commit: cea5425829f77e476b03702426f6b3701299b925
change-id: 20241001-kselftest-device-docs-6c8a411109b5

Best regards,
-- 
Nícolas F. R. A. Prado

Re: [PATCH doc] docs: gcov: fix link to LCOV website

2024-10-01 Thread Peter Oberparleiter

On 26.09.2024 15:09, Matthieu Baerts (NGI0) wrote:
> The previous website hosted on SourceForge is no longer available since
> January 2024 according to archive.org [1].
> 
> It looks like the website has been officially moved to GitHub in June
> 2022 [2]. Best to redirect readers to the new location then.
> 
> Link: 
> https://web.archive.org/web/20240105235756/https://ltp.sourceforge.net/coverage/lcov.php
>  [1]
> Link: https://github.com/linux-test-project/lcov/commit/6da8399c7a7a [2]
> Signed-off-by: Matthieu Baerts (NGI0) 

Yes, let's update the link.

Acked-by: Peter Oberparleiter 


-- 
Peter Oberparleiter
Linux on IBM Z Development - IBM Germany R&D

Re: [PATCH v3] kernel-docs: Add new section for Rust learning materials

2024-10-01 Thread Miguel Ojeda

On Sun, Sep 22, 2024 at 6:04 PM Carlos Bilbao
 wrote:
>
> +   * Title: **Experiment: Improving the Rust Book**
> +
> +  :Author: Cognitive Engineering Lab at Brown University
> +  :URL: https://rust-book.cs.brown.edu/
> +  :Date: Accessed Sep 22 2024
> +  :Keywords: rust, blog.
> +  :Description: From the website: "The goal of this experiment is to
> +evaluate and improve the content of the Rust Book to help people
> +learn Rust more effectively.".

Perhaps this could go closer to the Rust book entry since it is a variant of it.

Or are these sorted in a particular way?

> +   * Title: **Opsem-team** (repository)
> +
> +  :Author: Operational semantics team
> +  :URL: https://github.com/rust-lang/opsem-team/tree/main

Nit: I think you can remove the `/tree/main` part of the URL to simplify.

> +  :Date: Accessed Sep 22 2024

Since these are repositories, and elsewhere you say "rolling version",
should that one have a concrete date, or should some of these ones
have a rolling date too?

One more that I remembered and that we could have here if we have
other videos/podcasts/... is Crust of Rust (and probably the book from
the same author).

In any case, I think it looks good, thanks!

Acked-by: Miguel Ojeda 

Cheers,
Miguel

Re: [PATCH 17/33] prctl: arch-agnostic prctl for shadow stack

2024-10-01 Thread Deepak Gupta


On Tue, Oct 01, 2024 at 05:15:08PM +0100, Mark Brown wrote:

On Tue, Oct 01, 2024 at 09:06:22AM -0700, Deepak Gupta wrote:

From: Mark Brown 



This is based on a patch originally written by Deepak Gupta but later
modified by Mark Brown for arm's GCS patch series.

Signed-off-by: Mark Brown 
Co-developed-by: Deepak Gupta 
---


You need to add your own signoff to this when reposting, see
submitting-patches.rst.


Thanks. Will do that.

[PATCH v13 16/40] KVM: arm64: Manage GCS access and registers for guests

2024-10-01 Thread Mark Brown

GCS introduces a number of system registers for EL1 and EL0, on systems
with GCS we need to context switch them and expose them to VMMs to allow
guests to use GCS.

In order to allow guests to use GCS we also need to configure
HCRX_EL2.GCSEn, if this is not set GCS instructions will be noops and
CHKFEAT will report GCS as disabled.  Also enable fine grained traps for
access to the GCS registers by guests which do not have the feature
enabled.

In order to allow userspace to control availability of the feature to
guests we enable writability for only ID_AA64PFR1_EL1.GCS, this is a
deliberately conservative choice to avoid errors due to oversights.
Further fields should be made writable in future.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/kvm_host.h  | 12 
 arch/arm64/include/asm/vncr_mapping.h  |  2 ++
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 31 ++
 arch/arm64/kvm/sys_regs.c  | 27 +-
 4 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 329619c6fa96..31887d3f3de1 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -448,6 +448,10 @@ enum vcpu_sysreg {
 
POR_EL0,/* Permission Overlay Register 0 (EL0) */
 
+   /* Guarded Control Stack registers */
+   GCSCRE0_EL1,/* Guarded Control Stack Control (EL0) */
+   GCSPR_EL0,  /* Guarded Control Stack Pointer (EL0) */
+
/* FP/SIMD/SVE */
SVCR,
FPMR,
@@ -525,6 +529,10 @@ enum vcpu_sysreg {
 
VNCR(POR_EL1),  /* Permission Overlay Register 1 (EL1) */
 
+   /* Guarded Control Stack registers */
+   VNCR(GCSPR_EL1),/* Guarded Control Stack Pointer (EL1) */
+   VNCR(GCSCR_EL1),/* Guarded Control Stack Control (EL1) */
+
VNCR(HFGRTR_EL2),
VNCR(HFGWTR_EL2),
VNCR(HFGITR_EL2),
@@ -1495,4 +1503,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
(system_supports_fpmr() &&  \
 kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP))
 
+#define kvm_has_gcs(k) \
+   (system_supports_gcs() &&   \
+kvm_has_feat((k), ID_AA64PFR1_EL1, GCS, IMP))
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/vncr_mapping.h 
b/arch/arm64/include/asm/vncr_mapping.h
index 06f8ec0906a6..e289064148b3 100644
--- a/arch/arm64/include/asm/vncr_mapping.h
+++ b/arch/arm64/include/asm/vncr_mapping.h
@@ -89,6 +89,8 @@
 #define VNCR_PMSIRR_EL1 0x840
 #define VNCR_PMSLATFR_EL1   0x848
 #define VNCR_TRFCR_EL1  0x880
+#define VNCR_GCSPR_EL1 0x8C0
+#define VNCR_GCSCR_EL1 0x8D0
 #define VNCR_MPAM1_EL1  0x900
 #define VNCR_MPAMHCR_EL20x930
 #define VNCR_MPAMVPMV_EL2   0x938
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h 
b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index 1579a3c08a36..70bd61430834 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -17,6 +17,7 @@
 #include 
 
 static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt);
+static inline bool ctxt_has_gcs(struct kvm_cpu_context *ctxt);
 
 static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
 {
@@ -31,6 +32,11 @@ static inline void __sysreg_save_user_state(struct 
kvm_cpu_context *ctxt)
 {
ctxt_sys_reg(ctxt, TPIDR_EL0)   = read_sysreg(tpidr_el0);
ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0);
+
+   if (ctxt_has_gcs(ctxt)) {
+   ctxt_sys_reg(ctxt, GCSPR_EL0) = read_sysreg_s(SYS_GCSPR_EL0);
+   ctxt_sys_reg(ctxt, GCSCRE0_EL1) = 
read_sysreg_s(SYS_GCSCRE0_EL1);
+   }
 }
 
 static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
@@ -83,6 +89,17 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context 
*ctxt)
return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64MMFR3_EL1, S1POE, 
IMP);
 }
 
+static inline bool ctxt_has_gcs(struct kvm_cpu_context *ctxt)
+{
+   struct kvm_vcpu *vcpu;
+
+   if (!cpus_have_final_cap(ARM64_HAS_GCS))
+   return false;
+
+   vcpu = ctxt_to_vcpu(ctxt);
+   return kvm_has_feat(kern_hyp_va(vcpu->kvm), ID_AA64PFR1_EL1, GCS, IMP);
+}
+
 static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
 {
ctxt_sys_reg(ctxt, SCTLR_EL1)   = read_sysreg_el1(SYS_SCTLR);
@@ -96,6 +113,10 @@ static inline void __sysreg_save_el1_state(struct 
kvm_cpu_context *ctxt)
if (ctxt_has_s1pie(ctxt)) {
ctxt_sys_reg(ctxt, PIR_EL1) = 
read_sysreg_el1(SYS_PIR);
ctxt_sys_reg(ctxt, PIRE0_EL1)   = 
read_sysreg_el1(SYS_PIRE0);
+   if (ctxt_has_gcs(ctxt)) {
+

[PATCH v13 17/40] arm64/idreg: Add overrride for GCS

2024-10-01 Thread Mark Brown

Hook up an override for GCS, allowing it to be disabled from the command
line by specifying arm64.nogcs in case there are problems.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Acked-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 Documentation/admin-guide/kernel-parameters.txt | 3 +++
 arch/arm64/kernel/pi/idreg-override.c   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 1518343bbe22..c1b00f709734 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -446,6 +446,9 @@
arm64.nobti [ARM64] Unconditionally disable Branch Target
Identification support
 
+   arm64.nogcs [ARM64] Unconditionally disable Guarded Control Stack
+   support
+
arm64.nomops[ARM64] Unconditionally disable Memory Copy and Memory
Set instructions support
 
diff --git a/arch/arm64/kernel/pi/idreg-override.c 
b/arch/arm64/kernel/pi/idreg-override.c
index 29d4b6244a6f..2bb709d78405 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -133,6 +133,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = {
.override   = &id_aa64pfr1_override,
.fields = {
FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ),
+   FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL),
FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL),
FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter),
{}
@@ -215,6 +216,7 @@ static const struct {
{ "arm64.nosve","id_aa64pfr0.sve=0" },
{ "arm64.nosme","id_aa64pfr1.sme=0" },
{ "arm64.nobti","id_aa64pfr1.bt=0" },
+   { "arm64.nogcs","id_aa64pfr1.gcs=0" },
{ "arm64.nopauth",
  "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 "
  "id_aa64isar1.api=0 id_aa64isar1.apa=0 "

-- 
2.39.2

[PATCH v13 10/40] arm64/gcs: Provide put_user_gcs()

2024-10-01 Thread Mark Brown

In order for EL1 to write to an EL0 GCS it must use the GCSSTTR instruction
rather than a normal STTR. Provide a put_user_gcs() which does this.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/uaccess.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 0db494b24dd0..5b91803201ef 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -522,6 +522,24 @@ static inline int gcssttr(unsigned long __user *addr, 
unsigned long val)
return err;
 }
 
+static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
+   int *err)
+{
+   int ret;
+
+   if (!access_ok((char __user *)addr, sizeof(u64))) {
+   *err = -EFAULT;
+   return;
+   }
+
+   uaccess_ttbr0_enable();
+   ret = gcssttr(addr, val);
+   if (ret != 0)
+   *err = ret;
+   uaccess_ttbr0_disable();
+}
+
+
 #endif /* CONFIG_ARM64_GCS */
 
 #endif /* __ASM_UACCESS_H */

-- 
2.39.2

[PATCH v13 21/40] arm64/gcs: Context switch GCS state for EL0

2024-10-01 Thread Mark Brown

There are two registers controlling the GCS state of EL0, GCSPR_EL0 which
is the current GCS pointer and GCSCRE0_EL1 which has enable bits for the
specific GCS functionality enabled for EL0. Manage these on context switch
and process lifetime events, GCS is reset on exec().  Also ensure that
any changes to the GCS memory are visible to other PEs and that changes
from other PEs are visible on this one by issuing a GCSB DSYNC when
moving to or from a thread with GCS.

Since the current GCS configuration of a thread will be visible to
userspace we store the configuration in the format used with userspace
and provide a helper which configures the system register as needed.

On systems that support GCS we always allow access to GCSPR_EL0, this
facilitates reporting of GCS faults if userspace implements disabling of
GCS on error - the GCS can still be discovered and examined even if GCS
has been disabled.

Reviewed-by: Catalin Marinas 
Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/gcs.h   | 24 +++
 arch/arm64/include/asm/processor.h |  6 
 arch/arm64/kernel/process.c| 62 ++
 arch/arm64/mm/Makefile |  1 +
 arch/arm64/mm/gcs.c| 42 ++
 5 files changed, 135 insertions(+)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 7c5e95218db6..04594ef59dad 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -48,4 +48,28 @@ static inline u64 gcsss2(void)
return Xt;
 }
 
+#ifdef CONFIG_ARM64_GCS
+
+static inline bool task_gcs_el0_enabled(struct task_struct *task)
+{
+   return current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE;
+}
+
+void gcs_set_el0_mode(struct task_struct *task);
+void gcs_free(struct task_struct *task);
+void gcs_preserve_current_state(void);
+
+#else
+
+static inline bool task_gcs_el0_enabled(struct task_struct *task)
+{
+   return false;
+}
+
+static inline void gcs_set_el0_mode(struct task_struct *task) { }
+static inline void gcs_free(struct task_struct *task) { }
+static inline void gcs_preserve_current_state(void) { }
+
+#endif
+
 #endif
diff --git a/arch/arm64/include/asm/processor.h 
b/arch/arm64/include/asm/processor.h
index 1438424f0064..5260788247d8 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -185,6 +185,12 @@ struct thread_struct {
u64 svcr;
u64 tpidr2_el0;
u64 por_el0;
+#ifdef CONFIG_ARM64_GCS
+   unsigned intgcs_el0_mode;
+   u64 gcspr_el0;
+   u64 gcs_base;
+   u64 gcs_size;
+#endif
 };
 
 static inline unsigned int thread_get_vl(struct thread_struct *thread,
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 0540653fbf38..aedcf332f422 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -49,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -280,6 +281,25 @@ static void flush_poe(void)
write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
 }
 
+#ifdef CONFIG_ARM64_GCS
+
+static void flush_gcs(void)
+{
+   if (!system_supports_gcs())
+   return;
+
+   gcs_free(current);
+   current->thread.gcs_el0_mode = 0;
+   write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
+   write_sysreg_s(0, SYS_GCSPR_EL0);
+}
+
+#else
+
+static void flush_gcs(void) { }
+
+#endif
+
 void flush_thread(void)
 {
fpsimd_flush_thread();
@@ -287,6 +307,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(current);
flush_tagged_addr_state();
flush_poe();
+   flush_gcs();
 }
 
 void arch_release_task_struct(struct task_struct *tsk)
@@ -484,6 +505,46 @@ static void entry_task_switch(struct task_struct *next)
__this_cpu_write(__entry_task, next);
 }
 
+#ifdef CONFIG_ARM64_GCS
+
+void gcs_preserve_current_state(void)
+{
+   current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);
+}
+
+static void gcs_thread_switch(struct task_struct *next)
+{
+   if (!system_supports_gcs())
+   return;
+
+   /* GCSPR_EL0 is always readable */
+   gcs_preserve_current_state();
+   write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0);
+
+   if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode)
+   gcs_set_el0_mode(next);
+
+   /*
+* Ensure that GCS memory effects of the 'prev' thread are
+* ordered before other memory accesses with release semantics
+* (or preceded by a DMB) on the current PE. In addition, any
+* memory accesses with acquire semantics (or succeeded by a
+* DMB) are ordered before GCS memory effects of the 'next'
+* thread. This will ensure that the GCS memory effects are
+* visible to other PEs in case

[PATCH v13 22/40] arm64/gcs: Ensure that new threads have a GCS

2024-10-01 Thread Mark Brown

When a new thread is created by a thread with GCS enabled the GCS needs
to be specified along with the regular stack.

Unfortunately plain clone() is not extensible and existing clone3()
users will not specify a stack so all existing code would be broken if
we mandated specifying the stack explicitly.  For compatibility with
these cases and also x86 (which did not initially implement clone3()
support for shadow stacks) if no GCS is specified we will allocate one
so when a thread is created which has GCS enabled allocate one for it.
We follow the extensively discussed x86 implementation and allocate
min(RLIMIT_STACK/2, 2G).  Since the GCS only stores the call stack and not
any variables this should be more than sufficient for most applications.

GCSs allocated via this mechanism will be freed when the thread exits.

Reviewed-by: Thiago Jung Bauermann 
Acked-by: Yury Khrustalev 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/gcs.h |  9 +
 arch/arm64/include/asm/mmu_context.h |  9 +
 arch/arm64/kernel/process.c  | 32 +
 arch/arm64/mm/gcs.c  | 69 
 4 files changed, 119 insertions(+)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 04594ef59dad..c1f274fdb9c0 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -8,6 +8,8 @@
 #include 
 #include 
 
+struct kernel_clone_args;
+
 static inline void gcsb_dsync(void)
 {
asm volatile(".inst 0xd503227f" : : : "memory");
@@ -58,6 +60,8 @@ static inline bool task_gcs_el0_enabled(struct task_struct 
*task)
 void gcs_set_el0_mode(struct task_struct *task);
 void gcs_free(struct task_struct *task);
 void gcs_preserve_current_state(void);
+unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+const struct kernel_clone_args *args);
 
 #else
 
@@ -69,6 +73,11 @@ static inline bool task_gcs_el0_enabled(struct task_struct 
*task)
 static inline void gcs_set_el0_mode(struct task_struct *task) { }
 static inline void gcs_free(struct task_struct *task) { }
 static inline void gcs_preserve_current_state(void) { }
+static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+  const struct 
kernel_clone_args *args)
+{
+   return -ENOTSUPP;
+}
 
 #endif
 
diff --git a/arch/arm64/include/asm/mmu_context.h 
b/arch/arm64/include/asm/mmu_context.h
index 7c09d47e09cb..48b3d9553b67 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -311,6 +312,14 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
return por_el0_allows_pkey(vma_pkey(vma), write, execute);
 }
 
+#define deactivate_mm deactivate_mm
+static inline void deactivate_mm(struct task_struct *tsk,
+   struct mm_struct *mm)
+{
+   gcs_free(tsk);
+}
+
+
 #include 
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index aedcf332f422..fdd095480c3f 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -294,9 +294,35 @@ static void flush_gcs(void)
write_sysreg_s(0, SYS_GCSPR_EL0);
 }
 
+static int copy_thread_gcs(struct task_struct *p,
+  const struct kernel_clone_args *args)
+{
+   unsigned long gcs;
+
+   if (!system_supports_gcs())
+   return 0;
+
+   p->thread.gcs_base = 0;
+   p->thread.gcs_size = 0;
+
+   gcs = gcs_alloc_thread_stack(p, args);
+   if (IS_ERR_VALUE(gcs))
+   return PTR_ERR((void *)gcs);
+
+   p->thread.gcs_el0_mode = current->thread.gcs_el0_mode;
+   p->thread.gcs_el0_locked = current->thread.gcs_el0_locked;
+
+   return 0;
+}
+
 #else
 
 static void flush_gcs(void) { }
+static int copy_thread_gcs(struct task_struct *p,
+  const struct kernel_clone_args *args)
+{
+   return 0;
+}
 
 #endif
 
@@ -313,6 +339,7 @@ void flush_thread(void)
 void arch_release_task_struct(struct task_struct *tsk)
 {
fpsimd_release_task(tsk);
+   gcs_free(tsk);
 }
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
@@ -376,6 +403,7 @@ int copy_thread(struct task_struct *p, const struct 
kernel_clone_args *args)
unsigned long stack_start = args->stack;
unsigned long tls = args->tls;
struct pt_regs *childregs = task_pt_regs(p);
+   int ret;
 
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
 
@@ -420,6 +448,10 @@ int copy_thread(struct task_struct *p, const struct 
kernel_clone_args *args)
p->thread.uw.tp_value = tls;
p->thread.tpidr2_el0 = 0;
}
+
+   ret = copy_thread_gcs(p, args);
+   if (ret != 0)
+

[PATCH v13 19/40] arm64/traps: Handle GCS exceptions

2024-10-01 Thread Mark Brown

A new exception code is defined for GCS specific faults other than
standard load/store faults, for example GCS token validation failures,
add handling for this. These faults are reported to userspace as
segfaults with code SEGV_CPERR (protection error), mirroring the
reporting for x86 shadow stack errors.

GCS faults due to memory load/store operations generate data aborts with
a flag set, these will be handled separately as part of the data abort
handling.

Since we do not currently enable GCS for EL1 we should not get any faults
there but while we're at it we wire things up there, treating any GCS
fault as fatal.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/esr.h   | 28 +++-
 arch/arm64/include/asm/exception.h |  2 ++
 arch/arm64/kernel/entry-common.c   | 23 +++
 arch/arm64/kernel/traps.c  | 11 +++
 4 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index da6d2c1c0b03..d1b1a33f9a8b 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -51,7 +51,8 @@
 #define ESR_ELx_EC_FP_EXC32UL(0x28)
 /* Unallocated EC: 0x29 - 0x2B */
 #define ESR_ELx_EC_FP_EXC64UL(0x2C)
-/* Unallocated EC: 0x2D - 0x2E */
+#define ESR_ELx_EC_GCS UL(0x2D)
+/* Unallocated EC:  0x2E */
 #define ESR_ELx_EC_SERROR  UL(0x2F)
 #define ESR_ELx_EC_BREAKPT_LOW UL(0x30)
 #define ESR_ELx_EC_BREAKPT_CUR UL(0x31)
@@ -386,6 +387,31 @@
 #define ESR_ELx_MOPS_ISS_SRCREG(esr)   (((esr) & (UL(0x1f) << 5)) >> 5)
 #define ESR_ELx_MOPS_ISS_SIZEREG(esr)  (((esr) & (UL(0x1f) << 0)) >> 0)
 
+/* ISS field definitions for GCS */
+#define ESR_ELx_ExType_SHIFT   (20)
+#define ESR_ELx_ExType_MASKGENMASK(23, 20)
+#define ESR_ELx_Raddr_SHIFT(10)
+#define ESR_ELx_Raddr_MASK GENMASK(14, 10)
+#define ESR_ELx_Rn_SHIFT   (5)
+#define ESR_ELx_Rn_MASKGENMASK(9, 5)
+#define ESR_ELx_Rvalue_SHIFT   5
+#define ESR_ELx_Rvalue_MASKGENMASK(9, 5)
+#define ESR_ELx_IT_SHIFT   (0)
+#define ESR_ELx_IT_MASKGENMASK(4, 0)
+
+#define ESR_ELx_ExType_DATA_CHECK  0
+#define ESR_ELx_ExType_EXLOCK  1
+#define ESR_ELx_ExType_STR 2
+
+#define ESR_ELx_IT_RET 0
+#define ESR_ELx_IT_GCSPOPM 1
+#define ESR_ELx_IT_RET_KEYA2
+#define ESR_ELx_IT_RET_KEYB3
+#define ESR_ELx_IT_GCSSS1  4
+#define ESR_ELx_IT_GCSSS2  5
+#define ESR_ELx_IT_GCSPOPCX6
+#define ESR_ELx_IT_GCSPOPX 7
+
 #ifndef __ASSEMBLY__
 #include 
 
diff --git a/arch/arm64/include/asm/exception.h 
b/arch/arm64/include/asm/exception.h
index f296662590c7..674518464718 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -57,6 +57,8 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr);
 void do_el1_undef(struct pt_regs *regs, unsigned long esr);
 void do_el0_bti(struct pt_regs *regs);
 void do_el1_bti(struct pt_regs *regs, unsigned long esr);
+void do_el0_gcs(struct pt_regs *regs, unsigned long esr);
+void do_el1_gcs(struct pt_regs *regs, unsigned long esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
struct pt_regs *regs);
 void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs);
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3fcd9d080bf2..fe74813009bd 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -463,6 +463,15 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned 
long esr)
exit_to_kernel_mode(regs);
 }
 
+static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr)
+{
+   enter_from_kernel_mode(regs);
+   local_daif_inherit(regs);
+   do_el1_gcs(regs, esr);
+   local_daif_mask();
+   exit_to_kernel_mode(regs);
+}
+
 static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr)
 {
unsigned long far = read_sysreg(far_el1);
@@ -505,6 +514,9 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs 
*regs)
case ESR_ELx_EC_BTI:
el1_bti(regs, esr);
break;
+   case ESR_ELx_EC_GCS:
+   el1_gcs(regs, esr);
+   break;
case ESR_ELx_EC_BREAKPT_CUR:
case ESR_ELx_EC_SOFTSTP_CUR:
case ESR_ELx_EC_WATCHPT_CUR:
@@ -684,6 +696,14 @@ static void noinstr el0_mops(struct pt_regs *regs, 
unsigned long esr)
exit_to_user_mode(regs);
 }
 
+static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr)
+{
+   enter_from_user_mode(regs);
+   local_daif_restore(DAIF_PROCCTX);
+   do_el0_gcs(regs, esr);
+   exit_to_user_mode(regs);
+}
+
 static void noinstr el0_inv(struct pt_regs *regs, unsigned long e

[PATCH v13 27/40] arm64/ptrace: Expose GCS via ptrace and core files

2024-10-01 Thread Mark Brown

Provide a new register type NT_ARM_GCS reporting the current GCS mode
and pointer for EL0.  Due to the interactions with allocation and
deallocation of Guarded Control Stacks we do not permit any changes to
the GCS mode via ptrace, only GCSPR_EL0 may be changed.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/uapi/asm/ptrace.h |  8 +
 arch/arm64/kernel/ptrace.c   | 62 +++-
 include/uapi/linux/elf.h |  1 +
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/uapi/asm/ptrace.h 
b/arch/arm64/include/uapi/asm/ptrace.h
index 7fa2f7036aa7..0f39ba4f3efd 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -324,6 +324,14 @@ struct user_za_header {
 #define ZA_PT_SIZE(vq) \
(ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq))
 
+/* GCS state (NT_ARM_GCS) */
+
+struct user_gcs {
+   __u64 features_enabled;
+   __u64 features_locked;
+   __u64 gcspr_el0;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b756578aeaee..6c1dcfe6d25a 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1473,6 +1474,52 @@ static int poe_set(struct task_struct *target, const 
struct
 }
 #endif
 
+#ifdef CONFIG_ARM64_GCS
+static int gcs_get(struct task_struct *target,
+  const struct user_regset *regset,
+  struct membuf to)
+{
+   struct user_gcs user_gcs;
+
+   if (!system_supports_gcs())
+   return -EINVAL;
+
+   if (target == current)
+   gcs_preserve_current_state();
+
+   user_gcs.features_enabled = target->thread.gcs_el0_mode;
+   user_gcs.features_locked = target->thread.gcs_el0_locked;
+   user_gcs.gcspr_el0 = target->thread.gcspr_el0;
+
+   return membuf_write(&to, &user_gcs, sizeof(user_gcs));
+}
+
+static int gcs_set(struct task_struct *target, const struct
+  user_regset *regset, unsigned int pos,
+  unsigned int count, const void *kbuf, const
+  void __user *ubuf)
+{
+   int ret;
+   struct user_gcs user_gcs;
+
+   if (!system_supports_gcs())
+   return -EINVAL;
+
+   ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1);
+   if (ret)
+   return ret;
+
+   if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+   return -EINVAL;
+
+   target->thread.gcs_el0_mode = user_gcs.features_enabled;
+   target->thread.gcs_el0_locked = user_gcs.features_locked;
+   target->thread.gcspr_el0 = user_gcs.gcspr_el0;
+
+   return 0;
+}
+#endif
+
 enum aarch64_regset {
REGSET_GPR,
REGSET_FPR,
@@ -1503,7 +1550,10 @@ enum aarch64_regset {
REGSET_TAGGED_ADDR_CTRL,
 #endif
 #ifdef CONFIG_ARM64_POE
-   REGSET_POE
+   REGSET_POE,
+#endif
+#ifdef CONFIG_ARM64_GCS
+   REGSET_GCS,
 #endif
 };
 
@@ -1674,6 +1724,16 @@ static const struct user_regset aarch64_regsets[] = {
.set = poe_set,
},
 #endif
+#ifdef CONFIG_ARM64_GCS
+   [REGSET_GCS] = {
+   .core_note_type = NT_ARM_GCS,
+   .n = sizeof(struct user_gcs) / sizeof(u64),
+   .size = sizeof(u64),
+   .align = sizeof(u64),
+   .regset_get = gcs_get,
+   .set = gcs_set,
+   },
+#endif
 };
 
 static const struct user_regset_view user_aarch64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b9935988da5c..9adc218fb6df 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -443,6 +443,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_ZT  0x40d   /* ARM SME ZT registers */
 #define NT_ARM_FPMR0x40e   /* ARM floating point mode register */
 #define NT_ARM_POE 0x40f   /* ARM POE registers */
+#define NT_ARM_GCS 0x410   /* ARM GCS state */
 #define NT_ARC_V2  0x600   /* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD0x700   /* Vmcore Device Dump Note */
 #define NT_MIPS_DSP0x800   /* MIPS DSP ASE registers */

-- 
2.39.2

[PATCH v13 26/40] arm64/signal: Expose GCS state in signal frames

2024-10-01 Thread Mark Brown

Add a context for the GCS state and include it in the signal context when
running on a system that supports GCS. We reuse the same flags that the
prctl() uses to specify which GCS features are enabled and also provide the
current GCS pointer.

We do not support enabling GCS via signal return, there is a conflict
between specifying GCSPR_EL0 and allocation of a new GCS and this is not
an ancticipated use case.  We also enforce GCS configuration locking on
signal return.

Reviewed-by: Catalin Marinas 
Reviewed-by: Thiago Jung Bauermann 
Acked-by: Yury Khrustalev 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/uapi/asm/sigcontext.h |   9 +++
 arch/arm64/kernel/signal.c   | 109 +++
 2 files changed, 118 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/sigcontext.h 
b/arch/arm64/include/uapi/asm/sigcontext.h
index bb7af77a30a7..d42f7a92238b 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -183,6 +183,15 @@ struct zt_context {
__u16 __reserved[3];
 };
 
+#define GCS_MAGIC  0x47435300
+
+struct gcs_context {
+   struct _aarch64_ctx head;
+   __u64 gcspr;
+   __u64 features_enabled;
+   __u64 reserved;
+};
+
 #endif /* !__ASSEMBLY__ */
 
 #include 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index b5ab0e229a78..62d666278264 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -66,6 +66,7 @@ struct rt_sigframe_user_layout {
 
unsigned long fpsimd_offset;
unsigned long esr_offset;
+   unsigned long gcs_offset;
unsigned long sve_offset;
unsigned long tpidr2_offset;
unsigned long za_offset;
@@ -198,6 +199,8 @@ struct user_ctxs {
u32 fpmr_size;
struct poe_context __user *poe;
u32 poe_size;
+   struct gcs_context __user *gcs;
+   u32 gcs_size;
 };
 
 static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
@@ -643,6 +646,82 @@ extern int restore_zt_context(struct user_ctxs *user);
 
 #endif /* ! CONFIG_ARM64_SME */
 
+#ifdef CONFIG_ARM64_GCS
+
+static int preserve_gcs_context(struct gcs_context __user *ctx)
+{
+   int err = 0;
+   u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0);
+
+   /*
+* If GCS is enabled we will add a cap token to the frame,
+* include it in the GCSPR_EL0 we report to support stack
+* switching via sigreturn if GCS is enabled.  We do not allow
+* enabling via sigreturn so the token is only relevant for
+* threads with GCS enabled.
+*/
+   if (task_gcs_el0_enabled(current))
+   gcspr -= 8;
+
+   __put_user_error(GCS_MAGIC, &ctx->head.magic, err);
+   __put_user_error(sizeof(*ctx), &ctx->head.size, err);
+   __put_user_error(gcspr, &ctx->gcspr, err);
+   __put_user_error(0, &ctx->reserved, err);
+   __put_user_error(current->thread.gcs_el0_mode,
+&ctx->features_enabled, err);
+
+   return err;
+}
+
+static int restore_gcs_context(struct user_ctxs *user)
+{
+   u64 gcspr, enabled;
+   int err = 0;
+
+   if (user->gcs_size != sizeof(*user->gcs))
+   return -EINVAL;
+
+   __get_user_error(gcspr, &user->gcs->gcspr, err);
+   __get_user_error(enabled, &user->gcs->features_enabled, err);
+   if (err)
+   return err;
+
+   /* Don't allow unknown modes */
+   if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+   return -EINVAL;
+
+   err = gcs_check_locked(current, enabled);
+   if (err != 0)
+   return err;
+
+   /* Don't allow enabling */
+   if (!task_gcs_el0_enabled(current) &&
+   (enabled & PR_SHADOW_STACK_ENABLE))
+   return -EINVAL;
+
+   /* If we are disabling disable everything */
+   if (!(enabled & PR_SHADOW_STACK_ENABLE))
+   enabled = 0;
+
+   current->thread.gcs_el0_mode = enabled;
+
+   /*
+* We let userspace set GCSPR_EL0 to anything here, we will
+* validate later in gcs_restore_signal().
+*/
+   write_sysreg_s(gcspr, SYS_GCSPR_EL0);
+
+   return 0;
+}
+
+#else /* ! CONFIG_ARM64_GCS */
+
+/* Turn any non-optimised out attempts to use these into a link error: */
+extern int preserve_gcs_context(void __user *ctx);
+extern int restore_gcs_context(struct user_ctxs *user);
+
+#endif /* ! CONFIG_ARM64_GCS */
+
 static int parse_user_sigframe(struct user_ctxs *user,
   struct rt_sigframe __user *sf)
 {
@@ -661,6 +740,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
user->zt = NULL;
user->fpmr = NULL;
user->poe = NULL;
+   user->gcs = NULL;
 
if (!IS_ALIGNED((unsigned long)base, 16))
goto invalid;
@@ -777,6 +857,17 @@ static int parse_user_sigframe(struct user_ctxs *user,
user->fpmr_size = size;
break;
 
+

[PATCH v13 28/40] arm64: Add Kconfig for Guarded Control Stack (GCS)

2024-10-01 Thread Mark Brown

Provide a Kconfig option allowing the user to select if GCS support is
built into the kernel.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/Kconfig | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e29b44d2d7b..dcb12f041c13 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2178,6 +2178,27 @@ config ARCH_PKEY_BITS
 
 endmenu # "ARMv8.9 architectural features"
 
+menu "v9.4 architectural features"
+
+config ARM64_GCS
+   bool "Enable support for Guarded Control Stack (GCS)"
+   default y
+   select ARCH_HAS_USER_SHADOW_STACK
+   select ARCH_USES_HIGH_VMA_FLAGS
+   depends on !UPROBES
+   help
+ Guarded Control Stack (GCS) provides support for a separate
+ stack with restricted access which contains only return
+ addresses.  This can be used to harden against some attacks
+ by comparing return address used by the program with what is
+ stored in the GCS, and may also be used to efficiently obtain
+ the call stack for applications such as profiling.
+
+ The feature is detected at runtime, and will remain disabled
+ if the system does not implement the feature.
+
+endmenu # "v9.4 architectural features"
+
 config ARM64_SVE
bool "ARM Scalable Vector Extension support"
default y

-- 
2.39.2

[PATCH v13 29/40] kselftest/arm64: Verify the GCS hwcap

2024-10-01 Thread Mark Brown

Add coverage of the GCS hwcap to the hwcap selftest, using a read of
GCSPR_EL0 to generate SIGILL without having to worry about enabling GCS.

Reviewed-by: Thiago Jung Bauermann 
Tested-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/abi/hwcap.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c 
b/tools/testing/selftests/arm64/abi/hwcap.c
index f2d6007a2b98..1f07772ae578 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -98,6 +98,17 @@ static void fpmr_sigill(void)
asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0");
 }
 
+static void gcs_sigill(void)
+{
+   unsigned long *gcspr;
+
+   asm volatile(
+   "mrs%0, S3_3_C2_C5_1"
+   : "=r" (gcspr)
+   :
+   : "cc");
+}
+
 static void ilrcpc_sigill(void)
 {
/* LDAPUR W0, [SP, #8] */
@@ -534,6 +545,14 @@ static const struct hwcap_data {
.sigill_fn = fpmr_sigill,
.sigill_reliable = true,
},
+   {
+   .name = "GCS",
+   .at_hwcap = AT_HWCAP,
+   .hwcap_bit = HWCAP_GCS,
+   .cpuinfo = "gcs",
+   .sigill_fn = gcs_sigill,
+   .sigill_reliable = true,
+   },
{
.name = "JSCVT",
.at_hwcap = AT_HWCAP,

-- 
2.39.2

[PATCH v13 34/40] kselftest/arm64: Add very basic GCS test program

2024-10-01 Thread Mark Brown

This test program just covers the basic GCS ABI, covering aspects of the
ABI as standalone features without attempting to integrate things.

Reviewed-by: Thiago Jung Bauermann 
Tested-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/Makefile|   2 +-
 tools/testing/selftests/arm64/gcs/.gitignore  |   1 +
 tools/testing/selftests/arm64/gcs/Makefile|  18 ++
 tools/testing/selftests/arm64/gcs/basic-gcs.c | 357 ++
 tools/testing/selftests/arm64/gcs/gcs-util.h  |  90 +++
 5 files changed, 467 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/Makefile 
b/tools/testing/selftests/arm64/Makefile
index 28b93cab8c0d..22029e60eff3 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi
+ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi gcs
 else
 ARM64_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/arm64/gcs/.gitignore 
b/tools/testing/selftests/arm64/gcs/.gitignore
new file mode 100644
index ..0e5e695ecba5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -0,0 +1 @@
+basic-gcs
diff --git a/tools/testing/selftests/arm64/gcs/Makefile 
b/tools/testing/selftests/arm64/gcs/Makefile
new file mode 100644
index ..61a30f483429
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2023 ARM Limited
+#
+# In order to avoid interaction with the toolchain and dynamic linker the
+# portions of these tests that interact with the GCS are implemented using
+# nolibc.
+#
+
+TEST_GEN_PROGS := basic-gcs
+
+include ../../lib.mk
+
+$(OUTPUT)/basic-gcs: basic-gcs.c
+   $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \
+   -static -include ../../../../include/nolibc/nolibc.h \
+   -I../../../../../usr/include \
+   -std=gnu99 -I../.. -g \
+   -ffreestanding -Wall $^ -o $@ -lgcc
diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c 
b/tools/testing/selftests/arm64/gcs/basic-gcs.c
new file mode 100644
index ..3fb9742342a3
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#include 
+#include 
+
+#include 
+
+#include 
+#include 
+#include 
+
+#include "kselftest.h"
+#include "gcs-util.h"
+
+/* nolibc doesn't have sysconf(), just hard code the maximum */
+static size_t page_size = 65536;
+
+static  __attribute__((noinline)) void valid_gcs_function(void)
+{
+   /* Do something the compiler can't optimise out */
+   my_syscall1(__NR_prctl, PR_SVE_GET_VL);
+}
+
+static inline int gcs_set_status(unsigned long mode)
+{
+   bool enabling = mode & PR_SHADOW_STACK_ENABLE;
+   int ret;
+   unsigned long new_mode;
+
+   /*
+* The prctl takes 1 argument but we need to ensure that the
+* other 3 values passed in registers to the syscall are zero
+* since the kernel validates them.
+*/
+   ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode,
+ 0, 0, 0);
+
+   if (ret == 0) {
+   ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+ &new_mode, 0, 0, 0);
+   if (ret == 0) {
+   if (new_mode != mode) {
+   ksft_print_msg("Mode set to %lx not %lx\n",
+  new_mode, mode);
+   ret = -EINVAL;
+   }
+   } else {
+   ksft_print_msg("Failed to validate mode: %d\n", ret);
+   }
+
+   if (enabling != chkfeat_gcs()) {
+   ksft_print_msg("%senabled by prctl but %senabled in 
CHKFEAT\n",
+  enabling ? "" : "not ",
+  chkfeat_gcs() ? "" : "not ");
+   ret = -EINVAL;
+   }
+   }
+
+   return ret;
+}
+
+/* Try to read the status */
+static bool read_status(void)
+{
+   unsigned long state;
+   int ret;
+
+   ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+ &state, 0, 0, 0);
+   if (ret != 0) {
+   ksft_print_msg("Failed to read state: %d\n", ret);
+   return false;
+   }
+
+   return state & PR_SHADOW_STACK_ENABLE;
+}
+
+/* Just a straight enable */
+static bool base_enable(void)
+{
+   int ret;
+
+   ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+   if (ret) {
+   ksft_print_msg("PR_SHADOW_STACK_ENABLE failed %d\n", ret);
+   return fa

[PATCH v13 13/40] arm64/mm: Allocate PIE slots for EL0 guarded control stack

2024-10-01 Thread Mark Brown

Pages used for guarded control stacks need to be described to the hardware
using the Permission Indirection Extension, GCS is not supported without
PIE. In order to support copy on write for guarded stacks we allocate two
values, one for active GCSs and one for GCS pages marked as read only prior
to copy.

Since the actual effect is defined using PIE the specific bit pattern used
does not matter to the hardware but we choose two values which differ only
in PTE_WRITE in order to help share code with non-PIE cases.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/pgtable-prot.h | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-prot.h 
b/arch/arm64/include/asm/pgtable-prot.h
index 2a11d0c10760..4e4bcd676f4c 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -144,15 +144,23 @@ static inline bool __pure lpa2_is_enabled(void)
 /* 6:PTE_PXN | PTE_WRITE*/
 /* 7: PAGE_SHARED_EXEC   PTE_PXN | PTE_WRITE | PTE_USER */
 /* 8: PAGE_KERNEL_ROX  PTE_UXN  */
-/* 9:  PTE_UXN |   PTE_USER */
+/* 9: PAGE_GCS_RO  PTE_UXN |   PTE_USER */
 /* a: PAGE_KERNEL_EXEC PTE_UXN |   PTE_WRITE*/
-/* b:  PTE_UXN |   PTE_WRITE | PTE_USER */
+/* b: PAGE_GCS PTE_UXN |   PTE_WRITE | PTE_USER */
 /* c: PAGE_KERNEL_RO   PTE_UXN | PTE_PXN*/
 /* d: PAGE_READONLYPTE_UXN | PTE_PXN | PTE_USER */
 /* e: PAGE_KERNEL  PTE_UXN | PTE_PXN | PTE_WRITE*/
 /* f: PAGE_SHARED  PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */
 
+#define _PAGE_GCS  (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | 
PTE_USER)
+#define _PAGE_GCS_RO   (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER)
+
+#define PAGE_GCS   __pgprot(_PAGE_GCS)
+#define PAGE_GCS_RO__pgprot(_PAGE_GCS_RO)
+
 #define PIE_E0 ( \
+   PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS),   PIE_GCS)  | \
+   PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO),PIE_R)   | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY),  PIE_X_O) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O)  | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC),   PIE_RWX_O) | \
@@ -160,6 +168,8 @@ static inline bool __pure lpa2_is_enabled(void)
PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED),PIE_RW_O))
 
 #define PIE_E1 ( \
+   PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS),   PIE_NONE_O) | \
+   PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO),PIE_NONE_O) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY),  PIE_NONE_O) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R)  | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC),   PIE_RW) | \

-- 
2.39.2

[PATCH v13 33/40] kselftest/arm64: Always run signals tests with GCS enabled

2024-10-01 Thread Mark Brown

Since it is not possible to return from the function that enabled GCS
without disabling GCS it is very inconvenient to use the signal handling
tests to cover GCS when GCS is not enabled by the toolchain and runtime,
something that no current distribution does. Since none of the testcases
do anything with stacks that would cause problems with GCS we can sidestep
this issue by unconditionally enabling GCS on startup and exiting with a
call to exit() rather than a return from main().

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 .../testing/selftests/arm64/signal/test_signals.c  | 17 -
 .../selftests/arm64/signal/test_signals_utils.h| 29 ++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/signal/test_signals.c 
b/tools/testing/selftests/arm64/signal/test_signals.c
index 00051b40d71e..1304c8ec0f2f 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.c
+++ b/tools/testing/selftests/arm64/signal/test_signals.c
@@ -7,6 +7,10 @@
  * Each test provides its own tde struct tdescr descriptor to link with
  * this wrapper. Framework provides common helpers.
  */
+
+#include 
+#include 
+
 #include 
 
 #include "test_signals.h"
@@ -16,6 +20,16 @@ struct tdescr *current = &tde;
 
 int main(int argc, char *argv[])
 {
+   /*
+* Ensure GCS is at least enabled throughout the tests if
+* supported, otherwise the inability to return from the
+* function that enabled GCS makes it very inconvenient to set
+* up test cases.  The prctl() may fail if GCS was locked by
+* libc setup code.
+*/
+   if (getauxval(AT_HWCAP) & HWCAP_GCS)
+   gcs_set_state(PR_SHADOW_STACK_ENABLE);
+
ksft_print_msg("%s :: %s\n", current->name, current->descr);
if (test_setup(current) && test_init(current)) {
test_run(current);
@@ -23,5 +37,6 @@ int main(int argc, char *argv[])
}
test_result(current);
 
-   return current->result;
+   /* Do not return in case GCS was enabled */
+   exit(current->result);
 }
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h 
b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 762c8fe9c54a..1e80808ee105 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -18,6 +18,35 @@ void test_cleanup(struct tdescr *td);
 int test_run(struct tdescr *td);
 void test_result(struct tdescr *td);
 
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+/*
+ * The prctl takes 1 argument but we need to ensure that the other
+ * values passed in registers to the syscall are zero since the kernel
+ * validates them.
+ */
+#define gcs_set_state(state)   \
+   ({  \
+   register long _num  __asm__ ("x8") = __NR_prctl;\
+   register long _arg1 __asm__ ("x0") =  
PR_SET_SHADOW_STACK_STATUS; \
+   register long _arg2 __asm__ ("x1") = (long)(state); \
+   register long _arg3 __asm__ ("x2") = 0; \
+   register long _arg4 __asm__ ("x3") = 0; \
+   register long _arg5 __asm__ ("x4") = 0; \
+ \
+   __asm__  volatile ( \
+   "svc #0\n"  \
+   : "=r"(_arg1)   \
+   : "r"(_arg1), "r"(_arg2),   \
+ "r"(_arg3), "r"(_arg4),   \
+ "r"(_arg5), "r"(_num) \
+   : "memory", "cc"\
+   );  \
+   _arg1;  \
+   })
+
 static inline bool feats_ok(struct tdescr *td)
 {
if (td->feats_incompatible & td->feats_supported)

-- 
2.39.2

[PATCH v13 30/40] kselftest/arm64: Add GCS as a detected feature in the signal tests

2024-10-01 Thread Mark Brown

In preparation for testing GCS related signal handling add it as a feature
we check for in the signal handling support code.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/signal/test_signals.h   | 2 ++
 tools/testing/selftests/arm64/signal/test_signals_utils.c | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/tools/testing/selftests/arm64/signal/test_signals.h 
b/tools/testing/selftests/arm64/signal/test_signals.h
index 1e6273d81575..7ada43688c02 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -35,6 +35,7 @@ enum {
FSME_BIT,
FSME_FA64_BIT,
FSME2_BIT,
+   FGCS_BIT,
FMAX_END
 };
 
@@ -43,6 +44,7 @@ enum {
 #define FEAT_SME   (1UL << FSME_BIT)
 #define FEAT_SME_FA64  (1UL << FSME_FA64_BIT)
 #define FEAT_SME2  (1UL << FSME2_BIT)
+#define FEAT_GCS   (1UL << FGCS_BIT)
 
 /*
  * A descriptor used to describe and configure a test case.
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c 
b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 0dc948db3a4a..dcc49e3ce1eb 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -30,6 +30,7 @@ static char const *const feats_names[FMAX_END] = {
" SME ",
" FA64 ",
" SME2 ",
+   " GCS ",
 };
 
 #define MAX_FEATS_SZ   128
@@ -329,6 +330,8 @@ int test_init(struct tdescr *td)
td->feats_supported |= FEAT_SME_FA64;
if (getauxval(AT_HWCAP2) & HWCAP2_SME2)
td->feats_supported |= FEAT_SME2;
+   if (getauxval(AT_HWCAP) & HWCAP_GCS)
+   td->feats_supported |= FEAT_GCS;
if (feats_ok(td)) {
if (td->feats_required & td->feats_supported)
fprintf(stderr,

-- 
2.39.2

[PATCH v13 36/40] kselftest/arm64: Add test coverage for GCS mode locking

2024-10-01 Thread Mark Brown

Verify that we can lock individual GCS mode bits, that other modes
aren't affected and as a side effect also that every combination of
modes can be enabled.

Normally the inability to reenable GCS after disabling it would be an
issue with testing but fortunately the kselftest_harness runs each test
within a fork()ed child.  This can be inconvenient for some kinds of
testing but here it means that each test is in a separate thread and
therefore won't be affected by other tests in the suite.

Once we get toolchains with support for enabling GCS by default we will
need to take care to not do that in the build system but there are no
such toolchains yet so it is not yet an issue.

Reviewed-by: Thiago Jung Bauermann 
Tested-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/gcs/.gitignore|   1 +
 tools/testing/selftests/arm64/gcs/Makefile  |   2 +-
 tools/testing/selftests/arm64/gcs/gcs-locking.c | 200 
 3 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore 
b/tools/testing/selftests/arm64/gcs/.gitignore
index 5810c4a163d4..0c86f53f68ad 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1,2 +1,3 @@
 basic-gcs
 libc-gcs
+gcs-locking
diff --git a/tools/testing/selftests/arm64/gcs/Makefile 
b/tools/testing/selftests/arm64/gcs/Makefile
index a8fdf21e9a47..2173d6275956 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,7 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking
 
 LDLIBS+=-lpthread
 
diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c 
b/tools/testing/selftests/arm64/gcs/gcs-locking.c
new file mode 100644
index ..989f75a491b7
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ *
+ * Tests for GCS mode locking.  These tests rely on both having GCS
+ * unconfigured on entry and on the kselftest harness running each
+ * test in a fork()ed process which will have it's own mode.
+ */
+
+#include 
+
+#include 
+#include 
+
+#include 
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2)  \
+({\
+   register long _num  __asm__ ("x8") = (num);   \
+   register long _arg1 __asm__ ("x0") = (long)(arg1);\
+   register long _arg2 __asm__ ("x1") = (long)(arg2);\
+   register long _arg3 __asm__ ("x2") = 0;   \
+   register long _arg4 __asm__ ("x3") = 0;   \
+   register long _arg5 __asm__ ("x4") = 0;   \
+ \
+   __asm__  volatile (   \
+   "svc #0\n"\
+   : "=r"(_arg1) \
+   : "r"(_arg1), "r"(_arg2), \
+ "r"(_arg3), "r"(_arg4), \
+ "r"(_arg5), "r"(_num)   \
+   : "memory", "cc"  \
+   );\
+   _arg1;\
+})
+
+/* No mode bits are rejected for locking */
+TEST(lock_all_modes)
+{
+   int ret;
+
+   ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, ULONG_MAX, 0, 0, 0);
+   ASSERT_EQ(ret, 0);
+}
+
+FIXTURE(valid_modes)
+{
+};
+
+FIXTURE_VARIANT(valid_modes)
+{
+   unsigned long mode;
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable)
+{
+   .mode = PR_SHADOW_STACK_ENABLE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write)
+{
+   .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_push)
+{
+   .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write_push)
+{
+   .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE |
+   PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_SETUP(valid_modes)
+{
+}
+
+FIXTURE_TEARDOWN(valid_modes)
+{
+}
+
+/* We can set the mode at all */
+TEST_F(valid_modes, set)
+{
+   int ret;
+
+   ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ variant->mode);
+   ASSERT_EQ(ret, 0);
+
+   _exit(0);
+}
+
+/* Enabling, locking then disabling is rejected */
+TEST_F(valid_modes, enable_l

[PATCH v13 39/40] kselftest/arm64: Enable GCS for the FP stress tests

2024-10-01 Thread Mark Brown

While it's a bit off topic for them the floating point stress tests do give
us some coverage of context thrashing cases, and also of active signal
delivery separate to the relatively complicated framework in the actual
signals tests. Have the tests enable GCS on startup, ignoring failures so
they continue to work as before on systems without GCS.

Reviewed-by: Thiago Jung Bauermann 
Tested-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/fp/assembler.h   | 15 +++
 tools/testing/selftests/arm64/fp/fpsimd-test.S |  2 ++
 tools/testing/selftests/arm64/fp/sve-test.S|  2 ++
 tools/testing/selftests/arm64/fp/za-test.S |  2 ++
 tools/testing/selftests/arm64/fp/zt-test.S |  2 ++
 5 files changed, 23 insertions(+)

diff --git a/tools/testing/selftests/arm64/fp/assembler.h 
b/tools/testing/selftests/arm64/fp/assembler.h
index 9b38a0da407d..1fc46a5642c2 100644
--- a/tools/testing/selftests/arm64/fp/assembler.h
+++ b/tools/testing/selftests/arm64/fp/assembler.h
@@ -65,4 +65,19 @@ endfunction
bl  puts
 .endm
 
+#define PR_SET_SHADOW_STACK_STATUS  75
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+
+.macro enable_gcs
+   // Run with GCS
+   mov x0, PR_SET_SHADOW_STACK_STATUS
+   mov x1, PR_SHADOW_STACK_ENABLE
+   mov x2, xzr
+   mov x3, xzr
+   mov x4, xzr
+   mov x5, xzr
+   mov x8, #__NR_prctl
+   svc #0
+.endm
+
 #endif /* ! ASSEMBLER_H */
diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S 
b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index 8b960d01ed2e..b16fb7f42e3e 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -215,6 +215,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+   enable_gcs
+
mov x23, #0 // signal count
 
mov w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S 
b/tools/testing/selftests/arm64/fp/sve-test.S
index fff60e2a25ad..2fb4f0b84476 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -378,6 +378,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+   enable_gcs
+
mov x23, #0 // Irritation signal count
 
mov w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/za-test.S 
b/tools/testing/selftests/arm64/fp/za-test.S
index 095b45531640..b2603aba99de 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -231,6 +231,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+   enable_gcs
+
mov x23, #0 // signal count
 
mov w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S 
b/tools/testing/selftests/arm64/fp/zt-test.S
index b5c81e81a379..8d9609a49008 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -200,6 +200,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+   enable_gcs
+
mov x23, #0 // signal count
 
mov w0, #SIGINT

-- 
2.39.2

[PATCH v13 38/40] kselftest/arm64: Add a GCS stress test

2024-10-01 Thread Mark Brown

Add a stress test which runs one more process than we have CPUs spinning
through a very recursive function with frequent syscalls immediately prior
to return and signals being injected every 100ms. The goal is to flag up
any scheduling related issues, for example failure to ensure that barriers
are inserted when moving a GCS using task to another CPU. The test runs for
a configurable amount of time, defaulting to 10 seconds.

Reviewed-by: Thiago Jung Bauermann 
Tested-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/gcs/.gitignore   |   2 +
 tools/testing/selftests/arm64/gcs/Makefile |   6 +-
 tools/testing/selftests/arm64/gcs/asm-offsets.h|   0
 .../selftests/arm64/gcs/gcs-stress-thread.S| 311 
 tools/testing/selftests/arm64/gcs/gcs-stress.c | 530 +
 5 files changed, 848 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore 
b/tools/testing/selftests/arm64/gcs/.gitignore
index 0c86f53f68ad..1e8d1f6b27f2 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1,3 +1,5 @@
 basic-gcs
 libc-gcs
 gcs-locking
+gcs-stress
+gcs-stress-thread
diff --git a/tools/testing/selftests/arm64/gcs/Makefile 
b/tools/testing/selftests/arm64/gcs/Makefile
index 2173d6275956..d8b06ca51e22 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,8 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress
+TEST_GEN_PROGS_EXTENDED := gcs-stress-thread
 
 LDLIBS+=-lpthread
 
@@ -18,3 +19,6 @@ $(OUTPUT)/basic-gcs: basic-gcs.c
-I../../../../../usr/include \
-std=gnu99 -I../.. -g \
-ffreestanding -Wall $^ -o $@ -lgcc
+
+$(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S
+   $(CC) -nostdlib $^ -o $@
diff --git a/tools/testing/selftests/arm64/gcs/asm-offsets.h 
b/tools/testing/selftests/arm64/gcs/asm-offsets.h
new file mode 100644
index ..e69de29bb2d1
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S 
b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
new file mode 100644
index ..b88b25217da5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
@@ -0,0 +1,311 @@
+// Program that loops for ever doing lots of recursions and system calls,
+// intended to be used as part of a stress test for GCS context switching.
+//
+// Copyright 2015-2023 Arm Ltd
+
+#include 
+
+#define sa_sz 32
+#define sa_flags 8
+#define sa_handler 0
+#define sa_mask_sz 8
+
+#define si_code 8
+
+#define SIGINT 2
+#define SIGABRT 6
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGTERM 15
+#define SEGV_CPERR 10
+
+#define SA_NODEFER 1073741824
+#define SA_SIGINFO 4
+#define ucontext_regs 184
+
+#define PR_SET_SHADOW_STACK_STATUS  75
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+
+#defineGCSPR_EL0 S3_3_C2_C5_1
+
+.macro function name
+   .macro endfunction
+   .type \name, @function
+   .purgem endfunction
+   .endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+   str x0, [sp, #-16]!
+
+   mov x0, #1  // STDOUT_FILENO
+   mov x1, sp
+   mov x2, #1
+   mov x8, #__NR_write
+   svc #0
+
+   add sp, sp, #16
+   ret
+endfunction
+.globl putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+   mov x1, x0
+
+   mov x2, #0
+0: ldrbw3, [x0], #1
+   cbz w3, 1f
+   add x2, x2, #1
+   b   0b
+
+1: mov w0, #1  // STDOUT_FILENO
+   mov x8, #__NR_write
+   svc #0
+
+   ret
+endfunction
+.globl puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+   .pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+   .popsection
+
+   ldr x0, =.L__puts_literal\@
+   bl  puts
+.endm
+
+// Print an unsigned decimal number x0 to stdout
+// Clobbers x0-x4,x8
+function putdec
+   mov x1, sp
+   str x30, [sp, #-32]!// Result can't be > 20 digits
+
+   mov x2, #0
+   strbw2, [x1, #-1]!  // Write the NUL terminator
+
+   mov x2, #10
+0: udivx3, x0, x2  // div-mod loop to generate the digits
+   msubx0, x3, x2, x0
+   add w0, w0, #'0'
+   strbw0, [x1, #-1]!
+   mov x0, x3
+   cbnzx3, 0b
+
+   ldrbw0, [x1]
+   cbnzw0, 1f
+   mov w0, #'0'// Print "0" for 0, not ""
+   strbw0, [x1, #-1]!
+
+1: mov x0, x1
+   bl  puts
+
+   ldr x30, [sp], #32
+   ret

[PATCH v13 35/40] kselftest/arm64: Add a GCS test program built with the system libc

2024-10-01 Thread Mark Brown

There are things like threads which nolibc struggles with which we want
to add coverage for, and the ABI allows us to test most of these even if
libc itself does not understand GCS so add a test application built
using the system libc.

Reviewed-by: Thiago Jung Bauermann 
Tested-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/gcs/.gitignore |   1 +
 tools/testing/selftests/arm64/gcs/Makefile   |   4 +-
 tools/testing/selftests/arm64/gcs/gcs-util.h |  10 +
 tools/testing/selftests/arm64/gcs/libc-gcs.c | 728 +++
 4 files changed, 742 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore 
b/tools/testing/selftests/arm64/gcs/.gitignore
index 0e5e695ecba5..5810c4a163d4 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1 +1,2 @@
 basic-gcs
+libc-gcs
diff --git a/tools/testing/selftests/arm64/gcs/Makefile 
b/tools/testing/selftests/arm64/gcs/Makefile
index 61a30f483429..a8fdf21e9a47 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,9 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs
+TEST_GEN_PROGS := basic-gcs libc-gcs
+
+LDLIBS+=-lpthread
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h 
b/tools/testing/selftests/arm64/gcs/gcs-util.h
index 1ae6864d3f86..c99a6b39ac14 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-util.h
+++ b/tools/testing/selftests/arm64/gcs/gcs-util.h
@@ -16,6 +16,16 @@
 #define __NR_prctl 167
 #endif
 
+#ifndef NT_ARM_GCS
+#define NT_ARM_GCS 0x410
+
+struct user_gcs {
+   __u64 features_enabled;
+   __u64 features_locked;
+   __u64 gcspr_el0;
+};
+#endif
+
 /* Shadow Stack/Guarded Control Stack interface */
 #define PR_GET_SHADOW_STACK_STATUS 74
 #define PR_SET_SHADOW_STACK_STATUS  75
diff --git a/tools/testing/selftests/arm64/gcs/libc-gcs.c 
b/tools/testing/selftests/arm64/gcs/libc-gcs.c
new file mode 100644
index ..17b2fabfec38
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/libc-gcs.c
@@ -0,0 +1,728 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#define _GNU_SOURCE
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include 
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2)  \
+({\
+   register long _num  __asm__ ("x8") = (num);   \
+   register long _arg1 __asm__ ("x0") = (long)(arg1);\
+   register long _arg2 __asm__ ("x1") = (long)(arg2);\
+   register long _arg3 __asm__ ("x2") = 0;   \
+   register long _arg4 __asm__ ("x3") = 0;   \
+   register long _arg5 __asm__ ("x4") = 0;   \
+ \
+   __asm__  volatile (   \
+   "svc #0\n"\
+   : "=r"(_arg1) \
+   : "r"(_arg1), "r"(_arg2), \
+ "r"(_arg3), "r"(_arg4), \
+ "r"(_arg5), "r"(_num)   \
+   : "memory", "cc"  \
+   );\
+   _arg1;\
+})
+
+static noinline void gcs_recurse(int depth)
+{
+   if (depth)
+   gcs_recurse(depth - 1);
+
+   /* Prevent tail call optimization so we actually recurse */
+   asm volatile("dsb sy" : : : "memory");
+}
+
+/* Smoke test that a function call and return works*/
+TEST(can_call_function)
+{
+   gcs_recurse(0);
+}
+
+static void *gcs_test_thread(void *arg)
+{
+   int ret;
+   unsigned long mode;
+
+   /*
+* Some libcs don't seem to fill unused arguments with 0 but
+* the kernel validates this so we supply all 5 arguments.
+*/
+   ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+   if (ret != 0) {
+   ksft_print_msg("PR_GET_SHADOW_STACK_STATUS failed: %d\n", ret);
+   return NULL;
+   }
+
+   if (!(mode & PR_SHADOW_STACK_ENABLE)) {
+   ksft_print_msg("GCS not enabled in thread, mode is %lu\n",
+  mode);
+   return NULL;
+   }
+
+   /* Just in case... */
+   gcs_recurse(0);
+
+   /* Use a non-NULL value to indicate a pas

[PATCH v13 37/40] kselftest/arm64: Add GCS signal tests

2024-10-01 Thread Mark Brown

Do some testing of the signal handling for GCS, checking that a GCS
frame has the expected information in it and that the expected signals
are delivered with invalid operations.

Reviewed-by: Thiago Jung Bauermann 
Tested-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/signal/.gitignore|  1 +
 .../selftests/arm64/signal/test_signals_utils.h| 10 +++
 .../arm64/signal/testcases/gcs_exception_fault.c   | 62 +++
 .../selftests/arm64/signal/testcases/gcs_frame.c   | 88 ++
 .../arm64/signal/testcases/gcs_write_fault.c   | 67 
 5 files changed, 228 insertions(+)

diff --git a/tools/testing/selftests/arm64/signal/.gitignore 
b/tools/testing/selftests/arm64/signal/.gitignore
index b2f2bfd5c6aa..b257db665a35 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -3,6 +3,7 @@ mangle_*
 fake_sigreturn_*
 fpmr_*
 poe_*
+gcs_*
 sme_*
 ssve_*
 sve_*
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h 
b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 1e80808ee105..36fc12b3cd60 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -6,6 +6,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -47,6 +48,15 @@ void test_result(struct tdescr *td);
_arg1;  \
})
 
+static inline __attribute__((always_inline)) uint64_t get_gcspr_el0(void)
+{
+   uint64_t val;
+
+   asm volatile("mrs %0, S3_3_C2_C5_1" : "=r" (val));
+
+   return val;
+}
+
 static inline bool feats_ok(struct tdescr *td)
 {
if (td->feats_incompatible & td->feats_supported)
diff --git 
a/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c 
b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
new file mode 100644
index ..6228448b2ae7
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+/*
+ * We should get this from asm/siginfo.h but the testsuite is being
+ * clever with redefining siginfo_t.
+ */
+#ifndef SEGV_CPERR
+#define SEGV_CPERR 10
+#endif
+
+static inline void gcsss1(uint64_t Xt)
+{
+   asm volatile (
+   "sys #3, C7, C7, #2, %0\n"
+   :
+   : "rZ" (Xt)
+   : "memory");
+}
+
+static int gcs_op_fault_trigger(struct tdescr *td)
+{
+   /*
+* The slot below our current GCS should be in a valid GCS but
+* must not have a valid cap in it.
+*/
+   gcsss1(get_gcspr_el0() - 8);
+
+   return 0;
+}
+
+static int gcs_op_fault_signal(struct tdescr *td, siginfo_t *si,
+ ucontext_t *uc)
+{
+   ASSERT_GOOD_CONTEXT(uc);
+
+   return 1;
+}
+
+struct tdescr tde = {
+   .name = "Invalid GCS operation",
+   .descr = "An invalid GCS operation generates the expected signal",
+   .feats_required = FEAT_GCS,
+   .timeout = 3,
+   .sig_ok = SIGSEGV,
+   .sig_ok_code = SEGV_CPERR,
+   .sanity_disabled = true,
+   .trigger = gcs_op_fault_trigger,
+   .run = gcs_op_fault_signal,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c 
b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
new file mode 100644
index ..b405d82321da
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include 
+#include 
+#include 
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+   ucontext_t uc;
+   char buf[1024 * 64];
+} context;
+
+static int gcs_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+   size_t offset;
+   struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+   struct gcs_context *gcs;
+   unsigned long expected, gcspr;
+   uint64_t *u64_val;
+   int ret;
+
+   ret = prctl(PR_GET_SHADOW_STACK_STATUS, &expected, 0, 0, 0);
+   if (ret != 0) {
+   fprintf(stderr, "Unable to query GCS status\n");
+   return 1;
+   }
+
+   /* We expect a cap to be added to the GCS in the signal frame */
+   gcspr = get_gcspr_el0();
+   gcspr -= 8;
+   fprintf(stderr, "Expecting GCSPR_EL0 %lx\n", gcspr);
+
+   if (!get_current_context(td, &context.uc, sizeof(context))) {
+   fprintf(stderr, "Failed getting context\n");
+   return 1;
+   }
+
+   /* Ensure that the signal restore token was consumed */
+   u64_val = (uint64_t *)get_gcspr_el0() + 1;
+

[PATCH v13 20/40] arm64/mm: Handle GCS data aborts

2024-10-01 Thread Mark Brown

All GCS operations at EL0 must happen on a page which is marked as
having UnprivGCS access, including read operations.  If a GCS operation
attempts to access a page without this then it will generate a data
abort with the GCS bit set in ESR_EL1.ISS2.

EL0 may validly generate such faults, for example due to copy on write
which will cause the GCS data to be stored in a read only page with no
GCS permissions until the actual copy happens.  Since UnprivGCS allows
both reads and writes to the GCS (though only through GCS operations) we
need to ensure that the memory management subsystem handles GCS accesses
as writes at all times.  Do this by adding FAULT_FLAG_WRITE to any GCS
page faults, adding handling to ensure that invalid cases are identfied
as such early so the memory management core does not think they will
succeed.  The core cannot distinguish between VMAs which are generally
writeable and VMAs which are only writeable through GCS operations.

EL1 may validly write to EL0 GCS for management purposes (eg, while
initialising with cap tokens).

We also report any GCS faults in VMAs not marked as part of a GCS as
access violations, causing a fault to be delivered to userspace if it
attempts to do GCS operations outside a GCS.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/mm/fault.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 8b281cf308b3..c2f89a678ac0 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -504,6 +504,14 @@ static bool fault_from_pkey(unsigned long esr, struct 
vm_area_struct *vma,
false);
 }
 
+static bool is_gcs_fault(unsigned long esr)
+{
+   if (!esr_is_data_abort(esr))
+   return false;
+
+   return ESR_ELx_ISS2(esr) & ESR_ELx_GCS;
+}
+
 static bool is_el0_instruction_abort(unsigned long esr)
 {
return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
@@ -518,6 +526,23 @@ static bool is_write_abort(unsigned long esr)
return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
+static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr)
+{
+   if (!system_supports_gcs())
+   return false;
+
+   if (unlikely(is_gcs_fault(esr))) {
+   /* GCS accesses must be performed on a GCS page */
+   if (!(vma->vm_flags & VM_SHADOW_STACK))
+   return true;
+   } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) {
+   /* Only GCS operations can write to a GCS page */
+   return esr_is_data_abort(esr) && is_write_abort(esr);
+   }
+
+   return false;
+}
+
 static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
   struct pt_regs *regs)
 {
@@ -554,6 +579,14 @@ static int __kprobes do_page_fault(unsigned long far, 
unsigned long esr,
/* It was exec fault */
vm_flags = VM_EXEC;
mm_flags |= FAULT_FLAG_INSTRUCTION;
+   } else if (is_gcs_fault(esr)) {
+   /*
+* The GCS permission on a page implies both read and
+* write so always handle any GCS fault as a write fault,
+* we need to trigger CoW even for GCS reads.
+*/
+   vm_flags = VM_WRITE;
+   mm_flags |= FAULT_FLAG_WRITE;
} else if (is_write_abort(esr)) {
/* It was write fault */
vm_flags = VM_WRITE;
@@ -587,6 +620,13 @@ static int __kprobes do_page_fault(unsigned long far, 
unsigned long esr,
if (!vma)
goto lock_mmap;
 
+   if (is_invalid_gcs_access(vma, esr)) {
+   vma_end_read(vma);
+   fault = 0;
+   si_code = SEGV_ACCERR;
+   goto bad_area;
+   }
+
if (!(vma->vm_flags & vm_flags)) {
vma_end_read(vma);
fault = 0;

-- 
2.39.2

[PATCH v13 31/40] kselftest/arm64: Add framework support for GCS to signal handling tests

2024-10-01 Thread Mark Brown

Teach the framework about the GCS signal context, avoiding warnings on
the unknown context.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 tools/testing/selftests/arm64/signal/testcases/testcases.c | 7 +++
 tools/testing/selftests/arm64/signal/testcases/testcases.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c 
b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index e6daa94fcd2e..0c1a6b26afac 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -198,6 +198,13 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, 
char **err)
*err = "Bad size for fpmr_context";
new_flags |= FPMR_CTX;
break;
+   case GCS_MAGIC:
+   if (flags & GCS_CTX)
+   *err = "Multiple GCS_MAGIC";
+   if (head->size != sizeof(struct gcs_context))
+   *err = "Bad size for gcs_context";
+   new_flags |= GCS_CTX;
+   break;
case EXTRA_MAGIC:
if (flags & EXTRA_CTX)
*err = "Multiple EXTRA_MAGIC";
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h 
b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index 9872b8912714..98b97efdda23 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -20,6 +20,7 @@
 #define EXTRA_CTX  (1 << 3)
 #define ZT_CTX (1 << 4)
 #define FPMR_CTX   (1 << 5)
+#define GCS_CTX(1 << 6)
 
 #define KSFT_BAD_MAGIC 0xdeadbeef
 

-- 
2.39.2

[PATCH v13 32/40] kselftest/arm64: Allow signals tests to specify an expected si_code

2024-10-01 Thread Mark Brown

Currently we ignore si_code unless the expected signal is a SIGSEGV, in
which case we enforce it being SEGV_ACCERR. Allow test cases to specify
exactly which si_code should be generated so we can validate this, and
test for other segfault codes.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Mark Brown 
---
 .../testing/selftests/arm64/signal/test_signals.h  |  4 +++
 .../selftests/arm64/signal/test_signals_utils.c| 29 ++
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/arm64/signal/test_signals.h 
b/tools/testing/selftests/arm64/signal/test_signals.h
index 7ada43688c02..ee75a2c25ce7 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -71,6 +71,10 @@ struct tdescr {
 * Zero when no signal is expected on success
 */
int sig_ok;
+   /*
+* expected si_code for sig_ok, or 0 to not check
+*/
+   int sig_ok_code;
/* signum expected on unsupported CPU features. */
int sig_unsupp;
/* a timeout in second for test completion */
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c 
b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index dcc49e3ce1eb..5d3621921cfe 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -143,16 +143,25 @@ static bool handle_signal_ok(struct tdescr *td,
"current->token ZEROED...test is probably broken!\n");
abort();
}
-   /*
-* Trying to narrow down the SEGV to the ones generated by Kernel itself
-* via arm64_notify_segfault(). This is a best-effort check anyway, and
-* the si_code check may need to change if this aspect of the kernel
-* ABI changes.
-*/
-   if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
-   fprintf(stdout,
-   "si_code != SEGV_ACCERR...test is probably broken!\n");
-   abort();
+   if (td->sig_ok_code) {
+   if (si->si_code != td->sig_ok_code) {
+   fprintf(stdout, "si_code is %d not %d\n",
+   si->si_code, td->sig_ok_code);
+   abort();
+   }
+   } else {
+   /*
+* Trying to narrow down the SEGV to the ones
+* generated by Kernel itself via
+* arm64_notify_segfault(). This is a best-effort
+* check anyway, and the si_code check may need to
+* change if this aspect of the kernel ABI changes.
+*/
+   if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
+   fprintf(stdout,
+   "si_code != SEGV_ACCERR...test is probably 
broken!\n");
+   abort();
+   }
}
td->pass = 1;
/*

-- 
2.39.2

[PATCH v13 23/40] arm64/gcs: Implement shadow stack prctl() interface

2024-10-01 Thread Mark Brown

Implement the architecture neutral prctl() interface for setting the
shadow stack status, this supports setting and reading the current GCS
configuration for the current thread.

Userspace can enable basic GCS functionality and additionally also
support for GCS pushes and arbitrary GCS stores.  It is expected that
this prctl() will be called very early in application startup, for
example by the dynamic linker, and not subsequently adjusted during
normal operation.  Users should carefully note that after enabling GCS
for a thread GCS will become active with no call stack so it is not
normally possible to return from the function that invoked the prctl().

State is stored per thread, enabling GCS for a thread causes a GCS to be
allocated for that thread.

Userspace may lock the current GCS configuration by specifying
PR_SHADOW_STACK_ENABLE_LOCK, this prevents any further changes to the
GCS configuration via any means.

If GCS is not being enabled then all flags other than _LOCK are ignored,
it is not possible to enable stores or pops without enabling GCS.

When disabling the GCS we do not free the allocated stack, this allows
for inspection of the GCS after disabling as part of fault reporting.
Since it is not an expected use case and since it presents some
complications in determining what to do with previously initialsed data
on the GCS attempts to reenable GCS after this are rejected.  This can
be revisted if a use case arises.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/gcs.h   | 22 +++
 arch/arm64/include/asm/processor.h |  1 +
 arch/arm64/mm/gcs.c| 79 ++
 3 files changed, 102 insertions(+)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index c1f274fdb9c0..48c97e63e56a 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -50,6 +50,9 @@ static inline u64 gcsss2(void)
return Xt;
 }
 
+#define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK \
+   (PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH)
+
 #ifdef CONFIG_ARM64_GCS
 
 static inline bool task_gcs_el0_enabled(struct task_struct *task)
@@ -63,6 +66,20 @@ void gcs_preserve_current_state(void);
 unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
 const struct kernel_clone_args *args);
 
+static inline int gcs_check_locked(struct task_struct *task,
+  unsigned long new_val)
+{
+   unsigned long cur_val = task->thread.gcs_el0_mode;
+
+   cur_val &= task->thread.gcs_el0_locked;
+   new_val &= task->thread.gcs_el0_locked;
+
+   if (cur_val != new_val)
+   return -EBUSY;
+
+   return 0;
+}
+
 #else
 
 static inline bool task_gcs_el0_enabled(struct task_struct *task)
@@ -78,6 +95,11 @@ static inline unsigned long gcs_alloc_thread_stack(struct 
task_struct *tsk,
 {
return -ENOTSUPP;
 }
+static inline int gcs_check_locked(struct task_struct *task,
+  unsigned long new_val)
+{
+   return 0;
+}
 
 #endif
 
diff --git a/arch/arm64/include/asm/processor.h 
b/arch/arm64/include/asm/processor.h
index 5260788247d8..37fefdc3d3a3 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -187,6 +187,7 @@ struct thread_struct {
u64 por_el0;
 #ifdef CONFIG_ARM64_GCS
unsigned intgcs_el0_mode;
+   unsigned intgcs_el0_locked;
u64 gcspr_el0;
u64 gcs_base;
u64 gcs_size;
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 3c7a18f57ea9..61a80de6baf8 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -109,3 +109,82 @@ void gcs_free(struct task_struct *task)
task->thread.gcs_base = 0;
task->thread.gcs_size = 0;
 }
+
+int arch_set_shadow_stack_status(struct task_struct *task, unsigned long arg)
+{
+   unsigned long gcs, size;
+   int ret;
+
+   if (!system_supports_gcs())
+   return -EINVAL;
+
+   if (is_compat_thread(task_thread_info(task)))
+   return -EINVAL;
+
+   /* Reject unknown flags */
+   if (arg & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+   return -EINVAL;
+
+   ret = gcs_check_locked(task, arg);
+   if (ret != 0)
+   return ret;
+
+   /* If we are enabling GCS then make sure we have a stack */
+   if (arg & PR_SHADOW_STACK_ENABLE &&
+   !task_gcs_el0_enabled(task)) {
+   /* Do not allow GCS to be reenabled */
+   if (task->thread.gcs_base || task->thread.gcspr_el0)
+   return -EINVAL;
+
+   if (task != current)
+   return -EBUSY;
+
+   size = gcs_size(0);
+   gcs = alloc_gcs(0, size);
+

[PATCH v13 15/40] arm64/mm: Map pages for guarded control stack

2024-10-01 Thread Mark Brown

Map pages flagged as being part of a GCS as such rather than using the
full set of generic VM flags.

This is done using a conditional rather than extending the size of
protection_map since that would make for a very sparse array.

Reviewed-by: Thiago Jung Bauermann 
Reviewed-by: Catalin Marinas 
Signed-off-by: Mark Brown 
---
 arch/arm64/include/asm/mman.h | 9 +
 arch/arm64/mm/mmap.c  | 9 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 03b790fd0ad8..f6d784f8e6e0 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -71,6 +71,15 @@ static inline bool arch_validate_flags(unsigned long 
vm_flags)
return false;
}
 
+   if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
+   /* An executable GCS isn't a good idea. */
+   if (vm_flags & VM_EXEC)
+   return false;
+
+   /* The memory management core should prevent this */
+   VM_WARN_ON(vm_flags & VM_SHARED);
+   }
+
return true;
 
 }
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 7e3ad97e27d8..07aeab8a7606 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -83,8 +83,15 @@ arch_initcall(adjust_protection_map);
 
 pgprot_t vm_get_page_prot(unsigned long vm_flags)
 {
-   pteval_t prot = pgprot_val(protection_map[vm_flags &
+   pteval_t prot;
+
+   /* Short circuit GCS to avoid bloating the table. */
+   if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
+   prot = _PAGE_GCS_RO;
+   } else {
+   prot = pgprot_val(protection_map[vm_flags &
   (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
+   }
 
if (vm_flags & VM_ARM64_BTI)
prot |= PTE_GP;

-- 
2.39.2

88 matches

Mail list logo