Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Leonardo Bras
On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > From: Guo Ren 
> > 
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > improve the arch_xchg for qspinlock xchg_tail.
> > 
> > Signed-off-by: Guo Ren 
> > Signed-off-by: Guo Ren 
> > ---
> >  arch/riscv/Kconfig | 15 +++
> >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> >  arch/riscv/include/asm/hwcap.h |  1 +
> >  arch/riscv/include/asm/insn-def.h  |  5 +
> >  arch/riscv/include/asm/processor.h | 13 +
> >  arch/riscv/kernel/cpufeature.c |  1 +
> >  6 files changed, 38 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e9ae6fa232c3..2c346fe169c1 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> >  
> >If you don't know what to do here, say Y.
> >  
> > +config RISCV_ISA_ZICBOP
> > +   bool "Zicbop extension support for cache block prefetch"
> > +   depends on MMU
> > +   depends on RISCV_ALTERNATIVE
> > +   default y
> > +   help
> > +  Adds support to dynamically detect the presence of the ZICBOP
> > +  extension (Cache Block Prefetch Operations) and enable its
> > +  usage.
> > +
> > +  The Zicbop extension can be used to prefetch cache block for
> > +  read/write/instruction fetch.
> > +
> > +  If you don't know what to do here, say Y.
> > +
> >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> > bool
> > default y
> > diff --git a/arch/riscv/include/asm/cmpxchg.h 
> > b/arch/riscv/include/asm/cmpxchg.h
> > index 702725727671..56eff7a9d2d2 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -11,6 +11,7 @@
> >  
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #define __arch_xchg_masked(prepend, append, r, p, n)   
> > \
> >  ({ \
> > @@ -25,6 +26,7 @@
> > \
> > __asm__ __volatile__ (  \
> >prepend  \
> > +  PREFETCHW_ASM(%5)\
> >"0:  lr.w %0, %2\n"  \
> >"and  %1, %0, %z4\n" \
> >"or   %1, %1, %z3\n" \
> > @@ -32,7 +34,7 @@
> >"bnez %1, 0b\n"  \
> >append   \
> >: "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))   \
> > -  : "rJ" (__newx), "rJ" (~__mask)  \
> > +  : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> >: "memory"); \
> > \
> > r = (__typeof__(*(p)))((__retx & __mask) >> __s);   \
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index b7b58258f6c7..78b7b8b53778 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -58,6 +58,7 @@
> >  #define RISCV_ISA_EXT_ZICSR40
> >  #define RISCV_ISA_EXT_ZIFENCEI 41
> >  #define RISCV_ISA_EXT_ZIHPM42
> > +#define RISCV_ISA_EXT_ZICBOP   43
> >  
> >  #define RISCV_ISA_EXT_MAX  64
> >  
> > diff --git a/arch/riscv/include/asm/insn-def.h 
> > b/arch/riscv/include/asm/insn-def.h
> > index 6960beb75f32..dc590d331894 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -134,6 +134,7 @@
> >  
> >  #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> >  #define RV_OPCODE_SYSTEM   RV_OPCODE(115)
> > +#define RV_OPCODE_PREFETCH RV_OPCODE(19)
> 
> This should be named RV_OPCODE_OP_IMM and be placed in
> numerical order with the others, i.e. above SYSTEM.
> 
> >  
> >  #define HFENCE_VVMA(vaddr, asid)   \
> > INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),  \
> > @@ -196,4 +197,8 @@
> > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),  \
> >RS1(base), SIMM12(4))
> >  
> > +#define CBO_prefetchw(base)\
> 
> Please name this 'PREFETCH_w' and it should take an immediate parameter,
> even if we intend to pass 0 for it.

It makes sense.

The mnemonic in the previously mentioned documentation is:

prefetch.w offset(base)

So yeah, makes sense to have both offset and base as parameters for 
CBO_prefetchw (or PREFETCH_w, I

Re: [PATCH V11 05/17] riscv: qspinlock: Add basic queued_spinlock support

2023-09-15 Thread Leonardo Bras
On Fri, Sep 15, 2023 at 10:10:25AM +0800, Guo Ren wrote:
> On Thu, Sep 14, 2023 at 5:43 PM Leonardo Bras  wrote:
> >
> > On Thu, Sep 14, 2023 at 12:46:56PM +0800, Guo Ren wrote:
> > > On Thu, Sep 14, 2023 at 4:29 AM Leonardo Bras  wrote:
> > > >
> > > > On Sun, Sep 10, 2023 at 04:28:59AM -0400, guo...@kernel.org wrote:
> > > > > From: Guo Ren 
> > > > >
> > > > > The requirements of qspinlock have been documented by commit:
> > > > > a8ad07e5240c ("asm-generic: qspinlock: Indicate the use of mixed-size
> > > > > atomics").
> > > > >
> > > > > Although RISC-V ISA gives out a weaker forward guarantee LR/SC, which
> > > > > doesn't satisfy the requirements of qspinlock above, it won't prevent
> > > > > some riscv vendors from implementing a strong fwd guarantee LR/SC in
> > > > > microarchitecture to match xchg_tail requirement. T-HEAD C9xx 
> > > > > processor
> > > > > is the one.
> > > > >
> > > > > We've tested the patch on SOPHGO sg2042 & th1520 and passed the stress
> > > > > test on Fedora & Ubuntu & OpenEuler ... Here is the performance
> > > > > comparison between qspinlock and ticket_lock on sg2042 (64 cores):
> > > > >
> > > > > sysbench test=threads threads=32 yields=100 lock=8 (+13.8%):
> > > > >   queued_spinlock 0.5109/0.00
> > > > >   ticket_spinlock 0.5814/0.00
> > > > >
> > > > > perf futex/hash (+6.7%):
> > > > >   queued_spinlock 1444393 operations/sec (+- 0.09%)
> > > > >   ticket_spinlock 1353215 operations/sec (+- 0.15%)
> > > > >
> > > > > perf futex/wake-parallel (+8.6%):
> > > > >   queued_spinlock (waking 1/64 threads) in 0.0253 ms (+-2.90%)
> > > > >   ticket_spinlock (waking 1/64 threads) in 0.0275 ms (+-3.12%)
> > > > >
> > > > > perf futex/requeue (+4.2%):
> > > > >   queued_spinlock Requeued 64 of 64 threads in 0.0785 ms (+-0.55%)
> > > > >   ticket_spinlock Requeued 64 of 64 threads in 0.0818 ms (+-4.12%)
> > > > >
> > > > > System Benchmarks (+6.4%)
> > > > >   queued_spinlock:
> > > > > System Benchmarks Index Values   BASELINE   
> > > > > RESULTINDEX
> > > > > Dhrystone 2 using register variables 116700.0  
> > > > > 628613745.4  53865.8
> > > > > Double-Precision Whetstone   55.0 
> > > > > 182422.8  33167.8
> > > > > Execl Throughput 43.0  
> > > > > 13116.6   3050.4
> > > > > File Copy 1024 bufsize 2000 maxblocks  3960.0
> > > > > 7762306.2  19601.8
> > > > > File Copy 256 bufsize 500 maxblocks1655.0
> > > > > 3417556.8  20649.9
> > > > > File Copy 4096 bufsize 8000 maxblocks  5800.0
> > > > > 7427995.7  12806.9
> > > > > Pipe Throughput   12440.0   
> > > > > 23058600.5  18535.9
> > > > > Pipe-based Context Switching   4000.0
> > > > > 2835617.7   7089.0
> > > > > Process Creation126.0  
> > > > > 12537.3995.0
> > > > > Shell Scripts (1 concurrent) 42.4  
> > > > > 57057.4  13456.9
> > > > > Shell Scripts (8 concurrent)  6.0   
> > > > > 7367.1  12278.5
> > > > > System Call Overhead  15000.0   
> > > > > 33308301.3  22205.5
> > > > >   
> > > > >  
> > > > > System Benchmarks Index Score 
> > > > >   12426.1
> > > > >
> > > > >   ticket_spinlock:
> > > > > System Benchmarks Index Values   BASELINE   
> > > > > RESULTINDEX
> > > > > Dhrystone 2 using register variables 116700.0  
> > > > > 626541701.9  53688.2
> > > > > Double-Precision Whetstone   55.0 
> > > > > 181921.0  33076.5
> > > > > Execl Throughput 43.0  
> > > > > 12625.1   2936.1
> > > > > File Copy 1024 bufsize 2000 maxblocks  3960.0
> > > > > 6553792.9  16550.0
> > > > > File Copy 256 bufsize 500 maxblocks1655.0
> > > > > 3189231.6  19270.3
> > > > > File Copy 4096 bufsize 8000 maxblocks  5800.0
> > > > > 7221277.0  12450.5
> > > > > Pipe Throughput   12440.0   
> > > > > 20594018.7  16554.7
> > > > > Pipe-based Context Switching   4000.0
> > > > > 2571117.7   6427.8
> > > > > Process Creation126.0  
> > > > > 10798.4857.0
> > > > > Shell Scripts (1 concurrent) 42.4  
> > > > > 57227.5  13497.1
> > > > > Shell Scripts (8 concurrent)  6.0   
> > > > > 7329.2  12215.3
> > > > > System Call Overhead  15000.0   
> > > > > 30766778.4  20511.2
> > > > >   
> > > > >  
> > > > > System Benchmarks Index Score 
> > > > 

[RFC PATCH 00/14] Prevent cross-cache attacks in the SLUB allocator

2023-09-15 Thread Matteo Rizzo
The goal of this patch series is to deterministically prevent cross-cache
attacks in the SLUB allocator.

Use-after-free bugs are normally exploited by making the memory allocator
reuse the victim object's memory for an object with a different type. This
creates a type confusion which is a very powerful attack primitive.

There are generally two ways to create such type confusions in the kernel:
one way is to make SLUB reuse the freed object's address for another object
of a different type which lives in the same slab cache. This only works in
slab caches that can contain objects of different types (i.e. the kmalloc
caches) and the attacker is limited to objects that belong to the same size
class as the victim object.

The other way is to use a "cross-cache attack": make SLUB return the page
containing the victim object to the page allocator and then make it use the
same page for a different slab cache or other objects that contain
attacker-controlled data. This gives attackers access to all objects rather
than just the ones in the same size class as the target and lets attackers
target objects allocated from dedicated caches such as struct file.

This patch prevents cross-cache attacks by making sure that once a virtual
address is used for a slab cache it's never reused for anything except for
other slabs in that cache.


Jann Horn (13):
  mm/slub: add is_slab_addr/is_slab_page helpers
  mm/slub: move kmem_cache_order_objects to slab.h
  mm: use virt_to_slab instead of folio_slab
  mm/slub: create folio_set/clear_slab helpers
  mm/slub: pass additional args to alloc_slab_page
  mm/slub: pass slab pointer to the freeptr decode helper
  security: introduce CONFIG_SLAB_VIRTUAL
  mm/slub: add the slab freelists to kmem_cache
  x86: Create virtual memory region for SLUB
  mm/slub: allocate slabs from virtual memory
  mm/slub: introduce the deallocated_pages sysfs attribute
  mm/slub: sanity-check freepointers
  security: add documentation for SLAB_VIRTUAL

Matteo Rizzo (1):
  mm/slub: don't try to dereference invalid freepointers

 Documentation/arch/x86/x86_64/mm.rst   |   4 +-
 Documentation/security/self-protection.rst | 102 
 arch/x86/include/asm/page_64.h |  10 +
 arch/x86/include/asm/pgtable_64_types.h|  21 +
 arch/x86/mm/init_64.c  |  19 +-
 arch/x86/mm/kaslr.c|   9 +
 arch/x86/mm/mm_internal.h  |   4 +
 arch/x86/mm/physaddr.c |  10 +
 include/linux/slab.h   |   8 +
 include/linux/slub_def.h   |  25 +-
 init/main.c|   1 +
 kernel/resource.c  |   2 +-
 lib/slub_kunit.c   |   4 +
 mm/memcontrol.c|   2 +-
 mm/slab.h  | 145 +
 mm/slab_common.c   |  21 +-
 mm/slub.c  | 641 +++--
 mm/usercopy.c  |  12 +-
 security/Kconfig.hardening |  16 +
 19 files changed, 977 insertions(+), 79 deletions(-)


base-commit: 46a9ea6681907a3be6b6b0d43776dccc62cad6cf
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 01/14] mm/slub: don't try to dereference invalid freepointers

2023-09-15 Thread Matteo Rizzo
slab_free_freelist_hook tries to read a freelist pointer from the
current object even when freeing a single object. This is invalid
because single objects don't actually contain a freelist pointer when
they're freed and the memory contains other data. This causes problems
for checking the integrity of freelist in get_freepointer.

Signed-off-by: Matteo Rizzo 
---
 mm/slub.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index f7940048138c..a7dae207c2d2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1820,7 +1820,9 @@ static inline bool slab_free_freelist_hook(struct 
kmem_cache *s,
 
do {
object = next;
-   next = get_freepointer(s, object);
+   /* Single objects don't actually contain a freepointer */
+   if (object != old_tail)
+   next = get_freepointer(s, object);
 
/* If object's reuse doesn't have to be delayed */
if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 02/14] mm/slub: add is_slab_addr/is_slab_page helpers

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

This is refactoring in preparation for adding two different
implementations (for SLAB_VIRTUAL enabled and disabled).

virt_to_folio(x) expands to _compound_head(virt_to_page(x)) and
virt_to_head_page(x) also expands to _compound_head(virt_to_page(x))

so PageSlab(virt_to_head_page(res)) should be equivalent to
is_slab_addr(res).

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 include/linux/slab.h | 1 +
 kernel/resource.c| 2 +-
 mm/slab.h| 9 +
 mm/slab_common.c | 5 ++---
 mm/slub.c| 6 +++---
 5 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8228d1276a2f..a2d82010d269 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -793,4 +793,5 @@ int slab_dead_cpu(unsigned int cpu);
 #define slab_dead_cpu  NULL
 #endif
 
+#define is_slab_addr(addr) folio_test_slab(virt_to_folio(addr))
 #endif /* _LINUX_SLAB_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index b1763b2fd7ef..c829e5f97292 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -158,7 +158,7 @@ static void free_resource(struct resource *res)
 * buddy and trying to be smart and reusing them eventually in
 * alloc_resource() overcomplicates resource handling.
 */
-   if (res && PageSlab(virt_to_head_page(res)))
+   if (res && is_slab_addr(res))
kfree(res);
 }
 
diff --git a/mm/slab.h b/mm/slab.h
index 799a315695c6..25e41dd6087e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -169,6 +169,15 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 
sizeof(freelist_aba_t)
  */
 #define slab_page(s) folio_page(slab_folio(s), 0)
 
+/**
+ * is_slab_page - Checks if a page is really a slab page
+ * @s: The slab
+ *
+ * Checks if s points to a slab page.
+ *
+ * Return: true if s points to a slab and false otherwise.
+ */
+#define is_slab_page(s) folio_test_slab(slab_folio(s))
 /*
  * If network-based swap is enabled, sl*b must keep track of whether pages
  * were allocated from pfmemalloc reserves.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e99e821065c3..79102d24f099 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1063,7 +1063,7 @@ void kfree(const void *object)
return;
 
folio = virt_to_folio(object);
-   if (unlikely(!folio_test_slab(folio))) {
+   if (unlikely(!is_slab_addr(object))) {
free_large_kmalloc(folio, (void *)object);
return;
}
@@ -1094,8 +1094,7 @@ size_t __ksize(const void *object)
return 0;
 
folio = virt_to_folio(object);
-
-   if (unlikely(!folio_test_slab(folio))) {
+   if (unlikely(!is_slab_addr(object))) {
if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
return 0;
if (WARN_ON(object != folio_address(folio)))
diff --git a/mm/slub.c b/mm/slub.c
index a7dae207c2d2..b69916ab7aa8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1259,7 +1259,7 @@ static int check_slab(struct kmem_cache *s, struct slab 
*slab)
 {
int maxobj;
 
-   if (!folio_test_slab(slab_folio(slab))) {
+   if (!is_slab_page(slab)) {
slab_err(s, slab, "Not a valid slab page");
return 0;
}
@@ -1454,7 +1454,7 @@ static noinline bool alloc_debug_processing(struct 
kmem_cache *s,
return true;
 
 bad:
-   if (folio_test_slab(slab_folio(slab))) {
+   if (is_slab_page(slab)) {
/*
 * If this is a slab page then lets do the best we can
 * to avoid issues in the future. Marking all objects
@@ -1484,7 +1484,7 @@ static inline int free_consistency_checks(struct 
kmem_cache *s,
return 0;
 
if (unlikely(s != slab->slab_cache)) {
-   if (!folio_test_slab(slab_folio(slab))) {
+   if (!is_slab_page(slab)) {
slab_err(s, slab, "Attempt to free object(0x%p) outside 
of slab",
 object);
} else if (!slab->slab_cache) {
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 03/14] mm/slub: move kmem_cache_order_objects to slab.h

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

This is refactoring for SLAB_VIRTUAL. The implementation needs to know
the order of the virtual memory region allocated to each slab to know
how much physical memory to allocate when the slab is reused. We reuse
kmem_cache_order_objects for this, so we have to move it before struct
slab.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 include/linux/slub_def.h |  9 -
 mm/slab.h| 22 ++
 mm/slub.c| 12 
 3 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index deb90cf4bffb..0adf5ba8241b 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -83,15 +83,6 @@ struct kmem_cache_cpu {
 #define slub_percpu_partial_read_once(c)   NULL
 #endif // CONFIG_SLUB_CPU_PARTIAL
 
-/*
- * Word size structure that can be atomically updated or read and that
- * contains both the order and the number of objects that a slab of the
- * given order would contain.
- */
-struct kmem_cache_order_objects {
-   unsigned int x;
-};
-
 /*
  * Slab cache management.
  */
diff --git a/mm/slab.h b/mm/slab.h
index 25e41dd6087e..3fe0d1e26e26 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,15 @@ typedef union {
freelist_full_t full;
 } freelist_aba_t;
 
+/*
+ * Word size structure that can be atomically updated or read and that
+ * contains both the order and the number of objects that a slab of the
+ * given order would contain.
+ */
+struct kmem_cache_order_objects {
+   unsigned int x;
+};
+
 /* Reuses the bits in struct page */
 struct slab {
unsigned long __page_flags;
@@ -227,6 +236,19 @@ static inline struct slab *virt_to_slab(const void *addr)
return folio_slab(folio);
 }
 
+#define OO_SHIFT   16
+#define OO_MASK((1 << OO_SHIFT) - 1)
+
+static inline unsigned int oo_order(struct kmem_cache_order_objects x)
+{
+   return x.x >> OO_SHIFT;
+}
+
+static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
+{
+   return x.x & OO_MASK;
+}
+
 static inline int slab_order(const struct slab *slab)
 {
return folio_order((struct folio *)slab_folio(slab));
diff --git a/mm/slub.c b/mm/slub.c
index b69916ab7aa8..df2529c03bd3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -284,8 +284,6 @@ static inline bool kmem_cache_has_cpu_partial(struct 
kmem_cache *s)
  */
 #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
 
-#define OO_SHIFT   16
-#define OO_MASK((1 << OO_SHIFT) - 1)
 #define MAX_OBJS_PER_PAGE  32767 /* since slab.objects is u15 */
 
 /* Internal SLUB flags */
@@ -473,16 +471,6 @@ static inline struct kmem_cache_order_objects 
oo_make(unsigned int order,
return x;
 }
 
-static inline unsigned int oo_order(struct kmem_cache_order_objects x)
-{
-   return x.x >> OO_SHIFT;
-}
-
-static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
-{
-   return x.x & OO_MASK;
-}
-
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
 {
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 04/14] mm: use virt_to_slab instead of folio_slab

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

This is refactoring in preparation for the introduction of SLAB_VIRTUAL
which does not implement folio_slab.

With SLAB_VIRTUAL there is no longer a 1:1 correspondence between slabs
and pages of physical memory used by the slab allocator. There is no way
to look up the slab which corresponds to a specific page of physical
memory without iterating over all slabs or over the page tables. Instead
of doing that, we can look up the slab starting from its virtual address
which can still be performed cheaply with both SLAB_VIRTUAL enabled and
disabled.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 mm/memcontrol.c  |  2 +-
 mm/slab_common.c | 12 +++-
 mm/slub.c| 14 ++
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8ca4bdcb03c..0ab9f5323db7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2936,7 +2936,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio 
*folio, void *p)
struct slab *slab;
unsigned int off;
 
-   slab = folio_slab(folio);
+   slab = virt_to_slab(p);
objcgs = slab_objcgs(slab);
if (!objcgs)
return NULL;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 79102d24f099..42ceaf7e9f47 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1062,13 +1062,13 @@ void kfree(const void *object)
if (unlikely(ZERO_OR_NULL_PTR(object)))
return;
 
-   folio = virt_to_folio(object);
if (unlikely(!is_slab_addr(object))) {
+   folio = virt_to_folio(object);
free_large_kmalloc(folio, (void *)object);
return;
}
 
-   slab = folio_slab(folio);
+   slab = virt_to_slab(object);
s = slab->slab_cache;
__kmem_cache_free(s, (void *)object, _RET_IP_);
 }
@@ -1089,12 +1089,13 @@ EXPORT_SYMBOL(kfree);
 size_t __ksize(const void *object)
 {
struct folio *folio;
+   struct kmem_cache *s;
 
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
 
-   folio = virt_to_folio(object);
if (unlikely(!is_slab_addr(object))) {
+   folio = virt_to_folio(object);
if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
return 0;
if (WARN_ON(object != folio_address(folio)))
@@ -1102,11 +1103,12 @@ size_t __ksize(const void *object)
return folio_size(folio);
}
 
+   s = virt_to_slab(object)->slab_cache;
 #ifdef CONFIG_SLUB_DEBUG
-   skip_orig_size_check(folio_slab(folio)->slab_cache, object);
+   skip_orig_size_check(s, object);
 #endif
 
-   return slab_ksize(folio_slab(folio)->slab_cache);
+   return slab_ksize(s);
 }
 
 void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
diff --git a/mm/slub.c b/mm/slub.c
index df2529c03bd3..ad33d9e1601d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3848,25 +3848,23 @@ int build_detached_freelist(struct kmem_cache *s, 
size_t size,
 {
int lookahead = 3;
void *object;
-   struct folio *folio;
+   struct slab *slab;
size_t same;
 
object = p[--size];
-   folio = virt_to_folio(object);
+   slab = virt_to_slab(object);
if (!s) {
/* Handle kalloc'ed objects */
-   if (unlikely(!folio_test_slab(folio))) {
-   free_large_kmalloc(folio, object);
+   if (unlikely(slab == NULL)) {
+   free_large_kmalloc(virt_to_folio(object), object);
df->slab = NULL;
return size;
}
-   /* Derive kmem_cache from object */
-   df->slab = folio_slab(folio);
-   df->s = df->slab->slab_cache;
+   df->s = slab->slab_cache;
} else {
-   df->slab = folio_slab(folio);
df->s = cache_from_obj(s, object); /* Support for memcg */
}
+   df->slab = slab;
 
/* Start new detached freelist */
df->tail = object;
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 05/14] mm/slub: create folio_set/clear_slab helpers

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

This is refactoring in preparation for SLAB_VIRTUAL. Extract this code
to separate functions so that it's not duplicated in the code that
allocates and frees page with SLAB_VIRTUAL enabled.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 mm/slub.c | 32 ++--
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index ad33d9e1601d..9b87afade125 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1849,6 +1849,26 @@ static void *setup_object(struct kmem_cache *s, void 
*object)
 /*
  * Slab allocation and freeing
  */
+
+static void folio_set_slab(struct folio *folio, struct slab *slab)
+{
+   __folio_set_slab(folio);
+   /* Make the flag visible before any changes to folio->mapping */
+   smp_wmb();
+
+   if (folio_is_pfmemalloc(folio))
+   slab_set_pfmemalloc(slab);
+}
+
+static void folio_clear_slab(struct folio *folio, struct slab *slab)
+{
+   __slab_clear_pfmemalloc(slab);
+   folio->mapping = NULL;
+   /* Make the mapping reset visible before clearing the flag */
+   smp_wmb();
+   __folio_clear_slab(folio);
+}
+
 static inline struct slab *alloc_slab_page(gfp_t flags, int node,
struct kmem_cache_order_objects oo)
 {
@@ -1865,11 +1885,7 @@ static inline struct slab *alloc_slab_page(gfp_t flags, 
int node,
return NULL;
 
slab = folio_slab(folio);
-   __folio_set_slab(folio);
-   /* Make the flag visible before any changes to folio->mapping */
-   smp_wmb();
-   if (folio_is_pfmemalloc(folio))
-   slab_set_pfmemalloc(slab);
+   folio_set_slab(folio, slab);
 
return slab;
 }
@@ -2067,11 +2083,7 @@ static void __free_slab(struct kmem_cache *s, struct 
slab *slab)
int order = folio_order(folio);
int pages = 1 << order;
 
-   __slab_clear_pfmemalloc(slab);
-   folio->mapping = NULL;
-   /* Make the mapping reset visible before clearing the flag */
-   smp_wmb();
-   __folio_clear_slab(folio);
+   folio_clear_slab(folio, slab);
mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s);
__free_pages(&folio->page, order);
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 06/14] mm/slub: pass additional args to alloc_slab_page

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

This is refactoring in preparation for SLAB_VIRTUAL.

The implementation of SLAB_VIRTUAL needs access to struct kmem_cache in
alloc_slab_page in order to take unused slabs from the slab freelist,
which is per-cache.

In addition to that it passes two different sets of GFP flags.
meta_gfp_flags is used for the memory backing the metadata region and
page tables, and gfp_flags for the data memory.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 mm/slub.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 9b87afade125..eaa1256aff89 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1869,7 +1869,8 @@ static void folio_clear_slab(struct folio *folio, struct 
slab *slab)
__folio_clear_slab(folio);
 }
 
-static inline struct slab *alloc_slab_page(gfp_t flags, int node,
+static inline struct slab *alloc_slab_page(struct kmem_cache *s,
+   gfp_t meta_flags, gfp_t flags, int node,
struct kmem_cache_order_objects oo)
 {
struct folio *folio;
@@ -2020,7 +2021,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > 
oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
 
-   slab = alloc_slab_page(alloc_gfp, node, oo);
+   slab = alloc_slab_page(s, flags, alloc_gfp, node, oo);
if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
@@ -2028,7 +2029,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
 * Allocation may have failed due to fragmentation.
 * Try a lower order alloc if possible
 */
-   slab = alloc_slab_page(alloc_gfp, node, oo);
+   slab = alloc_slab_page(s, flags, alloc_gfp, node, oo);
if (unlikely(!slab))
return NULL;
stat(s, ORDER_FALLBACK);
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 07/14] mm/slub: pass slab pointer to the freeptr decode helper

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

This is refactoring in preparation for checking freeptrs for corruption
inside freelist_ptr_decode().

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 mm/slub.c | 43 +++
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index eaa1256aff89..42e7cc0b4452 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -383,7 +383,8 @@ static inline freeptr_t freelist_ptr_encode(const struct 
kmem_cache *s,
 }
 
 static inline void *freelist_ptr_decode(const struct kmem_cache *s,
-   freeptr_t ptr, unsigned long ptr_addr)
+   freeptr_t ptr, unsigned long ptr_addr,
+   struct slab *slab)
 {
void *decoded;
 
@@ -395,7 +396,8 @@ static inline void *freelist_ptr_decode(const struct 
kmem_cache *s,
return decoded;
 }
 
-static inline void *get_freepointer(struct kmem_cache *s, void *object)
+static inline void *get_freepointer(struct kmem_cache *s, void *object,
+   struct slab *slab)
 {
unsigned long ptr_addr;
freeptr_t p;
@@ -403,7 +405,7 @@ static inline void *get_freepointer(struct kmem_cache *s, 
void *object)
object = kasan_reset_tag(object);
ptr_addr = (unsigned long)object + s->offset;
p = *(freeptr_t *)(ptr_addr);
-   return freelist_ptr_decode(s, p, ptr_addr);
+   return freelist_ptr_decode(s, p, ptr_addr, slab);
 }
 
 #ifndef CONFIG_SLUB_TINY
@@ -424,18 +426,19 @@ static void prefetch_freepointer(const struct kmem_cache 
*s, void *object)
  * get_freepointer_safe() returns initialized memory.
  */
 __no_kmsan_checks
-static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
+static inline void *get_freepointer_safe(struct kmem_cache *s, void *object,
+struct slab *slab)
 {
unsigned long freepointer_addr;
freeptr_t p;
 
if (!debug_pagealloc_enabled_static())
-   return get_freepointer(s, object);
+   return get_freepointer(s, object, slab);
 
object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
-   return freelist_ptr_decode(s, p, freepointer_addr);
+   return freelist_ptr_decode(s, p, freepointer_addr, slab);
 }
 
 static inline void set_freepointer(struct kmem_cache *s, void *object, void 
*fp)
@@ -627,7 +630,7 @@ static void __fill_map(unsigned long *obj_map, struct 
kmem_cache *s,
 
bitmap_zero(obj_map, slab->objects);
 
-   for (p = slab->freelist; p; p = get_freepointer(s, p))
+   for (p = slab->freelist; p; p = get_freepointer(s, p, slab))
set_bit(__obj_to_index(s, addr, p), obj_map);
 }
 
@@ -937,7 +940,7 @@ static void print_trailer(struct kmem_cache *s, struct slab 
*slab, u8 *p)
print_slab_info(slab);
 
pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
-  p, p - addr, get_freepointer(s, p));
+  p, p - addr, get_freepointer(s, p, slab));
 
if (s->flags & SLAB_RED_ZONE)
print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
@@ -1230,7 +1233,7 @@ static int check_object(struct kmem_cache *s, struct slab 
*slab,
return 1;
 
/* Check free pointer validity */
-   if (!check_valid_pointer(s, slab, get_freepointer(s, p))) {
+   if (!check_valid_pointer(s, slab, get_freepointer(s, p, slab))) {
object_err(s, slab, p, "Freepointer corrupt");
/*
 * No choice but to zap it and thus lose the remainder
@@ -1298,7 +1301,7 @@ static int on_freelist(struct kmem_cache *s, struct slab 
*slab, void *search)
break;
}
object = fp;
-   fp = get_freepointer(s, object);
+   fp = get_freepointer(s, object, slab);
nr++;
}
 
@@ -1810,7 +1813,7 @@ static inline bool slab_free_freelist_hook(struct 
kmem_cache *s,
object = next;
/* Single objects don't actually contain a freepointer */
if (object != old_tail)
-   next = get_freepointer(s, object);
+   next = get_freepointer(s, object, virt_to_slab(object));
 
/* If object's reuse doesn't have to be delayed */
if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
@@ -2161,7 +2164,7 @@ static void *alloc_single_from_partial(struct kmem_cache 
*s,
lockdep_assert_held(&n->list_lock);
 
object = slab->freelist;
-   slab->freelist = get_freepointer(s, object);
+   slab->freelist = get_freepointer(s, object, slab);
slab->inuse++;
 
if (!alloc_debug_processing(s, slab, object, orig_size)

[RFC PATCH 08/14] security: introduce CONFIG_SLAB_VIRTUAL

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

SLAB_VIRTUAL is a mitigation for the SLUB allocator which prevents reuse
of virtual addresses across different slab caches and therefore makes
some types of use-after-free bugs unexploitable.

SLAB_VIRTUAL is incompatible with KASAN and we believe it's not worth
adding support for it. This is because SLAB_VIRTUAL and KASAN are aimed
at two different use cases: KASAN is meant for catching bugs as early as
possible in debug/fuzz/testing builds, and it's not meant to be used in
production. SLAB_VIRTUAL on the other hand is an exploit mitigation that
doesn't attempt to highlight bugs but instead tries to make them
unexploitable. It doesn't make sense to enable it in debugging builds or
during fuzzing, and instead we expect that it will be enabled in
production kernels.

SLAB_VIRTUAL is not currently compatible with KFENCE, removing this
limitation is future work.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 security/Kconfig.hardening | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 0f295961e773..9f4e6e38aa76 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -355,4 +355,18 @@ config GCC_PLUGIN_RANDSTRUCT
   * https://grsecurity.net/
   * https://pax.grsecurity.net/
 
+config SLAB_VIRTUAL
+   bool "Allocate slab objects from virtual memory"
+   depends on SLUB && !SLUB_TINY
+   # If KFENCE support is desired, it could be implemented on top of our
+   # virtual memory allocation facilities
+   depends on !KFENCE
+   # ASAN support will require that shadow memory is allocated
+   # appropriately.
+   depends on !KASAN
+   help
+ Allocate slab objects from kernel-virtual memory, and ensure that
+ virtual memory used as a slab cache is never reused to store
+ objects from other slab caches or non-slab data.
+
 endmenu
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 09/14] mm/slub: add the slab freelists to kmem_cache

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

With SLAB_VIRTUAL enabled, unused slabs which still have virtual memory
allocated to them but no physical memory are kept in a per-cache list so
that they can be reused later if the cache needs to grow again.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 include/linux/slub_def.h | 16 
 mm/slub.c| 23 +++
 2 files changed, 39 insertions(+)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 0adf5ba8241b..693e9bb34edc 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -86,6 +86,20 @@ struct kmem_cache_cpu {
 /*
  * Slab cache management.
  */
+struct kmem_cache_virtual {
+#ifdef CONFIG_SLAB_VIRTUAL
+   /* Protects freed_slabs and freed_slabs_min */
+   spinlock_t freed_slabs_lock;
+   /*
+* Slabs on this list have virtual memory of size oo allocated to them
+* but no physical memory
+*/
+   struct list_head freed_slabs;
+   /* Same as freed_slabs but with memory of size min */
+   struct list_head freed_slabs_min;
+#endif
+};
+
 struct kmem_cache {
 #ifndef CONFIG_SLUB_TINY
struct kmem_cache_cpu __percpu *cpu_slab;
@@ -107,6 +121,8 @@ struct kmem_cache {
 
/* Allocation and freeing of slabs */
struct kmem_cache_order_objects min;
+   struct kmem_cache_virtual virtual;
+
gfp_t allocflags;   /* gfp flags to use on each alloc */
int refcount;   /* Refcount for slab cache destroy */
void (*ctor)(void *);
diff --git a/mm/slub.c b/mm/slub.c
index 42e7cc0b4452..4f77e5d4fe6c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4510,8 +4510,20 @@ static int calculate_sizes(struct kmem_cache *s)
return !!oo_objects(s->oo);
 }
 
+static inline void slab_virtual_open(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLAB_VIRTUAL
+   /* WARNING: this stuff will be relocated in bootstrap()! */
+   spin_lock_init(&s->virtual.freed_slabs_lock);
+   INIT_LIST_HEAD(&s->virtual.freed_slabs);
+   INIT_LIST_HEAD(&s->virtual.freed_slabs_min);
+#endif
+}
+
 static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
 {
+   slab_virtual_open(s);
+
s->flags = kmem_cache_flags(s->size, flags, s->name);
 #ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
@@ -4994,6 +5006,16 @@ static int slab_memory_callback(struct notifier_block 
*self,
  * that may be pointing to the wrong kmem_cache structure.
  */
 
+static inline void slab_virtual_bootstrap(struct kmem_cache *s, struct 
kmem_cache *static_cache)
+{
+   slab_virtual_open(s);
+
+#ifdef CONFIG_SLAB_VIRTUAL
+   list_splice(&static_cache->virtual.freed_slabs, 
&s->virtual.freed_slabs);
+   list_splice(&static_cache->virtual.freed_slabs_min, 
&s->virtual.freed_slabs_min);
+#endif
+}
+
 static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 {
int node;
@@ -5001,6 +5023,7 @@ static struct kmem_cache * __init bootstrap(struct 
kmem_cache *static_cache)
struct kmem_cache_node *n;
 
memcpy(s, static_cache, kmem_cache->object_size);
+   slab_virtual_bootstrap(s, static_cache);
 
/*
 * This runs very early, and only the boot processor is supposed to be
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 10/14] x86: Create virtual memory region for SLUB

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

SLAB_VIRTUAL reserves 512 GiB of virtual memory and uses them for both
struct slab and the actual slab memory. The pointers returned by
kmem_cache_alloc will point to this range of memory.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 Documentation/arch/x86/x86_64/mm.rst|  4 ++--
 arch/x86/include/asm/pgtable_64_types.h | 16 
 arch/x86/mm/init_64.c   | 19 +++
 arch/x86/mm/kaslr.c |  9 +
 arch/x86/mm/mm_internal.h   |  4 
 mm/slub.c   |  4 
 security/Kconfig.hardening  |  2 ++
 7 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/Documentation/arch/x86/x86_64/mm.rst 
b/Documentation/arch/x86/x86_64/mm.rst
index 35e5e18c83d0..121179537175 100644
--- a/Documentation/arch/x86/x86_64/mm.rst
+++ b/Documentation/arch/x86/x86_64/mm.rst
@@ -57,7 +57,7 @@ Complete virtual memory map with 4-level page tables
fc00 |   -4TB | fdff |2 TB | ... unused hole
 ||  | | vaddr_end for 
KASLR
fe00 |   -2TB | fe7f |  0.5 TB | cpu_entry_area 
mapping
-   fe80 |   -1.5  TB | feff |  0.5 TB | ... unused hole
+   fe80 |   -1.5  TB | feff |  0.5 TB | SLUB virtual 
memory
ff00 |   -1TB | ff7f |  0.5 TB | %esp fixup 
stacks
ff80 | -512GB | ffee |  444 GB | ... unused hole
ffef |  -68GB | fffe |   64 GB | EFI region 
mapping space
@@ -116,7 +116,7 @@ Complete virtual memory map with 5-level page tables
fc00 |   -4TB | fdff |2 TB | ... unused hole
 ||  | | vaddr_end for 
KASLR
fe00 |   -2TB | fe7f |  0.5 TB | cpu_entry_area 
mapping
-   fe80 |   -1.5  TB | feff |  0.5 TB | ... unused hole
+   fe80 |   -1.5  TB | feff |  0.5 TB | SLUB virtual 
memory
ff00 |   -1TB | ff7f |  0.5 TB | %esp fixup 
stacks
ff80 | -512GB | ffee |  444 GB | ... unused hole
ffef |  -68GB | fffe |   64 GB | EFI region 
mapping space
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 38b54b992f32..e1a91eb084c4 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -6,6 +6,7 @@
 
 #ifndef __ASSEMBLY__
 #include 
+#include 
 #include 
 
 /*
@@ -199,6 +200,21 @@ extern unsigned int ptrs_per_p4d;
 #define ESPFIX_PGD_ENTRY   _AC(-2, UL)
 #define ESPFIX_BASE_ADDR   (ESPFIX_PGD_ENTRY << P4D_SHIFT)
 
+#ifdef CONFIG_SLAB_VIRTUAL
+#define SLAB_PGD_ENTRY _AC(-3, UL)
+#define SLAB_BASE_ADDR (SLAB_PGD_ENTRY << P4D_SHIFT)
+#define SLAB_END_ADDR  (SLAB_BASE_ADDR + P4D_SIZE)
+
+/*
+ * We need to define this here because we need it to compute SLAB_META_SIZE
+ * and including slab.h causes a dependency cycle.
+ */
+#define STRUCT_SLAB_SIZE (32 * sizeof(void *))
+#define SLAB_VPAGES ((SLAB_END_ADDR - SLAB_BASE_ADDR) / PAGE_SIZE)
+#define SLAB_META_SIZE ALIGN(SLAB_VPAGES * STRUCT_SLAB_SIZE, PAGE_SIZE)
+#define SLAB_DATA_BASE_ADDR (SLAB_BASE_ADDR + SLAB_META_SIZE)
+#endif /* CONFIG_SLAB_VIRTUAL */
+
 #define CPU_ENTRY_AREA_PGD _AC(-4, UL)
 #define CPU_ENTRY_AREA_BASE(CPU_ENTRY_AREA_PGD << P4D_SHIFT)
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a190aae8ceaf..d716ddfd9880 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1279,16 +1279,19 @@ static void __init register_page_bootmem_info(void)
 }
 
 /*
- * Pre-allocates page-table pages for the vmalloc area in the kernel 
page-table.
+ * Pre-allocates page-table pages for the vmalloc and SLUB areas in the kernel
+ * page-table.
  * Only the level which needs to be synchronized between all page-tables is
  * allocated because the synchronization can be expensive.
  */
-static void __init preallocate_vmalloc_pages(void)
+static void __init preallocate_top_level_entries_range(unsigned long start,
+  unsigned long end)
 {
unsigned long addr;
const char *lvl;
 
-   for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, 
PGDIR_SIZE)) {
+
+   for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
pgd_t *pgd = pgd_offset_k(addr);
p4d_t *p4d;
pud_t *pud;
@@ -1328,6 +1331,14 @@ static void __init preallocate_vmalloc_pages(void)
panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl);
 }
 
+static void __init preallocate_top_level_entries(void)
+{
+   preallocate_top_

[RFC PATCH 11/14] mm/slub: allocate slabs from virtual memory

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

This is the main implementation of SLAB_VIRTUAL. With SLAB_VIRTUAL
enabled, slab memory is not allocated from the linear map but from a
dedicated region of virtual memory. The code ensures that once a range
of virtual addresses is assigned to a slab cache, that virtual memory is
never reused again except for other slabs in that same cache. This lets
us mitigate some exploits for use-after-free vulnerabilities where the
attacker makes SLUB release a slab page to the page allocator and then
makes it reuse that same page for a different slab cache ("cross-cache
attacks").

With SLAB_VIRTUAL enabled struct slab no longer overlaps struct page but
instead it is allocated from a dedicated region of virtual memory. This
makes it possible to have references to slabs whose physical memory has
been freed.

SLAB_VIRTUAL has a small performance overhead, about 1-2% on kernel
compilation time. We are using 4 KiB pages to map slab pages and slab
metadata area, instead of the 2 MiB pages that the kernel uses to map
the physmap. We experimented with a version of the patch that uses 2 MiB
pages and we did see some performance improvement but the code also
became much more complicated and ugly because we would need to allocate
and free multiple slabs at once.

In addition to the TLB contention, SLAB_VIRTUAL also adds new locks to
the slow path of the allocator. Lock contention also contributes to the
performance penalty to some extent, and this is more visible on machines
with many CPUs.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 arch/x86/include/asm/page_64.h  |  10 +
 arch/x86/include/asm/pgtable_64_types.h |   5 +
 arch/x86/mm/physaddr.c  |  10 +
 include/linux/slab.h|   7 +
 init/main.c |   1 +
 mm/slab.h   | 106 ++
 mm/slab_common.c|   4 +
 mm/slub.c   | 439 +++-
 mm/usercopy.c   |  12 +-
 9 files changed, 587 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index cc6b8e087192..25fb734a2fe6 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -3,6 +3,7 @@
 #define _ASM_X86_PAGE_64_H
 
 #include 
+#include 
 
 #ifndef __ASSEMBLY__
 #include 
@@ -18,10 +19,19 @@ extern unsigned long page_offset_base;
 extern unsigned long vmalloc_base;
 extern unsigned long vmemmap_base;
 
+#ifdef CONFIG_SLAB_VIRTUAL
+unsigned long slab_virt_to_phys(unsigned long x);
+#endif
+
 static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
 {
unsigned long y = x - __START_KERNEL_map;
 
+#ifdef CONFIG_SLAB_VIRTUAL
+   if (is_slab_addr(x))
+   return slab_virt_to_phys(x);
+#endif
+
/* use the carry flag to determine if x was < __START_KERNEL_map */
x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
 
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index e1a91eb084c4..4aae822a6a96 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -213,6 +213,11 @@ extern unsigned int ptrs_per_p4d;
 #define SLAB_VPAGES ((SLAB_END_ADDR - SLAB_BASE_ADDR) / PAGE_SIZE)
 #define SLAB_META_SIZE ALIGN(SLAB_VPAGES * STRUCT_SLAB_SIZE, PAGE_SIZE)
 #define SLAB_DATA_BASE_ADDR (SLAB_BASE_ADDR + SLAB_META_SIZE)
+
+#define is_slab_addr(ptr) ((unsigned long)(ptr) >= SLAB_DATA_BASE_ADDR && \
+   (unsigned long)(ptr) < SLAB_END_ADDR)
+#define is_slab_meta(ptr) ((unsigned long)(ptr) >= SLAB_BASE_ADDR && \
+   (unsigned long)(ptr) < SLAB_DATA_BASE_ADDR)
 #endif /* CONFIG_SLAB_VIRTUAL */
 
 #define CPU_ENTRY_AREA_PGD _AC(-4, UL)
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index fc3f3d3e2ef2..7f1b81c75e4d 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -16,6 +16,11 @@ unsigned long __phys_addr(unsigned long x)
 {
unsigned long y = x - __START_KERNEL_map;
 
+#ifdef CONFIG_SLAB_VIRTUAL
+   if (is_slab_addr(x))
+   return slab_virt_to_phys(x);
+#endif
+
/* use the carry flag to determine if x was < __START_KERNEL_map */
if (unlikely(x > y)) {
x = y + phys_base;
@@ -48,6 +53,11 @@ bool __virt_addr_valid(unsigned long x)
 {
unsigned long y = x - __START_KERNEL_map;
 
+#ifdef CONFIG_SLAB_VIRTUAL
+   if (is_slab_addr(x))
+   return true;
+#endif
+
/* use the carry flag to determine if x was < __START_KERNEL_map */
if (unlikely(x > y)) {
x = y + phys_base;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a2d82010d269..2180d5170995 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -793,5 +793,12 @@ int slab_dead_cpu(unsigned int cpu);
 #define slab_dead_cpu  NULL
 #endif
 
+#ifdef

[RFC PATCH 12/14] mm/slub: introduce the deallocated_pages sysfs attribute

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

When SLAB_VIRTUAL is enabled this new sysfs attribute tracks the number
of slab pages whose physical memory has been reclaimed but whose virtual
memory is still allocated to a kmem_cache.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 include/linux/slub_def.h |  4 +++-
 mm/slub.c| 18 ++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 693e9bb34edc..eea402d849da 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -88,7 +88,7 @@ struct kmem_cache_cpu {
  */
 struct kmem_cache_virtual {
 #ifdef CONFIG_SLAB_VIRTUAL
-   /* Protects freed_slabs and freed_slabs_min */
+   /* Protects freed_slabs, freed_slabs_min, and nr_free_pages */
spinlock_t freed_slabs_lock;
/*
 * Slabs on this list have virtual memory of size oo allocated to them
@@ -97,6 +97,8 @@ struct kmem_cache_virtual {
struct list_head freed_slabs;
/* Same as freed_slabs but with memory of size min */
struct list_head freed_slabs_min;
+   /* Number of slab pages which got freed */
+   unsigned long nr_freed_pages;
 #endif
 };
 
diff --git a/mm/slub.c b/mm/slub.c
index 66ae60cdadaf..0f7f5bf0b174 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2110,6 +2110,8 @@ static struct slab *get_free_slab(struct kmem_cache *s,
 
if (likely(slab)) {
list_del(&slab->slab_list);
+   WRITE_ONCE(s->virtual.nr_freed_pages,
+   s->virtual.nr_freed_pages - (1UL << slab_order(slab)));
 
spin_unlock_irqrestore(&s->virtual.freed_slabs_lock, flags);
return slab;
@@ -2158,6 +2160,8 @@ static struct slab *alloc_slab_page(struct kmem_cache *s,
/* Rollback: put the struct slab back. */
spin_lock_irqsave(&s->virtual.freed_slabs_lock, flags);
list_add(&slab->slab_list, freed_slabs);
+   WRITE_ONCE(s->virtual.nr_freed_pages,
+   s->virtual.nr_freed_pages + (1UL << slab_order(slab)));
spin_unlock_irqrestore(&s->virtual.freed_slabs_lock, flags);
 
return NULL;
@@ -2438,6 +2442,8 @@ static void slub_tlbflush_worker(struct kthread_work 
*work)
WARN_ON(oo_order(slab->oo) != oo_order(s->min));
list_add(&slab->slab_list, &s->virtual.freed_slabs_min);
}
+   WRITE_ONCE(s->virtual.nr_freed_pages, s->virtual.nr_freed_pages 
+
+   (1UL << slab_order(slab)));
spin_unlock(&s->virtual.freed_slabs_lock);
}
spin_unlock_irqrestore(&slub_kworker_lock, irq_flags);
@@ -4924,6 +4930,7 @@ static inline void slab_virtual_open(struct kmem_cache *s)
spin_lock_init(&s->virtual.freed_slabs_lock);
INIT_LIST_HEAD(&s->virtual.freed_slabs);
INIT_LIST_HEAD(&s->virtual.freed_slabs_min);
+   s->virtual.nr_freed_pages = 0;
 #endif
 }
 
@@ -6098,6 +6105,14 @@ static ssize_t objects_partial_show(struct kmem_cache 
*s, char *buf)
 }
 SLAB_ATTR_RO(objects_partial);
 
+#ifdef CONFIG_SLAB_VIRTUAL
+static ssize_t deallocated_pages_show(struct kmem_cache *s, char *buf)
+{
+   return sysfs_emit(buf, "%lu\n", READ_ONCE(s->virtual.nr_freed_pages));
+}
+SLAB_ATTR_RO(deallocated_pages);
+#endif /* CONFIG_SLAB_VIRTUAL */
+
 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 {
int objects = 0;
@@ -6424,6 +6439,9 @@ static struct attribute *slab_attrs[] = {
&min_partial_attr.attr,
&cpu_partial_attr.attr,
&objects_partial_attr.attr,
+#ifdef CONFIG_SLAB_VIRTUAL
+   &deallocated_pages_attr.attr,
+#endif
&partial_attr.attr,
&cpu_slabs_attr.attr,
&ctor_attr.attr,
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 13/14] mm/slub: sanity-check freepointers

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

Sanity-check that:
 - non-NULL freepointers point into the slab
 - freepointers look plausibly aligned

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 lib/slub_kunit.c |  4 
 mm/slab.h|  8 +++
 mm/slub.c| 57 
 3 files changed, 69 insertions(+)

diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
index d4a3730b08fa..acf8600bd1fd 100644
--- a/lib/slub_kunit.c
+++ b/lib/slub_kunit.c
@@ -45,6 +45,10 @@ static void test_clobber_zone(struct kunit *test)
 #ifndef CONFIG_KASAN
 static void test_next_pointer(struct kunit *test)
 {
+   if (IS_ENABLED(CONFIG_SLAB_VIRTUAL))
+   kunit_skip(test,
+   "incompatible with freepointer corruption detection in 
CONFIG_SLAB_VIRTUAL");
+
struct kmem_cache *s = test_kmem_cache_create("TestSlub_next_ptr_free",
64, SLAB_POISON);
u8 *p = kmem_cache_alloc(s, GFP_KERNEL);
diff --git a/mm/slab.h b/mm/slab.h
index 460c802924bd..8d10a011bdf0 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -79,6 +79,14 @@ struct slab {
 
struct list_head flush_list_elem;
 
+   /*
+* Not in kmem_cache because it depends on whether the allocation is
+* normal order or fallback order.
+* an alternative might be to over-allocate virtual memory for
+* fallback-order pages.
+*/
+   unsigned long align_mask;
+
/* Replaces the page lock */
spinlock_t slab_lock;
 
diff --git a/mm/slub.c b/mm/slub.c
index 0f7f5bf0b174..57474c8a6569 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -392,6 +392,44 @@ static inline freeptr_t freelist_ptr_encode(const struct 
kmem_cache *s,
return (freeptr_t){.v = encoded};
 }
 
+/*
+ * Does some validation of freelist pointers. Without SLAB_VIRTUAL this is
+ * currently a no-op.
+ */
+static inline bool freelist_pointer_corrupted(struct slab *slab, freeptr_t ptr,
+   void *decoded)
+{
+#ifdef CONFIG_SLAB_VIRTUAL
+   /*
+* If the freepointer decodes to 0, use 0 as the slab_base so that
+* the check below always passes (0 & slab->align_mask == 0).
+*/
+   unsigned long slab_base = decoded ? (unsigned long)slab_to_virt(slab)
+   : 0;
+
+   /*
+* This verifies that the SLUB freepointer does not point outside the
+* slab. Since at that point we can basically do it for free, it also
+* checks that the pointer alignment looks vaguely sane.
+* However, we probably don't want the cost of a proper division here,
+* so instead we just do a cheap check whether the bottom bits that are
+* clear in the size are also clear in the pointer.
+* So for kmalloc-32, it does a perfect alignment check, but for
+* kmalloc-192, it just checks that the pointer is a multiple of 32.
+* This should probably be reconsidered - is this a good tradeoff, or
+* should that part be thrown out, or do we want a proper accurate
+* alignment check (and can we make it work with acceptable performance
+* cost compared to the security improvement - probably not)?
+*/
+   return CHECK_DATA_CORRUPTION(
+   ((unsigned long)decoded & slab->align_mask) != slab_base,
+   "bad freeptr (encoded %lx, ptr %p, base %lx, mask %lx",
+   ptr.v, decoded, slab_base, slab->align_mask);
+#else
+   return false;
+#endif
+}
+
 static inline void *freelist_ptr_decode(const struct kmem_cache *s,
freeptr_t ptr, unsigned long ptr_addr,
struct slab *slab)
@@ -403,6 +441,10 @@ static inline void *freelist_ptr_decode(const struct 
kmem_cache *s,
 #else
decoded = (void *)ptr.v;
 #endif
+
+   if (unlikely(freelist_pointer_corrupted(slab, ptr, decoded)))
+   return NULL;
+
return decoded;
 }
 
@@ -2122,6 +2164,21 @@ static struct slab *get_free_slab(struct kmem_cache *s,
if (slab == NULL)
return NULL;
 
+   /*
+* Bits that must be equal to start-of-slab address for all
+* objects inside the slab.
+* For compatibility with pointer tagging (like in HWASAN), this would
+* need to clear the pointer tag bits from the mask.
+*/
+   slab->align_mask = ~((PAGE_SIZE << oo_order(oo)) - 1);
+
+   /*
+* Object alignment bits (must be zero, which is equal to the bits in
+* the start-of-slab address)
+*/
+   if (s->red_left_pad == 0)
+   slab->align_mask |= (1 << (ffs(s->size) - 1)) - 1;
+
return slab;
 }
 
-- 
2.42.0.459.ge4e396fd5e-goog



[RFC PATCH 14/14] security: add documentation for SLAB_VIRTUAL

2023-09-15 Thread Matteo Rizzo
From: Jann Horn 

Document what SLAB_VIRTUAL is trying to do, how it's implemented, and
why.

Signed-off-by: Jann Horn 
Co-developed-by: Matteo Rizzo 
Signed-off-by: Matteo Rizzo 
---
 Documentation/security/self-protection.rst | 102 +
 1 file changed, 102 insertions(+)

diff --git a/Documentation/security/self-protection.rst 
b/Documentation/security/self-protection.rst
index 910668e665cb..5a5e99e3f244 100644
--- a/Documentation/security/self-protection.rst
+++ b/Documentation/security/self-protection.rst
@@ -314,3 +314,105 @@ To help kill classes of bugs that result in kernel 
addresses being
 written to userspace, the destination of writes needs to be tracked. If
 the buffer is destined for userspace (e.g. seq_file backed ``/proc`` files),
 it should automatically censor sensitive values.
+
+
+Memory Allocator Mitigations
+
+
+Protection against cross-cache attacks (SLAB_VIRTUAL)
+-
+
+SLAB_VIRTUAL is a mitigation that deterministically prevents cross-cache
+attacks.
+
+Linux Kernel use-after-free vulnerabilities are commonly exploited by turning
+them into an object type confusion (having two active pointers of different
+types to the same memory location) using one of the following techniques:
+
+1. Direct object reuse: make the kernel give the victim object back to the slab
+   allocator, then allocate the object again from the same slab cache as a
+   different type. This is only possible if the victim object resides in a slab
+   cache which can contain objects of different types - for example one of the
+   kmalloc caches.
+2. "Cross-cache attack": make the kernel give the victim object back to the 
slab
+   allocator, then make the slab allocator give the page containing the object
+   back to the page allocator, then either allocate the page directly as some
+   other type of page or make the slab allocator allocate it again for a
+   different slab cache and allocate an object from there.
+
+In either case, the important part is that the same virtual address is reused
+for two objects of different types.
+
+The first case can be addressed by separating objects of different types
+into different slab caches. If a slab cache only contains objects of the
+same type then directly turning an use-after-free into a type confusion is
+impossible as long as the slab page that contains the victim object remains
+assigned to that slab cache. This type of mitigation is easily bypassable
+by cross-cache attacks: if the attacker can make the slab allocator return
+the page containing the victim object to the page allocator and then make
+it use the same page for a different slab cache, type confusion becomes
+possible again. Addressing the first case is therefore only worthwhile if
+cross-cache attacks are also addressed. AUTOSLAB uses a combination of
+probabilistic mitigations for this. SLAB_VIRTUAL addresses the second case
+deterministically by changing the way the slab allocator allocates memory.
+
+Preventing slab virtual address reuse
+~
+
+In theory there is an easy fix against cross-cache attacks: modify the slab
+allocator so that it never gives memory back to the page allocator. In practice
+this would be problematic because physical memory remains permanently assigned
+to a slab cache even if it doesn't contain any active objects. A viable
+cross-cache mitigation must allow the system to reclaim unused physical memory.
+In the current design of the slab allocator there is no way
+to keep a region of virtual memory permanently assigned to a slab cache without
+also permanently reserving physical memory. That is because the virtual
+addresses that the slab allocator uses come from the linear map region, where
+there is a 1:1 correspondence between virtual and physical addresses.
+
+SLAB_VIRTUAL's solution is to create a dedicated virtual memory region that is
+only used for slab memory, and to enforce that once a range of virtual 
addresses
+is used for a slab cache, it is never reused for any other caches. Using a
+dedicated region of virtual memory lets us reserve ranges of virtual addresses
+to prevent cross-cache attacks and at the same time release physical memory 
back
+to the system when it's no longer needed. This is what Chromium's 
PartitionAlloc
+does in userspace
+(https://chromium.googlesource.com/chromium/src/+/354da2514b31df2aa14291199a567e10a7671621/base/allocator/partition_allocator/PartitionAlloc.md).
+
+Implementation
+~~
+
+SLAB_VIRTUAL reserves a region of virtual memory for the slab allocator. All
+pointers returned by the slab allocator point to this region. The region is
+statically partitioned in two sub-regions: the metadata region and the data
+region. The data region is where the actual objects are allocated from. The
+metadata region is an array of struct slab objects, one for each PAGE_SIZE 
bytes
+in the data region.
+With

Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Andrew Jones
On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > From: Guo Ren 
...
> > > diff --git a/arch/riscv/include/asm/insn-def.h 
> > > b/arch/riscv/include/asm/insn-def.h
> > > index 6960beb75f32..dc590d331894 100644
> > > --- a/arch/riscv/include/asm/insn-def.h
> > > +++ b/arch/riscv/include/asm/insn-def.h
> > > @@ -134,6 +134,7 @@
> > >  
> > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > >  #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > 
> > This should be named RV_OPCODE_OP_IMM and be placed in
> > numerical order with the others, i.e. above SYSTEM.
> > 
> > >  
> > >  #define HFENCE_VVMA(vaddr, asid) \
> > >   INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),  \
> > > @@ -196,4 +197,8 @@
> > >   INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),  \
> > >  RS1(base), SIMM12(4))
> > >  
> > > +#define CBO_prefetchw(base)  \
> > 
> > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > even if we intend to pass 0 for it.
> 
> It makes sense.
> 
> The mnemonic in the previously mentioned documentation is:
> 
> prefetch.w offset(base)
> 
> So yeah, makes sense to have both offset and base as parameters for 
> CBO_prefetchw (or PREFETCH_w, I have no strong preference).

I have a strong preference :-)

PREFETCH_w is consistent with the naming we already have for e.g.
cbo.clean, which is CBO_clean. The instruction we're picking a name
for now is prefetch.w, not cbo.prefetchw.

> 
> > 
> > > + INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0), \
> > > +RD(x0), RS1(base), RS2(x0))
> > 
> > prefetch.w is not an R-type instruction, it's an S-type. While the bit
> > shifts are the same, the names are different. We need to add S-type
> > names while defining this instruction. 
> 
> That is correct, it is supposed to look like a store instruction (S-type), 
> even though documentation don't explicitly state that.
> 
> Even though it works fine with the R-type definition, code documentation 
> would be wrong, and future changes could break it.
> 
> > Then, this define would be
> > 
> >  #define PREFETCH_w(base, imm) \

I should have suggested 'offset' instead of 'imm' for the second parameter
name.

> >  INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
> > RS1(base), __RS2(3))
> 
> s/OPCODE_OP_IMM/OPCODE_PREFETCH
> 0x4 vs 0x13

There's no major opcode named "PREFETCH" and the spec says that the major
opcode used for prefetch instructions is OP-IMM. That's why we want to
name this OPCODE_OP_IMM. I'm not sure where the 0x4 you're referring to
comes from. A 32-bit instruction has the lowest two bits set (figure 1.1
of the unpriv spec) and table 27.1 of the unpriv spec shows OP-IMM is
0b00100xx, so we have 0b0010011. Keeping the naming of the opcode macros
consistent with the spec also keeps them consistent with the .insn
directive where we could even use the names directly, i.e.

 .insn s OP_IMM, 6, x3, 0(a0)

> > 
> > When the assembler as insn_r I hope it will validate that

I meant insn_s here, which would be the macro for '.insn s'

> > (imm & 0xfe0) == imm

I played with it. It won't do what we want for prefetch, only
what works for s-type instructions in general, i.e. it allows
+/-2047 offsets and fails for everything else. That's good enough.
We can just mask off the low 5 bits here in our macro

 #define PREFETCH_w(base, offset) \
INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5((offset) & ~0x1f), \
   __IMM_4_0(0), RS1(base), __RS2(3))

Thanks,
drew


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Conor Dooley
On Fri, Sep 15, 2023 at 01:07:40PM +0200, Andrew Jones wrote:
> On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> > On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > > From: Guo Ren 
> ...
> > > > diff --git a/arch/riscv/include/asm/insn-def.h 
> > > > b/arch/riscv/include/asm/insn-def.h
> > > > index 6960beb75f32..dc590d331894 100644
> > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > @@ -134,6 +134,7 @@
> > > >  
> > > >  #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > > >  #define RV_OPCODE_SYSTEM   RV_OPCODE(115)
> > > > +#define RV_OPCODE_PREFETCH RV_OPCODE(19)
> > > 
> > > This should be named RV_OPCODE_OP_IMM and be placed in
> > > numerical order with the others, i.e. above SYSTEM.
> > > 
> > > >  
> > > >  #define HFENCE_VVMA(vaddr, asid)   \
> > > > INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),  \
> > > > @@ -196,4 +197,8 @@
> > > > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),  \
> > > >RS1(base), SIMM12(4))
> > > >  
> > > > +#define CBO_prefetchw(base)\
> > > 
> > > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > > even if we intend to pass 0 for it.
> > 
> > It makes sense.
> > 
> > The mnemonic in the previously mentioned documentation is:
> > 
> > prefetch.w offset(base)
> > 
> > So yeah, makes sense to have both offset and base as parameters for 
> > CBO_prefetchw (or PREFETCH_w, I have no strong preference).
> 
> I have a strong preference :-)
> 
> PREFETCH_w is consistent with the naming we already have for e.g.
> cbo.clean, which is CBO_clean. The instruction we're picking a name
> for now is prefetch.w, not cbo.prefetchw.

btw, the CBO_foo stuff was named that way as we were using them in
alternatives originally as an argument, that manifested as:
"cbo." __stringify(_op) " (a0)\n\t"
That was later changed to
CBO_##_op(a0)
but the then un-needed (AFAICT) capitalisation was kept to avoid
touching the callsites of the alternative. Maybe you remember better
than I do drew, since the idea was yours & I forgot I even wrote that
pattch.
If this isn't being used in a similar manner, then the w has no reason
to be in the odd lowercase form.

Cheers,
Conor.

> 
> > 
> > > 
> > > > +   INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0), \
> > > > +  RD(x0), RS1(base), RS2(x0))
> > > 
> > > prefetch.w is not an R-type instruction, it's an S-type. While the bit
> > > shifts are the same, the names are different. We need to add S-type
> > > names while defining this instruction. 
> > 
> > That is correct, it is supposed to look like a store instruction (S-type), 
> > even though documentation don't explicitly state that.
> > 
> > Even though it works fine with the R-type definition, code documentation 
> > would be wrong, and future changes could break it.
> > 
> > > Then, this define would be
> > > 
> > >  #define PREFETCH_w(base, imm) \
> 
> I should have suggested 'offset' instead of 'imm' for the second parameter
> name.
> 
> > >  INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
> > > RS1(base), __RS2(3))
> > 
> > s/OPCODE_OP_IMM/OPCODE_PREFETCH
> > 0x4 vs 0x13
> 
> There's no major opcode named "PREFETCH" and the spec says that the major
> opcode used for prefetch instructions is OP-IMM. That's why we want to
> name this OPCODE_OP_IMM. I'm not sure where the 0x4 you're referring to
> comes from. A 32-bit instruction has the lowest two bits set (figure 1.1
> of the unpriv spec) and table 27.1 of the unpriv spec shows OP-IMM is
> 0b00100xx, so we have 0b0010011. Keeping the naming of the opcode macros
> consistent with the spec also keeps them consistent with the .insn
> directive where we could even use the names directly, i.e.
> 
>  .insn s OP_IMM, 6, x3, 0(a0)
> 
> > > 
> > > When the assembler as insn_r I hope it will validate that
> 
> I meant insn_s here, which would be the macro for '.insn s'
> 
> > > (imm & 0xfe0) == imm
> 
> I played with it. It won't do what we want for prefetch, only
> what works for s-type instructions in general, i.e. it allows
> +/-2047 offsets and fails for everything else. That's good enough.
> We can just mask off the low 5 bits here in our macro
> 
>  #define PREFETCH_w(base, offset) \
> INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5((offset) & ~0x1f), \
>__IMM_4_0(0), RS1(base), __RS2(3))
> 
> Thanks,
> drew


signature.asc
Description: PGP signature


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Conor Dooley
Yo,

On Thu, Sep 14, 2023 at 04:47:18PM +0200, Andrew Jones wrote:
> On Thu, Sep 14, 2023 at 04:25:53PM +0200, Andrew Jones wrote:
> > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > From: Guo Ren 
> > > 
> > > Cache-block prefetch instructions are HINTs to the hardware to
> > > indicate that software intends to perform a particular type of
> > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > improve the arch_xchg for qspinlock xchg_tail.
> > > 
> > > Signed-off-by: Guo Ren 
> > > Signed-off-by: Guo Ren 
> > > ---
> > >  arch/riscv/Kconfig | 15 +++
> > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > >  arch/riscv/include/asm/hwcap.h |  1 +
> > >  arch/riscv/include/asm/insn-def.h  |  5 +
> > >  arch/riscv/include/asm/processor.h | 13 +
> > >  arch/riscv/kernel/cpufeature.c |  1 +
> > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index e9ae6fa232c3..2c346fe169c1 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > >  
> > >  If you don't know what to do here, say Y.
> > >  
> > > +config RISCV_ISA_ZICBOP
> > 
> > Even if we're not concerned with looping over blocks yet, I think we
> > should introduce zicbop block size DT parsing at the same time we bring
> > zicbop support to the kernel (it's just more copy+paste from zicbom and
> > zicboz). It's a bit annoying that the CMO spec doesn't state that block
> > sizes should be the same for m/z/p. And, the fact that m/z/p are all
> > separate extensions leads us to needing to parse block sizes for all
> > three, despite the fact that in practice they'll probably be the same.
> 
> Although, I saw on a different mailing list that Andrei Warkentin
> interpreted section 2.7 "Software Discovery" of the spec, which states
> 
> """
> The initial set of CMO extensions requires the following information to be
> discovered by software:
> 
> * The size of the cache block for management and prefetch instructions
> * The size of the cache block for zero instructions
> * CBIE support at each privilege level
> 
> Other general cache characteristics may also be specified in the discovery
> mechanism.
> """
> 
> as management and prefetch having the same block size and only zero
> potentially having a different size. That looks like a reasonable
> interpretation to me, too.

TBH, I don't really care what ambiguous wording the spec has used, we
have the opportunity to make better decisions if we please. I hate the
fact that the specs are often not abundantly clear about things like this.

> So, we could maybe proceed with assuming we
> can use zicbom_block_size for prefetch, for now. If a platform comes along
> that interpreted the spec differently, requiring prefetch block size to
> be specified separately, then we'll cross that bridge when we get there.

That said, I think I suggested originally having the zicboz stuff default
to the zicbom size too, so I'd be happy with prefetch stuff working
exclusively that way until someone comes along looking for different sizes.
The binding should be updated though since

  riscv,cbom-block-size:
$ref: /schemas/types.yaml#/definitions/uint32
description:
  The blocksize in bytes for the Zicbom cache operations.

would no longer be a complete description.

While thinking about new wording though, it feels really clunky to describe
it like:
The block size in bytes for the Zicbom cache operations, Zicbop
cache operations will default to this block size where not
explicitly defined.

since there's then no way to actually define the block size if it is
different. Unless you've got some magic wording, I'd rather document
riscv,cbop-block-size, even if we are going to use riscv,cbom-block-size
as the default.

Cheers,
Conor.


signature.asc
Description: PGP signature


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Andrew Jones
On Fri, Sep 15, 2023 at 12:37:50PM +0100, Conor Dooley wrote:
> Yo,
> 
> On Thu, Sep 14, 2023 at 04:47:18PM +0200, Andrew Jones wrote:
> > On Thu, Sep 14, 2023 at 04:25:53PM +0200, Andrew Jones wrote:
> > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > > From: Guo Ren 
> > > > 
> > > > Cache-block prefetch instructions are HINTs to the hardware to
> > > > indicate that software intends to perform a particular type of
> > > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > > improve the arch_xchg for qspinlock xchg_tail.
> > > > 
> > > > Signed-off-by: Guo Ren 
> > > > Signed-off-by: Guo Ren 
> > > > ---
> > > >  arch/riscv/Kconfig | 15 +++
> > > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > > >  arch/riscv/include/asm/hwcap.h |  1 +
> > > >  arch/riscv/include/asm/insn-def.h  |  5 +
> > > >  arch/riscv/include/asm/processor.h | 13 +
> > > >  arch/riscv/kernel/cpufeature.c |  1 +
> > > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > > index e9ae6fa232c3..2c346fe169c1 100644
> > > > --- a/arch/riscv/Kconfig
> > > > +++ b/arch/riscv/Kconfig
> > > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > > >  
> > > >If you don't know what to do here, say Y.
> > > >  
> > > > +config RISCV_ISA_ZICBOP
> > > 
> > > Even if we're not concerned with looping over blocks yet, I think we
> > > should introduce zicbop block size DT parsing at the same time we bring
> > > zicbop support to the kernel (it's just more copy+paste from zicbom and
> > > zicboz). It's a bit annoying that the CMO spec doesn't state that block
> > > sizes should be the same for m/z/p. And, the fact that m/z/p are all
> > > separate extensions leads us to needing to parse block sizes for all
> > > three, despite the fact that in practice they'll probably be the same.
> > 
> > Although, I saw on a different mailing list that Andrei Warkentin
> > interpreted section 2.7 "Software Discovery" of the spec, which states
> > 
> > """
> > The initial set of CMO extensions requires the following information to be
> > discovered by software:
> > 
> > * The size of the cache block for management and prefetch instructions
> > * The size of the cache block for zero instructions
> > * CBIE support at each privilege level
> > 
> > Other general cache characteristics may also be specified in the discovery
> > mechanism.
> > """
> > 
> > as management and prefetch having the same block size and only zero
> > potentially having a different size. That looks like a reasonable
> > interpretation to me, too.
> 
> TBH, I don't really care what ambiguous wording the spec has used, we
> have the opportunity to make better decisions if we please. I hate the
> fact that the specs are often not abundantly clear about things like this.
> 
> > So, we could maybe proceed with assuming we
> > can use zicbom_block_size for prefetch, for now. If a platform comes along
> > that interpreted the spec differently, requiring prefetch block size to
> > be specified separately, then we'll cross that bridge when we get there.
> 
> That said, I think I suggested originally having the zicboz stuff default
> to the zicbom size too, so I'd be happy with prefetch stuff working
> exclusively that way until someone comes along looking for different sizes.
> The binding should be updated though since
> 
>   riscv,cbom-block-size:
> $ref: /schemas/types.yaml#/definitions/uint32
> description:
>   The blocksize in bytes for the Zicbom cache operations.
> 
> would no longer be a complete description.
> 
> While thinking about new wording though, it feels really clunky to describe
> it like:
>   The block size in bytes for the Zicbom cache operations, Zicbop
>   cache operations will default to this block size where not
>   explicitly defined.
> 
> since there's then no way to actually define the block size if it is
> different. Unless you've got some magic wording, I'd rather document
> riscv,cbop-block-size, even if we are going to use riscv,cbom-block-size
> as the default.
>

Sounds good to me, but if it's documented, then we should probably
implement its parsing. Then, at that point, I wonder if it makes sense to
have the fallback at all, or if it's not better just to require all the
DTs to be explicit (even if redundant).

Thanks,
drew


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Andrew Jones
On Fri, Sep 15, 2023 at 12:26:20PM +0100, Conor Dooley wrote:
> On Fri, Sep 15, 2023 at 01:07:40PM +0200, Andrew Jones wrote:
> > On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> > > On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > > > From: Guo Ren 
> > ...
> > > > > diff --git a/arch/riscv/include/asm/insn-def.h 
> > > > > b/arch/riscv/include/asm/insn-def.h
> > > > > index 6960beb75f32..dc590d331894 100644
> > > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > > @@ -134,6 +134,7 @@
> > > > >  
> > > > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > > > >  #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> > > > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > > > 
> > > > This should be named RV_OPCODE_OP_IMM and be placed in
> > > > numerical order with the others, i.e. above SYSTEM.
> > > > 
> > > > >  
> > > > >  #define HFENCE_VVMA(vaddr, asid) \
> > > > >   INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),  \
> > > > > @@ -196,4 +197,8 @@
> > > > >   INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),  \
> > > > >  RS1(base), SIMM12(4))
> > > > >  
> > > > > +#define CBO_prefetchw(base)  \
> > > > 
> > > > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > > > even if we intend to pass 0 for it.
> > > 
> > > It makes sense.
> > > 
> > > The mnemonic in the previously mentioned documentation is:
> > > 
> > > prefetch.w offset(base)
> > > 
> > > So yeah, makes sense to have both offset and base as parameters for 
> > > CBO_prefetchw (or PREFETCH_w, I have no strong preference).
> > 
> > I have a strong preference :-)
> > 
> > PREFETCH_w is consistent with the naming we already have for e.g.
> > cbo.clean, which is CBO_clean. The instruction we're picking a name
> > for now is prefetch.w, not cbo.prefetchw.
> 
> btw, the CBO_foo stuff was named that way as we were using them in
> alternatives originally as an argument, that manifested as:
> "cbo." __stringify(_op) " (a0)\n\t"
> That was later changed to
> CBO_##_op(a0)
> but the then un-needed (AFAICT) capitalisation was kept to avoid
> touching the callsites of the alternative. Maybe you remember better
> than I do drew, since the idea was yours & I forgot I even wrote that
> pattch.

And I forgot anything I may have suggested about it :-)

> If this isn't being used in a similar manner, then the w has no reason
> to be in the odd lowercase form.

Other than to be consistent... However, the CBO_* instructions are not
consistent with the rest of macros. If we don't need lowercase for any
reason, then my preference would be to bite the bullet and change all the
callsites of CBO_* macros and then introduce this new instruction as
PREFETCH_W

Thanks,
drew


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Guo Ren
On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras  wrote:
>
> On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > From: Guo Ren 
> >
> > Cache-block prefetch instructions are HINTs to the hardware to
> > indicate that software intends to perform a particular type of
> > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > improve the arch_xchg for qspinlock xchg_tail.
> >
> > Signed-off-by: Guo Ren 
> > Signed-off-by: Guo Ren 
> > ---
> >  arch/riscv/Kconfig | 15 +++
> >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> >  arch/riscv/include/asm/hwcap.h |  1 +
> >  arch/riscv/include/asm/insn-def.h  |  5 +
> >  arch/riscv/include/asm/processor.h | 13 +
> >  arch/riscv/kernel/cpufeature.c |  1 +
> >  6 files changed, 38 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e9ae6fa232c3..2c346fe169c1 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> >
> >  If you don't know what to do here, say Y.
> >
> > +config RISCV_ISA_ZICBOP
> > + bool "Zicbop extension support for cache block prefetch"
> > + depends on MMU
> > + depends on RISCV_ALTERNATIVE
> > + default y
> > + help
> > +Adds support to dynamically detect the presence of the ZICBOP
> > +extension (Cache Block Prefetch Operations) and enable its
> > +usage.
> > +
> > +The Zicbop extension can be used to prefetch cache block for
> > +read/write/instruction fetch.
> > +
> > +If you don't know what to do here, say Y.
> > +
> >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> >   bool
> >   default y
> > diff --git a/arch/riscv/include/asm/cmpxchg.h 
> > b/arch/riscv/include/asm/cmpxchg.h
> > index 702725727671..56eff7a9d2d2 100644
> > --- a/arch/riscv/include/asm/cmpxchg.h
> > +++ b/arch/riscv/include/asm/cmpxchg.h
> > @@ -11,6 +11,7 @@
> >
> >  #include 
> >  #include 
> > +#include 
> >
> >  #define __arch_xchg_masked(prepend, append, r, p, n) \
> >  ({   \
> > @@ -25,6 +26,7 @@
> >   \
> >   __asm__ __volatile__ (  \
> >  prepend  \
> > +PREFETCHW_ASM(%5)\
> >  "0:  lr.w %0, %2\n"  \
> >  "and  %1, %0, %z4\n" \
> >  "or   %1, %1, %z3\n" \
> > @@ -32,7 +34,7 @@
> >  "bnez %1, 0b\n"  \
> >  append   \
> >  : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))   \
> > -: "rJ" (__newx), "rJ" (~__mask)  \
> > +: "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> >  : "memory"); \
> >   \
> >   r = (__typeof__(*(p)))((__retx & __mask) >> __s);   \
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index b7b58258f6c7..78b7b8b53778 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -58,6 +58,7 @@
> >  #define RISCV_ISA_EXT_ZICSR  40
> >  #define RISCV_ISA_EXT_ZIFENCEI   41
> >  #define RISCV_ISA_EXT_ZIHPM  42
> > +#define RISCV_ISA_EXT_ZICBOP 43
> >
> >  #define RISCV_ISA_EXT_MAX64
> >
> > diff --git a/arch/riscv/include/asm/insn-def.h 
> > b/arch/riscv/include/asm/insn-def.h
> > index 6960beb75f32..dc590d331894 100644
> > --- a/arch/riscv/include/asm/insn-def.h
> > +++ b/arch/riscv/include/asm/insn-def.h
> > @@ -134,6 +134,7 @@
> >
> >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> >  #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> >
> >  #define HFENCE_VVMA(vaddr, asid) \
> >   INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),  \
> > @@ -196,4 +197,8 @@
> >   INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),  \
> >  RS1(base), SIMM12(4))
> >
> > +#define CBO_prefetchw(base)  \
> > + INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0), \
> > +RD(x0), RS1(base), RS2(x0))
> > +
>
> I understand that here you create the instruction via bitfield, following
> the ISA, and this enables using instructions not available on the
> toolchain.
>
> It took me some time to find the document with this instruction, so please
> add this to the commit msg:
>
> https://github.com/riscv/risc

Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Conor Dooley

> > If this isn't being used in a similar manner, then the w has no reason
> > to be in the odd lowercase form.
> 
> Other than to be consistent... However, the CBO_* instructions are not
> consistent with the rest of macros. If we don't need lowercase for any
> reason, then my preference would be to bite the bullet and change all the
> callsites of CBO_* macros and then introduce this new instruction as
> PREFETCH_W

Aye, I probably should've done it to begin with. Maybe there was some
other consideration at the time.


signature.asc
Description: PGP signature


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Conor Dooley
On Fri, Sep 15, 2023 at 02:14:40PM +0200, Andrew Jones wrote:
> On Fri, Sep 15, 2023 at 12:37:50PM +0100, Conor Dooley wrote:
> > On Thu, Sep 14, 2023 at 04:47:18PM +0200, Andrew Jones wrote:
> > > On Thu, Sep 14, 2023 at 04:25:53PM +0200, Andrew Jones wrote:
> > > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > > > From: Guo Ren 

> > > So, we could maybe proceed with assuming we
> > > can use zicbom_block_size for prefetch, for now. If a platform comes along
> > > that interpreted the spec differently, requiring prefetch block size to
> > > be specified separately, then we'll cross that bridge when we get there.
> > 
> > That said, I think I suggested originally having the zicboz stuff default
> > to the zicbom size too, so I'd be happy with prefetch stuff working
> > exclusively that way until someone comes along looking for different sizes.
> > The binding should be updated though since
> > 
> >   riscv,cbom-block-size:
> > $ref: /schemas/types.yaml#/definitions/uint32
> > description:
> >   The blocksize in bytes for the Zicbom cache operations.
> > 
> > would no longer be a complete description.
> > 
> > While thinking about new wording though, it feels really clunky to describe
> > it like:
> > The block size in bytes for the Zicbom cache operations, Zicbop
> > cache operations will default to this block size where not
> > explicitly defined.
> > 
> > since there's then no way to actually define the block size if it is
> > different. Unless you've got some magic wording, I'd rather document
> > riscv,cbop-block-size, even if we are going to use riscv,cbom-block-size
> > as the default.
> >
> 
> Sounds good to me, but if it's documented, then we should probably
> implement its parsing. Then, at that point, I wonder if it makes sense to
> have the fallback at all, or if it's not better just to require all the
> DTs to be explicit (even if redundant).

Sure, why not I guess.


signature.asc
Description: PGP signature


Re: [RFC PATCH 00/14] Prevent cross-cache attacks in the SLUB allocator

2023-09-15 Thread Dave Hansen
On 9/15/23 03:59, Matteo Rizzo wrote:
> The goal of this patch series is to deterministically prevent cross-cache
> attacks in the SLUB allocator.

What's the cost?


Re: [RFC PATCH 2/2] docs: Update kernel-parameters.txt for signature verification enhancement

2023-09-15 Thread Randy Dunlap



On 9/14/23 04:27, Alessandro Carminati (Red Hat) wrote:
> Update kernel-parameters.txt to reflect new deferred signature
> verification.
> Enhances boot speed by allowing unsigned modules in initrd after
> bootloader check.
> 
> Signed-off-by: Alessandro Carminati (Red Hat) 
> ---
>  Documentation/admin-guide/kernel-parameters.txt | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 0c38a8af95ce..beec86f0dd05 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3410,6 +3410,15 @@
>   Note that if CONFIG_MODULE_SIG_FORCE is set, that
>   is always true, so this option does nothing.
>  
> + module_sig_check_wait=
> + This parameter enables delayed activation of module
> + signature checks, deferring the process until userspace
> + triggers it. Once activated, this setting becomes
> + permanent and cannot be reversed. This feature proves
> + valuable for incorporating unsigned modules within
> + initrd, especially after bootloader verification.
> + By employing this option, boot times can be quicker.
> +

Please keep the entries here in alphabetical order.
This new entry should be after module_blacklist, not before it.
Thanks.

>   module_blacklist=  [KNL] Do not load a comma-separated list of
>   modules.  Useful for debugging problem modules.
>  

-- 
~Randy


Re: [RFC PATCH 00/14] Prevent cross-cache attacks in the SLUB allocator

2023-09-15 Thread Lameter, Christopher

On Fri, 15 Sep 2023, Dave Hansen wrote:


On 9/15/23 03:59, Matteo Rizzo wrote:

The goal of this patch series is to deterministically prevent cross-cache
attacks in the SLUB allocator.


What's the cost?


The only thing that I see is 1-2% on kernel compilations (and "more on 
machines with lots of cores")?


Having a virtualized slab subsystem could enable other things:

- The page order calculation could be simplified since vmalloc can stitch 
arbitrary base pages together to form larger contiguous virtual segments. 
So just use f.e. order 5 or so for all slabs to reduce contention?


- Maybe we could make slab pages movable (if we can ensure that slab 
objects are not touched somehow. At least stop_machine run could be used 
to move batches of slab memory)


- Maybe we can avoid allocating page structs somehow for slab memory? 
Looks like this is taking a step into that direction. The metadata storage 
of the slab allocator could be reworked and optimized better.


Problems:

- Overhead due to more TLB lookups

- Larger amounts of TLBs are used for the OS. Currently we are trying to 
use the maximum mappable TLBs to reduce their numbers. This presumably 
means using 4K TLBs for all slab access.


- Memory may not be physically contiguous which may be required by 
some drivers doing DMA.






Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Leonardo Bras
On Fri, Sep 15, 2023 at 01:07:40PM +0200, Andrew Jones wrote:
> On Fri, Sep 15, 2023 at 05:22:26AM -0300, Leonardo Bras wrote:
> > On Thu, Sep 14, 2023 at 03:47:59PM +0200, Andrew Jones wrote:
> > > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > > From: Guo Ren 
> ...
> > > > diff --git a/arch/riscv/include/asm/insn-def.h 
> > > > b/arch/riscv/include/asm/insn-def.h
> > > > index 6960beb75f32..dc590d331894 100644
> > > > --- a/arch/riscv/include/asm/insn-def.h
> > > > +++ b/arch/riscv/include/asm/insn-def.h
> > > > @@ -134,6 +134,7 @@
> > > >  
> > > >  #define RV_OPCODE_MISC_MEM RV_OPCODE(15)
> > > >  #define RV_OPCODE_SYSTEM   RV_OPCODE(115)
> > > > +#define RV_OPCODE_PREFETCH RV_OPCODE(19)
> > > 
> > > This should be named RV_OPCODE_OP_IMM and be placed in
> > > numerical order with the others, i.e. above SYSTEM.
> > > 
> > > >  
> > > >  #define HFENCE_VVMA(vaddr, asid)   \
> > > > INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),  \
> > > > @@ -196,4 +197,8 @@
> > > > INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),  \
> > > >RS1(base), SIMM12(4))
> > > >  
> > > > +#define CBO_prefetchw(base)\
> > > 
> > > Please name this 'PREFETCH_w' and it should take an immediate parameter,
> > > even if we intend to pass 0 for it.
> > 
> > It makes sense.
> > 
> > The mnemonic in the previously mentioned documentation is:
> > 
> > prefetch.w offset(base)
> > 
> > So yeah, makes sense to have both offset and base as parameters for 
> > CBO_prefetchw (or PREFETCH_w, I have no strong preference).
> 
> I have a strong preference :-)
> 
> PREFETCH_w is consistent with the naming we already have for e.g.
> cbo.clean, which is CBO_clean. The instruction we're picking a name
> for now is prefetch.w, not cbo.prefetchw.
> 
> > 
> > > 
> > > > +   INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0), \
> > > > +  RD(x0), RS1(base), RS2(x0))
> > > 
> > > prefetch.w is not an R-type instruction, it's an S-type. While the bit
> > > shifts are the same, the names are different. We need to add S-type
> > > names while defining this instruction. 
> > 
> > That is correct, it is supposed to look like a store instruction (S-type), 
> > even though documentation don't explicitly state that.
> > 
> > Even though it works fine with the R-type definition, code documentation 
> > would be wrong, and future changes could break it.
> > 
> > > Then, this define would be
> > > 
> > >  #define PREFETCH_w(base, imm) \
> 
> I should have suggested 'offset' instead of 'imm' for the second parameter
> name.
> 
> > >  INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5(imm), __IMM_4_0(0), \
> > > RS1(base), __RS2(3))
> > 
> > s/OPCODE_OP_IMM/OPCODE_PREFETCH
> > 0x4 vs 0x13
> 
> There's no major opcode named "PREFETCH" and the spec says that the major
> opcode used for prefetch instructions is OP-IMM. That's why we want to
> name this OPCODE_OP_IMM. I'm not sure where the 0x4 you're referring to
> comes from

Oh, you are right.

Sorry about this, I misinterpreted table 24.1 from the 
Unprivileged ISA (20191213). 

Yeap, everything make sense now, and the define below is not actually 
needed:

> > > > +#define RV_OPCODE_PREFETCH RV_OPCODE(19)

Thanks!
Leo


> . A 32-bit instruction has the lowest two bits set (figure 1.1
> of the unpriv spec) and table 27.1 of the unpriv spec shows OP-IMM is
> 0b00100xx, so we have 0b0010011. Keeping the naming of the opcode macros
> consistent with the spec also keeps them consistent with the .insn
> directive where we could even use the names directly, i.e.
> 
>  .insn s OP_IMM, 6, x3, 0(a0)
> 
> > > 
> > > When the assembler as insn_r I hope it will validate that
> 
> I meant insn_s here, which would be the macro for '.insn s'
> 
> > > (imm & 0xfe0) == imm
> 
> I played with it. It won't do what we want for prefetch, only
> what works for s-type instructions in general, i.e. it allows
> +/-2047 offsets and fails for everything else. That's good enough.
> We can just mask off the low 5 bits here in our macro
> 
>  #define PREFETCH_w(base, offset) \
> INSN_S(OPCODE_OP_IMM, FUNC3(6), IMM_11_5((offset) & ~0x1f), \
>__IMM_4_0(0), RS1(base), __RS2(3))
> 
> Thanks,
> drew
> 



Re: [RFC PATCH 01/14] mm/slub: don't try to dereference invalid freepointers

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:20AM +, Matteo Rizzo wrote:
> slab_free_freelist_hook tries to read a freelist pointer from the
> current object even when freeing a single object. This is invalid
> because single objects don't actually contain a freelist pointer when
> they're freed and the memory contains other data. This causes problems
> for checking the integrity of freelist in get_freepointer.
> 
> Signed-off-by: Matteo Rizzo 
> ---
>  mm/slub.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index f7940048138c..a7dae207c2d2 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1820,7 +1820,9 @@ static inline bool slab_free_freelist_hook(struct 
> kmem_cache *s,
>  
>   do {
>   object = next;
> - next = get_freepointer(s, object);
> + /* Single objects don't actually contain a freepointer */
> + if (object != old_tail)
> + next = get_freepointer(s, object);
>  
>   /* If object's reuse doesn't have to be delayed */
>   if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {

I find this loop's logic a bit weird, but, yes, this ends up being
correct and avoids needless work.

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 02/14] mm/slub: add is_slab_addr/is_slab_page helpers

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:21AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> This is refactoring in preparation for adding two different
> implementations (for SLAB_VIRTUAL enabled and disabled).
> 
> virt_to_folio(x) expands to _compound_head(virt_to_page(x)) and
> virt_to_head_page(x) also expands to _compound_head(virt_to_page(x))
> 
> so PageSlab(virt_to_head_page(res)) should be equivalent to
> is_slab_addr(res).

Perhaps add a note that redundant calls to virt_to_folio() will be
removed in following patches?

> 
> Signed-off-by: Jann Horn 
> Co-developed-by: Matteo Rizzo 
> Signed-off-by: Matteo Rizzo 
> ---
>  include/linux/slab.h | 1 +
>  kernel/resource.c| 2 +-
>  mm/slab.h| 9 +
>  mm/slab_common.c | 5 ++---
>  mm/slub.c| 6 +++---
>  5 files changed, 16 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 8228d1276a2f..a2d82010d269 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -793,4 +793,5 @@ int slab_dead_cpu(unsigned int cpu);
>  #define slab_dead_cpuNULL
>  #endif
>  
> +#define is_slab_addr(addr) folio_test_slab(virt_to_folio(addr))
>  #endif   /* _LINUX_SLAB_H */
> diff --git a/kernel/resource.c b/kernel/resource.c
> index b1763b2fd7ef..c829e5f97292 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -158,7 +158,7 @@ static void free_resource(struct resource *res)
>* buddy and trying to be smart and reusing them eventually in
>* alloc_resource() overcomplicates resource handling.
>*/
> - if (res && PageSlab(virt_to_head_page(res)))
> + if (res && is_slab_addr(res))
>   kfree(res);
>  }
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index 799a315695c6..25e41dd6087e 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -169,6 +169,15 @@ static_assert(IS_ALIGNED(offsetof(struct slab, 
> freelist), sizeof(freelist_aba_t)
>   */
>  #define slab_page(s) folio_page(slab_folio(s), 0)
>  
> +/**
> + * is_slab_page - Checks if a page is really a slab page
> + * @s: The slab
> + *
> + * Checks if s points to a slab page.
> + *
> + * Return: true if s points to a slab and false otherwise.
> + */
> +#define is_slab_page(s) folio_test_slab(slab_folio(s))
>  /*
>   * If network-based swap is enabled, sl*b must keep track of whether pages
>   * were allocated from pfmemalloc reserves.
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index e99e821065c3..79102d24f099 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1063,7 +1063,7 @@ void kfree(const void *object)
>   return;
>  
>   folio = virt_to_folio(object);
> - if (unlikely(!folio_test_slab(folio))) {
> + if (unlikely(!is_slab_addr(object))) {
>   free_large_kmalloc(folio, (void *)object);
>   return;
>   }
> @@ -1094,8 +1094,7 @@ size_t __ksize(const void *object)
>   return 0;
>  
>   folio = virt_to_folio(object);
> -
> - if (unlikely(!folio_test_slab(folio))) {
> + if (unlikely(!is_slab_addr(object))) {
>   if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
>   return 0;
>   if (WARN_ON(object != folio_address(folio)))

In the above 2 hunks we're doing virt_to_folio() twice, but I see in
patch 4 that these go away.

> diff --git a/mm/slub.c b/mm/slub.c
> index a7dae207c2d2..b69916ab7aa8 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1259,7 +1259,7 @@ static int check_slab(struct kmem_cache *s, struct slab 
> *slab)
>  {
>   int maxobj;
>  
> - if (!folio_test_slab(slab_folio(slab))) {
> + if (!is_slab_page(slab)) {
>   slab_err(s, slab, "Not a valid slab page");
>   return 0;
>   }
> @@ -1454,7 +1454,7 @@ static noinline bool alloc_debug_processing(struct 
> kmem_cache *s,
>   return true;
>  
>  bad:
> - if (folio_test_slab(slab_folio(slab))) {
> + if (is_slab_page(slab)) {
>   /*
>* If this is a slab page then lets do the best we can
>* to avoid issues in the future. Marking all objects
> @@ -1484,7 +1484,7 @@ static inline int free_consistency_checks(struct 
> kmem_cache *s,
>   return 0;
>  
>   if (unlikely(s != slab->slab_cache)) {
> - if (!folio_test_slab(slab_folio(slab))) {
> + if (!is_slab_page(slab)) {
>   slab_err(s, slab, "Attempt to free object(0x%p) outside 
> of slab",
>object);
>   } else if (!slab->slab_cache) {
> -- 
> 2.42.0.459.ge4e396fd5e-goog

This all looks nice and mechanical. :P

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 03/14] mm/slub: move kmem_cache_order_objects to slab.h

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:22AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> This is refactoring for SLAB_VIRTUAL. The implementation needs to know
> the order of the virtual memory region allocated to each slab to know
> how much physical memory to allocate when the slab is reused. We reuse
> kmem_cache_order_objects for this, so we have to move it before struct
> slab.
> 
> Signed-off-by: Jann Horn 

Yay mechanical changes.

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 04/14] mm: use virt_to_slab instead of folio_slab

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:23AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> This is refactoring in preparation for the introduction of SLAB_VIRTUAL
> which does not implement folio_slab.
> 
> With SLAB_VIRTUAL there is no longer a 1:1 correspondence between slabs
> and pages of physical memory used by the slab allocator. There is no way
> to look up the slab which corresponds to a specific page of physical
> memory without iterating over all slabs or over the page tables. Instead
> of doing that, we can look up the slab starting from its virtual address
> which can still be performed cheaply with both SLAB_VIRTUAL enabled and
> disabled.
> 
> Signed-off-by: Jann Horn 

Refactoring continues to track.

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 05/14] mm/slub: create folio_set/clear_slab helpers

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:24AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> This is refactoring in preparation for SLAB_VIRTUAL. Extract this code
> to separate functions so that it's not duplicated in the code that
> allocates and frees page with SLAB_VIRTUAL enabled.
> 
> Signed-off-by: Jann Horn 
> Co-developed-by: Matteo Rizzo 
> Signed-off-by: Matteo Rizzo 
> ---
>  mm/slub.c | 32 ++--
>  1 file changed, 22 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index ad33d9e1601d..9b87afade125 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1849,6 +1849,26 @@ static void *setup_object(struct kmem_cache *s, void 
> *object)
>  /*
>   * Slab allocation and freeing
>   */
> +
> +static void folio_set_slab(struct folio *folio, struct slab *slab)
> +{
> + __folio_set_slab(folio);
> + /* Make the flag visible before any changes to folio->mapping */
> + smp_wmb();
> +
> + if (folio_is_pfmemalloc(folio))
> + slab_set_pfmemalloc(slab);
> +}
> +
> +static void folio_clear_slab(struct folio *folio, struct slab *slab)
> +{
> + __slab_clear_pfmemalloc(slab);
> + folio->mapping = NULL;
> + /* Make the mapping reset visible before clearing the flag */
> + smp_wmb();
> + __folio_clear_slab(folio);
> +}

Perhaps these should be explicitly marked as inlines?

> +
>  static inline struct slab *alloc_slab_page(gfp_t flags, int node,
>   struct kmem_cache_order_objects oo)
>  {
> @@ -1865,11 +1885,7 @@ static inline struct slab *alloc_slab_page(gfp_t 
> flags, int node,
>   return NULL;
>  
>   slab = folio_slab(folio);
> - __folio_set_slab(folio);
> - /* Make the flag visible before any changes to folio->mapping */
> - smp_wmb();
> - if (folio_is_pfmemalloc(folio))
> - slab_set_pfmemalloc(slab);
> + folio_set_slab(folio, slab);
>  
>   return slab;
>  }
> @@ -2067,11 +2083,7 @@ static void __free_slab(struct kmem_cache *s, struct 
> slab *slab)
>   int order = folio_order(folio);
>   int pages = 1 << order;
>  
> - __slab_clear_pfmemalloc(slab);
> - folio->mapping = NULL;
> - /* Make the mapping reset visible before clearing the flag */
> - smp_wmb();
> - __folio_clear_slab(folio);
> + folio_clear_slab(folio, slab);
>   mm_account_reclaimed_pages(pages);
>   unaccount_slab(slab, order, s);
>   __free_pages(&folio->page, order);
> -- 
> 2.42.0.459.ge4e396fd5e-goog

Otherwise this is a straight function extraction.

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 06/14] mm/slub: pass additional args to alloc_slab_page

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:25AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> This is refactoring in preparation for SLAB_VIRTUAL.
> 
> The implementation of SLAB_VIRTUAL needs access to struct kmem_cache in
> alloc_slab_page in order to take unused slabs from the slab freelist,
> which is per-cache.
> 
> In addition to that it passes two different sets of GFP flags.
> meta_gfp_flags is used for the memory backing the metadata region and
> page tables, and gfp_flags for the data memory.
> 
> Signed-off-by: Jann Horn 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 08/14] security: introduce CONFIG_SLAB_VIRTUAL

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:27AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> SLAB_VIRTUAL is a mitigation for the SLUB allocator which prevents reuse
> of virtual addresses across different slab caches and therefore makes
> some types of use-after-free bugs unexploitable.
> 
> SLAB_VIRTUAL is incompatible with KASAN and we believe it's not worth
> adding support for it. This is because SLAB_VIRTUAL and KASAN are aimed
> at two different use cases: KASAN is meant for catching bugs as early as
> possible in debug/fuzz/testing builds, and it's not meant to be used in
> production. SLAB_VIRTUAL on the other hand is an exploit mitigation that
> doesn't attempt to highlight bugs but instead tries to make them
> unexploitable. It doesn't make sense to enable it in debugging builds or
> during fuzzing, and instead we expect that it will be enabled in
> production kernels.
> 
> SLAB_VIRTUAL is not currently compatible with KFENCE, removing this
> limitation is future work.
> 
> Signed-off-by: Jann Horn 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 07/14] mm/slub: pass slab pointer to the freeptr decode helper

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:26AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> This is refactoring in preparation for checking freeptrs for corruption
> inside freelist_ptr_decode().
> 
> Signed-off-by: Jann Horn 
> Co-developed-by: Matteo Rizzo 
> Signed-off-by: Matteo Rizzo 
> ---
>  mm/slub.c | 43 +++
>  1 file changed, 23 insertions(+), 20 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index eaa1256aff89..42e7cc0b4452 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -383,7 +383,8 @@ static inline freeptr_t freelist_ptr_encode(const struct 
> kmem_cache *s,
>  }
>  
>  static inline void *freelist_ptr_decode(const struct kmem_cache *s,
> - freeptr_t ptr, unsigned long ptr_addr)
> + freeptr_t ptr, unsigned long ptr_addr,
> + struct slab *slab)

Can't "s" be dropped in these cases, since "s" should be
"slab->kmem_cache"?

>  {
>   void *decoded;
>  
> @@ -395,7 +396,8 @@ static inline void *freelist_ptr_decode(const struct 
> kmem_cache *s,
>   return decoded;
>  }
>  
> -static inline void *get_freepointer(struct kmem_cache *s, void *object)
> +static inline void *get_freepointer(struct kmem_cache *s, void *object,
> + struct slab *slab)
>  {
>   unsigned long ptr_addr;
>   freeptr_t p;
> @@ -403,7 +405,7 @@ static inline void *get_freepointer(struct kmem_cache *s, 
> void *object)
>   object = kasan_reset_tag(object);
>   ptr_addr = (unsigned long)object + s->offset;
>   p = *(freeptr_t *)(ptr_addr);
> - return freelist_ptr_decode(s, p, ptr_addr);
> + return freelist_ptr_decode(s, p, ptr_addr, slab);
>  }
>  
>  #ifndef CONFIG_SLUB_TINY
> @@ -424,18 +426,19 @@ static void prefetch_freepointer(const struct 
> kmem_cache *s, void *object)
>   * get_freepointer_safe() returns initialized memory.
>   */
>  __no_kmsan_checks
> -static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
> +static inline void *get_freepointer_safe(struct kmem_cache *s, void *object,
> +  struct slab *slab)
>  {
>   unsigned long freepointer_addr;
>   freeptr_t p;
>  
>   if (!debug_pagealloc_enabled_static())
> - return get_freepointer(s, object);
> + return get_freepointer(s, object, slab);
>  
>   object = kasan_reset_tag(object);
>   freepointer_addr = (unsigned long)object + s->offset;
>   copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
> - return freelist_ptr_decode(s, p, freepointer_addr);
> + return freelist_ptr_decode(s, p, freepointer_addr, slab);
>  }
>  
>  static inline void set_freepointer(struct kmem_cache *s, void *object, void 
> *fp)
> @@ -627,7 +630,7 @@ static void __fill_map(unsigned long *obj_map, struct 
> kmem_cache *s,
>  
>   bitmap_zero(obj_map, slab->objects);
>  
> - for (p = slab->freelist; p; p = get_freepointer(s, p))
> + for (p = slab->freelist; p; p = get_freepointer(s, p, slab))
>   set_bit(__obj_to_index(s, addr, p), obj_map);
>  }
>  
> @@ -937,7 +940,7 @@ static void print_trailer(struct kmem_cache *s, struct 
> slab *slab, u8 *p)
>   print_slab_info(slab);
>  
>   pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
> -p, p - addr, get_freepointer(s, p));
> +p, p - addr, get_freepointer(s, p, slab));
>  
>   if (s->flags & SLAB_RED_ZONE)
>   print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
> @@ -1230,7 +1233,7 @@ static int check_object(struct kmem_cache *s, struct 
> slab *slab,
>   return 1;
>  
>   /* Check free pointer validity */
> - if (!check_valid_pointer(s, slab, get_freepointer(s, p))) {
> + if (!check_valid_pointer(s, slab, get_freepointer(s, p, slab))) {
>   object_err(s, slab, p, "Freepointer corrupt");
>   /*
>* No choice but to zap it and thus lose the remainder
> @@ -1298,7 +1301,7 @@ static int on_freelist(struct kmem_cache *s, struct 
> slab *slab, void *search)
>   break;
>   }
>   object = fp;
> - fp = get_freepointer(s, object);
> + fp = get_freepointer(s, object, slab);
>   nr++;
>   }
>  
> @@ -1810,7 +1813,7 @@ static inline bool slab_free_freelist_hook(struct 
> kmem_cache *s,
>   object = next;
>   /* Single objects don't actually contain a freepointer */
>   if (object != old_tail)
> - next = get_freepointer(s, object);
> + next = get_freepointer(s, object, virt_to_slab(object));
>  
>   /* If object's reuse doesn't have to be delayed */
>   if (!slab_free_hook(s, object, slab_want_init_on_free(s))) {
> @@ -2161,7 +2164,7 @@ static void *alloc_single_from_partial(struct 
> kmem_cache *s

Re: [RFC PATCH 09/14] mm/slub: add the slab freelists to kmem_cache

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:28AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> With SLAB_VIRTUAL enabled, unused slabs which still have virtual memory
> allocated to them but no physical memory are kept in a per-cache list so
> that they can be reused later if the cache needs to grow again.
> 
> Signed-off-by: Jann Horn 

Looks appropriately #ifdef'ed...

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 10/14] x86: Create virtual memory region for SLUB

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:29AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> SLAB_VIRTUAL reserves 512 GiB of virtual memory and uses them for both
> struct slab and the actual slab memory. The pointers returned by
> kmem_cache_alloc will point to this range of memory.

I think the 512 GiB limit may be worth mentioning in the Kconfig help
text.

And in the "640K is enough for everything" devil's advocacy, why is 512
GiB enough here? Is there any greater risk of a pathological allocation
pattern breaking a system any more (or less) than is currently possible?
> 
> Signed-off-by: Jann Horn 

But, yes, I'm still a fan, and I think it interacts well here with the
rest of the KASLR initialization:

Reviewed-by: Kees Cook 

Have you tried to make this work on arm64? I imagine it should be
roughly as easy?

-- 
Kees Cook


Re: [RFC PATCH 11/14] mm/slub: allocate slabs from virtual memory

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:30AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> This is the main implementation of SLAB_VIRTUAL. With SLAB_VIRTUAL
> enabled, slab memory is not allocated from the linear map but from a
> dedicated region of virtual memory. The code ensures that once a range
> of virtual addresses is assigned to a slab cache, that virtual memory is
> never reused again except for other slabs in that same cache. This lets
> us mitigate some exploits for use-after-free vulnerabilities where the
> attacker makes SLUB release a slab page to the page allocator and then
> makes it reuse that same page for a different slab cache ("cross-cache
> attacks").
> 
> With SLAB_VIRTUAL enabled struct slab no longer overlaps struct page but
> instead it is allocated from a dedicated region of virtual memory. This
> makes it possible to have references to slabs whose physical memory has
> been freed.
> 
> SLAB_VIRTUAL has a small performance overhead, about 1-2% on kernel
> compilation time. We are using 4 KiB pages to map slab pages and slab
> metadata area, instead of the 2 MiB pages that the kernel uses to map
> the physmap. We experimented with a version of the patch that uses 2 MiB
> pages and we did see some performance improvement but the code also
> became much more complicated and ugly because we would need to allocate
> and free multiple slabs at once.

I think these hints about performance should be also noted in the
Kconfig help.

> In addition to the TLB contention, SLAB_VIRTUAL also adds new locks to
> the slow path of the allocator. Lock contention also contributes to the
> performance penalty to some extent, and this is more visible on machines
> with many CPUs.
> 
> Signed-off-by: Jann Horn 
> Co-developed-by: Matteo Rizzo 
> Signed-off-by: Matteo Rizzo 
> ---
>  arch/x86/include/asm/page_64.h  |  10 +
>  arch/x86/include/asm/pgtable_64_types.h |   5 +
>  arch/x86/mm/physaddr.c  |  10 +
>  include/linux/slab.h|   7 +
>  init/main.c |   1 +
>  mm/slab.h   | 106 ++
>  mm/slab_common.c|   4 +
>  mm/slub.c   | 439 +++-
>  mm/usercopy.c   |  12 +-
>  9 files changed, 587 insertions(+), 7 deletions(-)

Much of this needs review by MM people -- I can't speak well to the
specifics of the implementation. On coding style, I wonder if we can get
away with reducing the amount of #ifdef code by either using "if
(IS_ENABLED(...)) { ... }" style code, or, in the case of the allocation
function, splitting it out into two separate files, one for standard
page allocator, and one for the new virt allocator. But, again, MM
preferences reign. :)

-- 
Kees Cook


Re: [RFC PATCH 12/14] mm/slub: introduce the deallocated_pages sysfs attribute

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:31AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> When SLAB_VIRTUAL is enabled this new sysfs attribute tracks the number
> of slab pages whose physical memory has been reclaimed but whose virtual
> memory is still allocated to a kmem_cache.
> 
> Signed-off-by: Jann Horn 

Yay stats. :)

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [RFC PATCH 13/14] mm/slub: sanity-check freepointers

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:32AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> Sanity-check that:
>  - non-NULL freepointers point into the slab
>  - freepointers look plausibly aligned
> 
> Signed-off-by: Jann Horn 
> Co-developed-by: Matteo Rizzo 
> Signed-off-by: Matteo Rizzo 
> ---
>  lib/slub_kunit.c |  4 
>  mm/slab.h|  8 +++
>  mm/slub.c| 57 
>  3 files changed, 69 insertions(+)
> 
> diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c
> index d4a3730b08fa..acf8600bd1fd 100644
> --- a/lib/slub_kunit.c
> +++ b/lib/slub_kunit.c
> @@ -45,6 +45,10 @@ static void test_clobber_zone(struct kunit *test)
>  #ifndef CONFIG_KASAN
>  static void test_next_pointer(struct kunit *test)
>  {
> + if (IS_ENABLED(CONFIG_SLAB_VIRTUAL))
> + kunit_skip(test,
> + "incompatible with freepointer corruption detection in 
> CONFIG_SLAB_VIRTUAL");
> +
>   struct kmem_cache *s = test_kmem_cache_create("TestSlub_next_ptr_free",
>   64, SLAB_POISON);
>   u8 *p = kmem_cache_alloc(s, GFP_KERNEL);
> diff --git a/mm/slab.h b/mm/slab.h
> index 460c802924bd..8d10a011bdf0 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -79,6 +79,14 @@ struct slab {
>  
>   struct list_head flush_list_elem;
>  
> + /*
> +  * Not in kmem_cache because it depends on whether the allocation is
> +  * normal order or fallback order.
> +  * an alternative might be to over-allocate virtual memory for
> +  * fallback-order pages.
> +  */
> + unsigned long align_mask;
> +
>   /* Replaces the page lock */
>   spinlock_t slab_lock;
>  
> diff --git a/mm/slub.c b/mm/slub.c
> index 0f7f5bf0b174..57474c8a6569 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -392,6 +392,44 @@ static inline freeptr_t freelist_ptr_encode(const struct 
> kmem_cache *s,
>   return (freeptr_t){.v = encoded};
>  }
>  
> +/*
> + * Does some validation of freelist pointers. Without SLAB_VIRTUAL this is
> + * currently a no-op.
> + */
> +static inline bool freelist_pointer_corrupted(struct slab *slab, freeptr_t 
> ptr,
> + void *decoded)
> +{
> +#ifdef CONFIG_SLAB_VIRTUAL
> + /*
> +  * If the freepointer decodes to 0, use 0 as the slab_base so that
> +  * the check below always passes (0 & slab->align_mask == 0).
> +  */
> + unsigned long slab_base = decoded ? (unsigned long)slab_to_virt(slab)
> + : 0;
> +
> + /*
> +  * This verifies that the SLUB freepointer does not point outside the
> +  * slab. Since at that point we can basically do it for free, it also
> +  * checks that the pointer alignment looks vaguely sane.
> +  * However, we probably don't want the cost of a proper division here,
> +  * so instead we just do a cheap check whether the bottom bits that are
> +  * clear in the size are also clear in the pointer.
> +  * So for kmalloc-32, it does a perfect alignment check, but for
> +  * kmalloc-192, it just checks that the pointer is a multiple of 32.
> +  * This should probably be reconsidered - is this a good tradeoff, or
> +  * should that part be thrown out, or do we want a proper accurate
> +  * alignment check (and can we make it work with acceptable performance
> +  * cost compared to the security improvement - probably not)?

Is it really that much more expensive to check the alignment exactly?

> +  */
> + return CHECK_DATA_CORRUPTION(
> + ((unsigned long)decoded & slab->align_mask) != slab_base,
> + "bad freeptr (encoded %lx, ptr %p, base %lx, mask %lx",
> + ptr.v, decoded, slab_base, slab->align_mask);
> +#else
> + return false;
> +#endif
> +}
> +
>  static inline void *freelist_ptr_decode(const struct kmem_cache *s,
>   freeptr_t ptr, unsigned long ptr_addr,
>   struct slab *slab)
> @@ -403,6 +441,10 @@ static inline void *freelist_ptr_decode(const struct 
> kmem_cache *s,
>  #else
>   decoded = (void *)ptr.v;
>  #endif
> +
> + if (unlikely(freelist_pointer_corrupted(slab, ptr, decoded)))
> + return NULL;
> +
>   return decoded;
>  }
>  
> @@ -2122,6 +2164,21 @@ static struct slab *get_free_slab(struct kmem_cache *s,
>   if (slab == NULL)
>   return NULL;
>  
> + /*
> +  * Bits that must be equal to start-of-slab address for all
> +  * objects inside the slab.
> +  * For compatibility with pointer tagging (like in HWASAN), this would
> +  * need to clear the pointer tag bits from the mask.
> +  */
> + slab->align_mask = ~((PAGE_SIZE << oo_order(oo)) - 1);
> +
> + /*
> +  * Object alignment bits (must be zero, which is equal to the bits in
> +  * the start-of-slab address)
> +  */
> + if (s->red_left_pad == 0)
> + slab->align_mask |= (1 << (ffs(s->size) - 1)) - 1

Re: [RFC PATCH 14/14] security: add documentation for SLAB_VIRTUAL

2023-09-15 Thread Kees Cook
On Fri, Sep 15, 2023 at 10:59:33AM +, Matteo Rizzo wrote:
> From: Jann Horn 
> 
> Document what SLAB_VIRTUAL is trying to do, how it's implemented, and
> why.
> 
> Signed-off-by: Jann Horn 
> Co-developed-by: Matteo Rizzo 
> Signed-off-by: Matteo Rizzo 
> ---
>  Documentation/security/self-protection.rst | 102 +
>  1 file changed, 102 insertions(+)
> 
> diff --git a/Documentation/security/self-protection.rst 
> b/Documentation/security/self-protection.rst
> index 910668e665cb..5a5e99e3f244 100644
> --- a/Documentation/security/self-protection.rst
> +++ b/Documentation/security/self-protection.rst
> @@ -314,3 +314,105 @@ To help kill classes of bugs that result in kernel 
> addresses being
>  written to userspace, the destination of writes needs to be tracked. If
>  the buffer is destined for userspace (e.g. seq_file backed ``/proc`` files),
>  it should automatically censor sensitive values.
> +
> +
> +Memory Allocator Mitigations
> +
> +
> +Protection against cross-cache attacks (SLAB_VIRTUAL)
> +-
> +
> +SLAB_VIRTUAL is a mitigation that deterministically prevents cross-cache
> +attacks.
> +
> +Linux Kernel use-after-free vulnerabilities are commonly exploited by turning
> +them into an object type confusion (having two active pointers of different
> +types to the same memory location) using one of the following techniques:
> +
> +1. Direct object reuse: make the kernel give the victim object back to the 
> slab
> +   allocator, then allocate the object again from the same slab cache as a
> +   different type. This is only possible if the victim object resides in a 
> slab
> +   cache which can contain objects of different types - for example one of 
> the
> +   kmalloc caches.
> +2. "Cross-cache attack": make the kernel give the victim object back to the 
> slab
> +   allocator, then make the slab allocator give the page containing the 
> object
> +   back to the page allocator, then either allocate the page directly as some
> +   other type of page or make the slab allocator allocate it again for a
> +   different slab cache and allocate an object from there.

I feel like adding a link to
https://googleprojectzero.blogspot.com/2021/10/how-simple-linux-kernel-memory.html
would be nice here, as some folks reading this may not understand how
plausible the second attack can be. :)

> +
> +In either case, the important part is that the same virtual address is reused
> +for two objects of different types.
> +
> +The first case can be addressed by separating objects of different types
> +into different slab caches. If a slab cache only contains objects of the
> +same type then directly turning an use-after-free into a type confusion is
> +impossible as long as the slab page that contains the victim object remains
> +assigned to that slab cache. This type of mitigation is easily bypassable
> +by cross-cache attacks: if the attacker can make the slab allocator return
> +the page containing the victim object to the page allocator and then make
> +it use the same page for a different slab cache, type confusion becomes
> +possible again. Addressing the first case is therefore only worthwhile if
> +cross-cache attacks are also addressed. AUTOSLAB uses a combination of

I think you mean CONFIG_RANDOM_KMALLOC_CACHES, not AUTOSLAB which isn't
upstream.

> +probabilistic mitigations for this. SLAB_VIRTUAL addresses the second case
> +deterministically by changing the way the slab allocator allocates memory.
> +
> +Preventing slab virtual address reuse
> +~
> +
> +In theory there is an easy fix against cross-cache attacks: modify the slab
> +allocator so that it never gives memory back to the page allocator. In 
> practice
> +this would be problematic because physical memory remains permanently 
> assigned
> +to a slab cache even if it doesn't contain any active objects. A viable
> +cross-cache mitigation must allow the system to reclaim unused physical 
> memory.
> +In the current design of the slab allocator there is no way
> +to keep a region of virtual memory permanently assigned to a slab cache 
> without
> +also permanently reserving physical memory. That is because the virtual
> +addresses that the slab allocator uses come from the linear map region, where
> +there is a 1:1 correspondence between virtual and physical addresses.
> +
> +SLAB_VIRTUAL's solution is to create a dedicated virtual memory region that 
> is
> +only used for slab memory, and to enforce that once a range of virtual 
> addresses
> +is used for a slab cache, it is never reused for any other caches. Using a
> +dedicated region of virtual memory lets us reserve ranges of virtual 
> addresses
> +to prevent cross-cache attacks and at the same time release physical memory 
> back
> +to the system when it's no longer needed. This is what Chromium's 
> PartitionAlloc
> +does in userspace
> +(https://chromium.goo

Re: [RFC PATCH 10/14] x86: Create virtual memory region for SLUB

2023-09-15 Thread Dave Hansen
On 9/15/23 14:13, Kees Cook wrote:
> On Fri, Sep 15, 2023 at 10:59:29AM +, Matteo Rizzo wrote:
>> From: Jann Horn 
>>
>> SLAB_VIRTUAL reserves 512 GiB of virtual memory and uses them for both
>> struct slab and the actual slab memory. The pointers returned by
>> kmem_cache_alloc will point to this range of memory.
> 
> I think the 512 GiB limit may be worth mentioning in the Kconfig help
> text.

Yes, please.

> And in the "640K is enough for everything" devil's advocacy, why is 512
> GiB enough here? Is there any greater risk of a pathological allocation
> pattern breaking a system any more (or less) than is currently possible?

I have the feeling folks just grabbed the first big-ish chunk they saw
free in the memory map and stole that one.  Not a horrible approach,
mind you, but I have the feeling it didn't go through the most rigorous
sizing procedure. :)

My laptop memory is ~6% consumed by slab, 90% of which is reclaimable.
If a 64TB system had the same ratio, it would bump into this 512GB
limit.  But it _should_ just reclaim thing earlier rather than falling over.

That said, we still have gobs of actual vmalloc() space.  It's ~30TiB in
size and I'm not aware of anyone consuming anywhere near that much.  If
the 512GB fills up somehow, there are other places to steal the space.

One minor concern is that the virtual area is the same size on 4 and
5-level paging systems.  It might be a good idea to pick one of the
holes that actually gets bigger on 5-level systems.



Re: [RFC PATCH 11/14] mm/slub: allocate slabs from virtual memory

2023-09-15 Thread Dave Hansen
On 9/15/23 03:59, Matteo Rizzo wrote:
> + spin_lock_irqsave(&slub_kworker_lock, irq_flags);
> + list_splice_init(&slub_tlbflush_queue, &local_queue);
> + list_for_each_entry(slab, &local_queue, flush_list_elem) {
> + unsigned long start = (unsigned long)slab_to_virt(slab);
> + unsigned long end = start + PAGE_SIZE *
> + (1UL << oo_order(slab->oo));
> +
> + if (start < addr_start)
> + addr_start = start;
> + if (end > addr_end)
> + addr_end = end;
> + }
> + spin_unlock_irqrestore(&slub_kworker_lock, irq_flags);
> +
> + if (addr_start < addr_end)
> + flush_tlb_kernel_range(addr_start, addr_end);

I assume that the TLB flushes in the queue are going to be pretty sparse
on average.

At least on x86, flush_tlb_kernel_range() falls back pretty quickly from
individual address invalidation to just doing a full flush.  It might
not even be worth tracking the address ranges, and just do a full flush
every time.

I'd be really curious to see how often actual ranged flushes are
triggered from this code.  I expect it would be very near zero.


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Conor Dooley
On Fri, Sep 15, 2023 at 01:42:56PM +0100, Conor Dooley wrote:
> 
> > > If this isn't being used in a similar manner, then the w has no reason
> > > to be in the odd lowercase form.
> > 
> > Other than to be consistent... However, the CBO_* instructions are not
> > consistent with the rest of macros. If we don't need lowercase for any
> > reason, then my preference would be to bite the bullet and change all the
> > callsites of CBO_* macros and then introduce this new instruction as
> > PREFETCH_W
> 
> Aye, I probably should've done it to begin with. Maybe there was some
> other consideration at the time.

FWIW, I sent a patch for this earlier today. I figure you saw it Drew,
but nonetheless:
https://lore.kernel.org/all/20230915-aloe-dollar-99493746@spud/


signature.asc
Description: PGP signature


Re: [PATCH V11 03/17] riscv: Use Zicbop in arch_xchg when available

2023-09-15 Thread Leonardo Bras
On Fri, Sep 15, 2023 at 08:36:31PM +0800, Guo Ren wrote:
> On Wed, Sep 13, 2023 at 4:50 PM Leonardo Bras  wrote:
> >
> > On Sun, Sep 10, 2023 at 04:28:57AM -0400, guo...@kernel.org wrote:
> > > From: Guo Ren 
> > >
> > > Cache-block prefetch instructions are HINTs to the hardware to
> > > indicate that software intends to perform a particular type of
> > > memory access in the near future. Enable ARCH_HAS_PREFETCHW and
> > > improve the arch_xchg for qspinlock xchg_tail.
> > >
> > > Signed-off-by: Guo Ren 
> > > Signed-off-by: Guo Ren 
> > > ---
> > >  arch/riscv/Kconfig | 15 +++
> > >  arch/riscv/include/asm/cmpxchg.h   |  4 +++-
> > >  arch/riscv/include/asm/hwcap.h |  1 +
> > >  arch/riscv/include/asm/insn-def.h  |  5 +
> > >  arch/riscv/include/asm/processor.h | 13 +
> > >  arch/riscv/kernel/cpufeature.c |  1 +
> > >  6 files changed, 38 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > > index e9ae6fa232c3..2c346fe169c1 100644
> > > --- a/arch/riscv/Kconfig
> > > +++ b/arch/riscv/Kconfig
> > > @@ -617,6 +617,21 @@ config RISCV_ISA_ZICBOZ
> > >
> > >  If you don't know what to do here, say Y.
> > >
> > > +config RISCV_ISA_ZICBOP
> > > + bool "Zicbop extension support for cache block prefetch"
> > > + depends on MMU
> > > + depends on RISCV_ALTERNATIVE
> > > + default y
> > > + help
> > > +Adds support to dynamically detect the presence of the ZICBOP
> > > +extension (Cache Block Prefetch Operations) and enable its
> > > +usage.
> > > +
> > > +The Zicbop extension can be used to prefetch cache block for
> > > +read/write/instruction fetch.
> > > +
> > > +If you don't know what to do here, say Y.
> > > +
> > >  config TOOLCHAIN_HAS_ZIHINTPAUSE
> > >   bool
> > >   default y
> > > diff --git a/arch/riscv/include/asm/cmpxchg.h 
> > > b/arch/riscv/include/asm/cmpxchg.h
> > > index 702725727671..56eff7a9d2d2 100644
> > > --- a/arch/riscv/include/asm/cmpxchg.h
> > > +++ b/arch/riscv/include/asm/cmpxchg.h
> > > @@ -11,6 +11,7 @@
> > >
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #define __arch_xchg_masked(prepend, append, r, p, n) \
> > >  ({   \
> > > @@ -25,6 +26,7 @@
> > >   \
> > >   __asm__ __volatile__ (  \
> > >  prepend  \
> > > +PREFETCHW_ASM(%5)\
> > >  "0:  lr.w %0, %2\n"  \
> > >  "and  %1, %0, %z4\n" \
> > >  "or   %1, %1, %z3\n" \
> > > @@ -32,7 +34,7 @@
> > >  "bnez %1, 0b\n"  \
> > >  append   \
> > >  : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))   \
> > > -: "rJ" (__newx), "rJ" (~__mask)  \
> > > +: "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \
> > >  : "memory"); \
> > >   \
> > >   r = (__typeof__(*(p)))((__retx & __mask) >> __s);   \
> > > diff --git a/arch/riscv/include/asm/hwcap.h 
> > > b/arch/riscv/include/asm/hwcap.h
> > > index b7b58258f6c7..78b7b8b53778 100644
> > > --- a/arch/riscv/include/asm/hwcap.h
> > > +++ b/arch/riscv/include/asm/hwcap.h
> > > @@ -58,6 +58,7 @@
> > >  #define RISCV_ISA_EXT_ZICSR  40
> > >  #define RISCV_ISA_EXT_ZIFENCEI   41
> > >  #define RISCV_ISA_EXT_ZIHPM  42
> > > +#define RISCV_ISA_EXT_ZICBOP 43
> > >
> > >  #define RISCV_ISA_EXT_MAX64
> > >
> > > diff --git a/arch/riscv/include/asm/insn-def.h 
> > > b/arch/riscv/include/asm/insn-def.h
> > > index 6960beb75f32..dc590d331894 100644
> > > --- a/arch/riscv/include/asm/insn-def.h
> > > +++ b/arch/riscv/include/asm/insn-def.h
> > > @@ -134,6 +134,7 @@
> > >
> > >  #define RV_OPCODE_MISC_MEM   RV_OPCODE(15)
> > >  #define RV_OPCODE_SYSTEM RV_OPCODE(115)
> > > +#define RV_OPCODE_PREFETCH   RV_OPCODE(19)
> > >
> > >  #define HFENCE_VVMA(vaddr, asid) \
> > >   INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(17),  \
> > > @@ -196,4 +197,8 @@
> > >   INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),  \
> > >  RS1(base), SIMM12(4))
> > >
> > > +#define CBO_prefetchw(base)  \
> > > + INSN_R(OPCODE_PREFETCH, FUNC3(6), FUNC7(0), \
> > > +RD(x0), RS1(base), RS2(x0))
> > > +
> >
> > I underst