from:"IBM"

Re: [PATCH v4 01/13] powerpc/rtas: Add for_each_rtas_function() iterator

2023-11-20 Thread IBM

Nathan Lynch via B4 Relay 
writes:

> From: Nathan Lynch 
>
> Add a convenience macro for iterating over every element of the
> internal function table and convert the one site that can use it. An
> additional user of the macro is anticipated in changes to follow.
>

Reviewed-by: Aneesh Kumar K.V (IBM) 

> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/kernel/rtas.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index eddc031c4b95..1ad1869e2e96 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -454,6 +454,11 @@ static struct rtas_function rtas_function_table[] 
> __ro_after_init = {
>   },
>  };
>  
> +#define for_each_rtas_function(funcp)   \
> + for (funcp = &rtas_function_table[0];   \
> +  funcp < &rtas_function_table[ARRAY_SIZE(rtas_function_table)]; \
> +  ++funcp)
> +
>  /*
>   * Nearly all RTAS calls need to be serialized. All uses of the
>   * default rtas_args block must hold rtas_lock.
> @@ -525,10 +530,10 @@ static DEFINE_XARRAY(rtas_token_to_function_xarray);
>  
>  static int __init rtas_token_to_function_xarray_init(void)
>  {
> + const struct rtas_function *func;
>   int err = 0;
>  
> - for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) {
> - const struct rtas_function *func = &rtas_function_table[i];
> + for_each_rtas_function(func) {
>   const s32 token = func->token;
>  
>   if (token == RTAS_UNKNOWN_SERVICE)
>
> -- 
> 2.41.0

Re: [PATCH v4 02/13] powerpc/rtas: Fall back to linear search on failed token->function lookup

2023-11-20 Thread IBM

Nathan Lynch via B4 Relay 
writes:

> From: Nathan Lynch 
>
> Enabling any of the powerpc:rtas_* tracepoints at boot is likely to
> result in an oops on RTAS platforms. For example, booting a QEMU
> pseries model with 'trace_event=powerpc:rtas_input' in the command
> line leads to:
>
>   BUG: Kernel NULL pointer dereference on read at 0x0008
>   Oops: Kernel access of bad area, sig: 7 [#1]
>   NIP [c004231c] do_enter_rtas+0x1bc/0x460
>   LR [c004231c] do_enter_rtas+0x1bc/0x460
>   Call Trace:
> do_enter_rtas+0x1bc/0x460 (unreliable)
> rtas_call+0x22c/0x4a0
> rtas_get_boot_time+0x80/0x14c
> read_persistent_clock64+0x124/0x150
> read_persistent_wall_and_boot_offset+0x28/0x58
> timekeeping_init+0x70/0x348
> start_kernel+0xa0c/0xc1c
> start_here_common+0x1c/0x20
>
> (This is preceded by a warning for the failed lookup in
> rtas_token_to_function().)
>
> This happens when __do_enter_rtas_trace() attempts a token to function
> descriptor lookup before the xarray containing the mappings has been
> set up.
>
> Fall back to linear scan of the table if rtas_token_to_function_xarray
> is empty.
>

Reviewed-by: Aneesh Kumar K.V (IBM) 

> Signed-off-by: Nathan Lynch 
> Fixes: 24098f580e2b ("powerpc/rtas: add tracepoints around RTAS entry")
> ---
>  arch/powerpc/kernel/rtas.c | 18 ++
>  1 file changed, 14 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 1ad1869e2e96..f0051881348a 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -557,11 +557,21 @@ static const struct rtas_function 
> *rtas_token_to_function(s32 token)
>   return NULL;
>  
>   func = xa_load(&rtas_token_to_function_xarray, token);
> + if (func)
> + return func;
> + /*
> +  * Fall back to linear scan in case the reverse mapping hasn't
> +  * been initialized yet.
> +  */
> + if (xa_empty(&rtas_token_to_function_xarray)) {
> + for_each_rtas_function(func) {
> + if (func->token == token)
> + return func;
> + }
> + }
>  
> - if (WARN_ONCE(!func, "unexpected failed lookup for token %d", token))
> - return NULL;
> -
> - return func;
> + WARN_ONCE(true, "unexpected failed lookup for token %d", token);
> + return NULL;
>  }
>  
>  /* This is here deliberately so it's only used in this file */
>
> -- 
> 2.41.0

Re: [PATCH v4 03/13] powerpc/rtas: Add function return status constants

2023-11-20 Thread IBM

Nathan Lynch via B4 Relay 
writes:

> From: Nathan Lynch 
>
> Not all of the generic RTAS function statuses specified in PAPR have
> symbolic constants and descriptions in rtas.h. Fix this, providing a
> little more background, slightly updating the existing wording, and
> improving the formatting.
>

Reviewed-by: Aneesh Kumar K.V (IBM) 

> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/include/asm/rtas.h | 25 +++--
>  1 file changed, 19 insertions(+), 6 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index c697c3c74694..b73010583a8d 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -201,12 +201,25 @@ typedef struct {
>  /* Memory set aside for sys_rtas to use with calls that need a work area. */
>  #define RTAS_USER_REGION_SIZE (64 * 1024)
>  
> -/* RTAS return status codes */
> -#define RTAS_HARDWARE_ERROR  -1/* Hardware Error */
> -#define RTAS_BUSY-2/* RTAS Busy */
> -#define RTAS_INVALID_PARAMETER   -3/* Invalid 
> indicator/domain/sensor etc. */
> -#define RTAS_EXTENDED_DELAY_MIN  9900
> -#define RTAS_EXTENDED_DELAY_MAX  9905
> +/*
> + * Common RTAS function return values, derived from the table "RTAS
> + * Status Word Values" in PAPR+ 7.2.8: "Return Codes". If a function
> + * can return a value in this table then generally it has the meaning
> + * listed here. More extended commentary in the documentation for
> + * rtas_call().
> + *
> + * RTAS functions may use negative and positive numbers not in this
> + * set for function-specific error and success conditions,
> + * respectively.
> + */
> +#define RTAS_SUCCESS 0 /* Success. */
> +#define RTAS_HARDWARE_ERROR -1 /* Hardware or other unspecified 
> error. */
> +#define RTAS_BUSY   -2 /* Retry immediately. */
> +#define RTAS_INVALID_PARAMETER  -3 /* Invalid 
> indicator/domain/sensor etc. */
> +#define RTAS_UNEXPECTED_STATE_CHANGE-7 /* Seems limited to EEH and slot 
> reset. */
> +#define RTAS_EXTENDED_DELAY_MIN   9900 /* Retry after delaying for ~1ms. 
> */
> +#define RTAS_EXTENDED_DELAY_MAX   9905 /* Retry after delaying for 
> ~100s. */
> +#define RTAS_ML_ISOLATION_ERROR  -9000 /* Multi-level isolation error. */
>  
>  /* statuses specific to ibm,suspend-me */
>  #define RTAS_SUSPEND_ABORTED 9000 /* Suspension aborted */
>
> -- 
> 2.41.0

Re: [PATCH v4 04/13] powerpc/rtas: Factor out function descriptor lookup

2023-11-20 Thread IBM

Nathan Lynch via B4 Relay 
writes:

> From: Nathan Lynch 
>
> Move the function descriptor table lookup out of rtas_function_token()
> into a separate routine for use in new code to follow. No functional
> change.
>

Reviewed-by: Aneesh Kumar K.V (IBM) 

> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/kernel/rtas.c | 31 +++
>  1 file changed, 19 insertions(+), 12 deletions(-)
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index f0051881348a..1fc0b3fffdd1 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -469,29 +469,36 @@ static struct rtas_function rtas_function_table[] 
> __ro_after_init = {
>  static DEFINE_RAW_SPINLOCK(rtas_lock);
>  static struct rtas_args rtas_args;
>  
> -/**
> - * rtas_function_token() - RTAS function token lookup.
> - * @handle: Function handle, e.g. RTAS_FN_EVENT_SCAN.
> - *
> - * Context: Any context.
> - * Return: the token value for the function if implemented by this platform,
> - * otherwise RTAS_UNKNOWN_SERVICE.
> - */
> -s32 rtas_function_token(const rtas_fn_handle_t handle)
> +static struct rtas_function *rtas_function_lookup(const rtas_fn_handle_t 
> handle)
>  {
>   const size_t index = handle.index;
>   const bool out_of_bounds = index >= ARRAY_SIZE(rtas_function_table);
>  
>   if (WARN_ONCE(out_of_bounds, "invalid function index %zu", index))
> - return RTAS_UNKNOWN_SERVICE;
> + return NULL;
>   /*
>* Various drivers attempt token lookups on non-RTAS
>* platforms.
>*/
>   if (!rtas.dev)
> - return RTAS_UNKNOWN_SERVICE;
> + return NULL;
> +
> + return &rtas_function_table[index];
> +}
> +
> +/**
> + * rtas_function_token() - RTAS function token lookup.
> + * @handle: Function handle, e.g. RTAS_FN_EVENT_SCAN.
> + *
> + * Context: Any context.
> + * Return: the token value for the function if implemented by this platform,
> + * otherwise RTAS_UNKNOWN_SERVICE.
> + */
> +s32 rtas_function_token(const rtas_fn_handle_t handle)
> +{
> + const struct rtas_function *func = rtas_function_lookup(handle);
>  
> - return rtas_function_table[index].token;
> + return func ? func->token : RTAS_UNKNOWN_SERVICE;
>  }
>  EXPORT_SYMBOL_GPL(rtas_function_token);
>  
>
> -- 
> 2.41.0

Re: [PATCH v4 05/13] powerpc/rtas: Facilitate high-level call sequences

2023-11-20 Thread IBM

Nathan Lynch via B4 Relay 
writes:

> From: Nathan Lynch 
>
> On RTAS platforms there is a general restriction that the OS must not
> enter RTAS on more than one CPU at a time. This low-level
> serialization requirement is satisfied by holding a spin
> lock (rtas_lock) across most RTAS function invocations.
>
> However, some pseries RTAS functions require multiple successive calls
> to complete a logical operation. Beginning a new call sequence for such a
> function may disrupt any other sequences of that function already in
> progress. Safe and reliable use of these functions effectively
> requires higher-level serialization beyond what is already done at the
> level of RTAS entry and exit.
>
> Where a sequence-based RTAS function is invoked only through
> sys_rtas(), with no in-kernel users, there is no issue as far as the
> kernel is concerned. User space is responsible for appropriately
> serializing its call sequences. (Whether user space code actually
> takes measures to prevent sequence interleaving is another matter.)
> Examples of such functions currently include ibm,platform-dump and
> ibm,get-vpd.
>
> But where a sequence-based RTAS function has both user space and
> in-kernel uesrs, there is a hazard. Even if the in-kernel call sites
> of such a function serialize their sequences correctly, a user of
> sys_rtas() can invoke the same function at any time, potentially
> disrupting a sequence in progress.
>
> So in order to prevent disruption of kernel-based RTAS call sequences,
> they must serialize not only with themselves but also with sys_rtas()
> users, somehow. Preferably without adding global locks or adding more
> function-specific hacks to sys_rtas(). This is a prerequisite for
> adding an in-kernel call sequence of ibm,get-vpd, which is in a change
> to follow.
>
> Note that it has never been feasible for the kernel to prevent
> sys_rtas()-based sequences from being disrupted because control
> returns to user space on every call. sys_rtas()-based users of these
> functions have always been, and continue to be, responsible for
> coordinating their call sequences with other users, even those which
> may invoke the RTAS functions through less direct means than
> sys_rtas(). This is an unavoidable consequence of exposing
> sequence-based RTAS functions through sys_rtas().
>
> * Add new rtas_function_lock() and rtas_function_unlock() APIs for use
>   with sequence-based RTAS functions.
>
> * Add an optional per-function mutex to struct rtas_function. When this
>   member is set, kernel-internal callers of the RTAS function are
>   required to guard their call sequences with rtas_function_lock() and
>   rtas_function_unlock(). This requirement will be enforced in a later
>   change, after all affected call sites are updated.
>
> * Populate the lock members of function table entries where
>   serialization of call sequences is known to be necessary, along with
>   justifying commentary.
>
> * In sys_rtas(), acquire the per-function mutex when it is present.
>
> There should be no perceivable change introduced here except that
> concurrent callers of the same RTAS function via sys_rtas() may block
> on a mutex instead of spinning on rtas_lock. Changes to follow will
> add rtas_function_lock()/unlock() pairs to kernel-based call
> sequences.
>

Can you add an example of the last part. I did look at to find 06 to
find the details 

rtas_function_lock(RTAS_FN_IBM_ACTIVATE_FIRMWARE);

    do {
fwrc = rtas_call(token, 0, 1, NULL);
} while (rtas_busy_delay(fwrc));

rtas_function_unlock(RTAS_FN_IBM_ACTIVATE_FIRMWARE);

Reviewed-by: Aneesh Kumar K.V (IBM) 

>
> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/include/asm/rtas.h |   2 +
>  arch/powerpc/kernel/rtas.c  | 101 
> +++-
>  2 files changed, 101 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index b73010583a8d..9a20caba6858 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -421,6 +421,8 @@ static inline bool rtas_function_implemented(const 
> rtas_fn_handle_t handle)
>  {
>   return rtas_function_token(handle) != RTAS_UNKNOWN_SERVICE;
>  }
> +void rtas_function_lock(rtas_fn_handle_t handle);
> +void rtas_function_unlock(rtas_fn_handle_t handle);
>  extern int rtas_token(const char *service);
>  extern int rtas_service_present(const char *service);
>  extern int rtas_call(int token, int, int, int *, ...);
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 1fc0b3fffdd1..52f2242d0c28 100644
> --- a/arch/powerpc/kernel/rtas.c
>

Re: [PATCH v4 06/13] powerpc/rtas: Serialize firmware activation sequences

2023-11-20 Thread IBM

Nathan Lynch via B4 Relay 
writes:

> From: Nathan Lynch 
>
> Use the function lock API to prevent interleaving call sequences of
> the ibm,activate-firmware RTAS function, which typically requires
> multiple calls to complete the update. While the spec does not
> specifically prohibit interleaved sequences, there's almost certainly
> no advantage to allowing them.
>

Can we document what is the equivalent thing the userspace does? 

Reviewed-by: Aneesh Kumar K.V (IBM) 

> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/kernel/rtas.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 52f2242d0c28..e38ba05ad613 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -1753,10 +1753,14 @@ void rtas_activate_firmware(void)
>   return;
>   }
>  
> + rtas_function_lock(RTAS_FN_IBM_ACTIVATE_FIRMWARE);
> +
>   do {
>   fwrc = rtas_call(token, 0, 1, NULL);
>   } while (rtas_busy_delay(fwrc));
>  
> + rtas_function_unlock(RTAS_FN_IBM_ACTIVATE_FIRMWARE);
> +
>   if (fwrc)
>   pr_err("ibm,activate-firmware failed (%i)\n", fwrc);
>  }
>
> -- 
> 2.41.0

Re: [PATCH v4 07/13] powerpc/rtas: Warn if per-function lock isn't held

2023-11-20 Thread IBM

Nathan Lynch via B4 Relay 
writes:

> From: Nathan Lynch 
>
> If the function descriptor has a populated lock member, then callers
> are required to hold it across calls. Now that the firmware activation
> sequence is appropriately guarded, we can warn when the requirement
> isn't satisfied.
>
> __do_enter_rtas_trace() gets reorganized a bit as a result of
> performing the function descriptor lookup unconditionally now.
>

Reviewed-by: Aneesh Kumar K.V (IBM) 

> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/kernel/rtas.c | 21 +
>  1 file changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index e38ba05ad613..deb6289fcf9c 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -685,28 +685,25 @@ static void __do_enter_rtas(struct rtas_args *args)
>  
>  static void __do_enter_rtas_trace(struct rtas_args *args)
>  {
> - const char *name = NULL;
> + struct rtas_function *func = 
> rtas_token_to_function(be32_to_cpu(args->token));
>  
> - if (args == &rtas_args)
> - lockdep_assert_held(&rtas_lock);
>   /*
> -  * If the tracepoints that consume the function name aren't
> -  * active, avoid the lookup.
> +  * If there is a per-function lock, it must be held by the
> +  * caller.
>*/
> - if ((trace_rtas_input_enabled() || trace_rtas_output_enabled())) {
> - const s32 token = be32_to_cpu(args->token);
> - const struct rtas_function *func = 
> rtas_token_to_function(token);
> + if (func->lock)
> + WARN_ON(!mutex_is_locked(func->lock));
>  
> - name = func->name;
> - }
> + if (args == &rtas_args)
> + lockdep_assert_held(&rtas_lock);
>  
> - trace_rtas_input(args, name);
> + trace_rtas_input(args, func->name);
>   trace_rtas_ll_entry(args);
>  
>   __do_enter_rtas(args);
>  
>   trace_rtas_ll_exit(args);
> - trace_rtas_output(args, name);
> + trace_rtas_output(args, func->name);
>  }
>  
>  static void do_enter_rtas(struct rtas_args *args)
>
> -- 
> 2.41.0

Re: [PATCH] powerpc: add crtsavres.o to always-y instead of extra-y

2023-11-21 Thread IBM

"Nicholas Piggin"  writes:

> On Tue Nov 21, 2023 at 9:23 AM AEST, Masahiro Yamada wrote:
>> crtsavres.o is linked to modules. However, as explained in commit
>> d0e628cd817f ("kbuild: doc: clarify the difference between extra-y
>> and always-y"), 'make modules' does not build extra-y.
>>
>> For example, the following command fails:
>>
>>   $ make ARCH=powerpc LLVM=1 KBUILD_MODPOST_WARN=1 mrproper ps3_defconfig 
>> modules
>> [snip]
>> LD [M]  arch/powerpc/platforms/cell/spufs/spufs.ko
>>   ld.lld: error: cannot open arch/powerpc/lib/crtsavres.o: No such file or 
>> directory
>>   make[3]: *** [scripts/Makefile.modfinal:56: 
>> arch/powerpc/platforms/cell/spufs/spufs.ko] Error 1
>>   make[2]: *** [Makefile:1844: modules] Error 2
>>   make[1]: *** [/home/masahiro/workspace/linux-kbuild/Makefile:350: 
>> __build_one_by_one] Error 2
>>   make: *** [Makefile:234: __sub-make] Error 2
>>
>
> Thanks. Is this the correct Fixes tag?
>
> Fixes: d0e628cd817f ("powerpc/64: Do not link crtsavres.o in vmlinux")
>

I am finding a different commit ID:

commit baa25b571a168aff5a13bfdc973f1229e2b12b63
Author: Nicholas Piggin 
Date:   Fri May 12 01:56:49 2017 +1000

powerpc/64: Do not link crtsavres.o in vmlinux
 
The 64-bit linker creates save/restore functions on demand with final
links, so vmlinux does not require crtsavres.o.
 

-aneesh

Re: [PATCH v2 12/16] mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization

2023-06-27 Thread IBM

"Aneesh Kumar K.V"  writes:

> Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap
> optimization includes an update of both the permissions (writeable to
> read-only) and the output address (pfn) of the vmemmap ptes. That is not
> supported without unmapping of pte(marking it invalid) by some
> architectures.
>
> With DAX vmemmap optimization we don't require such pte updates and
> architectures can enable DAX vmemmap optimization while having hugetlb
> vmemmap optimization disabled. Hence split DAX optimization support into a
> different config.
>
> loongarch and riscv don't have devdax support. So the DAX config is not
> enabled for them. With this change, arm64 should be able to select DAX
> optimization
>
> [1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable 
> HUGETLB_PAGE_OPTIMIZE_VMEMMAP")
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/loongarch/Kconfig | 2 +-
>  arch/riscv/Kconfig | 2 +-
>  arch/x86/Kconfig   | 3 ++-
>  fs/Kconfig | 2 +-
>  include/linux/mm.h | 2 +-
>  mm/Kconfig | 5 -
>  6 files changed, 10 insertions(+), 6 deletions(-)

what about s390?

git grep "ARCH_WANT_OPTIMIZE_VMEMMAP" .
arch/s390/Kconfig:  select ARCH_WANT_OPTIMIZE_VMEMMAP

> diff --git a/mm/Kconfig b/mm/Kconfig
> index 7672a22647b4..7b388c10baab 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -461,7 +461,10 @@ config SPARSEMEM_VMEMMAP
>  # Select this config option from the architecture Kconfig, if it is preferred
>  # to enable the feature of HugeTLB/dev_dax vmemmap optimization.
>  #
> -config ARCH_WANT_OPTIMIZE_VMEMMAP
> +config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
> + bool
> +
> +config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
>   bool
>  
>  config HAVE_MEMBLOCK_PHYS_MAP
> -- 
> 2.40.1

-ritesh

Re: [PATCH v2 13/16] powerpc/book3s64/mm: Enable transparent pud hugepage

2023-06-27 Thread IBM

"Aneesh Kumar K.V"  writes:

These are just some minor nits in case you are going to send another
revision.

> This is enabled only with radix translation and 1G hugepage size. This will
> be used with devdax device memory with a namespace alignment of 1G.
>
> Anon transparent hugepage is not supported even though we do have helpers
> checking pud_trans_huge(). We should never find that return true. The only
> expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP.
>
> Some of the helpers are never expected to get called on hash translation
> and hence is marked to call BUG() in such a case.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h  | 156 --
>  arch/powerpc/include/asm/book3s/64/radix.h|  37 +
>  .../include/asm/book3s/64/tlbflush-radix.h|   2 +
>  arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +
>  arch/powerpc/mm/book3s64/pgtable.c|  78 +
>  arch/powerpc/mm/book3s64/radix_pgtable.c  |  28 
>  arch/powerpc/mm/book3s64/radix_tlb.c  |   7 +
>  arch/powerpc/platforms/Kconfig.cputype|   1 +
>  include/trace/events/thp.h|  17 ++
>  9 files changed, 323 insertions(+), 11 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 4acc9690f599..9a05de007956 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte)
>  {
>   return __pud_raw(pte_raw(pte));
>  }
> +
> +static inline pte_t *pudp_ptep(pud_t *pud)
> +{
> + return (pte_t *)pud;
> +}
> +
> +#define pud_pfn(pud) pte_pfn(pud_pte(pud))
> +#define pud_dirty(pud)   pte_dirty(pud_pte(pud))
> +#define pud_young(pud)   pte_young(pud_pte(pud))
> +#define pud_mkold(pud)   pte_pud(pte_mkold(pud_pte(pud)))
> +#define pud_wrprotect(pud)   pte_pud(pte_wrprotect(pud_pte(pud)))
> +#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud)))
> +#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud)))
> +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
> +#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud)))
>  #define pud_write(pud)   pte_write(pud_pte(pud))
>  
> +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
> +#define pud_soft_dirty(pmd)pte_soft_dirty(pud_pte(pud))
> +#define pud_mksoft_dirty(pmd)  pte_pud(pte_mksoft_dirty(pud_pte(pud)))
> +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud)))
> +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
> +
>  static inline int pud_bad(pud_t pud)
>  {
>   if (radix_enabled())
> @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, 
> bool write)
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
> +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
>  extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
>  extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
>  extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>  pmd_t *pmdp, pmd_t pmd);
> +extern void set_pud_at(struct mm_struct *mm, unsigned long addr,
> +pud_t *pudp, pud_t pud);
> +
>  static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
>   unsigned long addr, pmd_t *pmd)
>  {
>  }
>  
> +static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
> + unsigned long addr, pud_t *pud)
> +{
> +}
> +
>  extern int hash__has_transparent_hugepage(void);
>  static inline int has_transparent_hugepage(void)
>  {
> @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void)
>  }
>  #define has_transparent_hugepage has_transparent_hugepage
>  
> +static inline int has_transparent_pud_hugepage(void)
> +{
> + if (radix_enabled())
> + return radix__has_transparent_pud_hugepage();
> + return 0;
> +}
> +#define has_transparent_pud_hugepage has_transparent_pud_hugepage
> +
>  static inline unsigned long
>  pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
>   unsigned long clr, unsigned long set)
> @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned 
> long addr, pmd_t *pmdp,
>   return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
>  }
>  
> +static inline unsigned long
> +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp,
> + unsigned long clr, unsigned long set)
> +{
> + if (radix_enabled())
> + return radix__pud_hugepage_update(mm, addr, pudp, clr, set);
> + BUG();
> + return pud_val(*pudp);
> +}
> +
>  /*
>   * returns true for pmd migration entries, THP, devmap, hugetlb
>   * But compile time dependent on THP config
> @@ -1151,6 +1199,11 @@ st

Re: [PATCH v2 14/16] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

2023-06-27 Thread IBM

"Aneesh Kumar K.V"  writes:

> This is in preparation to update radix to implement vmemmap optimization
> for devdax. Below are the rules w.r.t radix vmemmap mapping
>
> 1. First try to map things using PMD (2M)
> 2. With altmap if altmap cross-boundary check returns true, fall back to
>PAGE_SIZE
> 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to
>PAGE_SIZE
>
> On removing vmemmap mapping, check if every subsection that is using the
> vmemmap area is invalid. If found to be invalid, that implies we can safely
> free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86
> because with 64K page size, we need to do the above check even at the
> PAGE_SIZE granularity.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |   2 +
>  arch/powerpc/include/asm/pgtable.h |   3 +
>  arch/powerpc/mm/book3s64/radix_pgtable.c   | 319 +++--
>  arch/powerpc/mm/init_64.c  |  26 +-
>  4 files changed, 319 insertions(+), 31 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index 8cdff5a05011..87d4c1e62491 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -332,6 +332,8 @@ extern int __meminit 
> radix__vmemmap_create_mapping(unsigned long start,
>unsigned long phys);
>  int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
> int node, struct vmem_altmap *altmap);
> +void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
> +struct vmem_altmap *altmap);
>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>   unsigned long page_size);
>  
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index 9972626ddaf6..6d4cd2ebae6e 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -168,6 +168,9 @@ static inline bool is_ioremap_addr(const void *x)
>  
>  struct seq_file;
>  void arch_report_meminfo(struct seq_file *m);
> +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
> vmemmap_map_size);
> +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
> +unsigned long page_size);
>  #endif /* CONFIG_PPC64 */
>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index d7e2dd3d4add..ef886fab643d 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -742,8 +742,57 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
>   p4d_clear(p4d);
>  }
>  
> +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned 
> long end)
> +{
> + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
> +
> + return !vmemmap_populated(start, PMD_SIZE);
> +}
> +
> +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned 
> long end)
> +{
> + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
> +
> + return !vmemmap_populated(start, PAGE_SIZE);
> +
> +}
> +
> +static void __meminit free_vmemmap_pages(struct page *page,
> +  struct vmem_altmap *altmap,
> +  int order)
> +{
> + unsigned int nr_pages = 1 << order;
> +
> + if (altmap) {
> + unsigned long alt_start, alt_end;
> + unsigned long base_pfn = page_to_pfn(page);
> +
> + /*
> +  * with 1G vmemmap mmaping we can have things setup
> +  * such that even though atlmap is specified we never
> +  * used altmap.
> +  */
> + alt_start = altmap->base_pfn;
> + alt_end = altmap->base_pfn + altmap->reserve +
> + altmap->free + altmap->alloc + altmap->align;
> +
> + if (base_pfn >= alt_start && base_pfn < alt_end) {
> + vmem_altmap_free(altmap, nr_pages);
> + return;
> + }
> + }
> +
> + if (PageReserved(page)) {
> + /* allocated from memblock */
> + while (nr_pages--)
> + free_reserved_page(page++);
> + } else
> + free_pages((unsigned long)page_address(page), order);
> +}
> +
>  static void remove_pte_table(pte_t *pte_start, unsigned long addr,
> -  unsigned long end, bool direct)
> +  unsigned long end, bool direct,
> +  struct vmem_altmap *altmap)
>  {
>   unsigned long next, pages = 0;
>   pte_t *pte;
> @@ -757,24 +806,23 @@ static void remove_pte_table(pte_t *pte_start, unsigned 
> long addr,
>   if (!pte_present(*pte))
>

Re: [PATCH RFC 06/12] mm/gup: Drop folio_fast_pin_allowed() in hugepd processing

2023-11-23 Thread IBM

Peter Xu  writes:

> On Thu, Nov 23, 2023 at 06:22:33PM +, Christophe Leroy wrote:
>> > For fast-gup I think the hugepd code is in use, however for walk_page_*
>> > apis hugepd code shouldn't be reached iiuc as we have the hugetlb specific
>> > handling (walk_hugetlb_range()), so anything within walk_pgd_range() to hit
>> > a hugepd can be dead code to me (but note that this "dead code" is good
>> > stuff to me, if one would like to merge hugetlb instead into generic mm).
>> 
>> Not sure what you mean here. What do you mean by "dead code" ?
>> A hugepage directory can be plugged at any page level, from PGD to PMD.
>> So the following bit in walk_pgd_range() is valid and not dead:
>> 
>>  if (is_hugepd(__hugepd(pgd_val(*pgd
>>  err = walk_hugepd_range((hugepd_t *)pgd, addr, next, 
>> walk, PGDIR_SHIFT);
>
> IMHO it boils down to the question on whether hugepd is only used in
> hugetlbfs.  I think I already mentioned that above, but I can be more
> explicit; what I see is that from higher stack in __walk_page_range():
>
>   if (is_vm_hugetlb_page(vma)) {
>   if (ops->hugetlb_entry)
>   err = walk_hugetlb_range(start, end, walk);
>   } else
>   err = walk_pgd_range(start, end, walk);
>
> It means to me as long as the vma is hugetlb, it'll not trigger any code in
> walk_pgd_range(), but only walk_hugetlb_range().  Do you perhaps mean
> hugepd is used outside hugetlbfs?
>

walk_pgd_range also get called from walk_page_range_novma().
IIRC commit e17eae2b839937817d771e2f5d2b30e5e2b81bb7 added the hugepd
details to pagewalk code to handle ptdump.

There is also a desire to use hugepd format in vmap mappings. 
https://lore.kernel.org/linuxppc-dev/cover.1620795204.git.christophe.le...@csgroup.eu

-aneesh

Re: [PATCH v4 06/13] powerpc/rtas: Serialize firmware activation sequences

2023-11-28 Thread IBM

Nathan Lynch  writes:

> "Aneesh Kumar K.V (IBM)"  writes:
>> Nathan Lynch via B4 Relay 
>> writes:
>>
>>>
>>> Use the function lock API to prevent interleaving call sequences of
>>> the ibm,activate-firmware RTAS function, which typically requires
>>> multiple calls to complete the update. While the spec does not
>>> specifically prohibit interleaved sequences, there's almost certainly
>>> no advantage to allowing them.
>>>
>>
>> Can we document what is the equivalent thing the userspace does?
>
> I'm not sure what we would document.
>
> As best I can tell, the activate_firmware command in powerpc-utils does
> not make any effort to protect its use of the ibm,activate-firmware RTAS
> function. The command is not intended to be run manually and I guess
> it's relying on the platform's management console to serialize its
> invocations.
>
> drmgr (also from powerpc-utils) has some dead code for LPM that calls
> ibm,activate-firmware; it should probably be removed. The command uses a
> lock file to serialize all of its executions.
>
> Something that could happen with interleaved ibm,activate-firmware
> sequences is something like this:
>
> 1. Process A initiates an ibm,activate-firmware sequence and receives a
>    "retry" status (-2/990x).
> 2. Process B calls ibm,activate-firmware and receives the "done" status
>(0), concluding the sequence A began.
> 3. Process A, unaware of B, calls ibm,activate-firmware again,
>inadvertently beginning a new sequence.
>

So this patch won't protect us against a parallel userspace invocation.
We can add static bool call_in_progress to track the ongoing
ibm,activate-firmware call from userspace? My only concern is we are
adding locks to protect against parallel calls in the kernel, but at the
same time, we ignore any userspace call regarding the same. We should at
least document this if this is not important to be fixed.

-aneesh

Re: [PATCH v2] powerpc/book3s/hash: Drop _PAGE_PRIVILEGED from PAGE_NONE

2023-12-04 Thread IBM

Michael Ellerman  writes:

> "Aneesh Kumar K.V"  writes:
>> There used to be a dependency on _PAGE_PRIVILEGED with pte_savedwrite.
>> But that got dropped by
>> commit 6a56ccbcf6c6 ("mm/autonuma: use can_change_(pte|pmd)_writable() to 
>> replace savedwrite")
>>
>> With the change in this patch numa fault pte (pte_protnone()) gets mapped as 
>> regular user pte
>> with RWX cleared (no-access) whereas earlier it used to be mapped 
>> _PAGE_PRIVILEGED.
>>
>> Hash fault handling code did get some WARN_ON added because those
>> functions are not expected to get called with _PAGE_READ cleared.
>> commit 18061c17c8ec ("powerpc/mm: Update PROTFAULT handling in the page 
>> fault path")
>> explains the details.
>  
> You say "did get" which makes me think you're talking about the past.
> But I think you're referring to the WARN_ON you are adding in this patch?

That is correct. Will update this as "Hash fault handing code gets some
WARN_ON added in this patch ..." ?
>

>
>> Also revert commit 1abce0580b89 ("powerpc/64s: Fix __pte_needs_flush() false 
>> positive warning")
>
> That could be done separately as a follow-up couldn't it? Would reduce
> the diff size.
>

Will split that to a separate patch.


>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>  arch/powerpc/include/asm/book3s/64/pgtable.h  | 9 +++--
>>  arch/powerpc/include/asm/book3s/64/tlbflush.h | 9 ++---
>>  arch/powerpc/mm/book3s64/hash_utils.c | 7 +++
>>  3 files changed, 12 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
>> b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index cb77eddca54b..2cc58ac74080 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -17,12 +17,6 @@
>>  #define _PAGE_EXEC  0x1 /* execute permission */
>>  #define _PAGE_WRITE 0x2 /* write access allowed */
>>  #define _PAGE_READ  0x4 /* read access allowed */
>> -#define _PAGE_NA_PAGE_PRIVILEGED
>  
>> -#define _PAGE_NAX   _PAGE_EXEC
>> -#define _PAGE_RO_PAGE_READ
>> -#define _PAGE_ROX   (_PAGE_READ | _PAGE_EXEC)
>> -#define _PAGE_RW(_PAGE_READ | _PAGE_WRITE)
>> -#define _PAGE_RWX   (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
>  
> Those are unrelated I think?
>

If we don't require _PAGE_NA we can fallback to generic version.


>>  #define _PAGE_PRIVILEGED0x8 /* kernel access only */
>>  #define _PAGE_SAO   0x00010 /* Strong access order */
>>  #define _PAGE_NON_IDEMPOTENT0x00020 /* non idempotent memory */
>> @@ -529,6 +523,9 @@ static inline bool pte_user(pte_t pte)
>>  }
>>  
>>  #define pte_access_permitted pte_access_permitted
>> +/*
>> + * execute-only mappings return false
>> + */
>
> That would fit better in the existing comment block inside the function
> I think. Normally this location would be a function description comment.
>

Will move.

>>  static inline bool pte_access_permitted(pte_t pte, bool write)
>>  {
>>  /*
>   ie. here
>
> cheers

Thanks
-aneesh

Re: [PATCH v3] powerpc/pseries/vas: Use usleep_range() to support HCALL delay

2023-12-04 Thread IBM

Haren Myneni  writes:

> VAS allocate, modify and deallocate HCALLs returns
> H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC for busy
> delay and expects OS to reissue HCALL after that delay. But using
> msleep() will often sleep at least 20 msecs even though the
> hypervisor suggests OS reissue these HCALLs after 1 or 10msecs.
> The open and close VAS window functions hold mutex and then issue
> these HCALLs. So these operations can take longer than the
> necessary when multiple threads issue open or close window APIs
> simultaneously.
>
> So instead of msleep(), use usleep_range() to ensure sleep with
> the expected value before issuing HCALL again.
>

Can you summarize if there an user observable impact for the current
code? We have other code paths using msleep(get_longbusy_msec()). Should
we audit those usages?


>
> Signed-off-by: Haren Myneni 
> Suggested-by: Nathan Lynch 
>
> ---
> v1 -> v2:
> - Use usleep_range instead of using RTAS sleep routine as
>   suggested by Nathan
> v2 -> v3:
> - Sleep 10MSecs even for HCALL delay > 10MSecs and the other
>   commit / comemnt changes as suggested by Nathan and Ellerman.
> ---
>  arch/powerpc/platforms/pseries/vas.c | 25 -
>  1 file changed, 24 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/pseries/vas.c 
> b/arch/powerpc/platforms/pseries/vas.c
> index 71d52a670d95..5cf81c564d4b 100644
> --- a/arch/powerpc/platforms/pseries/vas.c
> +++ b/arch/powerpc/platforms/pseries/vas.c
> @@ -38,7 +38,30 @@ static long hcall_return_busy_check(long rc)
>  {
>   /* Check if we are stalled for some time */
>   if (H_IS_LONG_BUSY(rc)) {
> - msleep(get_longbusy_msecs(rc));
> + unsigned int ms;
> + /*
> +  * Allocate, Modify and Deallocate HCALLs returns
> +  * H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC
> +  * for the long delay. So the sleep time should always
> +  * be either 1 or 10msecs, but in case if the HCALL
> +  * returns the long delay > 10 msecs, clamp the sleep
> +  * time to 10msecs.
> +  */
> + ms = clamp(get_longbusy_msecs(rc), 1, 10);
> +
> + /*
> +  * msleep() will often sleep at least 20 msecs even
> +  * though the hypervisor suggests that the OS reissue
> +  * HCALLs after 1 or 10msecs. Also the delay hint from
> +  * the HCALL is just a suggestion. So OK to pause for
> +  * less time than the hinted delay. Use usleep_range()
> +  * to ensure we don't sleep much longer than actually
> +  * needed.
> +  *
> +  * See Documentation/timers/timers-howto.rst for
> +  * explanation of the range used here.
> +  */
> + usleep_range(ms * 100, ms * 1000);
>   rc = H_BUSY;
>   } else if (rc == H_BUSY) {
>   cond_resched();
> -- 
> 2.26.3

Re: [PATCH 01/12] KVM: PPC: Book3S HV nestedv2: Invalidate RPT before deleting a guest

2023-12-07 Thread IBM

Vaibhav Jain  writes:

> From: Jordan Niethe 
>
> An L0 must invalidate the L2's RPT during H_GUEST_DELETE if this has not
> already been done. This is a slow operation that means H_GUEST_DELETE
> must return H_BUSY multiple times before completing. Invalidating the
> tables before deleting the guest so there is less work for the L0 to do.
>
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/kvm_book3s.h | 1 +
>  arch/powerpc/kvm/book3s_hv.c  | 6 --
>  arch/powerpc/kvm/book3s_hv_nested.c   | 2 +-
>  3 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 4f527d09c92b..a37736ed3728 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -302,6 +302,7 @@ void kvmhv_nested_exit(void);
>  void kvmhv_vm_nested_init(struct kvm *kvm);
>  long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
>  long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu);
> +void kvmhv_flush_lpid(u64 lpid);
>  void kvmhv_set_ptbl_entry(u64 lpid, u64 dw0, u64 dw1);
>  void kvmhv_release_all_nested(struct kvm *kvm);
>  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 1ed6ec140701..5543e8490cd9 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -5691,10 +5691,12 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
>   kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
>   }
>  
> - if (kvmhv_is_nestedv2())
> + if (kvmhv_is_nestedv2()) {
> + kvmhv_flush_lpid(kvm->arch.lpid);
>   plpar_guest_delete(0, kvm->arch.lpid);
>

I am not sure I follow the optimization here. I would expect the
hypervisor to kill all the translation caches as part of guest_delete.
What is the benefit of doing a lpid flush outside the guest delete?

> - else
> + } else {
>   kvmppc_free_lpid(kvm->arch.lpid);
> + }
>  
>   kvmppc_free_pimap(kvm);
>  }
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
> b/arch/powerpc/kvm/book3s_hv_nested.c
> index 3b658b8696bc..5c375ec1a3c6 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -503,7 +503,7 @@ void kvmhv_nested_exit(void)
>   }
>  }
>  
> -static void kvmhv_flush_lpid(u64 lpid)
> +void kvmhv_flush_lpid(u64 lpid)
>  {
>   long rc;
>  
> -- 
> 2.42.0

Re: [PATCH 09/12] KVM: PPC: Book3S HV nestedv2: Do not call H_COPY_TOFROM_GUEST

2023-12-08 Thread IBM

Vaibhav Jain  writes:

> From: Jordan Niethe 
>
> H_COPY_TOFROM_GUEST is part of the nestedv1 API and so should not be
> called by a nestedv2 host. Do not attempt to call it.
>

May be we should use
firmware_has_feature(FW_FEATURE_H_COPY_TOFROM_GUEST))?

the nestedv2 can end up using the above hcall if it is supported by the
hypervisor right? In its absence we will have to translate the guest ea
using xlate and then use kvm_guest_read to read location using the guest
real address right? That xlate will also involves multiple kvm_guest_read.


> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 916af6c153a5..4a1abb9f7c05 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -40,6 +40,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int 
> pid,
>   unsigned long quadrant, ret = n;
>   bool is_load = !!to;
>  
> + if (kvmhv_is_nestedv2())
> + return H_UNSUPPORTED;
> +
>   /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
>   if (kvmhv_on_pseries())
>   return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
> -- 
> 2.42.0

Re: [PATCH] MAINTAINERS: powerpc: Add Aneesh & Naveen

2023-12-13 Thread IBM

Michael Ellerman  writes:

> Aneesh and Naveen are helping out with some aspects of upstream
> maintenance, add them as reviewers.
>

Acked-by: Aneesh Kumar K.V (IBM) 

> Signed-off-by: Michael Ellerman 
> ---
>  MAINTAINERS | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index ea790149af79..562d048863ee 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -12240,6 +12240,8 @@ LINUX FOR POWERPC (32-BIT AND 64-BIT)
>  M:   Michael Ellerman 
>  R:   Nicholas Piggin 
>  R:   Christophe Leroy 
> +R:   Aneesh Kumar K.V 
> +R:   Naveen N. Rao 
>  L:   linuxppc-dev@lists.ozlabs.org
>  S:   Supported
>  W:   https://github.com/linuxppc/wiki/wiki
> -- 
> 2.43.0

Re: [PATCH 1/2] radix/kfence: map __kfence_pool at page granularity

2024-04-30 Thread IBM

Hari Bathini  writes:

> When KFENCE is enabled, total system memory is mapped at page level
> granularity. But in radix MMU mode, ~3GB additional memory is needed
> to map 100GB of system memory at page level granularity when compared
> to using 2MB direct mapping. This is not desired considering KFENCE is
> designed to be enabled in production kernels [1]. Also, mapping memory
> allocated for KFENCE pool at page granularity seems sufficient enough
> to enable KFENCE support. So, allocate __kfence_pool during bootup and
> map it at page granularity instead of mapping all system memory at
> page granularity.
>
> Without patch:
> # cat /proc/meminfo
> MemTotal:   101201920 kB
>
> With patch:
> # cat /proc/meminfo
> MemTotal:   104483904 kB
>
> All kfence_test.c testcases passed with this patch.
>
> [1] https://lore.kernel.org/all/20201103175841.3495947-2-el...@google.com/
>
> Signed-off-by: Hari Bathini 
> ---
>  arch/powerpc/include/asm/kfence.h|  5 
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 34 ++--
>  arch/powerpc/mm/init_64.c| 14 ++

New at this. But the patch looked interesting, hence my review comments.

>  3 files changed, 45 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kfence.h 
> b/arch/powerpc/include/asm/kfence.h
> index 424ceef82ae6..18ec2b06ba1e 100644
> --- a/arch/powerpc/include/asm/kfence.h
> +++ b/arch/powerpc/include/asm/kfence.h
> @@ -8,6 +8,7 @@
>  #ifndef __ASM_POWERPC_KFENCE_H
>  #define __ASM_POWERPC_KFENCE_H
>  
> +#include 
>  #include 
>  #include 
>  
> @@ -15,6 +16,10 @@
>  #define ARCH_FUNC_PREFIX "."
>  #endif
>  
> +#ifdef CONFIG_KFENCE
> +extern bool kfence_early_init;
> +#endif
> +
>  static inline bool arch_kfence_init_pool(void)
>  {
>   return true;

Shouldn't we return false for !kfence_early_init?
Because otherwise, this patch may break the late init case which your
next patch is fixing, and maybe git bisect will break?


> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 15e88f1439ec..fccbf92f279b 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -31,6 +31,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -291,9 +292,8 @@ static unsigned long next_boundary(unsigned long addr, 
> unsigned long end)
>   return end;
>  }
>  
> -static int __meminit create_physical_mapping(unsigned long start,
> -  unsigned long end,
> -  int nid, pgprot_t _prot)
> +static int __meminit create_physical_mapping(unsigned long start, unsigned 
> long end, int nid,
> +  pgprot_t _prot, unsigned long 
> mapping_sz_limit)

lines over 80 chars.

>  {
>   unsigned long vaddr, addr, mapping_size = 0;
>   bool prev_exec, exec = false;
> @@ -301,7 +301,10 @@ static int __meminit create_physical_mapping(unsigned 
> long start,
>   int psize;
>   unsigned long max_mapping_size = memory_block_size;
>  
> - if (debug_pagealloc_enabled_or_kfence())
> + if (mapping_sz_limit < max_mapping_size)
> + max_mapping_size = mapping_sz_limit;
> +
> + if (debug_pagealloc_enabled())
>   max_mapping_size = PAGE_SIZE;
>  
>   start = ALIGN(start, PAGE_SIZE);
> @@ -358,6 +361,7 @@ static int __meminit create_physical_mapping(unsigned 
> long start,
>  
>  static void __init radix_init_pgtable(void)
>  {
> + phys_addr_t kfence_pool __maybe_unused;
>   unsigned long rts_field;
>   phys_addr_t start, end;
>   u64 i;
> @@ -365,6 +369,13 @@ static void __init radix_init_pgtable(void)
>   /* We don't support slb for radix */
>   slb_set_size(0);
>  
> +#ifdef CONFIG_KFENCE
> + if (kfence_early_init) {
> + kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);

What if memblock_phys_alloc() failed? error handling?
> + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
> + }
> +#endif
> +

Instead of #ifdef CONFIG_KFENCE in the function,
maybe we can define radix_kfence_alloc_pool()? Then we won't need
__maybe_unused too.

>   /*
>* Create the linear mapping
>*/
> @@ -380,10 +391,18 @@ static void __init radix_init_pgtable(void)
>   continue;
>   }
>  
> - WARN_ON(create_physical_mapping(start, end,
> - -1, PAGE_KERNEL));
> + WARN_ON(create_physical_mapping(start, end, -1, PAGE_KERNEL, 
> ~0UL));
>   }
>  
> +#ifdef CONFIG_KFENCE
> + if (kfence_early_init) {
> + create_physical_mapping(kfence_pool, kfence_pool + 
> KFENCE_POOL_SIZE, -1,
> + PAGE_KERNEL, PAGE_SIZE);

Even this can return an error. Maybe WARN_ON_ONCE()? or disabling kfence
for an error?

Re: [PATCH 2/2] radix/kfence: support late __kfence_pool allocation

2024-05-01 Thread IBM

Hari Bathini  writes:

> With commit b33f778bba5ef ("kfence: alloc kfence_pool after system
> startup"), KFENCE pool can be allocated after system startup via the
> page allocator. This can lead to problems as all memory is not mapped
> at page granularity anymore with CONFIG_KFENCE. Address this by direct
> mapping all memory at PMD level and split the mapping for PMD pages
> that overlap with __kfence_pool to page level granularity if and when
> __kfence_pool is allocated after system startup.
>
> Signed-off-by: Hari Bathini 
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |  2 +
>  arch/powerpc/include/asm/kfence.h  | 14 +-
>  arch/powerpc/mm/book3s64/radix_pgtable.c   | 50 +-
>  3 files changed, 64 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index 8f55ff74bb68..0423ddbcf73c 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -340,6 +340,8 @@ extern void radix__vmemmap_remove_mapping(unsigned long 
> start,
>  extern int radix__map_kernel_page(unsigned long ea, unsigned long pa,
>pgprot_t flags, unsigned int psz);
>  
> +extern bool radix_kfence_init_pool(void);
> +
>  static inline unsigned long radix__get_tree_size(void)
>  {
>   unsigned long rts_field;
> diff --git a/arch/powerpc/include/asm/kfence.h 
> b/arch/powerpc/include/asm/kfence.h
> index 18ec2b06ba1e..c5d2fb2f9ecb 100644
> --- a/arch/powerpc/include/asm/kfence.h
> +++ b/arch/powerpc/include/asm/kfence.h
> @@ -18,12 +18,24 @@
>  
>  #ifdef CONFIG_KFENCE
>  extern bool kfence_early_init;
> -#endif
> +
> +static inline bool kfence_alloc_pool_late(void)
> +{
> + return !kfence_early_init;
> +}

Minor nit, but do we need kfence_alloc_pool_late()?
The function name looks confusing. Can we not just use
!kfence_early_init? If not then maybe bool kfence_late_init?

>  
>  static inline bool arch_kfence_init_pool(void)
>  {
> +#ifdef CONFIG_PPC_BOOK3S_64
> + if (radix_enabled())
> + return radix_kfence_init_pool();

Can we directly check...
if (radix_enabled() && !kfence_early_init)
... instead of embedding the check inside radix_kfence_late_init_pool()

> +#endif
> +
>   return true;
>  }
> +#else
> +static inline bool kfence_alloc_pool_late(void) { return false; }
> +#endif
>  
>  #ifdef CONFIG_PPC64
>  static inline bool kfence_protect_page(unsigned long addr, bool protect)
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index fccbf92f279b..f4374e3e31e1 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -253,6 +253,53 @@ void radix__mark_initmem_nx(void)
>  }
>  #endif /* CONFIG_STRICT_KERNEL_RWX */
>  
> +#ifdef CONFIG_KFENCE
> +static inline int radix_split_pmd_page(pmd_t *pmd, unsigned long addr)
> +{
> + pte_t *pte = pte_alloc_one_kernel(&init_mm);
> + unsigned long pfn = PFN_DOWN(__pa(addr));

Minor nit. Since addr will always be page aligned, so maybe PHYS_PFN() is better
suited. Although it does not matter.

> + int i;
> +
> + if (!pte)
> + return -ENOMEM;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + __set_pte_at(&init_mm, addr, pte + i, pfn_pte(pfn + i, 
> PAGE_KERNEL), 0);
> + asm volatile("ptesync": : :"memory");
> + }

Maybe a comment above the loop on why __set_pte_at() is ok for late
kfence init? and why not pte_update()? [1]

[1]: https://lore.kernel.org/linuxppc-dev/87y318wp9r@linux.ibm.com/


> + pmd_populate_kernel(&init_mm, pmd, pte);
> +
> + flush_tlb_kernel_range(addr, addr + PMD_SIZE);
> + return 0;
> +}
> +
> +bool radix_kfence_init_pool(void)
> +{
> + unsigned int page_psize, pmd_psize;
> + unsigned long addr;
> + pmd_t *pmd;
> +
> + if (!kfence_alloc_pool_late())
> + return true;
> +
> + page_psize = shift_to_mmu_psize(PAGE_SHIFT);
> + pmd_psize = shift_to_mmu_psize(PMD_SHIFT);
> + for (addr = (unsigned long)__kfence_pool; is_kfence_address((void 
> *)addr);
> +  addr += PAGE_SIZE) {
> + pmd = pmd_off_k(addr);
> +
> + if (pmd_leaf(*pmd)) {
> + if (radix_split_pmd_page(pmd, addr & PMD_MASK))
> + return false;
> + update_page_count(pmd_psize, -1);
> + update_page_count(page_psize, PTRS_PER_PTE);
> + }
> + }
> +
> + return true;
> +}
> +#endif
> +
>  static inline void __meminit
>  print_mapping(unsigned long start, unsigned long end, unsigned long size, 
> bool exec)
>  {
> @@ -391,7 +438,8 @@ static void __init radix_init_pgtable(void)
>   continue;
>   }
>  
> - WARN_ON(create_physical_mapping(start, end, -1, PAGE_KERNEL, 
> ~0UL));
> + WARN_ON

Re: [PATCH 2/3] powerpc/pseries: Export hardware trace macro dump via debugfs

2024-06-22 Thread IBM



This is a generic review and I haven't looked into the PAPR spec for
htmdump hcall and it's interface.

Madhavan Srinivasan  writes:

> This patch adds debugfs interface to export Hardware Trace Macro (HTM)
> function data in a LPAR. New hypervisor call "H_HTM" has been
> defined to setup, configure, control and dump the HTM data.
> This patch supports only dumping of HTM data in a LPAR.
> New debugfs folder called "htmdump" has been added under
> /sys/kernel/debug/arch path which contains files need to
> pass required parameters for the H_HTM dump function. New Kconfig
> option called "CONFIG_HTMDUMP" has been in platform/pseries for the same.
>
> With patch series applied and booted, list of files in debugfs path
>
> # pwd
> /sys/kernel/debug/powerpc/htmdump
> # ls
> coreindexonchip  htmtype  nodalchipindex  nodeindex  trace
>
> Signed-off-by: Madhavan Srinivasan 
> ---
>  arch/powerpc/platforms/pseries/Kconfig   |   8 ++
>  arch/powerpc/platforms/pseries/Makefile  |   1 +
>  arch/powerpc/platforms/pseries/htmdump.c | 130 +++
>  3 files changed, 139 insertions(+)
>  create mode 100644 arch/powerpc/platforms/pseries/htmdump.c
>
> diff --git a/arch/powerpc/platforms/pseries/Kconfig 
> b/arch/powerpc/platforms/pseries/Kconfig
> index afc0f6a61337..46c0ea605e33 100644
> --- a/arch/powerpc/platforms/pseries/Kconfig
> +++ b/arch/powerpc/platforms/pseries/Kconfig
> @@ -128,6 +128,14 @@ config CMM
> will be reused for other LPARs. The interface allows firmware to
> balance memory across many LPARs.
>
> +config HTMDUMP
> + tristate "PHYP HTM data dumper"

Not sure if we can make machine_device_initcall() as a tristate?
Did we try compiling it as a module?

It we would like to keep this as a module - then why not use module_init
call and then make it depend upon...

depends on PPC_PSERIES && DEBUG_FS (??)

> + default y

and then since this is mostly a debug trace facility, then we need not enable
it by default right?

> + help
> +   Select this option, if you want to enable the kernel debugfs
> +   interface to dump the Hardware Trace Macro (HTM) function data
> +   in the LPAR.
> +
>  config HV_PERF_CTRS
>   bool "Hypervisor supplied PMU events (24x7 & GPCI)"
>   default y
> diff --git a/arch/powerpc/platforms/pseries/Makefile 
> b/arch/powerpc/platforms/pseries/Makefile
> index 7bf506f6b8c8..3f3e3492e436 100644
> --- a/arch/powerpc/platforms/pseries/Makefile
> +++ b/arch/powerpc/platforms/pseries/Makefile
> @@ -19,6 +19,7 @@ obj-$(CONFIG_HVC_CONSOLE)   += hvconsole.o
>  obj-$(CONFIG_HVCS)   += hvcserver.o
>  obj-$(CONFIG_HCALL_STATS)+= hvCall_inst.o
>  obj-$(CONFIG_CMM)+= cmm.o
> +obj-$(CONFIG_HTMDUMP)+= htmdump.o
>  obj-$(CONFIG_IO_EVENT_IRQ)   += io_event_irq.o
>  obj-$(CONFIG_LPARCFG)+= lparcfg.o
>  obj-$(CONFIG_IBMVIO) += vio.o
> diff --git a/arch/powerpc/platforms/pseries/htmdump.c 
> b/arch/powerpc/platforms/pseries/htmdump.c
> new file mode 100644
> index ..540cdb7e069c
> --- /dev/null
> +++ b/arch/powerpc/platforms/pseries/htmdump.c
> @@ -0,0 +1,130 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) IBM Corporation, 2024
> + */
> +
> +#define pr_fmt(fmt) "htmdump: " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 

Do we need all of the above?
e.g. slab, memory_hotplug etc are not needed IMO.

Maybe only?

#include 
#include 
#include 
#include 

#include 
#include 

(module.h depending upon if we make it module_init())


> +
> +/* This enables us to keep track of the memory removed from each node. */
> +struct htmdump_entry {
> + void *buf;
> + struct dentry *dir;
> + char name[16];
> +};
> +
> +static u32 nodeindex = 0;
> +static u32 nodalchipindex = 0;
> +static u32 coreindexonchip = 0;
> +static u32 htmtype = 0;
> +
> +#define BUFFER_SIZE PAGE_SIZE
> +
> +static ssize_t htmdump_read(struct file *filp, char __user *ubuf,
> +  size_t count, loff_t *ppos)
> +{
> + struct htmdump_entry *ent = filp->private_data;
> + unsigned long page, read_size, available;
> + loff_t offset;
> + long rc;
> +
> + page = ALIGN_DOWN(*ppos, BUFFER_SIZE);
> + offset = (*ppos) % BUFFER_SIZE;
> +
> + rc = htm_get_dump_hardware(nodeindex, nodalchipindex, coreindexonchip,
> +htmtype, virt_to_phys(ent->buf), 
> B

Re: [PATCH 3/3] powerpc: Document details on H_HTM hcall

2024-06-22 Thread IBM

Madhavan Srinivasan  writes:

> Add documentation to 'papr_hcalls.rst' describing the
> input, output and return values of the H_HTM hcall as
> per the internal specification.
>
> Signed-off-by: Madhavan Srinivasan 
> ---
>  Documentation/arch/powerpc/papr_hcalls.rst | 11 +++
>  1 file changed, 11 insertions(+)
>
> diff --git a/Documentation/arch/powerpc/papr_hcalls.rst 
> b/Documentation/arch/powerpc/papr_hcalls.rst
> index 80d2c0aadab5..805e1cb9bab9 100644
> --- a/Documentation/arch/powerpc/papr_hcalls.rst
> +++ b/Documentation/arch/powerpc/papr_hcalls.rst
> @@ -289,6 +289,17 @@ to be issued multiple times in order to be completely 
> serviced. The
>  subsequent hcalls to the hypervisor until the hcall is completely serviced
>  at which point H_SUCCESS or other error is returned by the hypervisor.
>  
> +**H_HTM**
> +
> +| Input: flags, target, operation (op), op-param1, op-param2, op-param3
> +| Out: *dumphtmbufferdata*
> +| Return Value: *H_Success,H_Busy,H_LongBusyOrder,H_Partial,H_Parameter,
> +  H_P2,H_P3,H_P4,H_P5,H_P6,H_State,H_Not_Available,H_Authority*
> +
> +H_HTM supports setup, configuration, control and dumping of Hardware Trace
> +Macro (HTM) function and its data. HTM buffer stores tracing data for 
> functions
> +like core instruction, core LLAT and nest.
> +

Minor nit: Maybe the set of debugfs cmds to collect the trace and some
example trace log? If it is not confidential?

>  References
>  ==
>  .. [1] "Power Architecture Platform Reference"
> -- 
> 2.45.2

Re: linux-next: boot warning after merge of the vfs-brauner tree

2024-08-26 Thread IBM

Luis Chamberlain  writes:

> On Mon, Aug 26, 2024 at 02:10:49PM -0700, Darrick J. Wong wrote:
>> On Mon, Aug 26, 2024 at 01:52:54PM -0700, Luis Chamberlain wrote:
>> > On Mon, Aug 26, 2024 at 07:43:20PM +0200, Christophe Leroy wrote:
>> > > 
>> > > 
>> > > Le 26/08/2024 à 17:48, Pankaj Raghav (Samsung) a écrit :
>> > > > On Mon, Aug 26, 2024 at 05:59:31PM +1000, Stephen Rothwell wrote:
>> > > > > Hi all,
>> > > > > 
>> > > > > After merging the vfs-brauner tree, today's linux-next boot test 
>> > > > > (powerpc
>> > > > > pseries_le_defconfig) produced this warning:
>> > > > 
>> > > > iomap dio calls set_memory_ro() on the page that is used for sub block
>> > > > zeroing.
>> > > > 
>> > > > But looking at powerpc code, they don't support set_memory_ro() for
>> > > > memory region that belongs to the kernel(LINEAR_MAP_REGION_ID).
>> > > > 
>> > > > /*
>> > > >   * On hash, the linear mapping is not in the Linux page table so
>> > > >   * apply_to_existing_page_range() will have no effect. If in the 
>> > > > future
>> > > >   * the set_memory_* functions are used on the linear map this will 
>> > > > need
>> > > >   * to be updated.
>> > > >   */
>> > > > if (!radix_enabled()) {
>> > > >  int region = get_region_id(addr);
>> > > > 
>> > > >  if (WARN_ON_ONCE(region != VMALLOC_REGION_ID && region != 
>> > > > IO_REGION_ID))
>> > > >  return -EINVAL;
>> > > > }
>> > > > 
>> > > > We call set_memory_ro() on the zero page as a extra security measure.
>> > > > I don't know much about powerpc, but looking at the comment, is it just
>> > > > adding the following to support it in powerpc:
>> > > > 
>> > > > diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
>> > > > index ac22bf28086fa..e6e0b40ba6db4 100644
>> > > > --- a/arch/powerpc/mm/pageattr.c
>> > > > +++ b/arch/powerpc/mm/pageattr.c
>> > > > @@ -94,7 +94,9 @@ int change_memory_attr(unsigned long addr, int 
>> > > > numpages, long action)
>> > > >  if (!radix_enabled()) {
>> > > >  int region = get_region_id(addr);
>> > > > -   if (WARN_ON_ONCE(region != VMALLOC_REGION_ID && region 
>> > > > != IO_REGION_ID))
>> > > > +   if (WARN_ON_ONCE(region != VMALLOC_REGION_ID &&
>> > > > +region != IO_REGION_ID &&
>> > > > +region != LINEAR_MAP_REGION_ID))
>> > > >  return -EINVAL;
>> > > >  }
>> > > >   #endif
>> > > 
>> > > By doing this you will just hide the fact that it didn't work.
>> > > 
>> > > See commit 1f9ad21c3b38 ("powerpc/mm: Implement set_memory() routines") 
>> > > for
>> > > details. The linear memory region is not mapped using page tables so
>> > > set_memory_ro() will have no effect on it.
>> > > 
>> > > You can either use vmalloc'ed pages, or do a const static allocation at
>> > > buildtime so that it will be allocated in the kernel static rodata area.
>> > > 
>> > > By the way, your code should check the value returned by set_memory_ro(),
>> > > there is some work in progress to make it mandatory, see
>> > > https://github.com/KSPP/linux/issues/7
>> > 
>> > Our users expect contiguous memory [0] and so we use alloc_pages() here,
>> > so if we're architecture limitted by this I'd rather we just remove the
>> > set_memory_ro() only for PPC, I don't see why other have to skip this.

Looks like not a standard thing to do for kernel linear memory map
region then and maybe few other archs could be ignoring too?

>> 
>> Just drop it, then.
>
> OK sent a patch for that.
>

Thanks for fixing it!

-ritesh

Re: [PATCH] powerpc: Use printk instead of WARN in change_memory_attr

2024-08-27 Thread IBM

Christophe Leroy  writes:

> Le 27/08/2024 à 11:12, Ritesh Harjani (IBM) a écrit :
>> [Vous ne recevez pas souvent de courriers de ritesh.l...@gmail.com. 
>> Découvrez pourquoi ceci est important à 
>> https://aka.ms/LearnAboutSenderIdentification ]
>> 
>> Use pr_warn_once instead of WARN_ON_ONCE as discussed here [1]
>> for printing possible use of set_memory_* on linear map on Hash.
>> 
>> [1]: https://lore.kernel.org/all/877cc2fpi2.fsf@mail.lhotse/#t
>> 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>>   arch/powerpc/mm/pageattr.c | 5 -
>>   1 file changed, 4 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
>> index ac22bf28086f..c8c2d664c6f3 100644
>> --- a/arch/powerpc/mm/pageattr.c
>> +++ b/arch/powerpc/mm/pageattr.c
>> @@ -94,8 +94,11 @@ int change_memory_attr(unsigned long addr, int numpages, 
>> long action)
>>  if (!radix_enabled()) {
>>  int region = get_region_id(addr);
>> 
>> -   if (WARN_ON_ONCE(region != VMALLOC_REGION_ID && region != 
>> IO_REGION_ID))
>> +   if (region != VMALLOC_REGION_ID && region != IO_REGION_ID) {
>> +   pr_warn_once("%s: possible use of set_memory_* on 
>> linear map on Hash from (%ps)\n",
>> +   __func__, 
>> __builtin_return_address(0));
>
> Is it really only linear map ?
>
> What about "possible user of set_memory_* outside of vmalloc or io region.

"warning: possible user of set_memory_* outside of vmalloc and io region."

I am thinking of adding a word "warning" too. I can make above change and send 
v2.

>
> Maybe a show_stack() would also be worth it ?

IMO, since we have the caller, we need not pollute the dmesg with the
entire call stack. Besides I am not aware of dump_stack_once() style prints.

>
>
> But in principle I think it would be better to keep the WARN_ONCE until 
> we can add __must_check to set_memory_xxx() functions to be sure all 
> callers check the result, as mandated by 
> https://github.com/KSPP/linux/issues/7

Fixing the callers to check the return value is something that need not
depend on this change no?

The intention of this change is to mainly remove the heavy WARN_ON_ONCE
from powerpc specific change_memory_attr() and convert to printk warn.

-ritesh

Re: [RFC v1 01/10] book3s64/hash: Remove kfence support temporarily

2024-09-04 Thread IBM



Sorry for the delayed response. Was pulled into something else.

Christophe Leroy  writes:

> Le 31/07/2024 à 09:56, Ritesh Harjani (IBM) a écrit :
>> [Vous ne recevez pas souvent de courriers de ritesh.l...@gmail.com. 
>> Découvrez pourquoi ceci est important à 
>> https://aka.ms/LearnAboutSenderIdentification ]
>> 
>> Kfence on book3s Hash on pseries is anyways broken. It fails to boot
>> due to RMA size limitation. That is because, kfence with Hash uses
>> debug_pagealloc infrastructure. debug_pagealloc allocates linear map
>> for entire dram size instead of just kfence relevant objects.
>> This means for 16TB of DRAM it will require (16TB >> PAGE_SHIFT)
>> which is 256MB which is half of RMA region on P8.
>> crash kernel reserves 256MB and we also need 2048 * 16KB * 3 for
>> emergency stack and some more for paca allocations.
>> That means there is not enough memory for reserving the full linear map
>> in the RMA region, if the DRAM size is too big (>=16TB)
>> (The issue is seen above 8TB with crash kernel 256 MB reservation).
>> 
>> Now Kfence does not require linear memory map for entire DRAM.
>> It only needs for kfence objects. So this patch temporarily removes the
>> kfence functionality since debug_pagealloc code needs some refactoring.
>> We will bring in kfence on Hash support in later patches.
>> 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>>   arch/powerpc/include/asm/kfence.h |  5 +
>>   arch/powerpc/mm/book3s64/hash_utils.c | 16 +++-
>>   2 files changed, 16 insertions(+), 5 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/kfence.h 
>> b/arch/powerpc/include/asm/kfence.h
>> index fab124ada1c7..f3a9476a71b3 100644
>> --- a/arch/powerpc/include/asm/kfence.h
>> +++ b/arch/powerpc/include/asm/kfence.h
>> @@ -10,6 +10,7 @@
>> 
>>   #include 
>>   #include 
>> +#include 
>> 
>>   #ifdef CONFIG_PPC64_ELF_ABI_V1
>>   #define ARCH_FUNC_PREFIX "."
>> @@ -25,6 +26,10 @@ static inline void disable_kfence(void)
>> 
>>   static inline bool arch_kfence_init_pool(void)
>>   {
>> +#ifdef CONFIG_PPC64
>> +   if (!radix_enabled())
>> +   return false;
>> +#endif
>
> Avoid #ifdefs whenever possible. Here you can do:
>
>   if (IS_ENABLED(CONFIG_PPC64) && !radix_enabled())
>   return false;
>

Sure. I will change it.

>>  return !kfence_disabled;
>>   }
>>   #endif
>> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
>> b/arch/powerpc/mm/book3s64/hash_utils.c
>> index 01c3b4b65241..1a1b50735fa0 100644
>> --- a/arch/powerpc/mm/book3s64/hash_utils.c
>> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
>> @@ -431,7 +431,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned 
>> long vend,
>>  break;
>> 
>>  cond_resched();
>> -   if (debug_pagealloc_enabled_or_kfence() &&
>> +   if (debug_pagealloc_enabled() &&
>>  (paddr >> PAGE_SHIFT) < linear_map_hash_count)
>>  linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 
>> 0x80;
>>  }
>> @@ -814,7 +814,7 @@ static void __init htab_init_page_sizes(void)
>>  bool aligned = true;
>>  init_hpte_page_sizes();
>> 
>> -   if (!debug_pagealloc_enabled_or_kfence()) {
>> +   if (!debug_pagealloc_enabled()) {
>>  /*
>>   * Pick a size for the linear mapping. Currently, we only
>>   * support 16M, 1M and 4K which is the default
>> @@ -1134,7 +1134,7 @@ static void __init htab_initialize(void)
>> 
>>  prot = pgprot_val(PAGE_KERNEL);
>> 
>> -   if (debug_pagealloc_enabled_or_kfence()) {
>> +   if (debug_pagealloc_enabled()) {
>>  linear_map_hash_count = memblock_end_of_DRAM() >> 
>> PAGE_SHIFT;
>>  linear_map_hash_slots = memblock_alloc_try_nid(
>>  linear_map_hash_count, 1, 
>> MEMBLOCK_LOW_LIMIT,
>> @@ -2117,7 +2117,7 @@ void hpt_do_stress(unsigned long ea, unsigned long 
>> hpte_group)
>>  }
>>   }
>> 
>> -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
>> +#if defined(CONFIG_DEBUG_PAGEALLOC)
>
> Use #ifdef
>

Sure. Thanks!

-ritesh

Re: [RFC v1 09/10] book3s64/radix: Refactoring common kfence related functions

2024-09-04 Thread IBM

Christophe Leroy  writes:

> Le 31/07/2024 à 09:56, Ritesh Harjani (IBM) a écrit :
>> [Vous ne recevez pas souvent de courriers de ritesh.l...@gmail.com. 
>> Découvrez pourquoi ceci est important à 
>> https://aka.ms/LearnAboutSenderIdentification ]
>> 
>> Both radix and hash on book3s requires to detect if kfence
>> early init is enabled or not. Hash needs to disable kfence
>> if early init is not enabled because with kfence the linear map is
>> mapped using PAGE_SIZE rather than 16M mapping.
>> We don't support multiple page sizes for slb entry used for kernel
>> linear map in book3s64.
>> 
>> This patch refactors out the common functions required to detect kfence
>> early init is enabled or not.
>> 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>>   arch/powerpc/include/asm/kfence.h|  2 ++
>>   arch/powerpc/mm/book3s64/radix_pgtable.c | 12 
>>   arch/powerpc/mm/init-common.c| 12 
>>   3 files changed, 14 insertions(+), 12 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/kfence.h 
>> b/arch/powerpc/include/asm/kfence.h
>> index fab124ada1c7..5975688d8de1 100644
>> --- a/arch/powerpc/include/asm/kfence.h
>> +++ b/arch/powerpc/include/asm/kfence.h
>> @@ -15,6 +15,8 @@
>>   #define ARCH_FUNC_PREFIX "."
>>   #endif
>> 
>> +extern bool kfence_early_init;
>> +
>>   #ifdef CONFIG_KFENCE
>>   extern bool kfence_disabled;
>> 
>> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
>> b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> index b0d927009af8..311e2112d782 100644
>> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> @@ -363,18 +363,6 @@ static int __meminit create_physical_mapping(unsigned 
>> long start,
>>   }
>> 
>>   #ifdef CONFIG_KFENCE
>> -static bool __ro_after_init kfence_early_init = 
>> !!CONFIG_KFENCE_SAMPLE_INTERVAL;
>> -
>> -static int __init parse_kfence_early_init(char *arg)
>> -{
>> -   int val;
>> -
>> -   if (get_option(&arg, &val))
>> -   kfence_early_init = !!val;
>> -   return 0;
>> -}
>> -early_param("kfence.sample_interval", parse_kfence_early_init);
>> -
>>   static inline phys_addr_t alloc_kfence_pool(void)
>>   {
>>  phys_addr_t kfence_pool;
>> diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
>> index 21131b96d209..259821a4db62 100644
>> --- a/arch/powerpc/mm/init-common.c
>> +++ b/arch/powerpc/mm/init-common.c
>> @@ -33,6 +33,18 @@ bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
>>   bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
>>   #ifdef CONFIG_KFENCE
>>   bool __ro_after_init kfence_disabled;
>> +bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
>> +static int __init parse_kfence_early_init(char *arg)
>
> If I understand correctly, previously it was only for radix, now it is 
> for every platform including PPC32 ?
>

Ok. I see what you mean. Let me see how can I limit this cmdline parsing
of kfence and/or special case kfence handling to book3s64 only.

>> +{
>> +   int val;
>> +
>> +   if (get_option(&arg, &val))
>> +   kfence_early_init = !!val;
>> +   return 0;
>> +}
>> +early_param("kfence.sample_interval", parse_kfence_early_init);
>> +#else
>> +bool __ro_after_init kfence_early_init;
>
> I don't understand, why do you need that in the #else case ?
>

Yes, I don't like it either. Let me clean this up.
this was required in htab_init_page_sizes(). 

Thanks for pointing out.

-ritesh

Re: [RFC v1 10/10] book3s64/hash: Disable kfence if not early init

2024-09-04 Thread IBM

Christophe Leroy  writes:

> Le 31/07/2024 à 09:56, Ritesh Harjani (IBM) a écrit :
>> [Vous ne recevez pas souvent de courriers de ritesh.l...@gmail.com. 
>> Découvrez pourquoi ceci est important à 
>> https://aka.ms/LearnAboutSenderIdentification ]
>> 
>> Enable kfence on book3s64 hash only when early init is enabled.
>> This is because, kfence could cause the kernel linear map to be mapped
>> at PAGE_SIZE level instead of 16M (which I guess we don't want).
>> 
>> Also currently there is no way to -
>> 1. Make multiple page size entries for the SLB used for kernel linear
>> map.
>> 2. No easy way of getting the hash slot details after the page table
>> mapping for kernel linear setup. So even if kfence allocate the
>> pool in late init, we won't be able to get the hash slot details in
>> kfence linear map.
>> 
>> Thus this patch disables kfence on hash if kfence early init is not
>> enabled.
>> 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>>   arch/powerpc/mm/book3s64/hash_utils.c | 5 -
>>   1 file changed, 4 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
>> b/arch/powerpc/mm/book3s64/hash_utils.c
>> index c66b9921fc7d..759dbcbf1483 100644
>> --- a/arch/powerpc/mm/book3s64/hash_utils.c
>> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
>> @@ -410,6 +410,8 @@ static phys_addr_t kfence_pool;
>> 
>>   static inline void hash_kfence_alloc_pool(void)
>>   {
>> +   if (!kfence_early_init)
>> +   goto err;
>> 
>>  // allocate linear map for kfence within RMA region
>>  linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT;
>> @@ -1074,7 +1076,8 @@ static void __init htab_init_page_sizes(void)
>>  bool aligned = true;
>>  init_hpte_page_sizes();
>> 
>> -   if (!debug_pagealloc_enabled_or_kfence()) {
>> +   if (!debug_pagealloc_enabled() &&
>> +   !(IS_ENABLED(CONFIG_KFENCE) && kfence_early_init)) {
>
> Looks complex, can we do simpler ?
>

Yes, kfence_early_init anyway needs clean up. Will make it simpler.

Thanks for the review!

-ritesh

Re: [RFC v3 1/3] fadump: Refactor and prepare fadump_cma_init for late init

2024-10-14 Thread IBM

Madhavan Srinivasan  writes:

> On 10/11/24 8:30 PM, Ritesh Harjani (IBM) wrote:
>> We anyway don't use any return values from fadump_cma_init(). Since
>> fadump_reserve_mem() from where fadump_cma_init() gets called today,
>> already has the required checks.
>> This patch makes this function return type as void. Let's also handle
>> extra cases like return if fadump_supported is false or dump_active, so
>> that in later patches we can call fadump_cma_init() separately from
>> setup_arch().
>
> Usually patches to this file are posted with title format of
>
> powerpc/fadump:<>

yes. I guess it is good to do it that way (I might have missed it)
Although commit history of oldest few patches to fadump shows..

ebaeb5ae2437 fadump: Convert firmware-assisted cpu state dump data into elf 
notes.
2df173d9e85d fadump: Initialize elfcore header and add PT_LOAD program headers.
3ccc00a7e04f fadump: Register for firmware assisted dump.
eb39c8803d0e fadump: Reserve the memory for firmware assisted dump.

>
>
>> 
>> Acked-by: Hari Bathini 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>> v2 -> v3: Separated the series into 2 as discussed in v2.
>> [v2]: 
>> https://lore.kernel.org/linuxppc-dev/cover.1728585512.git.ritesh.l...@gmail.com/
>> 
>>  arch/powerpc/kernel/fadump.c | 23 +--
>>  1 file changed, 9 insertions(+), 14 deletions(-)
>> 
>> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
>> index a612e7513a4f..162327d66982 100644
>> --- a/arch/powerpc/kernel/fadump.c
>> +++ b/arch/powerpc/kernel/fadump.c
>> @@ -78,27 +78,23 @@ static struct cma *fadump_cma;
>>   * But for some reason even if it fails we still have the memory reservation
>>   * with us and we can still continue doing fadump.
>>   */
>> -static int __init fadump_cma_init(void)
>> +static void __init fadump_cma_init(void)
>>  {
>>  unsigned long long base, size;
>>  int rc;
>> 
>> -if (!fw_dump.fadump_enabled)
>> -return 0;
>> -
>> +if (!fw_dump.fadump_supported || !fw_dump.fadump_enabled ||
>> +fw_dump.dump_active)
>> +return;
>
> Is these checks even needed here? fadump_reserve_mem() checked for all
> these already, also dont see any other caller for fadump_cma_init(). 
>
>

In the next patch we will move fadump_cma_init() call from within
fadump_reserve_mem() to setup_arch(). Hence we need these extra checks
in fadump_cma_init() as well. I mentioned the same in the commit msg of
this patch too.

>>  /*
>>   * Do not use CMA if user has provided fadump=nocma kernel parameter.
>> - * Return 1 to continue with fadump old behaviour.
>>   */
>> -if (fw_dump.nocma)
>> -return 1;
>> +if (fw_dump.nocma || !fw_dump.boot_memory_size)
>> +return;
>> 
>>  base = fw_dump.reserve_dump_area_start;
>>  size = fw_dump.boot_memory_size;
>> 
>> -if (!size)
>> -return 0;
>
> So this is the only place where we return 0, which in turn will make the
> "ret" in fadump_reserve_mem() as zero forcing to call reserve_crashkernel()
> in early_init_devtree().
>
> we are removing it, becos we know "size" here will never be zero?
>
>

yes. Because we already check if boot_memory_size is less than
bootmem_min in fadump_reserve_mem(). If it is less, then we fail and
disable fadump (fadump_enabled = 0).

So then there is no need to check for !boot_memory_size in here.

fadump_reseve_mem( ) {
<...>
if (!fw_dump.dump_active) {
fw_dump.boot_memory_size =
PAGE_ALIGN(fadump_calculate_reserve_size());

bootmem_min = fw_dump.ops->fadump_get_bootmem_min();
if (fw_dump.boot_memory_size < bootmem_min) {
pr_err("Can't enable fadump with boot memory size 
(0x%lx) less than 0x%llx\n",
   fw_dump.boot_memory_size, bootmem_min);
goto error_out;
}
<...>
}
<...>
error_out:
fw_dump.fadump_enabled = 0;
fw_dump.reserve_dump_area_size = 0;
return 0;
}


Thanks for the review!
-ritesh

Re: [RFC RESEND v2 02/13] powerpc: mm: Fix kfence page fault reporting

2024-10-15 Thread IBM

Christophe Leroy  writes:

> Le 15/10/2024 à 03:33, Ritesh Harjani (IBM) a écrit :
>> copy_from_kernel_nofault() can be called when doing read of /proc/kcore.
>> /proc/kcore can have some unmapped kfence objects which when read via
>> copy_from_kernel_nofault() can cause page faults. Since *_nofault()
>> functions define their own fixup table for handling fault, use that
>> instead of asking kfence to handle such faults.
>> 
>> Hence we search the exception tables for the nip which generated the
>> fault. If there is an entry then we let the fixup table handler handle the
>> page fault by returning an error from within ___do_page_fault().
>> 
>> This can be easily triggered if someone tries to do dd from /proc/kcore.
>> dd if=/proc/kcore of=/dev/null bs=1M
>> 
>> 
>> ===
>> BUG: KFENCE: invalid read in copy_from_kernel_nofault+0xb0/0x1c8
>> Invalid read at 0x4f749d2e:
>>   copy_from_kernel_nofault+0xb0/0x1c8
>>   0xc57f7950
>>   read_kcore_iter+0x41c/0x9ac
>>   proc_reg_read_iter+0xe4/0x16c
>>   vfs_read+0x2e4/0x3b0
>>   ksys_read+0x88/0x154
>>   system_call_exception+0x124/0x340
>>   system_call_common+0x160/0x2c4
>> 
>> BUG: KFENCE: use-after-free read in copy_from_kernel_nofault+0xb0/0x1c8
>> Use-after-free read at 0x8fbb08ad (in kfence-#0):
>>   copy_from_kernel_nofault+0xb0/0x1c8
>>   0xc57f7950
>>   read_kcore_iter+0x41c/0x9ac
>>   proc_reg_read_iter+0xe4/0x16c
>>   vfs_read+0x2e4/0x3b0
>>   ksys_read+0x88/0x154
>>   system_call_exception+0x124/0x340
>>   system_call_common+0x160/0x2c4
>> 
>> Guessing the fix should go back to when we first got kfence on PPC32.
>> 
>> Fixes: 90cbac0e995d ("powerpc: Enable KFENCE for PPC32")
>> Reported-by: Disha Goel 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>>   arch/powerpc/mm/fault.c | 10 +-
>>   1 file changed, 9 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>> index 81c77ddce2e3..fa825198f29f 100644
>> --- a/arch/powerpc/mm/fault.c
>> +++ b/arch/powerpc/mm/fault.c
>> @@ -439,9 +439,17 @@ static int ___do_page_fault(struct pt_regs *regs, 
>> unsigned long address,
>>  /*
>>   * The kernel should never take an execute fault nor should it
>>   * take a page fault to a kernel address or a page fault to a user
>> - * address outside of dedicated places
>> + * address outside of dedicated places.
>> + *
>> + * Rather than kfence reporting false negatives, let the fixup table
>> + * handler handle the page fault by returning SIGSEGV, if the fault
>> + * has come from functions like copy_from_kernel_nofault().
>>   */
>>  if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, 
>> is_write))) {
>> +
>> +if (search_exception_tables(instruction_pointer(regs)))
>> +return SIGSEGV;
>
> This is a heavy operation. It should at least be done only when KFENCE 
> is built-in.
>
> kfence_handle_page_fault() bails out immediately when 
> is_kfence_address() returns false, and is_kfence_address() returns 
> always false when KFENCE is not built-in.
>
> So you could check that before calling the heavy weight 
> search_exception_tables().
>
>   if (is_kfence_address(address) &&
>   !search_exception_tables(instruction_pointer(regs)) &&
>   kfence_handle_page_fault(address, is_write, regs))
>   return 0;
>

Yes, thanks for the input. I agree with above. I will take that in v3.
I will wait for sometime for any review comments on other patches before
spinning a v3, though.

>
>
>  > +  return SIGSEGV;
>
>> +
>>  if (kfence_handle_page_fault(address, is_write, regs))
>>  return 0;
>>   

-ritesh

Re: [RFC v3 1/3] fadump: Refactor and prepare fadump_cma_init for late init

2024-10-18 Thread IBM

Madhavan Srinivasan  writes:

>
> Patchset looks fine to me. 
>
> Reviewed-by: Madhavan Srinivasan  for the series.
>

Thanks Maddy for the reviews! 
I will spin PATCH v4 with these minor suggested changes (No code changes)

-ritesh

Re: [PATCH v3 01/12] powerpc: mm/fault: Fix kfence page fault reporting

2024-10-21 Thread IBM

Michael Ellerman  writes:

> Hi Ritesh,
>
> "Ritesh Harjani (IBM)"  writes:
>> copy_from_kernel_nofault() can be called when doing read of /proc/kcore.
>> /proc/kcore can have some unmapped kfence objects which when read via
>> copy_from_kernel_nofault() can cause page faults. Since *_nofault()
>> functions define their own fixup table for handling fault, use that
>> instead of asking kfence to handle such faults.
>>
>> Hence we search the exception tables for the nip which generated the
>> fault. If there is an entry then we let the fixup table handler handle the
>> page fault by returning an error from within ___do_page_fault().
>>
>> This can be easily triggered if someone tries to do dd from /proc/kcore.
>> dd if=/proc/kcore of=/dev/null bs=1M
>>
>> 
>> ===
>> BUG: KFENCE: invalid read in copy_from_kernel_nofault+0xb0/0x1c8
>> Invalid read at 0x4f749d2e:
>>  copy_from_kernel_nofault+0xb0/0x1c8
>>  0xc57f7950
>>  read_kcore_iter+0x41c/0x9ac
>>  proc_reg_read_iter+0xe4/0x16c
>>  vfs_read+0x2e4/0x3b0
>>  ksys_read+0x88/0x154
>>  system_call_exception+0x124/0x340
>>  system_call_common+0x160/0x2c4
>
> I haven't been able to reproduce this. Can you give some more details on
> the exact machine/kernel-config/setup where you saw this?

w/o this patch I am able to hit this on book3s64 with both Radix and
Hash. I believe these configs should do the job. We should be able to
reproduce it on qemu and/or LPAR or baremetal.

root-> cat .out-ppc/.config |grep -i KFENCE
CONFIG_HAVE_ARCH_KFENCE=y
CONFIG_KFENCE=y
CONFIG_KFENCE_SAMPLE_INTERVAL=100
CONFIG_KFENCE_NUM_OBJECTS=255
# CONFIG_KFENCE_DEFERRABLE is not set
# CONFIG_KFENCE_STATIC_KEYS is not set
CONFIG_KFENCE_STRESS_TEST_FAULTS=0
CONFIG_KFENCE_KUNIT_TEST=y

root-> cat .out-ppc/.config |grep -i KCORE
CONFIG_PROC_KCORE=y

root-> cat .out-ppc/.config |grep -i KUNIT
CONFIG_KFENCE_KUNIT_TEST=y
CONFIG_KUNIT=y
CONFIG_KUNIT_DEFAULT_ENABLED=y


Then doing running dd like below can hit the issue. Maybe let it run for
few mins and see?

~ # dd if=/proc/kcore of=/dev/null bs=1M

Otherwise running this kfence kunit test also can reproduce the same
bug [1]. Above configs have kfence kunit config shown as well which will
run during boot time itself.

[1]: 
https://lore.kernel.org/linuxppc-dev/210e561f7845697a32de44b643393890f180069f.1729272697.git.ritesh.l...@gmail.com/

Note: This was originally reported internally in which the tester was
doing - perf test 'Object code reading'  [2]
[2]: 
https://github.com/torvalds/linux/blob/master/tools/perf/tests/code-reading.c#L737

Thanks for looking into this. Let me know if this helped.

-ritesh

Re: [RFC 1/2] cma: Fix CMA_MIN_ALIGNMENT_BYTES during early_init

2024-10-09 Thread IBM

David Hildenbrand  writes:

> On 08.10.24 15:27, Ritesh Harjani (IBM) wrote:
>> During early init CMA_MIN_ALIGNMENT_BYTES can be PAGE_SIZE,
>> since pageblock_order is still zero and it gets initialized
>> later during paging_init() e.g.
>> paging_init() -> free_area_init() -> set_pageblock_order().
>> 
>> One such use case is -
>> early_setup() -> early_init_devtree() -> fadump_reserve_mem()
>> 
>> This causes CMA memory alignment check to be bypassed in
>> cma_init_reserved_mem(). Then later cma_activate_area() can hit
>> a VM_BUG_ON_PAGE(pfn & ((1 << order) - 1)) if the reserved memory
>> area was not pageblock_order aligned.
>> 
>> Instead of fixing it locally for fadump case on PowerPC, I believe
>> this should be fixed for CMA_MIN_ALIGNMENT_BYTES.
>
> I think we should add a way to catch the usage of 
> CMA_MIN_ALIGNMENT_BYTES before it actually has meaning (before 
> pageblock_order was set)

Maybe by enforcing that the pageblock_order should not be zero where we
do the alignment check then?

i.e. in cma_init_reserved_mem() 

diff --git a/mm/cma.c b/mm/cma.c
index 3e9724716bad..36d753e7a0bf 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,6 +182,15 @@ int __init cma_init_reserved_mem(phys_addr_t base, 
phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;

+   /*
+* CMA uses CMA_MIN_ALIGNMENT_BYTES as alignment requirement which
+* needs pageblock_order to be initialized. Let's enforce it.
+*/
+   if (!pageblock_order) {
+   pr_err("pageblock_order not yet initialized. Called during 
early boot?\n");
+   return -EINVAL;
+   }
+
/* ensure minimal alignment required by mm core */
if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;


> and fix the PowerPC usage by reshuffling the 
> code accordingly.

Ok. I will submit a v2 with the above patch incldued.

Thanks for the review!
-ritesh

Re: [PATCH v2] selftests/powerpc: Remove the path after initialization.

2024-10-01 Thread IBM

zhangjiao2  writes:

> From: zhang jiao 
>
> If there were no anamolies noted, then we can
> simply remove the log file and return, 

after the path variable has been initialized.

(minor nit)


>
> Signed-off-by: zhang jiao 
> ---
> v1->v2:
>   Remove the path after initialization.
>
>  tools/testing/selftests/powerpc/mm/tlbie_test.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)


Thanks for the fix. Looks good to me. 
Please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM)

Re: [RFC v2 0/4] cma: powerpc fadump fixes

2024-10-11 Thread IBM

Michael Ellerman  writes:

> "Ritesh Harjani (IBM)"  writes:
>> Please find the v2 of cma related powerpc fadump fixes.
>>
>> Patch-1 is a change in mm/cma.c to make sure we return an error if someone 
>> uses
>> cma_init_reserved_mem() before the pageblock_order is initalized.
>>
>> I guess, it's best if Patch-1 goes via mm tree and since rest of the changes
>> are powerpc fadump fixes hence those should go via powerpc tree. Right?
>
> Yes I think that will work.
>
> Because there's no actual dependency on patch 1, correct?

There is no dependency, yes.

>
> Let's see if the mm folks are happy with the approach, and if so you
> should send patch 1 on its own, and patches 2-4 as a separate series.
>
> Then I can take the series (2-4) as fixes, and patch 1 can go via the mm
> tree (probably in next, not as a fix).
>

Sure. Since David has acked patch-1, let me split this into 2 series
as you mentioned above and re-send both seperately, so that it can be
picked up in their respective trees.

Will just do it in sometime. Thanks!

-ritesh


> cheers
>
>> v1 -> v2:
>> =
>> 1. Review comments from David to call fadump_cma_init() after the
>>pageblock_order is initialized. Also to catch usages if someone tries
>>to call cma_init_reserved_mem() before pageblock_order is initialized.
>>
>> [v1]: 
>> https://lore.kernel.org/linuxppc-dev/c1e66d3e69c8d90988c02b84c79db5d9dd93f053.1728386179.git.ritesh.l...@gmail.com/
>>
>> Ritesh Harjani (IBM) (4):
>>   cma: Enforce non-zero pageblock_order during cma_init_reserved_mem()
>>   fadump: Refactor and prepare fadump_cma_init for late init
>>   fadump: Reserve page-aligned boot_memory_size during fadump_reserve_mem
>>   fadump: Move fadump_cma_init to setup_arch() after initmem_init()
>>
>>  arch/powerpc/include/asm/fadump.h  |  7 
>>  arch/powerpc/kernel/fadump.c   | 55 +++---
>>  arch/powerpc/kernel/setup-common.c |  6 ++--
>>  mm/cma.c   |  9 +
>>  4 files changed, 48 insertions(+), 29 deletions(-)
>>
>> --
>> 2.46.0

Re: drivers/nx: Invalid wait context issue when rebooting

2024-10-14 Thread IBM

Vishal Chourasia  writes:

> On Fri, Oct 11, 2024 at 09:37:27PM +1100, Michael Ellerman wrote:
>> 
>> I don't see why of_reconfig_notifier_unregister() needs to be called
>> with the devdata_mutext held, but I haven't looked that closely at it.
>> 
>> So the change below might work.
>> 
>> cheers
>> 
>> diff --git a/drivers/crypto/nx/nx-common-pseries.c 
>> b/drivers/crypto/nx/nx-common-pseries.c
>> index 35f2d0d8507e..a2050c5fb11d 100644
>> --- a/drivers/crypto/nx/nx-common-pseries.c
>> +++ b/drivers/crypto/nx/nx-common-pseries.c
>> @@ -1122,10 +1122,11 @@ static void nx842_remove(struct vio_dev *viodev)
>>  
>>  crypto_unregister_alg(&nx842_pseries_alg);
>>  
>> +of_reconfig_notifier_unregister(&nx842_of_nb);
>> +
>>  spin_lock_irqsave(&devdata_mutex, flags);
>>  old_devdata = rcu_dereference_check(devdata,
>>  lockdep_is_held(&devdata_mutex));
>> -of_reconfig_notifier_unregister(&nx842_of_nb);
>>  RCU_INIT_POINTER(devdata, NULL);
>>  spin_unlock_irqrestore(&devdata_mutex, flags);
>>  synchronize_rcu();
>> 
> With above changes, I see another similar bug, but what's strange is
> swapper does not hold any lock and still this bug is being triggered

Looking at the below stack, it looks like you discovered a new problem
after the above problem was fixed with the above changes.
(So maybe you could submit this fix along with [1])
Also looking at the history of changes, seems the above problem always
existed. Not sure why it wasn't caught earlier then?

[1]: https://lore.kernel.org/linuxppc-dev/zwyqd-w5hehrn...@linux.ibm.com/T/#u

I am not much aware of the below code paths. Nor it is evident from the
stack on why "Invalid wait context". Maybe you can give git bisect a try
for below issue (or can also wait for someone to comment on below stack).
(But you might have to keep the nx-common-pseries driver disabled for git 
bisect to work). 

>
> =
> [ BUG: Invalid wait context ]
> 6.12.0-rc2-fix-invalid-wait-context-00222-g7d2910da7039-dirty #84 Not tainted
> -
> swapper/2/0 is trying to lock:
> c4062128 (&xibm->lock){}-{3:3}, at: xive_spapr_put_ipi+0xb8/0x120
> other info that might help us debug this:
> context-{2:2}
> no locks held by swapper/2/0.
> stack backtrace:
> CPU: 2 UID: 0 PID: 0 Comm: swapper/2 Not tainted 
> 6.12.0-rc2-fix-invalid-wait-context-00222-g7d2910da7039-dirty #84
> Hardware name: IBM,9080-HEX POWER10 (architected) 0x800200 0xf06 
> of:IBM,FW1060.00 (NH1060_012) hv:phyp pSeries
> Call Trace:
> [c4ac3420] [c130d2e4] dump_stack_lvl+0xc8/0x130 (unreliable)
> [c4ac3460] [c0312ca8] __lock_acquire+0xb68/0xf00
> [c4ac3570] [c0313130] lock_acquire.part.0+0xf0/0x2a0
> [c4ac3690] [c13955b8] _raw_spin_lock_irqsave+0x78/0x130
> kexec: waiting for cpu 2 (physical 2) to enter 2 state
> [c4ac36d0] [c0194798] xive_spapr_put_ipi+0xb8/0x120
> [c4ac3710] [c1383728] xive_cleanup_cpu_ipi+0xc8/0xf0
> [c4ac3750] [c13837f4] xive_teardown_cpu+0xa4/0x100
> [c4ac3780] [c01d2cc4] pseries_kexec_cpu_down+0x54/0x1e0
> [c4ac3800] [c0213674] kexec_smp_down+0x124/0x1f0
> [c4ac3890] [c03c9ddc] 
> __flush_smp_call_function_queue+0x28c/0xad0
> [c4ac3950] [c005fb64] smp_ipi_demux_relaxed+0xe4/0xf0
> [c4ac3990] [c00593d8] doorbell_exception+0x108/0x2f0
> [c4ac3a20] [c000a26c] doorbell_super_common_virt+0x28c/0x290
> --- interrupt: a00 at plpar_hcall_norets_notrace+0x18/0x2c
> NIP:  c01bee18 LR: c13867a8 CTR: 
> REGS: c4ac3a50 TRAP: 0a00   Not tainted  
> (6.12.0-rc2-fix-invalid-wait-context-00222-g7d2910da7039-dirty)
> MSR:  8280b033   CR: 22000242  XER: 
> 0001
> CFAR:  IRQMASK: 0
> GPR00:  c4ac3cf0 c1e37600 
> GPR04:   0001dc4f97750361 0001
> GPR08: 00c0 0080 0001dc4f97750554 0080
> GPR12:  c007fffee480  
> GPR16:    
> GPR20:  c2ebf778  0043a215d824
> GPR24:  c0ec0f80 c2ebf778 
> GPR28:  0001 c21a2300 c21a2308
> NIP [c01bee18] plpar_hcall_norets_notrace+0x18/0x2c
> LR [c

Re: [PATCH] powerpc: Use str_enabled_disabled() helper function

2024-10-27 Thread IBM

Thorsten Blum  writes:

> Remove hard-coded strings by using the str_enabled_disabled() helper
> function.
>

Looks like now a days LKP also reports this - e.g. [1] "opportunity for 
str_enabled_disabled(enable)" 

[1]: https://lore.kernel.org/all/202410071601.tfpxoqgw-...@intel.com/

I see more such oppotunities for later improvements in arch/powerpc/
root-> git grep "\"enabled\" : \"disabled\"" arch/powerpc/
arch/powerpc/kernel/secure_boot.c:  pr_info("Secure boot mode %s\n", 
enabled ? "enabled" : "disabled");
arch/powerpc/kernel/secure_boot.c:  pr_info("Trusted boot mode %s\n", 
enabled ? "enabled" : "disabled");
arch/powerpc/platforms/powermac/setup.c:
"enabled" : "disabled");
arch/powerpc/sysdev/mpic.c: flags & HT_MSI_FLAGS_ENABLE ? "enabled" 
: "disabled", addr);

> Signed-off-by: Thorsten Blum 
> ---
>  arch/powerpc/kernel/secure_boot.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>

For this patch it looks good to me. Please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM) 

> diff --git a/arch/powerpc/kernel/secure_boot.c 
> b/arch/powerpc/kernel/secure_boot.c
> index 9e0efb657f39..3a28795b4ed8 100644
> --- a/arch/powerpc/kernel/secure_boot.c
> +++ b/arch/powerpc/kernel/secure_boot.c
> @@ -5,6 +5,7 @@
>   */
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  static struct device_node *get_ppc_fw_sb_node(void)
> @@ -38,7 +39,7 @@ bool is_ppc_secureboot_enabled(void)
>   of_node_put(node);
>  
>  out:
> - pr_info("Secure boot mode %s\n", enabled ? "enabled" : "disabled");
> + pr_info("Secure boot mode %s\n", str_enabled_disabled(enabled));
>  
>   return enabled;
>  }
> @@ -62,7 +63,7 @@ bool is_ppc_trustedboot_enabled(void)
>   of_node_put(node);
>  
>  out:
> - pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled");
> + pr_info("Trusted boot mode %s\n", str_enabled_disabled(enabled));
>  
>   return enabled;
>  }
> -- 
> 2.47.0

Re: [PATCH v3] mm/kfence: Add a new kunit test test_use_after_free_read_nofault()

2024-11-12 Thread IBM

Marco Elver  writes:

> On Fri, 18 Oct 2024 at 19:46, Ritesh Harjani (IBM)
>  wrote:
>>
>> From: Nirjhar Roy 
>>
>> Faults from copy_from_kernel_nofault() needs to be handled by fixup
>> table and should not be handled by kfence. Otherwise while reading
>> /proc/kcore which uses copy_from_kernel_nofault(), kfence can generate
>> false negatives. This can happen when /proc/kcore ends up reading an
>> unmapped address from kfence pool.
>>
>> Let's add a testcase to cover this case.
>>
>> Co-developed-by: Ritesh Harjani (IBM) 
>> Signed-off-by: Nirjhar Roy 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>>
>> Will be nice if we can get some feedback on this.
>
> There was some discussion recently how sanitizers should behave around
> these nofault helpers when accessing invalid memory (including freed
> memory):
> https://lore.kernel.org/all/canpmjnmavfzqnczheity9cjiqq9cvn1x7qeeeap_6ykjwko...@mail.gmail.com/
>
> It should be similar for KFENCE, i.e. no report should be generated.
> Definitely a good thing to test.
>
> Tested-by: Marco Elver 
> Reviewed-by: Marco Elver 
>

Gentle ping. Is this going into -next?

-ritesh

>> v2 -> v3:
>> =
>> 1. Separated out this kfence kunit test from the larger powerpc+kfence+v3 
>> series.
>> 2. Dropped RFC tag
>>
>> [v2]: 
>> https://lore.kernel.org/linuxppc-dev/cover.1728954719.git.ritesh.l...@gmail.com
>> [powerpc+kfence+v3]: 
>> https://lore.kernel.org/linuxppc-dev/cover.1729271995.git.ritesh.l...@gmail.com
>>
>>  mm/kfence/kfence_test.c | 17 +
>>  1 file changed, 17 insertions(+)
>>
>> diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
>> index 00fd17285285..f65fb182466d 100644
>> --- a/mm/kfence/kfence_test.c
>> +++ b/mm/kfence/kfence_test.c
>> @@ -383,6 +383,22 @@ static void test_use_after_free_read(struct kunit *test)
>> KUNIT_EXPECT_TRUE(test, report_matches(&expect));
>>  }
>>
>> +static void test_use_after_free_read_nofault(struct kunit *test)
>> +{
>> +   const size_t size = 32;
>> +   char *addr;
>> +   char dst;
>> +   int ret;
>> +
>> +   setup_test_cache(test, size, 0, NULL);
>> +   addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
>> +   test_free(addr);
>> +   /* Use after free with *_nofault() */
>> +   ret = copy_from_kernel_nofault(&dst, addr, 1);
>> +   KUNIT_EXPECT_EQ(test, ret, -EFAULT);
>> +   KUNIT_EXPECT_FALSE(test, report_available());
>> +}
>> +
>>  static void test_double_free(struct kunit *test)
>>  {
>> const size_t size = 32;
>> @@ -780,6 +796,7 @@ static struct kunit_case kfence_test_cases[] = {
>> KFENCE_KUNIT_CASE(test_out_of_bounds_read),
>> KFENCE_KUNIT_CASE(test_out_of_bounds_write),
>> KFENCE_KUNIT_CASE(test_use_after_free_read),
>> +   KFENCE_KUNIT_CASE(test_use_after_free_read_nofault),
>> KFENCE_KUNIT_CASE(test_double_free),
>> KFENCE_KUNIT_CASE(test_invalid_addr_free),
>> KFENCE_KUNIT_CASE(test_corruption),
>> --
>> 2.46.0
>>

Re: [RFC v3 -next] cma: Enforce non-zero pageblock_order during cma_init_reserved_mem()

2024-11-12 Thread IBM

"Ritesh Harjani (IBM)"  writes:

> cma_init_reserved_mem() checks base and size alignment with
> CMA_MIN_ALIGNMENT_BYTES. However, some users might call this during
> early boot when pageblock_order is 0. That means if base and size does
> not have pageblock_order alignment, it can cause functional failures
> during cma activate area.
>
> So let's enforce pageblock_order to be non-zero during
> cma_init_reserved_mem().
>
> Acked-by: David Hildenbrand 
> Signed-off-by: Ritesh Harjani (IBM) 
> ---
> v2 -> v3: Separated the series into 2 as discussed in v2.
> [v2]: 
> https://lore.kernel.org/linuxppc-dev/cover.1728585512.git.ritesh.l...@gmail.com/
>
>  mm/cma.c | 9 +
>  1 file changed, 9 insertions(+)

Gentle ping. Is this going into -next?

-ritesh

>
> diff --git a/mm/cma.c b/mm/cma.c
> index 3e9724716bad..36d753e7a0bf 100644
> --- a/mm/cma.c
> +++ b/mm/cma.c
> @@ -182,6 +182,15 @@ int __init cma_init_reserved_mem(phys_addr_t base, 
> phys_addr_t size,
>   if (!size || !memblock_is_region_reserved(base, size))
>   return -EINVAL;
>
> + /*
> +  * CMA uses CMA_MIN_ALIGNMENT_BYTES as alignment requirement which
> +  * needs pageblock_order to be initialized. Let's enforce it.
> +  */
> + if (!pageblock_order) {
> + pr_err("pageblock_order not yet initialized. Called during 
> early boot?\n");
> + return -EINVAL;
> + }
> +
>   /* ensure minimal alignment required by mm core */
>   if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
>   return -EINVAL;
> --
> 2.46.0

Re: [PATCH v2 2/2] fadump: reserve param area if below boot_mem_top

2024-11-12 Thread IBM

Sourabh Jain  writes:

> Hello Ritesh,
>
>
> On 12/11/24 17:23, Ritesh Harjani (IBM) wrote:
>> Ritesh Harjani (IBM)  writes:
>>
>>> Sourabh Jain  writes:
>>>
>>>> Hello Ritesh,
>>>>
>>>>
>>>> On 12/11/24 11:51, Ritesh Harjani (IBM) wrote:
>>>>> Sourabh Jain  writes:
>>>>>
>>>>>> The param area is a memory region where the kernel places additional
>>>>>> command-line arguments for fadump kernel. Currently, the param memory
>>>>>> area is reserved in fadump kernel if it is above boot_mem_top. However,
>>>>>> it should be reserved if it is below boot_mem_top because the fadump
>>>>>> kernel already reserves memory from boot_mem_top to the end of DRAM.
>>>>> did you mean s/reserves/preserves ?
>>>> Yeah, preserves is better.
>>>>
>>>>>> Currently, there is no impact from not reserving param memory if it is
>>>>>> below boot_mem_top, as it is not used after the early boot phase of the
>>>>>> fadump kernel. However, if this changes in the future, it could lead to
>>>>>> issues in the fadump kernel.
>>>>> This will only affect Hash and not radix correct? Because for radix your
>>>>> param_area is somewhere within [memblock_end_of_DRAM() / 2, 
>>>>> memblock_end_of_DRAM()]
>>>>> which is anyway above boot_mem_top so it is anyway preserved as is...
>>>> Yes.
>>>>
>>>>> ... On second thoughts since param_area during normal kernel boot anyway
>>>>> comes from memblock now. And irrespective of where it falls (above or 
>>>>> below
>>>>> boot_mem_top), we anyway append the bootargs to that. So we don't really
>>>>> preserve the original contents :) right?
>>>> Sorry I didn't get it. We append strings from param_area to
>>>> boot_command_line
>>>> not the other way.
>>>>
>>>>
>>> Right. My bad.
>>>
>>>>> So why not just always call for
>>>>> memblock_reserve() on param_area during capture kernel run?
>>>>>
>>>>> Thoughts?
>>>> Yes, there is no harm in calling memblock_reserve regardless of whether
>>>> param_area
>>>> is below or above boot_mem_top. However, calling it when param_area is
>>>> higher than
>>>> boot_mem_top is redundant, as we know fadump preserves memory from
>>>> boot_mem_top
>>>> to the end of DRAM during early boot.
>>> So if we don't reserve the param_area then the kernel may use it for
>>> some other purposes once memory is released to buddy, right. But I guess,
>>> given we anyway copied the param_area in fadump_append_bootargs() during
>>> early boot to cmdline (before parse_early_param()), we anyway don't need
>>> it for later, right?
>>>
>>> In that case we don't need for Hash too (i.e when param_area falls under
>>> boot_mem_top), right? Since we anyway copied the param_area before
>>> parse_early_param() in fadump_append_bootargs. So what is the point in
>>> calling memblock_reserve() on that? Maybe I am missing something, can
>>> you please help explain.
>>>
>> Ok. I think I got it now. You did mention in the changelog -
>>
>> "Currently, there is no impact from not reserving param memory if it is
>> below boot_mem_top, as it is not used after the early boot phase of the
>> fadump kernel. However, if this changes in the future, it could lead to
>> issues in the fadump kernel."
>>
>>
>> So it is not an issue now, since the param area is not used after the
>> contents is copied over. So I think today we anyway don't need to call
>> memblock_reserve() on the param area - but if we are making it future
>> proof then we might as well just call memblock_reserve() on param_area
>> irrespective because otherwise once the kernel starts up it might re-use
>> that area for other purposes. So isn't it better to reserve for fadump
>> use of the param_area for either during early boot or during late kernel
>> boot phase of the capture kernel?
>
> Seems like there is some confusion. Here is the full picture with the 
> current patch:
>
> First kernel boot: Reserve param_area during early boot and let it 
> remain reserved.
>
> First kernel crashed
>
> Fadump/second kernel boot
>
> fadump_reserve_mem() does memblock_reserve() from boot_me

Re: [PATCH v2 2/2] fadump: reserve param area if below boot_mem_top

2024-11-12 Thread IBM

Sourabh Jain  writes:

> Hello Ritesh,
>
>
> On 12/11/24 11:51, Ritesh Harjani (IBM) wrote:
>> Sourabh Jain  writes:
>>
>>> The param area is a memory region where the kernel places additional
>>> command-line arguments for fadump kernel. Currently, the param memory
>>> area is reserved in fadump kernel if it is above boot_mem_top. However,
>>> it should be reserved if it is below boot_mem_top because the fadump
>>> kernel already reserves memory from boot_mem_top to the end of DRAM.
>> did you mean s/reserves/preserves ?
>
> Yeah, preserves is better.
>
>>
>>> Currently, there is no impact from not reserving param memory if it is
>>> below boot_mem_top, as it is not used after the early boot phase of the
>>> fadump kernel. However, if this changes in the future, it could lead to
>>> issues in the fadump kernel.
>> This will only affect Hash and not radix correct? Because for radix your
>> param_area is somewhere within [memblock_end_of_DRAM() / 2, 
>> memblock_end_of_DRAM()]
>> which is anyway above boot_mem_top so it is anyway preserved as is...
>
> Yes.
>
>>
>> ... On second thoughts since param_area during normal kernel boot anyway
>> comes from memblock now. And irrespective of where it falls (above or below
>> boot_mem_top), we anyway append the bootargs to that. So we don't really
>> preserve the original contents :) right?
>
> Sorry I didn't get it. We append strings from param_area to 
> boot_command_line
> not the other way.
>
>

Right. My bad. 

>> So why not just always call for
>> memblock_reserve() on param_area during capture kernel run?
>>
>> Thoughts?
>
> Yes, there is no harm in calling memblock_reserve regardless of whether 
> param_area
> is below or above boot_mem_top. However, calling it when param_area is 
> higher than
> boot_mem_top is redundant, as we know fadump preserves memory from 
> boot_mem_top
> to the end of DRAM during early boot.

So if we don't reserve the param_area then the kernel may use it for
some other purposes once memory is released to buddy, right. But I guess,
given we anyway copied the param_area in fadump_append_bootargs() during
early boot to cmdline (before parse_early_param()), we anyway don't need
it for later, right?

In that case we don't need for Hash too (i.e when param_area falls under
boot_mem_top), right? Since we anyway copied the param_area before
parse_early_param() in fadump_append_bootargs. So what is the point in
calling memblock_reserve() on that? Maybe I am missing something, can
you please help explain.

-ritesh

>
> According to the memblock documentation, when reserving memory regions, 
> the new
> regions can overlap with existing ones, but I don't see any advantage in 
> calling memblock_reserve
> for param_area if it falls above boot_mem_top.
>
> Regardless, I don’t have a strong opinion. If you think we should call 
> memblock_reserve regardless
> of where param_area is placed, I can do that. Please let me know your 
> opinion.
>
> Sourabh Jain
>
>
>
>>
>>> Fixes: 3416c9daa6b1 ("powerpc/fadump: pass additional parameters when 
>>> fadump is active")
>>> Cc: Mahesh Salgaonkar 
>>> Cc: Michael Ellerman 
>>> Acked-by: Hari Bathini 
>>> Signed-off-by: Sourabh Jain 
>>> ---
>>>
>>> Changelog:
>>>
>>> Since v1: 
>>> https://lore.kernel.org/all/20241104083528.99520-1-sourabhj...@linux.ibm.com/
>>>- Include Fixes and Acked-by tag in the commit message
>>>- No functional changes
>>>
>>> ---
>>>   arch/powerpc/kernel/fadump.c | 2 +-
>>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
>>> index 3a2863307863..3f3674060164 100644
>>> --- a/arch/powerpc/kernel/fadump.c
>>> +++ b/arch/powerpc/kernel/fadump.c
>>> @@ -143,7 +143,7 @@ void __init fadump_append_bootargs(void)
>>> if (!fw_dump.dump_active || !fw_dump.param_area_supported || 
>>> !fw_dump.param_area)
>>> return;
>>>   
>>> -   if (fw_dump.param_area >= fw_dump.boot_mem_top) {
>>> +   if (fw_dump.param_area < fw_dump.boot_mem_top) {
>>> if (memblock_reserve(fw_dump.param_area, COMMAND_LINE_SIZE)) {
>>> pr_warn("WARNING: Can't use additional parameters 
>>> area!\n");
>>> fw_dump.param_area = 0;
>>> -- 
>>> 2.46.2

Re: [PATCH v2 2/2] fadump: reserve param area if below boot_mem_top

2024-11-12 Thread IBM

Ritesh Harjani (IBM)  writes:

> Sourabh Jain  writes:
>
>> Hello Ritesh,
>>
>>
>> On 12/11/24 11:51, Ritesh Harjani (IBM) wrote:
>>> Sourabh Jain  writes:
>>>
>>>> The param area is a memory region where the kernel places additional
>>>> command-line arguments for fadump kernel. Currently, the param memory
>>>> area is reserved in fadump kernel if it is above boot_mem_top. However,
>>>> it should be reserved if it is below boot_mem_top because the fadump
>>>> kernel already reserves memory from boot_mem_top to the end of DRAM.
>>> did you mean s/reserves/preserves ?
>>
>> Yeah, preserves is better.
>>
>>>
>>>> Currently, there is no impact from not reserving param memory if it is
>>>> below boot_mem_top, as it is not used after the early boot phase of the
>>>> fadump kernel. However, if this changes in the future, it could lead to
>>>> issues in the fadump kernel.
>>> This will only affect Hash and not radix correct? Because for radix your
>>> param_area is somewhere within [memblock_end_of_DRAM() / 2, 
>>> memblock_end_of_DRAM()]
>>> which is anyway above boot_mem_top so it is anyway preserved as is...
>>
>> Yes.
>>
>>>
>>> ... On second thoughts since param_area during normal kernel boot anyway
>>> comes from memblock now. And irrespective of where it falls (above or below
>>> boot_mem_top), we anyway append the bootargs to that. So we don't really
>>> preserve the original contents :) right?
>>
>> Sorry I didn't get it. We append strings from param_area to 
>> boot_command_line
>> not the other way.
>>
>>
>
> Right. My bad. 
>
>>> So why not just always call for
>>> memblock_reserve() on param_area during capture kernel run?
>>>
>>> Thoughts?
>>
>> Yes, there is no harm in calling memblock_reserve regardless of whether 
>> param_area
>> is below or above boot_mem_top. However, calling it when param_area is 
>> higher than
>> boot_mem_top is redundant, as we know fadump preserves memory from 
>> boot_mem_top
>> to the end of DRAM during early boot.
>
> So if we don't reserve the param_area then the kernel may use it for
> some other purposes once memory is released to buddy, right. But I guess,
> given we anyway copied the param_area in fadump_append_bootargs() during
> early boot to cmdline (before parse_early_param()), we anyway don't need
> it for later, right?
>
> In that case we don't need for Hash too (i.e when param_area falls under
> boot_mem_top), right? Since we anyway copied the param_area before
> parse_early_param() in fadump_append_bootargs. So what is the point in
> calling memblock_reserve() on that? Maybe I am missing something, can
> you please help explain.
>

Ok. I think I got it now. You did mention in the changelog - 

"Currently, there is no impact from not reserving param memory if it is
below boot_mem_top, as it is not used after the early boot phase of the
fadump kernel. However, if this changes in the future, it could lead to
issues in the fadump kernel."


So it is not an issue now, since the param area is not used after the
contents is copied over. So I think today we anyway don't need to call
memblock_reserve() on the param area - but if we are making it future
proof then we might as well just call memblock_reserve() on param_area
irrespective because otherwise once the kernel starts up it might re-use
that area for other purposes. So isn't it better to reserve for fadump
use of the param_area for either during early boot or during late kernel
boot phase of the capture kernel?

But now that I understand I don't have a strong opinion too (since it is
just future proofing). But I would prefer the safer approach of doing
memblock_reserve() always for param_area. So I will leave it upto you
and others. 

>>
>> According to the memblock documentation, when reserving memory regions, 
>> the new
>> regions can overlap with existing ones, but I don't see any advantage in 
>> calling memblock_reserve
>> for param_area if it falls above boot_mem_top.
>>
>> Regardless, I don’t have a strong opinion. If you think we should call 
>> memblock_reserve regardless
>> of where param_area is placed, I can do that. Please let me know your 
>> opinion.
>>
>> Sourabh Jain
>>
>>
>>
>>>
>>>> Fixes: 3416c9daa6b1 ("powerpc/fadump: pass additional parameters when 
>>>> fadump is active")

Not really IIUC, this is not really a fix but a future proofing of

Re: [RFC PATCH] powerpc/tlb: enable arch want batched unmap tlb flush

2024-09-22 Thread IBM

Luming Yu  writes:

> On Sun, Sep 22, 2024 at 04:39:53PM +0530, Ritesh Harjani wrote:
>> Luming Yu  writes:
>> 
>> > From: Yu Luming 
>> >
>> > ppc always do its own tracking for batch tlb. By trivially enabling
>> > the ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH in ppc, ppc arch can re-use
>> > common code in rmap and reduce overhead and do optimization it could not
>> > have without a tlb flushing context at low architecture level.
>> 
>> I looked at this patch and other than the compile failure, this patch
>> still won't optimize anything. The idea of this config is that we want
>> to batch all the tlb flush operation at the end. By returning false from
>> should_defer_flush() (in this patch), we are saying we cannot defer
>> the flush and hence we do tlb flush in the same context of unmap.
> not exactly, as false return implies, we currently do nothing but relying on
> book3S_64's tlb batch implementation which contains a bit of defer 
> optimization
> that we need to use a real benchmark to do some performance characterization.
>
> And I need to get my test bed ready for patch testing first. So I have to
> defer the real optimization in this area.
>> 
>> Anyway, I took a quick look at ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
>> and I have a quick PoC for the same. I will soon post it.
> thanks for picking up the barton for the future collaboration on the
> potential common performance benefits among us for powerpc arch.

Sure Thanks, Luming. 
I have posted this work here [1].

[1]: 
https://lore.kernel.org/linuxppc-dev/cover.1727001426.git.ritesh.l...@gmail.com/
-ritesh

Re: [RFC PATCH] powerpc/tlb: enable arch want batched unmap tlb flush

2024-09-22 Thread IBM

Luming Yu  writes:

> From: Yu Luming 
>
> ppc always do its own tracking for batch tlb. By trivially enabling
> the ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH in ppc, ppc arch can re-use
> common code in rmap and reduce overhead and do optimization it could not
> have without a tlb flushing context at low architecture level.

I looked at this patch and other than the compile failure, this patch
still won't optimize anything. The idea of this config is that we want
to batch all the tlb flush operation at the end. By returning false from
should_defer_flush() (in this patch), we are saying we cannot defer
the flush and hence we do tlb flush in the same context of unmap.

Anyway, I took a quick look at ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
and I have a quick PoC for the same. I will soon post it.

-ritesh

>
> Signed-off-by: Luming Yu 
> ---
>  arch/powerpc/Kconfig|  1 +
>  arch/powerpc/include/asm/tlbbatch.h | 30 +
>  2 files changed, 31 insertions(+)
>  create mode 100644 arch/powerpc/include/asm/tlbbatch.h
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index e94e7e4bfd40..e6db84dd014a 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -175,6 +175,7 @@ config PPC
>   select ARCH_WANT_IPC_PARSE_VERSION
>   select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
>   select ARCH_WANT_LD_ORPHAN_WARN
> + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
>   select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP   if PPC_RADIX_MMU
>   select ARCH_WANTS_MODULES_DATA_IN_VMALLOC   if PPC_BOOK3S_32 || 
> PPC_8xx
>   select ARCH_WEAK_RELEASE_ACQUIRE
> diff --git a/arch/powerpc/include/asm/tlbbatch.h 
> b/arch/powerpc/include/asm/tlbbatch.h
> new file mode 100644
> index ..484628460057
> --- /dev/null
> +++ b/arch/powerpc/include/asm/tlbbatch.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ARCH_PPC_TLBBATCH_H
> +#define _ARCH_PPC_TLBBATCH_H
> +
> +struct arch_tlbflush_unmap_batch {
> + /*
> + *
> +  */
> +};
> +
> +static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch 
> *batch)
> +{
> +}
> +
> +static inline void arch_tlbbatch_add_pending(struct 
> arch_tlbflush_unmap_batch *batch,
> + struct mm_struct *mm,
> + unsigned long uarddr)
> +{
> +}
> +
> +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
> +{
> + /*ppc always do tlb flush in batch*/
> + return false;
> +}
> +
> +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
> +{
> +}
> +#endif /* _ARCH_PPC_TLBBATCH_H */
> -- 
> 2.42.0.windows.2

Re: [PATCH v3] powerpc/pseries/eeh: Fix pseries_eeh_err_inject

2024-09-23 Thread IBM

Guenter Roeck  writes:

> Hi,
>
> On Mon, Sep 09, 2024 at 09:02:20AM -0500, Narayana Murty N wrote:
>> VFIO_EEH_PE_INJECT_ERR ioctl is currently failing on pseries
>> due to missing implementation of err_inject eeh_ops for pseries.
>> This patch implements pseries_eeh_err_inject in eeh_ops/pseries
>> eeh_ops. Implements support for injecting MMIO load/store error
>> for testing from user space.
>> 
>> The check on PCI error type (bus type) code is moved to platform
>> code, since the eeh_pe_inject_err can be allowed to more error
>> types depending on platform requirement. Removal of the check for
>> 'type' in eeh_pe_inject_err() doesn't impact PowerNV as
>> pnv_eeh_err_inject() already has an equivalent check in place.
>> 
>> Signed-off-by: Narayana Murty N 
>> Reviewed-by: Vaibhav Jain 
>> 
>> ---
>>  arch/powerpc/include/asm/eeh.h   |  2 +-
>>  arch/powerpc/kernel/eeh.c|  9 +++--
>>  arch/powerpc/platforms/pseries/eeh_pseries.c | 39 +++-
>>  3 files changed, 44 insertions(+), 6 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
>> index 91a9fd53254f..317b12fc1fe4 100644
>> --- a/arch/powerpc/include/asm/eeh.h
>> +++ b/arch/powerpc/include/asm/eeh.h
>> @@ -308,7 +308,7 @@ int eeh_pe_reset(struct eeh_pe *pe, int option, bool 
>> include_passed);
>>  int eeh_pe_configure(struct eeh_pe *pe);
>>  int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
>>unsigned long addr, unsigned long mask);
>> -
>> +int eeh_pe_inject_mmio_error(struct pci_dev *pdev);
>>  /**
>>   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
>>   *
>> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>> index d03f17987fca..49ab11a287a3 100644
>> --- a/arch/powerpc/kernel/eeh.c
>> +++ b/arch/powerpc/kernel/eeh.c
>> @@ -1537,10 +1537,6 @@ int eeh_pe_inject_err(struct eeh_pe *pe, int type, 
>> int func,
>>  if (!eeh_ops || !eeh_ops->err_inject)
>>  return -ENOENT;
>>  
>> -/* Check on PCI error type */
>> -if (type != EEH_ERR_TYPE_32 && type != EEH_ERR_TYPE_64)
>> -return -EINVAL;
>> -
>>  /* Check on PCI error function */
>>  if (func < EEH_ERR_FUNC_MIN || func > EEH_ERR_FUNC_MAX)
>>  return -EINVAL;
>> @@ -1851,6 +1847,11 @@ static const struct file_operations 
>> eeh_dev_break_fops = {
>>  .read   = eeh_debugfs_dev_usage,
>>  };
>>  
>> +int eeh_pe_inject_mmio_error(struct pci_dev *pdev)
>> +{
>> +return eeh_debugfs_break_device(pdev);
>> +}
>> +
>
> The new function, as the context suggests, is only compiled if 
> CONFIG_DEBUG_FS=y.
> However, it is called unconditionally. With CONFIG_DEBUG_FS=n, this results in
>
> powerpc64-linux-ld: arch/powerpc/platforms/pseries/eeh_pseries.o: in function 
> `pseries_eeh_err_inject':
> /opt/buildbot/slave/qemu-ppc64/build/arch/powerpc/platforms/pseries/eeh_pseries.c:814:(.text+0x554):
>  undefined reference to `eeh_pe_inject_mmio_error'
> make[3]: *** 
> [/opt/buildbot/slave/qemu-ppc64/build/scripts/Makefile.vmlinux:34: vmlinux] 
> Error 1
> make[2]: *** [/opt/buildbot/slave/qemu-ppc64/build/Makefile:1157: vmlinux] 
> Error 2
>
> I'll enable CONFIG_DEBUG_FS in my tests and won't report this further,
> but you might want to consider fixing the problem at some point.
>

Yes, this is fixed and picked up in powerpc tree.

https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?h=merge&id=3af2e2f68cc6baf0a11f662d30b0bf981f77bfea

-ritesh

Re: [PATCH v2] powerpc/xive: Use cpumask_intersects()

2024-09-27 Thread IBM

Costa Shulyupin  writes:

> Replace `cpumask_any_and(a, b) >= nr_cpu_ids`
> with the more readable `!cpumask_intersects(a, b)`.
>
> Comparison between cpumask_any_and() and cpumask_intersects()
>
> The cpumask_any_and() function expands using FIND_FIRST_BIT(),
> resulting in a loop that iterates through each bit of the bitmask:
>
> for (idx = 0; idx * BITS_PER_LONG < sz; idx++) {
>   val = (FETCH);
>   if (val) {
>   sz = min(idx * BITS_PER_LONG + __ffs(MUNGE(val)), sz);
>   break;
>   }
> }
>
> The cpumask_intersects() function expands using __bitmap_intersects(),
> resulting in that the first loop iterates through each long word of the 
> bitmask,
> and the second through each bit within a long word:
>
> unsigned int k, lim = bits/BITS_PER_LONG;
> for (k = 0; k < lim; ++k)
>   if (bitmap1[k] & bitmap2[k])
>   return true;
>
> if (bits % BITS_PER_LONG)
>   if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
>   return true;
>
> Conclusion: cpumask_intersects() is at least as efficient as 
> cpumask_any_and(),
> if not more so, as it typically performs fewer loops and comparisons.
>

I agree with the analysis in above. cpumask_any_and() has to get the
first set bit from the two cpumask for which it also does some
additional calculations like __ffs().

whereas cpumask_intersects() has to only check if any of the bits is set
hence does fewer operations.


Looks good to me. Please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM) 


> Signed-off-by: Costa Shulyupin 
> Reviewed-by: Ming Lei 
>
> ---
>
> v2: add comparison between cpumask_any_and() and cpumask_intersects()
>
> ---
>  arch/powerpc/sysdev/xive/common.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/sysdev/xive/common.c 
> b/arch/powerpc/sysdev/xive/common.c
> index fa01818c1972c..a6c388bdf5d08 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -726,7 +726,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
>   pr_debug("%s: irq %d/0x%x\n", __func__, d->irq, hw_irq);
>  
>   /* Is this valid ? */
> - if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
> + if (!cpumask_intersects(cpumask, cpu_online_mask))
>   return -EINVAL;
>  
>   /*
> -- 
> 2.45.0

Re: [PATCH] selftests/powerpc: Rm the unnecessary remove function.

2024-09-27 Thread IBM

zhangjiao2  writes:

> From: zhang jiao 
>
> Path is not initialized before use,
> remove the unnecessary remove function.
>
> Signed-off-by: zhang jiao 
> ---
>  tools/testing/selftests/powerpc/mm/tlbie_test.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c 
> b/tools/testing/selftests/powerpc/mm/tlbie_test.c
> index 48344a74b212..fd1456d16a7d 100644
> --- a/tools/testing/selftests/powerpc/mm/tlbie_test.c
> +++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c
> @@ -314,7 +314,6 @@ static inline void end_verification_log(unsigned int tid, 
> unsigned nr_anamolies)
>   fclose(f);
>  
>   if (nr_anamolies == 0) {
> - remove(path);
>   return;
>   }

Nice catch. Indeed the path is uninitialized here. 

However, I believe the above "if" block should come after initializing
the path. The idea is if there were no anamolies noted, then we can
simply remove the log file and return.

Something like below. Thoughts?

diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c 
b/tools/testing/selftests/powerpc/mm/tlbie_test.c
index 48344a74b212..35f0098399cc 100644
--- a/tools/testing/selftests/powerpc/mm/tlbie_test.c
+++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c
@@ -313,16 +313,16 @@ static inline void end_verification_log(unsigned int tid, 
unsigned nr_anamolies)

fclose(f);

-   if (nr_anamolies == 0) {
-   remove(path);
-   return;
-   }
-
sprintf(logfile, logfilename, tid);
strcpy(path, logdir);
strcat(path, separator);
strcat(path, logfile);

+   if (nr_anamolies == 0) {
+   remove(path);
+   return;
+   }
+
printf("Thread %02d chunk has %d corrupted words. For details check 
%s\n",
tid, nr_anamolies, path);
 }


-ritesh

Re: [PATCH] powerpc: remove dead config options for MPC85xx platform support

2024-09-27 Thread IBM

Lukas Bulwahn  writes:

> From: Lukas Bulwahn 
>
> Commit 384e338a9187 ("powerpc: drop MPC8540_ADS and MPC8560_ADS platform
> support") and commit b751ed04bc5e ("powerpc: drop MPC85xx_CDS platform
> support") removes the platform support for MPC8540_ADS, MPC8560_ADS and
> MPC85xx_CDS in the source tree, but misses to remove the config options in
> the Kconfig file. Hence, these three config options are without any effect
> since then.
>
> Drop these three dead config options.
>

Indeed these looks to be dead config remaining.

> Fixes: 384e338a9187 ("powerpc: drop MPC8540_ADS and MPC8560_ADS platform 
> support")
> Fixes: b751ed04bc5e ("powerpc: drop MPC85xx_CDS platform support")
> Signed-off-by: Lukas Bulwahn 
> ---
>  arch/powerpc/platforms/85xx/Kconfig | 21 -
>  1 file changed, 21 deletions(-)

I couldn't find any relevant reference of MPC8540_ADS, MPC8560_ADS or 
MPC85xx_CDS
after this patch

So please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM) 

>
> diff --git a/arch/powerpc/platforms/85xx/Kconfig 
> b/arch/powerpc/platforms/85xx/Kconfig
> index 9315a3b69d6d..604c1b4b6d45 100644
> --- a/arch/powerpc/platforms/85xx/Kconfig
> +++ b/arch/powerpc/platforms/85xx/Kconfig
> @@ -40,27 +40,6 @@ config BSC9132_QDS
> and dual StarCore SC3850 DSP cores.
> Manufacturer : Freescale Semiconductor, Inc
>  
> -config MPC8540_ADS
> - bool "Freescale MPC8540 ADS"
> - select DEFAULT_UIMAGE
> - help
> -   This option enables support for the MPC 8540 ADS board
> -
> -config MPC8560_ADS
> - bool "Freescale MPC8560 ADS"
> - select DEFAULT_UIMAGE
> - select CPM2
> - help
> -   This option enables support for the MPC 8560 ADS board
> -
> -config MPC85xx_CDS
> - bool "Freescale MPC85xx CDS"
> - select DEFAULT_UIMAGE
> - select PPC_I8259
> - select HAVE_RAPIDIO
> - help
> -   This option enables support for the MPC85xx CDS board
> -
>  config MPC85xx_MDS
>   bool "Freescale MPC8568 MDS / MPC8569 MDS / P1021 MDS"
>   select DEFAULT_UIMAGE
> -- 
> 2.46.1

Re: [PATCH] powerpc/pseries/eeh: move pseries_eeh_err_inject() outside CONFIG_DEBUG_FS block

2024-09-17 Thread IBM

Narayana Murty N  writes:

> Makes pseries_eeh_err_inject() available even when debugfs
> is disabled (CONFIG_DEBUG_FS=n). It moves eeh_debugfs_break_device()
> and eeh_pe_inject_mmio_error() out of the CONFIG_DEBUG_FS block
> and renames it as eeh_break_device().
>
> Reported-by: kernel test robot 
> Closes: 
> https://lore.kernel.org/oe-kbuild-all/202409170509.vwc6jadc-...@intel.com/
> Fixes: b0e2b828dfca ("powerpc/pseries/eeh: Fix pseries_eeh_err_inject")
> Signed-off-by: Narayana Murty N 
> ---
>  arch/powerpc/kernel/eeh.c | 198 +++---
>  1 file changed, 99 insertions(+), 99 deletions(-)

Ok, so in your original patch you implemented eeh_inject ops for pseries
using mmio based eeh error injection (eeh_pe_inject_mmio_error()), which
uses the functions defined under debugfs -> eeh_debugfs_break_device(). 

This was failing when CONFIG_DEBUGFS is not defined, thus referring to
undefined function definition. 

Minor nit below.

>
> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
> index 49ab11a287a3..0fe25e907ea6 100644
> --- a/arch/powerpc/kernel/eeh.c
> +++ b/arch/powerpc/kernel/eeh.c
> @@ -1574,6 +1574,104 @@ static int proc_eeh_show(struct seq_file *m, void *v)
>  }
>  #endif /* CONFIG_PROC_FS */
>  
> +static int eeh_break_device(struct pci_dev *pdev)
> +{
> + struct resource *bar = NULL;
> + void __iomem *mapped;
> + u16 old, bit;
> + int i, pos;
> +
> + /* Do we have an MMIO BAR to disable? */
> + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
> + struct resource *r = &pdev->resource[i];
> +
> + if (!r->flags || !r->start)
> + continue;
> + if (r->flags & IORESOURCE_IO)
> + continue;
> + if (r->flags & IORESOURCE_UNSET)
> + continue;
> +
> + bar = r;
> + break;
> + }
> +
> + if (!bar) {
> + pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n");
> + return -ENXIO;
> + }
> +
> + pci_err(pdev, "Going to break: %pR\n", bar);
> +
> + if (pdev->is_virtfn) {
> +#ifndef CONFIG_PCI_IOV
> + return -ENXIO;
> +#else
> + /*
> +  * VFs don't have a per-function COMMAND register, so the best
> +  * we can do is clear the Memory Space Enable bit in the PF's
> +  * SRIOV control reg.
> +  *
> +  * Unfortunately, this requires that we have a PF (i.e doesn't
> +  * work for a passed-through VF) and it has the potential side
> +  * effect of also causing an EEH on every other VF under the
> +  * PF. Oh well.
> +  */
> + pdev = pdev->physfn;
> + if (!pdev)
> + return -ENXIO; /* passed through VFs have no PF */
> +
> + pos  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
> + pos += PCI_SRIOV_CTRL;
> + bit  = PCI_SRIOV_CTRL_MSE;
> +#endif /* !CONFIG_PCI_IOV */
> + } else {
> + bit = PCI_COMMAND_MEMORY;
> + pos = PCI_COMMAND;
> + }
> +
> + /*
> +  * Process here is:
> +  *
> +  * 1. Disable Memory space.
> +  *
> +  * 2. Perform an MMIO to the device. This should result in an error
> +  *(CA  / UR) being raised by the device which results in an EEH
> +  *PE freeze. Using the in_8() accessor skips the eeh detection hook
> +  *so the freeze hook so the EEH Detection machinery won't be
> +  *triggered here. This is to match the usual behaviour of EEH
> +  *where the HW will asynchronously freeze a PE and it's up to
> +  *the kernel to notice and deal with it.
> +  *
> +  * 3. Turn Memory space back on. This is more important for VFs
> +  *since recovery will probably fail if we don't. For normal
> +  *the COMMAND register is reset as a part of re-initialising
> +  *the device.
> +  *
> +  * Breaking stuff is the point so who cares if it's racy ;)
> +  */
> + pci_read_config_word(pdev, pos, &old);
> +
> + mapped = ioremap(bar->start, PAGE_SIZE);
> + if (!mapped) {
> + pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar);
> + return -ENXIO;
> + }
> +
> + pci_write_config_word(pdev, pos, old & ~bit);
> + in_8(mapped);
> + pci_write_config_word(pdev, pos, old);
> +
> + iounmap(mapped);
> +
> + return 0;
> +}
> +
> +int eeh_pe_inject_mmio_error(struct pci_dev *pdev)
> +{
> + return eeh_break_device(pdev);
> +}
> +

Why have an extra eeh_pe_inject_mmio_error() function which only calls
eeh_break_device()?

Maybe we can rename eeh_break_device() to eeh_mmio_break_device() and use
this function itself at both call sites?

-ritesh

Re: [RFC v2 02/13] powerpc: mm: Fix kfence page fault reporting

2024-09-18 Thread IBM

Christophe Leroy  writes:

> Le 19/09/2024 à 04:56, Ritesh Harjani (IBM) a écrit :
>> copy_from_kernel_nofault() can be called when doing read of /proc/kcore.
>> /proc/kcore can have some unmapped kfence objects which when read via
>> copy_from_kernel_nofault() can cause page faults. Since *_nofault()
>> functions define their own fixup table for handling fault, use that
>> instead of asking kfence to handle such faults.
>> 
>> Hence we search the exception tables for the nip which generated the
>> fault. If there is an entry then we let the fixup table handler handle the
>> page fault by returning an error from within ___do_page_fault().
>
> Searching the exception table is a heavy operation and all has been done 
> in the past to minimise the number of times it is called, see for 
> instance commit cbd7e6ca0210 ("powerpc/fault: Avoid heavy 
> search_exception_tables() verification")

This should not cause latency in user page fault paths. We call
search_exception_tables() only when there is a page fault for kernel
address (which isn't that common right) which otherwise kfence will handle.

>
> Also, by trying to hide false positives you also hide real ones. For 

I believe these should be false negatives. If kernel functions provides an
exception table to handle such a fault, then shouldn't it be handled via
fixup table provided rather then via kfence?

> instance if csum_partial_copy_generic() is using a kfence protected 
> area, it will now go undetected.

I can go and look into usages of csum_partial_copy_generic(). But can
you please expand more here on what you meant? 

... so if a fault occurs for above case, this patch will just let the
fixup table handle that fault rather than kfence reporting it and
returning 0.

The issue we see here is when unmapped kfence addresses get accessed via
*_nofault() variants which causes kfence to report a false negative
(this happens when we use read /proc/kcore or tools like perf read that)

This is because as per my understanding copy_from_kernel_nofault()
should return -EFAULT from it's fixup table if a fault occurs...
whereas with kfence it will report the warning and will return 0 after
kfence handled the fault.

I see other archs too calling fixup_table() in their fault handling
routine before allowing kfence to handle the fault. 

>
> IIUC, here your problem is limited to copy_from_kernel_nofault(). You 
> should handle the root cause, not its effects. For that, you could 
> perform additional verifications in copy_from_kernel_nofault_allowed().

Sorry, why make copy_from_kernel_nofault() as a special case for powerpc?
I don't see any other arch making copy_from_kernel_nofault() as a
special case. Shouldn't Kernel faults be handled via fixup_table(), if
it is supplied, before kfence handling it?
(maybe I am missing something)

-ritesh

Re: [PATCH v3 2/3] powerpc/pseries: Export hardware trace macro dump via debugfs

2024-09-20 Thread IBM

Madhavan Srinivasan  writes:

> This patch adds debugfs interface to export Hardware Trace Macro (HTM)
> function data in a LPAR. New hypervisor call "H_HTM" has been
> defined to setup, configure, control and dump the HTM data.
> This patch supports only dumping of HTM data in a LPAR.
> New debugfs folder called "htmdump" has been added under
> /sys/kernel/debug/arch path which contains files need to
> pass required parameters for the H_HTM dump function. New Kconfig
> option called "CONFIG_HTMDUMP" has been in platform/pseries for the same.
>
> With this module loaded, list of files in debugfs path
>
> /sys/kernel/debug/powerpc/htmdump
> coreindexonchip  htmtype  nodalchipindex  nodeindex  trace
>
> Signed-off-by: Madhavan Srinivasan 
> ---
> Changelog v2:
> - Made driver as modules based on review comments
> Changelog v1:
> - Changed from tristate to bool with dependency flags
> - Trimmed the include headers
>
>  arch/powerpc/platforms/pseries/Kconfig   |   9 ++
>  arch/powerpc/platforms/pseries/Makefile  |   1 +
>  arch/powerpc/platforms/pseries/htmdump.c | 130 +++
>  3 files changed, 140 insertions(+)
>  create mode 100644 arch/powerpc/platforms/pseries/htmdump.c
>
> diff --git a/arch/powerpc/platforms/pseries/Kconfig 
> b/arch/powerpc/platforms/pseries/Kconfig
> index afc0f6a61337..a66be66d690e 100644
> --- a/arch/powerpc/platforms/pseries/Kconfig
> +++ b/arch/powerpc/platforms/pseries/Kconfig
> @@ -128,6 +128,15 @@ config CMM
> will be reused for other LPARs. The interface allows firmware to
> balance memory across many LPARs.
>  
> +config HTMDUMP
> + tristate "PHYP HTM data dumper"
> + depends on PPC_PSERIES && DEBUG_FS
> + default m
> + help
> +   Select this option, if you want to enable the kernel debugfs
> +   interface to dump the Hardware Trace Macro (HTM) function data
> +   in the LPAR.
> +
>  config HV_PERF_CTRS
>   bool "Hypervisor supplied PMU events (24x7 & GPCI)"
>   default y
> diff --git a/arch/powerpc/platforms/pseries/Makefile 
> b/arch/powerpc/platforms/pseries/Makefile
> index 7bf506f6b8c8..3f3e3492e436 100644
> --- a/arch/powerpc/platforms/pseries/Makefile
> +++ b/arch/powerpc/platforms/pseries/Makefile
> @@ -19,6 +19,7 @@ obj-$(CONFIG_HVC_CONSOLE)   += hvconsole.o
>  obj-$(CONFIG_HVCS)   += hvcserver.o
>  obj-$(CONFIG_HCALL_STATS)+= hvCall_inst.o
>  obj-$(CONFIG_CMM)+= cmm.o
> +obj-$(CONFIG_HTMDUMP)+= htmdump.o
>  obj-$(CONFIG_IO_EVENT_IRQ)   += io_event_irq.o
>  obj-$(CONFIG_LPARCFG)+= lparcfg.o
>  obj-$(CONFIG_IBMVIO) += vio.o
> diff --git a/arch/powerpc/platforms/pseries/htmdump.c 
> b/arch/powerpc/platforms/pseries/htmdump.c
> new file mode 100644
> index ..54c28525c4a7
> --- /dev/null
> +++ b/arch/powerpc/platforms/pseries/htmdump.c
> @@ -0,0 +1,130 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) IBM Corporation, 2024
> + */
> +
> +#define pr_fmt(fmt) "htmdump: " fmt
> +
> +#include 
> +
> +#include 
> +#include 
> +
> +/* This enables us to keep track of the memory removed from each node. */
> +struct htmdump_entry {
> + void *buf;
> + struct dentry *dir;
> + char name[16];
> +};
>

How does dir and name gets used?
It isn't that obvious, so maybe a comment will be gr8.

> +static u32 nodeindex;
> +static u32 nodalchipindex;
> +static u32 coreindexonchip;
> +static u32 htmtype;
> +static struct dentry *htmdump_debugfs_dir;
> +static struct htmdump_entry *ent;
> +
> +#define BUFFER_SIZE PAGE_SIZE
> +
> +static ssize_t htmdump_read(struct file *filp, char __user *ubuf,
> +  size_t count, loff_t *ppos)
> +{
> + struct htmdump_entry *ent = filp->private_data;
> + unsigned long page, read_size, available;
> + loff_t offset;
> + long rc;
> +
> + page = ALIGN_DOWN(*ppos, BUFFER_SIZE);
> + offset = (*ppos) % BUFFER_SIZE;
> +
> + rc = htm_get_dump_hardware(nodeindex, nodalchipindex, coreindexonchip,
> +htmtype, virt_to_phys(ent->buf), 
> BUFFER_SIZE, page);
> +
> + switch (rc) {
> + case H_SUCCESS:
> + case H_PARTIAL:
> + break;
> + case H_NOT_AVAILABLE:
> + return 0;

Minor nits for error returns here...

Is returning 0 correct here? Maybe it is (since 0 means no data read),
but wanted to confirm if we should return -ENODATA, or -ENODEV 
(not sure what does H_NOT_AVAILABLE here means)

#define ENODATA 61  /* No data avai

Re: [PATCH] powerpc/kvm: Fix typo in the kvm functions

2024-09-20 Thread IBM

Kajol Jain  writes:

> Fix typo in the following kvm function names from:
>
> kmvhv_counters_tracepoint_regfunc -> kvmhv_counters_tracepoint_regfunc
> kmvhv_counters_tracepoint_unregfunc -> kvmhv_counters_tracepoint_unregfunc

Gr8 spotting!
It took sometime to realize k[mv] and k[vm] is the change :) 

>
> Fixes: e1f288d2f9c6 ("KVM: PPC: Book3S HV nestedv2: Add support for reading 
> VPA counters for pseries guests")

Right. This commit added the registration and unregistration helpers
for TRACE_EVEN_FN_COND tracepoint which mainly collects the
observability stats for nested guest on pseries.

> Reported-by: Madhavan Srinivasan 
> Signed-off-by: Kajol Jain 
> ---
>  arch/powerpc/include/asm/kvm_book3s_64.h | 4 ++--
>  arch/powerpc/kvm/book3s_hv.c | 4 ++--
>  arch/powerpc/kvm/trace_hv.h  | 2 +-
>  3 files changed, 5 insertions(+), 5 deletions(-)

Not an expert in kvm area. But the change looks very straight forward to
me. Searching for "kmv" string in arch/powerpc/ after applying this
patch indeed resulted in zero hits. 

Please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM)

Re: [RFC v2 03/13] book3s64/hash: Remove kfence support temporarily

2024-09-18 Thread IBM

Christophe Leroy  writes:

> Le 19/09/2024 à 04:56, Ritesh Harjani (IBM) a écrit :
>> Kfence on book3s Hash on pseries is anyways broken. It fails to boot
>> due to RMA size limitation. That is because, kfence with Hash uses
>> debug_pagealloc infrastructure. debug_pagealloc allocates linear map
>> for entire dram size instead of just kfence relevant objects.
>> This means for 16TB of DRAM it will require (16TB >> PAGE_SHIFT)
>> which is 256MB which is half of RMA region on P8.
>> crash kernel reserves 256MB and we also need 2048 * 16KB * 3 for
>> emergency stack and some more for paca allocations.
>> That means there is not enough memory for reserving the full linear map
>> in the RMA region, if the DRAM size is too big (>=16TB)
>> (The issue is seen above 8TB with crash kernel 256 MB reservation).
>> 
>> Now Kfence does not require linear memory map for entire DRAM.
>> It only needs for kfence objects. So this patch temporarily removes the
>> kfence functionality since debug_pagealloc code needs some refactoring.
>> We will bring in kfence on Hash support in later patches.
>> 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> ---
>>   arch/powerpc/include/asm/kfence.h |  5 +
>>   arch/powerpc/mm/book3s64/hash_utils.c | 16 +++-
>>   2 files changed, 16 insertions(+), 5 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/kfence.h 
>> b/arch/powerpc/include/asm/kfence.h
>> index fab124ada1c7..f3a9476a71b3 100644
>> --- a/arch/powerpc/include/asm/kfence.h
>> +++ b/arch/powerpc/include/asm/kfence.h
>> @@ -10,6 +10,7 @@
>>   
>>   #include 
>>   #include 
>> +#include 
>>   
>>   #ifdef CONFIG_PPC64_ELF_ABI_V1
>>   #define ARCH_FUNC_PREFIX "."
>> @@ -25,6 +26,10 @@ static inline void disable_kfence(void)
>>   
>>   static inline bool arch_kfence_init_pool(void)
>>   {
>> +#ifdef CONFIG_PPC64
>> +if (!radix_enabled())
>
> No need for a #ifdef here, you can just do:
>
>   if (IS_ENABLED(CONFIG_PPC64) && !radix_enabled())
>   return false;
>
>

This special radix handling is anyway dropped in later pacthes. 
So I didn't bother changing it here.

>> +return false;
>> +#endif
>>  return !kfence_disabled;
>
> But why not just set kfence_disabled to true by calling disable_kfence() 
> from one of the powerpc init functions ?
>

This patch is only temporarily disabling kfence support for only Hash.
This special Hash handling gets removed in patch-10 which brings back
kfence support.

-ritesh

Re: [PATCH] powerpc/pseries/eeh: move pseries_eeh_err_inject() outside CONFIG_DEBUG_FS block

2024-09-19 Thread IBM

Vaibhav Jain  writes:

> Hi Ritesh,
>
> Thanks for looking into this patch. My responses your review inline
> below:
>
> Ritesh Harjani (IBM)  writes:
>
>> Narayana Murty N  writes:
>>
>>> Makes pseries_eeh_err_inject() available even when debugfs
>>> is disabled (CONFIG_DEBUG_FS=n). It moves eeh_debugfs_break_device()
>>> and eeh_pe_inject_mmio_error() out of the CONFIG_DEBUG_FS block
>>> and renames it as eeh_break_device().
>>>
>>> Reported-by: kernel test robot 
>>> Closes: 
>>> https://lore.kernel.org/oe-kbuild-all/202409170509.vwc6jadc-...@intel.com/
>>> Fixes: b0e2b828dfca ("powerpc/pseries/eeh: Fix pseries_eeh_err_inject")
>>> Signed-off-by: Narayana Murty N 
>>> ---
>>>  arch/powerpc/kernel/eeh.c | 198 +++---
>>>  1 file changed, 99 insertions(+), 99 deletions(-)
>>
>> Ok, so in your original patch you implemented eeh_inject ops for pseries
>> using mmio based eeh error injection (eeh_pe_inject_mmio_error()), which
>> uses the functions defined under debugfs -> eeh_debugfs_break_device(). 
>>
>> This was failing when CONFIG_DEBUGFS is not defined, thus referring to
>> undefined function definition. 
>>
>> Minor nit below.
>>
>>>
>>> diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
>>> index 49ab11a287a3..0fe25e907ea6 100644
>>> --- a/arch/powerpc/kernel/eeh.c
>>> +++ b/arch/powerpc/kernel/eeh.c
>>> @@ -1574,6 +1574,104 @@ static int proc_eeh_show(struct seq_file *m, void 
>>> *v)
>>>  }
>>>  #endif /* CONFIG_PROC_FS */
>>>  
>>> +static int eeh_break_device(struct pci_dev *pdev)
>>> +{
>>> +   struct resource *bar = NULL;
>>> +   void __iomem *mapped;
>>> +   u16 old, bit;
>>> +   int i, pos;
>>> +
>>> +   /* Do we have an MMIO BAR to disable? */
>>> +   for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
>>> +   struct resource *r = &pdev->resource[i];
>>> +
>>> +   if (!r->flags || !r->start)
>>> +   continue;
>>> +   if (r->flags & IORESOURCE_IO)
>>> +   continue;
>>> +   if (r->flags & IORESOURCE_UNSET)
>>> +   continue;
>>> +
>>> +   bar = r;
>>> +   break;
>>> +   }
>>> +
>>> +   if (!bar) {
>>> +   pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n");
>>> +   return -ENXIO;
>>> +   }
>>> +
>>> +   pci_err(pdev, "Going to break: %pR\n", bar);
>>> +
>>> +   if (pdev->is_virtfn) {
>>> +#ifndef CONFIG_PCI_IOV
>>> +   return -ENXIO;
>>> +#else
>>> +   /*
>>> +* VFs don't have a per-function COMMAND register, so the best
>>> +* we can do is clear the Memory Space Enable bit in the PF's
>>> +* SRIOV control reg.
>>> +*
>>> +* Unfortunately, this requires that we have a PF (i.e doesn't
>>> +* work for a passed-through VF) and it has the potential side
>>> +* effect of also causing an EEH on every other VF under the
>>> +* PF. Oh well.
>>> +*/
>>> +   pdev = pdev->physfn;
>>> +   if (!pdev)
>>> +   return -ENXIO; /* passed through VFs have no PF */
>>> +
>>> +   pos  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
>>> +   pos += PCI_SRIOV_CTRL;
>>> +   bit  = PCI_SRIOV_CTRL_MSE;
>>> +#endif /* !CONFIG_PCI_IOV */
>>> +   } else {
>>> +   bit = PCI_COMMAND_MEMORY;
>>> +   pos = PCI_COMMAND;
>>> +   }
>>> +
>>> +   /*
>>> +* Process here is:
>>> +*
>>> +* 1. Disable Memory space.
>>> +*
>>> +* 2. Perform an MMIO to the device. This should result in an error
>>> +*(CA  / UR) being raised by the device which results in an EEH
>>> +*PE freeze. Using the in_8() accessor skips the eeh detection hook
>>> +*so the freeze hook so the EEH Detection machinery won't be
>>> +*triggered here. This is to match the usual behaviour of EEH
>>> +*where the HW will asynchronously freeze a PE and it's up to
>>>

Re: [RFC v2 01/13] mm/kfence: Add a new kunit test test_use_after_free_read_nofault()

2024-10-02 Thread IBM



Hello Kasan/kfence-devs, 

Wanted your inputs on this kfence kunit test [PATCH-1] and it's respective
powerpc fix [Patch-2]. The commit msgs has a good description of it. I
see that the same problem was noticed on s390 as well [1] a while ago.
So that makes me believe that maybe we should have a kunit test for the
same to make sure all architectures handles this properly. 

Thoughts?

[1]: https://lore.kernel.org/all/20230213183858.1473681-1-...@linux.ibm.com/

-ritesh


"Ritesh Harjani (IBM)"  writes:

> From: Nirjhar Roy 
>
> Faults from copy_from_kernel_nofault() needs to be handled by fixup
> table and should not be handled by kfence. Otherwise while reading
> /proc/kcore which uses copy_from_kernel_nofault(), kfence can generate
> false negatives. This can happen when /proc/kcore ends up reading an
> unmapped address from kfence pool.
>
> Let's add a testcase to cover this case.
>
> Co-developed-by: Ritesh Harjani (IBM) 
> Signed-off-by: Ritesh Harjani (IBM) 
> Signed-off-by: Nirjhar Roy 
> Cc: kasan-...@googlegroups.com
> Cc: Alexander Potapenko 
> Cc: linux...@kvack.org
> ---
>  mm/kfence/kfence_test.c | 17 +
>  1 file changed, 17 insertions(+)
>
> diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
> index 00fd17285285..f65fb182466d 100644
> --- a/mm/kfence/kfence_test.c
> +++ b/mm/kfence/kfence_test.c
> @@ -383,6 +383,22 @@ static void test_use_after_free_read(struct kunit *test)
>   KUNIT_EXPECT_TRUE(test, report_matches(&expect));
>  }
>
> +static void test_use_after_free_read_nofault(struct kunit *test)
> +{
> + const size_t size = 32;
> + char *addr;
> + char dst;
> + int ret;
> +
> + setup_test_cache(test, size, 0, NULL);
> + addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
> + test_free(addr);
> + /* Use after free with *_nofault() */
> + ret = copy_from_kernel_nofault(&dst, addr, 1);
> + KUNIT_EXPECT_EQ(test, ret, -EFAULT);
> + KUNIT_EXPECT_FALSE(test, report_available());
> +}
> +
>  static void test_double_free(struct kunit *test)
>  {
>   const size_t size = 32;
> @@ -780,6 +796,7 @@ static struct kunit_case kfence_test_cases[] = {
>   KFENCE_KUNIT_CASE(test_out_of_bounds_read),
>   KFENCE_KUNIT_CASE(test_out_of_bounds_write),
>   KFENCE_KUNIT_CASE(test_use_after_free_read),
> + KFENCE_KUNIT_CASE(test_use_after_free_read_nofault),
>   KFENCE_KUNIT_CASE(test_double_free),
>   KFENCE_KUNIT_CASE(test_invalid_addr_free),
>   KFENCE_KUNIT_CASE(test_corruption),
> --
> 2.46.0

Re: [PATCH] selftests/mount_setattr: Fix failures on 64K PAGE_SIZE kernels

2024-11-16 Thread IBM

Michael Ellerman  writes:

> Currently the mount_setattr_test fails on machines with a 64K PAGE_SIZE,
> with errors such as:
>
>   #  RUN   mount_setattr_idmapped.invalid_fd_negative ...
>   mkfs.ext4: No space left on device while writing out and closing file system
>   # mount_setattr_test.c:1055:invalid_fd_negative:Expected system("mkfs.ext4 
> -q /mnt/C/ext4.img") (256) == 0 (0)
>   # invalid_fd_negative: Test terminated by assertion
>   #  FAIL  mount_setattr_idmapped.invalid_fd_negative
>   not ok 12 mount_setattr_idmapped.invalid_fd_negative
>
> The code creates a 100,000 byte tmpfs:
>
>   ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV,
>   "size=10,mode=700"), 0);
>
> And then a little later creates a 2MB ext4 filesystem in that tmpfs:
>
>   ASSERT_EQ(ftruncate(img_fd, 1024 * 2048), 0);
>   ASSERT_EQ(system("mkfs.ext4 -q /mnt/C/ext4.img"), 0);
>
> At first glance it seems like that should never work, after all 2MB is
> larger than 100,000 bytes. However the filesystem image doesn't actually
> occupy 2MB on "disk" (actually RAM, due to tmpfs). On 4K kernels the
> ext4.img uses ~84KB of actual space (according to du), which just fits.
>
> However on 64K PAGE_SIZE kernels the ext4.img takes at least 256KB,
> which is too large to fit in the tmpfs, hence the errors.

Even though ext4 will use by default 4k blocksize on both kernels but
the backing file here belongs to tmpfs (RAM) which operates at 64k page.
Hence the difference in the size of the image between 4k and 64k kernels
looks expected.

# 64k kernel
du -sh /run/ext4
256K/run/ext4

# 4k kernel
du -sh /run/ext4
84K /run/ext4

>
> It seems fraught to rely on the ext4.img taking less space on disk than
> the allocated size, so instead create the tmpfs with a size of 2MB. With
> that all 21 tests pass on 64K PAGE_SIZE kernels.

That looks like the right thing to do here. 

Please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM) 

>
> Fixes: 01eadc8dd96d ("tests: add mount_setattr() selftests")
> Signed-off-by: Michael Ellerman 
> ---
>  tools/testing/selftests/mount_setattr/mount_setattr_test.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c 
> b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
> index 68801e1a9ec2..70f65eb320a7 100644
> --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c
> +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c
> @@ -1026,7 +1026,7 @@ FIXTURE_SETUP(mount_setattr_idmapped)
>   "size=10,mode=700"), 0);
>  
>   ASSERT_EQ(mount("testing", "/mnt", "tmpfs", MS_NOATIME | MS_NODEV,
> - "size=10,mode=700"), 0);
> + "size=2m,mode=700"), 0);
>  
>   ASSERT_EQ(mkdir("/mnt/A", 0777), 0);
>  
> -- 
> 2.47.0

Re: [bug report from fstests] WARNING: CPU: 1 PID: 0 at arch/powerpc/mm/mmu_context.c:106 switch_mm_irqs_off+0x220/0x270

2024-11-16 Thread IBM

Zorro Lang  writes:

> Hi,
>
> Recently fstests generic/650 hit a kernel warning on ppc64le [1] with
> xfs (default mkfs option). My latest test on mainline linux v6.12-rc6+
> with HEAD=da4373fbcf006deda90e5e6a87c499e0ff747572 .

I tried this on KVM pseries machine type, but I am unable to hit it. 
Let me try that on an actual LPAR and confirm whether I can still hit
this or not. Then maybe we can see if we could get a git bisect log of it.

Thanks for reporting it.

-ritesh


>
> Thanks,
> Zorro
>
>
> [1]
> FSTYP -- xfs (debug)
> PLATFORM  -- Linux/ppc64le rdma-cert-03-lp10 6.12.0-rc6+ #1 SMP Sat Nov  
> 9 13:18:41 EST 2024
> MKFS_OPTIONS  -- -f -m 
> crc=1,finobt=1,rmapbt=1,reflink=1,inobtcount=1,bigtime=1 /dev/sda3
> MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/sda3 
> /mnt/xfstests/scratch
>
> generic/650   _check_dmesg: something found in dmesg (see 
> /var/lib/xfstests/results//generic/650.dmesg)
>
>
> HINT: You _MAY_ be missing kernel fix:
>   ecd49f7a36fb xfs: fix per-cpu CIL structure aggregation racing with 
> dying cpus
>
> Ran: generic/650
> Failures: generic/650
> Failed 1 of 1 tests
>
>
> # cat /var/lib/xfstests/results//generic/650.dmesg
> [16630.359077] run fstests generic/650 at 2024-11-09 18:03:21
> [16631.058519] [ cut here ]
> [16631.058531] WARNING: CPU: 1 PID: 0 at arch/powerpc/mm/mmu_context.c:106 
> switch_mm_irqs_off+0x220/0x270
> [16631.058542] Modules linked in: overlay dm_zero dm_log_writes dm_thin_pool 
> dm_persistent_data dm_bio_prison dm_snapshot dm_bufio ext4 mbcache jbd2 
> dm_flakey bonding tls rfkill sunrpc ibmveth pseries_rng vmx_crypto sg dm_mod 
> fuse loop nfnetlink xfs sd_mod nvme nvme_core ibmvscsi scsi_transport_srp 
> nvme_auth [last unloaded: scsi_debug]
> [16631.058617] CPU: 1 UID: 0 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G  
>   W  6.12.0-rc6+ #1
> [16631.058623] Tainted: [W]=WARN
> [16631.058625] Hardware name: IBM,9009-22G POWER9 (architected) 0x4e0203 
> 0xf05 of:IBM,FW950.11 (VL950_075) hv:phyp pSeries
> [16631.058629] NIP:  c00b02c0 LR: c00b0220 CTR: 
> c0152a20
> [16631.058633] REGS: c8bd7ad0 TRAP: 0700   Tainted: GW
>(6.12.0-rc6+)
> [16631.058637] MSR:  82823033   CR: 
> 2800440a  XER: 2004
> [16631.058660] CFAR: c00b0230 IRQMASK: 3 
>GPR00: c00b027c c8bd7d70 c2616800 
> c0016131a900 
>GPR04:  000a  
>  
>GPR08:   0001 
>  
>GPR12: c0152a20 cffcf300  
> 1ef31820 
>GPR16:    
>  
>GPR20:    
> 0001 
>GPR24: 0001 c89acb00 c4fafcb0 
>  
>GPR28: c4f24880 c0016131a900 0001 
> c4f25180 
> [16631.058740] NIP [c00b02c0] switch_mm_irqs_off+0x220/0x270
> [16631.058746] LR [c00b0220] switch_mm_irqs_off+0x180/0x270
> [16631.058751] Call Trace:
> [16631.058754] [c8bd7d70] [c00b027c] 
> switch_mm_irqs_off+0x1dc/0x270 (unreliable)
> [16631.058763] [c8bd7de0] [c02572f8] 
> idle_task_exit+0x118/0x1b0
> [16631.058771] [c8bd7e40] [c0152a70] 
> pseries_cpu_offline_self+0x50/0x150
> [16631.058780] [c8bd7eb0] [c0078678] 
> arch_cpu_idle_dead+0x68/0x7c
> [16631.058787] [c8bd7ee0] [c029f504] do_idle+0x1c4/0x290
> [16631.058793] [c8bd7f40] [c029fa90] 
> cpu_startup_entry+0x60/0x70
> [16631.058800] [c8bd7f70] [c007825c] 
> start_secondary+0x44c/0x480
> [16631.058807] [c8bd7fe0] [c000e258] 
> start_secondary_prolog+0x10/0x14
> [16631.058815] Code: 3884 387c00f8 487f7a51 6000 813c00f8 7129000a 
> 4182ff40 2c3d 4182ff38 7c0004ac 4bb8 4b30 <0fe0> 4b70 
> 6000 6000 
> [16631.058848] irq event stamp: 15169028
> [16631.058850] hardirqs last  enabled at (15169027): [] 
> tick_nohz_idle_exit+0x1c0/0x3b0
> [16631.058858] hardirqs last disabled at (15169028): [] 
> __schedule+0x5ac/0xc30
> [16631.058865] softirqs last  enabled at (15169016): [] 
> handle_softirqs+0x578/0x620
> [16631.058871] softirqs last disabled at (15168957): [] 
> do_softirq_own_stack+0x6c/0x90
> [16631.058878] ---[ end trace  ]---
> [16631.814920] NOHZ

Re: [PATCH] powerpc/book3s64/hugetlb: Fix disabling hugetlb when fadump is active

2024-12-16 Thread IBM

Sourabh Jain  writes:

> Commit 8597538712eb ("powerpc/fadump: Do not use hugepages when fadump
> is active") disabled hugetlb support when fadump is active by returning
> early from hugetlbpage_init():arch/powerpc/mm/hugetlbpage.c and not
> populating hpage_shift/HPAGE_SHIFT.
>
> Later, commit 2354ad252b66 ("powerpc/mm: Update default hugetlb size
> early") moved the allocation of hpage_shift/HPAGE_SHIFT to early boot,
> which inadvertently re-enabled hugetlb support when fadump is active.
>
> Fix this by implementing hugepages_supported() on powerpc. This ensures
> that disabling hugetlb for the fadump kernel is independent of
> hpage_shift/HPAGE_SHIFT.
>

Thanks for describing the history of the changes clearly.

> Fixes: 2354ad252b66 ("powerpc/mm: Update default hugetlb size early")
> CC: Aneesh Kumar K.V 
> CC: Hari Bathini 
> CC: Madhavan Srinivasan 
> Cc: Mahesh Salgaonkar 
> Cc: Michael Ellerman 
> CC: Ritesh Harjani (IBM) 
> Signed-off-by: Sourabh Jain 
> ---
>
> Note: Even with this fix included, it is possible to enable gigantic
> pages in the fadump kernel. IIUC, gigantic pages were never disabled
> for the fadump kernel.
>
> Currently, gigantic pages are allocated during early boot as long as
> the respective hstate is supported by the architecture.
>
> I will introduce some changes in the generic hugetlb code to allow the
> architecture to decide on supporting gigantic pages on the go. Bringing
> gigantic page allocation under hugepages_supported() does work for
> powerpc but I need verify the impact on other architectures.
>
> Regarding the Fixes tag: This patch fixes a bug inadvertently introduced
> by the commit mentioned under Fixes tag in the commit message. Feel free
> to remove the tag if it is unnecessary.
>
> ---
>  arch/powerpc/include/asm/hugetlb.h | 9 +
>  1 file changed, 9 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/hugetlb.h 
> b/arch/powerpc/include/asm/hugetlb.h
> index 18a3028ac3b6..f294e57663b0 100644
> --- a/arch/powerpc/include/asm/hugetlb.h
> +++ b/arch/powerpc/include/asm/hugetlb.h
> @@ -15,6 +15,15 @@
>  
>  extern bool hugetlb_disabled;
>  
> +static inline int hugepages_supported(void)
> +{
> + if (hugetlb_disabled)
> + return 0;
> +
> + return HPAGE_SHIFT != 0;
> +}
> +#define hugepages_supported hugepages_supported
> +

In include/linux/hugetlb.h

#ifndef hugepages_supported
/*
 * Some platform decide whether they support huge pages at boot
 * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
 * when there is no such support
 */
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif

The above comment is not entirely correct after this change 2354ad252b66
("powerpc/mm: Update default hugetlb size early), because we anyway go
ahead and initialize HPAGE_SHIFT even when hugetlb_disabled is true. But
nevertheless - we can fix the comment later. I see there are few other
cleanups which could be clubbed too.

fadump when the capture kernel is active would like to disable hugetlb page
allocation (to avoid OOMs) hence it uses hugetlb_disabled flag to mark
it disabled. As you correctly pointed out, the change in question moved
initialization of HPAGE_SHIFT to early boot as it was required to set the
pageblock_order properly (especially for radix 2M huge pagesize).

Now earlier generic hugepages_supported() was only checking for
HPAGE_SHIFT != 0. This patch will now check for both, hugetlb_disabled
should be false and HPAGE_SHIFT should not be 0. Only then hugetlb will
go and allocate hugepages in hugetlb_init().  

So, the change looks good to me. Thanks for catching and fixing that. 
I hope we can add a testcase to cover this scenario as the problematic
patch was added long ago - but we only noticed the problem now. Quick
qn, was this caught due to any OOM? Or was it an observation?

The patch looks good to me. So please feel free to add - 
Reviewed-by: Ritesh Harjani (IBM) 

-ritesh

Re: [PATCH] powerpc/book3s64/hugetlb: Fix disabling hugetlb when fadump is active

2024-12-16 Thread IBM

Ritesh Harjani (IBM)  writes:

> Sourabh Jain  writes:
>
>> Commit 8597538712eb ("powerpc/fadump: Do not use hugepages when fadump
>> is active") disabled hugetlb support when fadump is active by returning
>> early from hugetlbpage_init():arch/powerpc/mm/hugetlbpage.c and not
>> populating hpage_shift/HPAGE_SHIFT.
>>
>> Later, commit 2354ad252b66 ("powerpc/mm: Update default hugetlb size
>> early") moved the allocation of hpage_shift/HPAGE_SHIFT to early boot,
>> which inadvertently re-enabled hugetlb support when fadump is active.
>>
>> Fix this by implementing hugepages_supported() on powerpc. This ensures
>> that disabling hugetlb for the fadump kernel is independent of
>> hpage_shift/HPAGE_SHIFT.
>>
>
> Thanks for describing the history of the changes clearly.
>
>> Fixes: 2354ad252b66 ("powerpc/mm: Update default hugetlb size early")
>> CC: Aneesh Kumar K.V 
>> CC: Hari Bathini 
>> CC: Madhavan Srinivasan 
>> Cc: Mahesh Salgaonkar 
>> Cc: Michael Ellerman 
>> CC: Ritesh Harjani (IBM) 
>> Signed-off-by: Sourabh Jain 
>> ---
>>
>> Note: Even with this fix included, it is possible to enable gigantic
>> pages in the fadump kernel. IIUC, gigantic pages were never disabled
>> for the fadump kernel.
>>
>> Currently, gigantic pages are allocated during early boot as long as
>> the respective hstate is supported by the architecture.
>>
>> I will introduce some changes in the generic hugetlb code to allow the
>> architecture to decide on supporting gigantic pages on the go. Bringing
>> gigantic page allocation under hugepages_supported() does work for
>> powerpc but I need verify the impact on other architectures.
>>
>> Regarding the Fixes tag: This patch fixes a bug inadvertently introduced
>> by the commit mentioned under Fixes tag in the commit message. Feel free
>> to remove the tag if it is unnecessary.
>>
>> ---
>>  arch/powerpc/include/asm/hugetlb.h | 9 +
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/hugetlb.h 
>> b/arch/powerpc/include/asm/hugetlb.h
>> index 18a3028ac3b6..f294e57663b0 100644
>> --- a/arch/powerpc/include/asm/hugetlb.h
>> +++ b/arch/powerpc/include/asm/hugetlb.h
>> @@ -15,6 +15,15 @@
>>  
>>  extern bool hugetlb_disabled;
>>  
>> +static inline int hugepages_supported(void)

I guess we may as well make it's return type as bool.

>> +{
>> +if (hugetlb_disabled)
>> +return 0;
>> +
>> +return HPAGE_SHIFT != 0;
>> +}
>> +#define hugepages_supported hugepages_supported
>> +

-ritesh

Re: [PATCH 2/3] selftest/powerpc/ptrace/ptrace-pkey: Remove duplicate macros

2024-12-16 Thread IBM

Madhavan Srinivasan  writes:

> ptrace-pkey.c test has macros defined which
> are part of "pkeys.h" header file. Remove those
> duplicates and include "pkeys.h"
>
> Signed-off-by: Madhavan Srinivasan 
> ---
>  .../testing/selftests/powerpc/ptrace/ptrace-pkey.c | 14 +-
>  1 file changed, 1 insertion(+), 13 deletions(-)
>

Similar to previous patch. Cleanup looks good to me. 

Please feel free to add - 
Reviewed-by: Ritesh Harjani (IBM)  

-ritesh

Re: [PATCH 3/3] selftest/powerpc/ptrace: Cleanup duplicate macro definitions

2024-12-16 Thread IBM

Madhavan Srinivasan  writes:

> Both core-pkey.c and ptrace-pkey.c tests have similar macro
> definitions, move them to "pkeys.h" and remove the macro
> definitions from the C file.
>
> Signed-off-by: Madhavan Srinivasan 
> ---
>  tools/testing/selftests/powerpc/include/pkeys.h  | 8 
>  tools/testing/selftests/powerpc/ptrace/core-pkey.c   | 8 
>  tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c | 8 
>  3 files changed, 8 insertions(+), 16 deletions(-)
>
> diff --git a/tools/testing/selftests/powerpc/include/pkeys.h 
> b/tools/testing/selftests/powerpc/include/pkeys.h
> index 51729d9a7111..3a0129467de6 100644
> --- a/tools/testing/selftests/powerpc/include/pkeys.h
> +++ b/tools/testing/selftests/powerpc/include/pkeys.h
> @@ -35,10 +35,18 @@
>  #define __NR_pkey_alloc  384
>  #define __NR_pkey_free   385
>  
> +#ifndef NT_PPC_PKEY
> +#define NT_PPC_PKEY  0x110
> +#endif
> +
>  #define PKEY_BITS_PER_PKEY   2
>  #define NR_PKEYS 32
>  #define PKEY_BITS_MASK   ((1UL << PKEY_BITS_PER_PKEY) - 1)
>  
> +#define AMR_BITS_PER_PKEY 2
> +#define PKEY_REG_BITS (sizeof(u64) * 8)
> +#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
> +
>  inline unsigned long pkeyreg_get(void)
>  {
>   return mfspr(SPRN_AMR);
> diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c 
> b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> index 31c9bf6d95db..f8ff05e5bf6e 100644
> --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> @@ -18,18 +18,10 @@
>  #include "child.h"
>  #include "pkeys.h"
>  
> -#ifndef NT_PPC_PKEY
> -#define NT_PPC_PKEY  0x110
> -#endif
> -
>  #ifndef PKEY_DISABLE_EXECUTE
>  #define PKEY_DISABLE_EXECUTE 0x4
>  #endif

We could remove this as well right. Since pkeys.h already has this
permission defines i.e.
PKEY_DISABLE_[ACCESS|WRITE|EXECUTE] 

>  
> -#define AMR_BITS_PER_PKEY 2
> -#define PKEY_REG_BITS (sizeof(u64) * 8)
> -#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
> -
>  #define CORE_FILE_LIMIT  (5 * 1024 * 1024)   /* 5 MB should be 
> enough */
>  
>  static const char core_pattern_file[] = "/proc/sys/kernel/core_pattern";
> diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c 
> b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> index 6893ed096457..5d528d0ea9d1 100644
> --- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> @@ -9,18 +9,10 @@
>  #include "child.h"
>  #include "pkeys.h"
>  
> -#ifndef NT_PPC_PKEY
> -#define NT_PPC_PKEY  0x110
> -#endif
> -
>  #ifndef PKEY_DISABLE_EXECUTE
>  #define PKEY_DISABLE_EXECUTE 0x4
>  #endif
>  

Same here. This can be cleaned up, no? Since pkeys already has this defined.


-ritesh

> -#define AMR_BITS_PER_PKEY 2
> -#define PKEY_REG_BITS (sizeof(u64) * 8)
> -#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
> -
>  static const char user_read[] = "[User Read (Running)]";
>  static const char user_write[] = "[User Write (Running)]";
>  static const char ptrace_read_running[] = "[Ptrace Read (Running)]";
> -- 
> 2.47.0

Re: [PATCH v2 3/3] selftest/powerpc/ptrace: Cleanup duplicate macro definitions

2024-12-16 Thread IBM

Madhavan Srinivasan  writes:

> Both core-pkey.c and ptrace-pkey.c tests have
> similar macro definitions, move them to "pkeys.h"
> and remove the macro definitions from the C file.
>
> Signed-off-by: Madhavan Srinivasan 
> ---
> Changelog v1:
>  - Removed additional macros pointed out by Ritesh
>which are duplicates and are avilable in "pkeys.h"

Thanks! The changes looks good to me. 

Please feel free to add - 
Reviewed-by: Ritesh Harjani (IBM) 


Gave a quick run on my lpar too - 

# selftests: powerpc/ptrace: core-pkey
# test: core_pkey
# [User Write (Running)] AMR: 3cff pkey1: 4 pkey2: 5 pkey3: 
6
# success: core_pkey
ok 9 selftests: powerpc/ptrace: core-pkey
# selftests: powerpc/ptrace: ptrace-pkey
# test: ptrace_pkey
# [User Write (Running)] AMR: 3cff pkey1: 4 pkey2: 5 pkey3: 
6
# success: ptrace_pkey
ok 13 selftests: powerpc/ptrace: ptrace-pkey


-ritesh

>
>  tools/testing/selftests/powerpc/include/pkeys.h  |  8 
>  tools/testing/selftests/powerpc/ptrace/core-pkey.c   | 12 
>  tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c | 12 
>  3 files changed, 8 insertions(+), 24 deletions(-)
>
> diff --git a/tools/testing/selftests/powerpc/include/pkeys.h 
> b/tools/testing/selftests/powerpc/include/pkeys.h
> index 51729d9a7111..3a0129467de6 100644
> --- a/tools/testing/selftests/powerpc/include/pkeys.h
> +++ b/tools/testing/selftests/powerpc/include/pkeys.h
> @@ -35,10 +35,18 @@
>  #define __NR_pkey_alloc  384
>  #define __NR_pkey_free   385
>  
> +#ifndef NT_PPC_PKEY
> +#define NT_PPC_PKEY  0x110
> +#endif
> +
>  #define PKEY_BITS_PER_PKEY   2
>  #define NR_PKEYS 32
>  #define PKEY_BITS_MASK   ((1UL << PKEY_BITS_PER_PKEY) - 1)
>  
> +#define AMR_BITS_PER_PKEY 2
> +#define PKEY_REG_BITS (sizeof(u64) * 8)
> +#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
> +
>  inline unsigned long pkeyreg_get(void)
>  {
>   return mfspr(SPRN_AMR);
> diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c 
> b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> index 31c9bf6d95db..f061434af452 100644
> --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
> @@ -18,18 +18,6 @@
>  #include "child.h"
>  #include "pkeys.h"
>  
> -#ifndef NT_PPC_PKEY
> -#define NT_PPC_PKEY  0x110
> -#endif
> -
> -#ifndef PKEY_DISABLE_EXECUTE
> -#define PKEY_DISABLE_EXECUTE 0x4
> -#endif
> -
> -#define AMR_BITS_PER_PKEY 2
> -#define PKEY_REG_BITS (sizeof(u64) * 8)
> -#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
> -
>  #define CORE_FILE_LIMIT  (5 * 1024 * 1024)   /* 5 MB should be 
> enough */
>  
>  static const char core_pattern_file[] = "/proc/sys/kernel/core_pattern";
> diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c 
> b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> index 6893ed096457..fc633014424f 100644
> --- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
> @@ -9,18 +9,6 @@
>  #include "child.h"
>  #include "pkeys.h"
>  
> -#ifndef NT_PPC_PKEY
> -#define NT_PPC_PKEY  0x110
> -#endif
> -
> -#ifndef PKEY_DISABLE_EXECUTE
> -#define PKEY_DISABLE_EXECUTE 0x4
> -#endif
> -
> -#define AMR_BITS_PER_PKEY 2
> -#define PKEY_REG_BITS (sizeof(u64) * 8)
> -#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
> -
>  static const char user_read[] = "[User Read (Running)]";
>  static const char user_write[] = "[User Write (Running)]";
>  static const char ptrace_read_running[] = "[Ptrace Read (Running)]";
> -- 
> 2.47.0

Re: [PATCH] powerpc/pseries/eeh: Fix get PE state translation

2024-11-21 Thread IBM

Vaibhav Jain  writes:

> Hi Ritesh,
>
> Thanks for looking into this patch. My responses on behalf of Narayana
> below:
>
> "Ritesh Harjani (IBM)"  writes:
>
>> Narayana Murty N  writes:
>>
>>> The PE Reset State "0" obtained from RTAS calls
>>> ibm_read_slot_reset_[state|state2] indicates that
>>> the Reset is deactivated and the PE is not in the MMIO
>>> Stopped or DMA Stopped state.
>>>
>>> With PE Reset State "0", the MMIO and DMA is allowed for
>>> the PE.
>>
>> Looking at the PAPR spec - I do agree that it states the same. i.e.
>> The "0" Initial PE state means the "Not Reset", "Load/Store allowed" &
>> "DMA allowed" (Normal Operations). 
>>
>>> The function pseries_eeh_get_state() is currently
>>> not indicating that to the caller because of  which the
>>> drivers are unable to resume the MMIO and DMA activity.
>>
>> It's new to me, but could you help explain the user visible effect
>> of what gets broken. Since this looks like pseries_eeh_get_state() has
>> always been like this when it got first implemented.
>> Is there also a unit test somewhere which you are testing?
> Without this patch a userspace process performing VFIO EEH-Recovery wont
> get the correct indication that EEH recovery is completed. Test code at
> [2] has an example test case that uses VFIO to inject an EEH error on to
> a pci-device and then waits on it to reach 'EEH_PE_STATE_NORMAL' state
> . That state is never reached without this patch.
>
> [2] :
> https://github.com/nnmwebmin/vfio-ppc-tests/commit/006d8fdc41a4
>

Right. Thanks for helping with that test code. It's much clearer now. So
after the error inject and/or the PE hot reset, the PE is never reaching
it's normal state. That is due to this kernel bug in the pseries EEH
handling, where it fails to advertise the MMIO & DMA enabled capability
flag back to the caller. This therefore can cause the userspace VFIO
driver to incorrectly assume that MMIO/DMA operations cannot be done. 

>>
>> IIUC eeh_pe_get_state() was implemented[1] for supporting EEH for VFIO PCI
>> devices. i.e. the VFIO_EEH_PE_GET_STATE operation of VFIO EEH PE ioctl op
>> uses pseries_eeh_get_state() helper to query PE state on pseries LPAR.
>> So are you suggesting that EEH functionality for VFIO PCI device was
>> never enabled/tested before on pseries?
> VFIO-EEH had been broken for pseries for a quite some time and was
> recently fixed in kernel. So this issue was probably not discovered
> until recently when we started testing with userspace VFIO.
>

ohk right, then maybe we might have started testing it after the eeh
error inject op was implemented for pseries here [1].

[1]: 
https://lore.kernel.org/linuxppc-dev/20240909140220.529333-1-nnmli...@linux.ibm.com/#t

>>
>> [1]: 
>> https://lore.kernel.org/all/1402364517-28561-3-git-send-email-gws...@linux.vnet.ibm.com/
>>
>> Checking the powernv side of implementation I do see that it does
>> enables the EEH_STATE_[MMIO|DMA]_ENABLED flags in the result mask for
>> the callers. So doing the same for pseries eeh get state implementation
>> does look like the right thing to do here IMO.
>>
>>> The patch fixes that by reflecting what is actually allowed.
>>
>> You say this is "fixes" so I am also assuming you are also looking for
>> stable backports of this? If yes - could you please also add the "Fixes"
>> tag and cc stable?
> Yes, agree will re-send adding the fixes tag.
>

Yes and maybe let's also add some more context & information to the
commit message from this discussion.

-ritesh

>>
>> -ritesh
>>
>>>
>>> Signed-off-by: Narayana Murty N 
>>> ---
>>>  arch/powerpc/platforms/pseries/eeh_pseries.c | 6 --
>>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
>>> b/arch/powerpc/platforms/pseries/eeh_pseries.c
>>> index 1893f66371fa..b12ef382fec7 100644
>>> --- a/arch/powerpc/platforms/pseries/eeh_pseries.c
>>> +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
>>> @@ -580,8 +580,10 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, 
>>> int *delay)
>>>  
>>> switch(rets[0]) {
>>> case 0:
>>> -   result = EEH_STATE_MMIO_ACTIVE |
>>> -EEH_STATE_DMA_ACTIVE;
>>> +   result = EEH_STATE_MMIO_ACTIVE  |
>>> +EEH_STATE_DMA_ACTIVE   |
>>> +EEH_STATE_MMIO_ENABLED |
>>> +EEH_STATE_DMA_ENABLED;
>>> break;
>>> case 1:
>>> result = EEH_STATE_RESET_ACTIVE |
>>> -- 
>>> 2.45.2
>>
>
> -- 
> Cheers
> ~ Vaibhav

Re: [RESEND PATCH] powerpc: Use str_on_off() helper in check_cache_coherency()

2024-12-22 Thread IBM

Thorsten Blum  writes:

> Remove hard-coded strings by using the str_on_off() helper function.
>
> Signed-off-by: Thorsten Blum 
> ---
>  arch/powerpc/kernel/setup-common.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/setup-common.c 
> b/arch/powerpc/kernel/setup-common.c
> index 6fa179448c33..f7d7a93f07fc 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -834,8 +834,8 @@ static int __init check_cache_coherency(void)
>   if (devtree_coherency != KERNEL_COHERENCY) {
>   printk(KERN_ERR
>   "kernel coherency:%s != device tree_coherency:%s\n",
> - KERNEL_COHERENCY ? "on" : "off",
> - devtree_coherency ? "on" : "off");
> + str_on_off(KERNEL_COHERENCY),
> + str_on_off(devtree_coherency));
>   BUG();
>   }


Looks good to me. Please feel free to add - 
Reviewed-by: Ritesh Harjani (IBM) 

-ritesh

Re: [PATCH] powerpc/64s: Rewrite __real_pte() as a static inline

2025-01-11 Thread IBM

Christophe Leroy  writes:

> Rewrite __real_pte() as a static inline in order to avoid
> following warning/error when building with 4k page size:
>
> CC  arch/powerpc/mm/book3s64/hash_tlb.o
>   arch/powerpc/mm/book3s64/hash_tlb.c: In function 'hpte_need_flush':
>   arch/powerpc/mm/book3s64/hash_tlb.c:49:16: error: variable 'offset' set 
> but not used [-Werror=unused-but-set-variable]
>  49 | int i, offset;
> |^~
>   cc1: all warnings being treated as errors
>
> Reported-by: kernel test robot 
> Closes: 
> https://lore.kernel.org/oe-kbuild-all/202501081741.ayfwybsq-...@intel.com/

Great. Why not fix the other warning as well which is reported in above
link, which is...  

--
   arch/powerpc/mm/book3s64/hash_native.c: In function 
'native_flush_hash_range':
>> arch/powerpc/mm/book3s64/hash_native.c:786:29: warning: variable 'index' set 
>> but not used [-Wunused-but-set-variable]
 786 | unsigned long hash, index, hidx, shift, slot;
--

...similar to how we fixed this warning by making the macro as static
inline? That means something like this (not tested)?

-#define __rpte_to_hidx(r,index)(pte_val(__rpte_to_pte(r)) >> 
H_PAGE_F_GIX_SHIFT)
+static inline unsigned long __rpte_to_hidx(real_pte_t r, unsigned long index)
+{
+   return pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT;
+}


-ritesh


> Fixes: ff31e105464d ("powerpc/mm/hash64: Store the slot information at the 
> right offset for hugetlb")
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/book3s/64/hash-4k.h | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
> b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index c3efacab4b94..a7a68ba9c71b 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -77,7 +77,10 @@
>  /*
>   * With 4K page size the real_pte machinery is all nops.
>   */
> -#define __real_pte(e, p, o)  ((real_pte_t){(e)})
> +static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset)
> +{
> + return (real_pte_t){pte};
> +}
>  #define __rpte_to_pte(r) ((r).pte)
>  #define __rpte_to_hidx(r,index)  (pte_val(__rpte_to_pte(r)) >> 
> H_PAGE_F_GIX_SHIFT)
>  
> -- 
> 2.47.0

Re: [PATCH v2] powerpc/64s: Rewrite __real_pte() and __rpte_to_hidx() as static inline

2025-01-12 Thread IBM

Christophe Leroy  writes:

> Rewrite __real_pte() and __rpte_to_hidx() as static inline in order to
> avoid following warnings/errors when building with 4k page size:
>
> CC  arch/powerpc/mm/book3s64/hash_tlb.o
>   arch/powerpc/mm/book3s64/hash_tlb.c: In function 'hpte_need_flush':
>   arch/powerpc/mm/book3s64/hash_tlb.c:49:16: error: variable 'offset' set 
> but not used [-Werror=unused-but-set-variable]
>  49 | int i, offset;
> |^~
>
> CC  arch/powerpc/mm/book3s64/hash_native.o
>   arch/powerpc/mm/book3s64/hash_native.c: In function 
> 'native_flush_hash_range':
>   arch/powerpc/mm/book3s64/hash_native.c:782:29: error: variable 'index' 
> set but not used [-Werror=unused-but-set-variable]
> 782 | unsigned long hash, index, hidx, shift, slot;
> | ^
>
> Reported-by: kernel test robot 
> Closes: 
> https://lore.kernel.org/oe-kbuild-all/202501081741.ayfwybsq-...@intel.com/
> Fixes: ff31e105464d ("powerpc/mm/hash64: Store the slot information at the 
> right offset for hugetlb")
> Signed-off-by: Christophe Leroy 
> ---
> v2: Also inline __rpte_to_hidx() for the same reason

Thanks for addressing the other warning too in v2. I also tested the
changes on my system and this fixes both the reported warnings.

The changes looks good to me. Please feel free to add - 

Reviewed-by: Ritesh Harjani (IBM) 


> ---
>  arch/powerpc/include/asm/book3s/64/hash-4k.h | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
> b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index c3efacab4b94..aa90a048f319 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -77,9 +77,17 @@
>  /*
>   * With 4K page size the real_pte machinery is all nops.
>   */
> -#define __real_pte(e, p, o)  ((real_pte_t){(e)})
> +static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset)
> +{
> + return (real_pte_t){pte};
> +}
> +
>  #define __rpte_to_pte(r) ((r).pte)
> -#define __rpte_to_hidx(r,index)  (pte_val(__rpte_to_pte(r)) >> 
> H_PAGE_F_GIX_SHIFT)
> +
> +static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long 
> index)
> +{
> + return pte_val(__rpte_to_pte(rpte)) >> H_PAGE_F_GIX_SHIFT;
> +}
>  
>  #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)   \
>   do { \
> -- 
> 2.47.0

Re: [PATCH] fadump: Use str_yes_no() helper in fadump_show_config()

2025-01-13 Thread IBM

Thorsten Blum  writes:

> Remove hard-coded strings by using the str_yes_no() helper function.
>
> Signed-off-by: Thorsten Blum 
> ---
>  arch/powerpc/kernel/fadump.c | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)


In fadump.c file we have implicit include of string_choices.h i.e. 

include/linux/seq_file.h -> linux/string_helpers.h -> 
linux/string_choices.h 

Directly having string_choices include could be better. 
#include 

However no hard preferences. The patch functionally looks correct to me. 

Please feel free to add - 
Reviewed-by: Ritesh Harjani (IBM) 


>
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index 4b371c738213..8c531533dd3e 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -289,10 +289,8 @@ static void __init fadump_show_config(void)
>   if (!fw_dump.fadump_supported)
>   return;
>  
> - pr_debug("Fadump enabled: %s\n",
> - (fw_dump.fadump_enabled ? "yes" : "no"));
> - pr_debug("Dump Active   : %s\n",
> - (fw_dump.dump_active ? "yes" : "no"));
> + pr_debug("Fadump enabled: %s\n", 
> str_yes_no(fw_dump.fadump_enabled));
> + pr_debug("Dump Active   : %s\n", str_yes_no(fw_dump.dump_active));
>   pr_debug("Dump section sizes:\n");
>   pr_debug("CPU state data size: %lx\n", fw_dump.cpu_state_data_size);
>   pr_debug("HPTE region size   : %lx\n", fw_dump.hpte_region_size);
> -- 
> 2.47.1

Re: [RFC PATCH] powerpc: Add check to select PPC_RADIX_BROADCAST_TLBIE

2025-04-08 Thread IBM

Christophe Leroy  writes:

> Le 07/04/2025 à 21:10, Ritesh Harjani (IBM) a écrit :
>> Madhavan Srinivasan  writes:
>> 
>>> Commit 3d45a3d0d2e6 ("powerpc: Define config option for processors with 
>>> broadcast TLBIE")
>> 
>> We may need to add above to Fixes tag as well, no?
>> 
>>> added a config option PPC_RADIX_BROADCAST_TLBIE to support processors with
>>> broadcast TLBIE. Since this option is relevant only for RADIX_MMU, add
>>> a check as a dependency to enable PPC_RADIX_BROADCAST_TLBIE in both
>>> powernv and pseries configs. This fixes the unmet config dependency
>>> warning reported
>>>
>>> WARNING: unmet direct dependencies detected for 
>>> PPC_RADIX_BROADCAST_TLBIE
>>>   Depends on [n]: PPC_RADIX_MMU [=n]
>>>   Selected by [y]:
>>>   - PPC_PSERIES [=y] && PPC64 [=y] && PPC_BOOK3S [=y]
>>>
>>> Reported-by: kernel test robot 
>>> Closes: 
>>> https://lore.kernel.org/oe-kbuild-all/202504051857.jrqxm60c-...@intel.com/
>>> Signed-off-by: Madhavan Srinivasan 
>> 
>> It's a bit strange that even though PPC_RADIX_BROADCAST_TLBIE adds
>> PPC_RADIX_MMU as a dependency where is it defined, we still have to add
>> an extra check for the same dependency to enable this for any platform.
>
> That's expected, see 

Yes. I had figured that out. 

> https://docs.kernel.org/kbuild/kconfig-language.html#menu-attributes :

Ok! Good to see we have this properly documented as well. Thanks for
pointing out the documentation link.

>
> select should be used with care. select will force a symbol to a value 
> without visiting the dependencies. By abusing select you are able to 
> select a symbol FOO even if FOO depends on BAR that is not set. In 
> general use select only for non-visible symbols (no prompts anywhere) 
> and for symbols with no dependencies. That will limit the usefulness but 
> on the other hand avoid the illegal configurations all over.
>
> Christophe
>

Make sense. Thanks! for adding the details.

-ritesh

Re: kexec failing with KVM on Power8 baremetal host

2025-04-07 Thread IBM

Stefan Berger  writes:

> I bisected Linux between 6.13.0 and 6.12.0 due to failing kexec on a 
> Power8 baremetal host on 6.13.0:
>
> 8fec58f503b296af87ffca3898965e3054f2b616 is the first bad commit
> commit 8fec58f503b296af87ffca3898965e3054f2b616
> Author: Ritesh Harjani (IBM) 
> Date:   Fri Oct 18 22:59:50 2024 +0530
>
>  book3s64/hash: Add kfence functionality
>
>  Now that linear map functionality of debug_pagealloc is made generic,
>  enable kfence to use this generic infrastructure.
>
>  1. Define kfence related linear map variables.
> - u8 *linear_map_kf_hash_slots;
> - unsigned long linear_map_kf_hash_count;
> - DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
>  2. The linear map size allocated in RMA region is quite small
> (KFENCE_POOL_SIZE >> PAGE_SHIFT) which is 512 bytes by default.
>  3. kfence pool memory is reserved using memblock_phys_alloc() which has
> can come from anywhere.
> (default 255 objects => ((1+255) * 2) << PAGE_SHIFT = 32MB)
>  4. The hash slot information for kfence memory gets added in linear map
> in hash_linear_map_add_slot() (which also adds for debug_pagealloc).
>
>  Reported-by: Pavithra Prakash 
>  Signed-off-by: Ritesh Harjani (IBM) 
>  Signed-off-by: Michael Ellerman 
>  Link: 
> https://patch.msgid.link/5c2b61941b344077a2b8654dab46efa0322af3af.1729271995.git.ritesh.l...@gmail.com
>
>   arch/powerpc/include/asm/kfence.h |   5 ---
>   arch/powerpc/mm/book3s64/hash_utils.c | 162 
> +++--
>   2 files changed, 149 insertions(+), 18 deletions(-)
>
>
> Reverting part of this patch by applying the following changes to 6.13.0 
> resolves the issue:

Sorry for the delay in getting back to this, we have been going back and
forth on a few other work priorities. Nevertheless, Aboorva (cc'd)
helped in analyzing & has root caused the reported problem. Let me
summarize the findings. Aboorva, please add if I missed any details in
here:

1. The issue reported on this is not related to the above mentioned
patch. The issue can also be reproduced on v6.12 or older kernel where
we didn't have this series (with CONFIG_KFENCE=y).

2. The issue is happening during kexec_sequence(), i.e.
kexec_copy_flush() -> copy_segments -> copy_page(dest, addr). Note that
the dest address in copy_page() is obtained during kexec load time.

The root cause of the issue is - that the dest address in above
copy_page() function is falling into a kfence region, which is causing
the page fault.

Now, as per the kexec_sequence(), we are not supposed to take a page
fault during that path after the local_paca->data_offset is changed to
some poison value. We do disable the MMU for most other cases except
for Hash on Pseries. That is also the reason why the issue is only seen
on Hash on Pseries and not on Radix. 

On debugging further, we found that kexec on ppc tries to find the
memory region using "for_each_mem_range_rev()" rather than using
"for_each_free_mem_range_reverse()". i.e. 

- Ƒ __locate_mem_hole_top_down
  - Ƒ locate_mem_hole_top_down_ppc64
- Ƒ arch_kexec_locate_mem_hole
  - Ƒ kexec_add_buffer
- Ƒ kexec_purgatory_setup_kbuf
  - Ƒ kexec_load_purgatory
+ Ƒ elf64_load

IMO, it should be something like below diff... so that we could avoid using
a region which is in use by someone else e.g. kfence.

diff --git a/arch/powerpc/kexec/file_load_64.c 
b/arch/powerpc/kexec/file_load_64.c
index dc65c1391157..771b7dbaae0b 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -66,7 +66,7 @@ static int __locate_mem_hole_top_down(struct kexec_buf *kbuf,
phys_addr_t start, end;
u64 i;

-   for_each_mem_range_rev(i, &start, &end) {
+   for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, 
&end, NULL) {
/*
 * memblock uses [start, end) convention while it is
 * [start, end] here. Fix the off-by-one to have the
@@ -165,7 +165,7 @@ static int __locate_mem_hole_bottom_up(struct kexec_buf 
*kbuf,
phys_addr_t start, end;
u64 i;

-   for_each_mem_range(i, &start, &end) {
+   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, 
NULL) {
/*
 * memblock uses [start, end) convention while it is
 * [start, end] here. Fix the off-by-one to have the

3. In the latest 6.15-rc1 release, it looks like these custom arch
specific functions "__locate_mem_hole_top_down()" etc. has been removed
[1] (For some other reason though).  Aboorva, also verified that the
issue is not seen on v6.15-rc1 anymore. 

[

Re: [BUG][powerpc] OOPs: Kernel access of bad area during zram swap write - kswapd0 crash

2025-04-20 Thread IBM



++ linux-mm

Misbah Anjum N  writes:

> Bug Description:
> When running Avocado-VT based functional tests on KVM guest, the system 
> encounters a
> kernel panic and crash during memory reclaim activity when zram is 
> actively used for
> swap. The crash occurs in the kswapd0 kernel thread during what appears 
> to be a write
> operation to zram.
>
>
> Steps to Reproduce:
> 1. Compile Upstream Kernel on LPAR
> 2. Compile Qemu, Libvirt for KVM Guest
> 3. Run Functional tests on KVM guest using Avocado-VT Regression Bucket
>  a. Clone: git clone https://github.com/lop-devops/tests.git
>  b. Setup: python3 avocado-setup.py --bootstrap --enable-kvm 
> --install-deps
>  c. Add guest in folder: tests/data/avocado-vt/images/
>  d. Run: python3 avocado-setup.py --run-suite guest_regression 
> --guest-os\
>   --only-filter 'virtio_scsi virtio_net qcow2'\
>  --no-download
>
> The bug is reproducible when Avocado-VT Regression bucket is executed 
> which
> consists of series of functional tp-libvirt tests performed on the KVM 
> guest in the
> following order: cpu, memory, network, storage and hotplug (disk, change 
> media,
> libvirt_mem), etc.
> Whilst execution, the system crashes during test:
> io-github-autotest-libvirt.libvirt_mem.positive_test.mem_basic.cold_plug_discard
> Note: This does not appear to be caused by a single test, but by 
> cumulative
> operations during the test sequence.
>
>
> Environment Details:
>  Kernel: 6.15.0-rc1-g521d54901f98
>  Reproducible with: 6.15.0-rc2-gf3a2e2a79c9d


Looks like the issue is happening on 6.15-rc2. Did git bisect revealed a
faulty commit?


>  Platform: IBM POWER10 LPAR (ppc64le)
>  Distro: Fedora42
>  RAM: 64GB
>  CPUs: 80
>  Qemu: 9.2.93 (v10.0.0-rc3-10-g8bdd3a0308)
>  Libvirt: 11.3.0
>
>
> System Memory State:
>  # free -mh
>  totalusedfree  shared  
> buff/cache   available
>  Mem:61Gi   3.0Gi25Gi11Mi
> 33Gi58Gi
>  Swap:  8.0Gi  0B   8.0Gi
>  # zramctl
>  NAME   ALGORITHM DISKSIZE DATA COMPR TOTAL STREAMS 
> MOUNTPOINT
>  /dev/zram0 lzo-rle 8G  64K  222B  128K [SWAP]
>  # swapon --show
>  NAME   TYPE  SIZE USED PRIO
>  /dev/zram0 partition   8G   0B  100
>
>
> Call Trace:
> [180060.602200] BUG: Unable to handle kernel data access on read at 
> 0xc0080a1b
> [180060.602219] Faulting instruction address: 0xc0175670
> [180060.602224] Oops: Kernel access of bad area, sig: 11 [#1]
> [180060.602227] LE PAGE_SIZE=64K MMU=Radix  SMP NR_CPUS=2048 NUMA 
> pSeries
> [180060.602232] Modules linked in: dm_thin_pool dm_persistent_data 
> vmw_vsock_virtio_transport_common vsock zram xfs dm_service_time sd_mod 
> [180060.602345] CPU: 68 UID: 0 PID: 465 Comm: kswapd0 Kdump: loaded Not 
> tainted
> 6.15.0-rc1-g521d54901f98 #1 VOLUNTARY
> [180060.602351] Hardware name: IBM,9080-HEX POWER10 (architected) 
> 0x800200 0xf06 of:IBM,FW1060.21
> (NH1060_078) hv:phyp pSeries
> [180060.602355] NIP:  c0175670 LR: c06d96b4 CTR: 
> 01fffc05
> [180060.602358] REGS: c000a5a56da0 TRAP: 0300   Not tainted  
> (6.15.0-rc1-g521d54901f98)
> [180060.602362] MSR:  82009033   CR: 
> 44042880  XER: 20040001
> [180060.602370] CFAR: c01756c8 DAR: c0080a1b DSISR: 
> 4000 IRQMASK: 0
<...>
>
>
> Crash Utility Output:
> # crash /home/kvmci/linux/vmlinux vmcore
> crash 8.0.6-4.fc42
>
>KERNEL: /home/kvmci/linux/vmlinux  [TAINTED]
>  DUMPFILE: vmcore  [PARTIAL DUMP]
>  CPUS: 80
>  DATE: Wed Dec 31 18:00:00 CST 1969
>UPTIME: 2 days, 02:01:00
> LOAD AVERAGE: 0.72, 0.66, 0.64
> TASKS: 1249
>  NODENAME: ***
>   RELEASE: 6.15.0-rc1-g521d54901f98
>   VERSION: #1 SMP Wed Apr  9 05:13:03 CDT 2025
>   MACHINE: ppc64le  (3450 Mhz)
>MEMORY: 64 GB
> PANIC: "Oops: Kernel access of bad area, sig: 11 [#1]" (check log 
> for details)
>   PID: 465
>   COMMAND: "kswapd0"
>  TASK: c6067d80  [THREAD_INFO: c6067d80]
>   CPU: 68
> STATE: TASK_RUNNING (PANIC)
>
> crash> bt
> PID: 465  TASK: c6067d80  CPU: 68   COMMAND: "kswapd0"
>   R0:  0e00R1:  c000a5a57040R2:  c17a8100
>   R3:  c00d34cefd00R4:  c0080a1affe8R5:  fffa
>   R6:  01ffR7:  03ff2cb33000R8:  8000
&g

Re: early soft lockup in 6.15-rc2 on PowerNV

2025-04-16 Thread IBM

Dan Horák  writes:

> Hi,
>
> after updating to Fedora built 6.15-rc2 kernel from 6.14 I am getting a
> soft lockup early in the boot and NVME related timeout/crash later
> (could it be related?). I am first checking if this is a known issue
> as I have not started bisecting yet.
>
> [2.866399] Memory: 63016960K/67108864K available (25152K kernel code, 
> 4416K rwdata, 24000K rodata, 9792K init, 1796K bss, 476160K reserved, 
> 3356672K cma-reserved)
> [2.874121] devtmpfs: initialized
> [   24.037685] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [swapper/0:1]
> [   24.037690] CPU#0 Utilization every 4s during lockup:
> [   24.037692]#1: 101% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   24.037697]#2: 100% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   24.037701]#3: 100% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   24.037704]#4: 101% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   24.037707]#5: 100% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   24.037711] Modules linked in:
> [   24.037716] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 
> 6.15.0-0.rc2.22.fc43.ppc64le #1 VOLUNTARY 
> [   24.037722] Hardware name: T2P9D01 REV 1.00 POWER9 0x4e1202 
> opal:skiboot-bc106a0 PowerNV
> [   24.037725] NIP:  c308a72c LR: c308a7d0 CTR: 
> c18012c0
> [   24.037729] REGS: c00026637a50 TRAP: 0900   Not tainted  
> (6.15.0-0.rc2.22.fc43.ppc64le)
> [   24.037733] MSR:  92009033   CR: 
> 48000828  XER: 
> [   24.037750] CFAR:  IRQMASK: 0 
> [   24.037750] GPR00: c308a7d0 c00026637cf0 c25baa00 
> 0040 
> [   24.037750] GPR04: c0002007ff390b00 0001  
> c0002007ff3a0b00 
> [   24.037750] GPR08: 002007ff 0012d092  
>  
> [   24.037750] GPR12:  c3fb c0011320 
>  
> [   24.037750] GPR16:    
>  
> [   24.037750] GPR20:    
>  
> [   24.037750] GPR24:    
>  
> [   24.037750] GPR28:  c3f10be0 c19efaf8 
> 00037940 
> [   24.037806] NIP [c308a72c] memory_dev_init+0xb4/0x194
> [   24.037815] LR [c308a7d0] memory_dev_init+0x158/0x194
> [   24.037820] Call Trace:
> [   24.037822] [c00026637cf0] [c308a7d0] 
> memory_dev_init+0x158/0x194 (unreliable)
> [   24.037830] [c00026637d70] [c3089bd0] driver_init+0x74/0xa0
> [   24.037836] [c00026637d90] [c300f628] 
> kernel_init_freeable+0x204/0x288
> [   24.037843] [c00026637df0] [c0011344] kernel_init+0x2c/0x1b8
> [   24.037849] [c00026637e50] [c000debc] 
> ret_from_kernel_user_thread+0x14/0x1c
> [   24.037855] --- interrupt: 0 at 0x0
> [   24.037858] Code: 7c651b78 40820010 3fa20195 3bbd61e0 4880 3c62ff89 
> 389e00c8 3863e510 4bf7a625 6000 39290001 7c284840 <41800088> 792aaac2 
> 7c2a2840 4080ffec 
> [   48.045039] watchdog: BUG: soft lockup - CPU#0 stuck for 44s! [swapper/0:1]
> [   48.045043] CPU#0 Utilization every 4s during lockup:
> [   48.045045]#1: 101% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   48.045049]#2: 100% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   48.045053]#3: 100% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   48.045056]#4: 101% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   48.045059]#5: 100% system,  0% softirq, 0% hardirq,   
>   0% idle
> [   48.045063] Modules linked in:
> [   48.045067] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Tainted: G L  
>--  ---  6.15.0-0.rc2.22.fc43.ppc64le #1 VOLUNTARY 
> [   48.045073] Tainted: [L]=SOFTLOCKUP
> [   48.045075] Hardware name: T2P9D01 REV 1.00 POWER9 0x4e1202 
> opal:skiboot-bc106a0 PowerNV
> [   48.045077] NIP:  c308a72c LR: c308a7d0 CTR: 
> c18012c0
> [   48.045081] REGS: c00026637a50 TRAP: 0900   Tainted: G L   
>   --  ---   (6.15.0-0.rc2.22.fc43.ppc64le)
> [   48.045085] MSR:  92009033   CR: 
> 48000828  XER: 
> [   48.045100] CFAR:  IRQMASK: 0 
> [   48.045100] GPR00: c308a7d0 c00026637cf0 c25baa00 
> 0040 
> [   48.045100] GPR04: c0002007ff390b00 0001  
> c0002007ff3a0b00 
> [   48.045100] GPR08: 002007ff 000a65fd  
>  
> [   48.045100] GPR12:  c3fb c0011320 
>  
> [   48.045100] GPR16:    
>  
> [   48.0

Re: [PATCH for-next 1/2] book3s64/radix : Handle error conditions properly in radix_vmemmap_populate

2025-05-05 Thread IBM

Donet Tom  writes:

> Error conditions are not handled properly if altmap is not present
> and PMD_SIZE vmemmap_alloc_block_buf fails.
>
> In this patch, if vmemmap_alloc_block_buf fails in the non-altmap
> case, we will fall back to the base mapping.
>

We are trying to create mappings for vmemmap area. In this, we first try
to allocate pmd entry using vmemmap_alloc_block_buf() of PMD_SIZE. If we
couldn't allocate, we should definitely fallback to base page mapping. 

Looks good to me. Feel free to add:
Reviewed-by: Ritesh Harjani (IBM) 


-ritesh

> Signed-off-by: Donet Tom 
> ---
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 9f764bc42b8c..3d67aee8c8ca 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -1173,7 +1173,7 @@ int __meminit radix__vmemmap_populate(unsigned long 
> start, unsigned long end, in
>   vmemmap_set_pmd(pmd, p, node, addr, next);
>   pr_debug("PMD_SIZE vmemmap mapping\n");
>   continue;
> - } else if (altmap) {
> + } else {
>   /*
>* A vmemmap block allocation can fail due to
>* alignment requirements and we trying to align
> -- 
> 2.48.1

Re: [PATCH] powerpc/iommu: Memory leak in TCE table userspace view

2025-05-02 Thread IBM

Gaurav Batra  writes:

> When a device is opened by a userspace driver, via VFIO interface, DMA
> window is created. This DMA window has TCE Table and a corresponding
> data for userview of
> TCE table.
>
> When the userspace driver closes the device, all the above infrastructure
> is free'ed and the device control given back to kernel. Both DMA window
> and TCE table is getting free'ed. But due to a code bug, userview of the
> TCE table is not getting free'ed. This is resulting in a memory leak.
>
> Befow is the information from KMEMLEAK
>
> unreferenced object 0xc00822af (size 16777216):
>   comm "senlib_unit_tes", pid 9346, jiffies 4294983174
>   hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
>   backtrace (crc 0):
> kmemleak_vmalloc+0xc8/0x1a0
> __vmalloc_node_range+0x284/0x340
> vzalloc+0x58/0x70
> spapr_tce_create_table+0x4b0/0x8d0
> tce_iommu_create_table+0xcc/0x170 [vfio_iommu_spapr_tce]
> tce_iommu_create_window+0x144/0x2f0 [vfio_iommu_spapr_tce]
> tce_iommu_ioctl.part.0+0x59c/0xc90 [vfio_iommu_spapr_tce]
> vfio_fops_unl_ioctl+0x88/0x280 [vfio]
> sys_ioctl+0xf4/0x160
> system_call_exception+0x164/0x310
> system_call_vectored_common+0xe8/0x278
> unreferenced object 0xc00823b0 (size 4194304):
>   comm "senlib_unit_tes", pid 9351, jiffies 4294984116
>   hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
>   backtrace (crc 0):
> kmemleak_vmalloc+0xc8/0x1a0
> __vmalloc_node_range+0x284/0x340
> vzalloc+0x58/0x70
> spapr_tce_create_table+0x4b0/0x8d0
> tce_iommu_create_table+0xcc/0x170 [vfio_iommu_spapr_tce]
> tce_iommu_create_window+0x144/0x2f0 [vfio_iommu_spapr_tce]
> tce_iommu_create_default_window+0x88/0x120 [vfio_iommu_spapr_tce]
> tce_iommu_ioctl.part.0+0x57c/0xc90 [vfio_iommu_spapr_tce]
> vfio_fops_unl_ioctl+0x88/0x280 [vfio]
> sys_ioctl+0xf4/0x160
> system_call_exception+0x164/0x310
> system_call_vectored_common+0xe8/0x278
>
> Fixes: f431a8cde7f1 ("powerpc/iommu: Reimplement the iommu_table_group_ops 
> for pSeries")
> Signed-off-by: Gaurav Batra 
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> b/arch/powerpc/platforms/pseries/iommu.c
> index d6ebc19fb99c..eec333dd2e59 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -197,7 +197,7 @@ static void tce_iommu_userspace_view_free(struct 
> iommu_table *tbl)
>
>  static void tce_free_pSeries(struct iommu_table *tbl)
>  {
> - if (!tbl->it_userspace)
> + if (tbl->it_userspace)
>   tce_iommu_userspace_view_free(tbl);
>  }

Gr8 catch. That clearly looks like a miss in the original code.

vfree() can be called even directly and it says no operation is
performed if addr passed to vfree is NULL. However I don't really see
any value add in doing that except maybe we can kill tce_free_pSeries()
function. But vfree() still does few checks in there. So we may as well
check for a non-null address before calling vfree().

nitpick: I might have re-pharsed the commit msg as:
 powerpc/pseries/iommu: Fix kmemleak in TCE table userspace view

The patch looks good to me purely from the kmemleak bug perspective.
So feel free to take: 
Reviewed-by: Ritesh Harjani (IBM) 


-ritesh

Re: [PATCH v4] powerpc/hugetlb: Disable gigantic hugepages if fadump is active

2025-03-01 Thread IBM

Sourabh Jain  writes:

> The fadump kernel boots with limited memory solely to collect the kernel
> core dump. Having gigantic hugepages in the fadump kernel is of no use.

Sure got it.

> Many times, the fadump kernel encounters OOM (Out of Memory) issues if
> gigantic hugepages are allocated.
>
> To address this, disable gigantic hugepages if fadump is active by
> returning early from arch_hugetlb_valid_size() using
> hugepages_supported(). When fadump is active, the global variable
> hugetlb_disabled is set to true, which is later used by the
> PowerPC-specific hugepages_supported() function to determine hugepage
> support.
>
> Returning early from arch_hugetlb_vali_size() not only disables
> gigantic hugepages but also avoids unnecessary hstate initialization for
> every hugepage size supported by the platform.
>
> kernel logs related to hugepages with this patch included:
> kernel argument passed: hugepagesz=1G hugepages=1
>
> First kernel: gigantic hugepage got allocated
> ==
>
> dmesg | grep -i "hugetlb"
> -
> HugeTLB: registered 1.00 GiB page size, pre-allocated 1 pages
> HugeTLB: 0 KiB vmemmap can be freed for a 1.00 GiB page
> HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
> HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
>
> $ cat /proc/meminfo | grep -i "hugetlb"
> -
> Hugetlb: 1048576 kB

Was this tested with patch [1] in your local tree?

[1]: 
https://web.git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id=d629d7a8efc33

IIUC, this patch [1] disables the boot time allocation of hugepages.
Isn't it also disabling the boot time allocation for gigantic huge pages
passed by the cmdline params like hugepagesz=1G and hugepages=2 ?


> HugeTLB: registered 1.00 GiB page size, pre-allocated 1 pages
This print comes from report_hugepages(). The only place from where
report_hugepages() gets called is hugetlb_init(). hugetlb_init() is what
is responsible for hugepages & gigantic hugepage allocations of the
passed kernel cmdline params.

But hugetlb_init() already checks for hugepages_supported() in the very
beginning. So I am not sure whether we need this extra patch to disable
gigantic hugepages allocation by the kernel cmdline params like
hugepagesz=1G and hugepages=2 type of options.

Hence I was wondering if you had this patch [1] in your tree when you were
testing this?

But I may be missing something. Could you please help clarify on whether
we really need this patch to disable gigantic hugetlb page allocations?

>
> Fadump kernel: gigantic hugepage not allocated
> ===
>
> dmesg | grep -i "hugetlb"
> -
> [0.00] HugeTLB: unsupported hugepagesz=1G
> [0.00] HugeTLB: hugepages=1 does not follow a valid hugepagesz, 
> ignoring
> [0.706375] HugeTLB support is disabled!
> [0.773530] hugetlbfs: disabling because there are no supported hugepage 
> sizes
>
> $ cat /proc/meminfo | grep -i "hugetlb"
> --
> 
>
> Cc: Hari Bathini 
> Cc: Madhavan Srinivasan 
> Cc: Mahesh Salgaonkar 
> Cc: Michael Ellerman 
> Cc: Ritesh Harjani (IBM)" 

I guess the extra " in the above was not adding me in the cc list.
Hence I missed to see this patch early.

-ritesh


> Reviewed-by: Christophe Leroy 
> Signed-off-by: Sourabh Jain 
> ---
> Changelog:
>
> v1:
> https://lore.kernel.org/all/20250121150419.1342794-1-sourabhj...@linux.ibm.com/
>
> v2:
> https://lore.kernel.org/all/20250124103220.111303-1-sourabhj...@linux.ibm.com/
>  - disable gigantic hugepage in arch code, arch_hugetlb_valid_size()
>
> v3:
> https://lore.kernel.org/all/20250125104928.1-1-sourabhj...@linux.ibm.com/
>  - Do not modify the initialization of the shift variable
>
> v4:
> - Update commit message to include how hugepages_supported() detects
>   hugepages support when fadump is active
> - Add Reviewed-by tag
> - NO functional change
>
> ---
>  arch/powerpc/mm/hugetlbpage.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 6b043180220a..88cfd182db4e 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -138,6 +138,9 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>   int shift = __ffs(size);
>   int mmu_psize;
>
> + if (!hugepages_supported())
> + return false;
> +
>   /* Check that it is a page size supported by the hardware and
>* that it fits within pagetable and slice limits. */
>   if (size <= PAGE_SIZE || !is_power_of_2(size))
> --
> 2.48.1

Re: [PATCH v4] powerpc/hugetlb: Disable gigantic hugepages if fadump is active

2025-03-03 Thread IBM

Sourabh Jain  writes:

> Hello Ritesh,
>
> Thanks for the review.
>
> On 02/03/25 12:05, Ritesh Harjani (IBM) wrote:
>> Sourabh Jain  writes:
>>
>>> The fadump kernel boots with limited memory solely to collect the kernel
>>> core dump. Having gigantic hugepages in the fadump kernel is of no use.
>> Sure got it.
>>
>>> Many times, the fadump kernel encounters OOM (Out of Memory) issues if
>>> gigantic hugepages are allocated.
>>>
>>> To address this, disable gigantic hugepages if fadump is active by
>>> returning early from arch_hugetlb_valid_size() using
>>> hugepages_supported(). When fadump is active, the global variable
>>> hugetlb_disabled is set to true, which is later used by the
>>> PowerPC-specific hugepages_supported() function to determine hugepage
>>> support.
>>>
>>> Returning early from arch_hugetlb_vali_size() not only disables
>>> gigantic hugepages but also avoids unnecessary hstate initialization for
>>> every hugepage size supported by the platform.
>>>
>>> kernel logs related to hugepages with this patch included:
>>> kernel argument passed: hugepagesz=1G hugepages=1
>>>
>>> First kernel: gigantic hugepage got allocated
>>> ==
>>>
>>> dmesg | grep -i "hugetlb"
>>> -
>>> HugeTLB: registered 1.00 GiB page size, pre-allocated 1 pages
>>> HugeTLB: 0 KiB vmemmap can be freed for a 1.00 GiB page
>>> HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
>>> HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
>>>
>>> $ cat /proc/meminfo | grep -i "hugetlb"
>>> -
>>> Hugetlb: 1048576 kB
>> Was this tested with patch [1] in your local tree?
>>
>> [1]: 
>> https://web.git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id=d629d7a8efc33
>>
>> IIUC, this patch [1] disables the boot time allocation of hugepages.
>> Isn't it also disabling the boot time allocation for gigantic huge pages
>> passed by the cmdline params like hugepagesz=1G and hugepages=2 ?
>
> Yes, I had the patch [1] in my tree.
>
> My understanding is that gigantic pages are allocated before normal huge 
> pages.
>
> In hugepages_setup() in hugetlb.c, we have:
>
>      if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
>      hugetlb_hstate_alloc_pages(parsed_hstate);
>
> I believe the above code allocates memory for gigantic pages, and 
> hugetlb_init() is
> called later because it is a subsys_initcall.
>
> So, by the time the kernel reaches hugetlb_init(), the gigantic pages 
> are already
> allocated. Isn't that right?
>
> Please let me know your opinion.

Yes, you are right. We are allocating hugepages from memblock, however
this isn't getting advertized anywhere. i.e. there is no way one can
know from any user interface on whether hugepages were allocated or not.
i.e. for fadump kernel when hugepagesz= and hugepages= params are
passed, though it will allocate gigantic pages, it won't advertize this
in meminfo or anywhere else. This was adding the confusion when I tested
this (which wasn't clear from the commit msg either).

And I guess this is happening during fadump kernel because of our patch
[1], which added a check to see whether hugetlb_disabled is true in
hugepages_supported(). Due to this hugetlb_init() is now not doing the
rest of the initialization for those gigantic pages which were allocated
due to cmdline options from hugepages_setup().

[1]: 
https://lore.kernel.org/linuxppc-dev/20241202054310.928610-1-sourabhj...@linux.ibm.com/

Now as we know from below that fadump can set hugetlb_disabled call in 
early_setup().
i.e. fadump can mark hugetlb_disabled to true in 
early_setup() -> early_init_devtree() -> fadump_reserve_mem()

And hugepages_setup() and hugepagesz_setup() gets called late in
start_kernel() -> parse_args() 

And we already check for hugepages_supported() in all necessary calls in
mm/hugetlb.c. So IMO, this check should go in mm/hugetlb.c in
hugepagesz_setup() and hugepages_setup(). Because otherwise every arch
implementation will end up duplicating this by adding
hugepages_supported() check in their arch implementation of
arch_hugetlb_valid_size().

e.g. references of hugepages_supported() checks in mm/hugetlb.c

mm/hugetlb.c hugetlb_show_meminfo_node 4959 if (!hugepages_supported())
mm/hugetlb.c hugetlb_report_node_meminfo 4943 if (!hugepages_supported())  
mm/hugetlb.c hugetlb_report_meminfo 4914 if (!hugepages_supported())   
mm/hugetlb.c hugetlb_overco

Re: [PATCH v3 1/2] book3s64/radix: Fix compile errors when CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP=n

2025-03-10 Thread IBM

Christophe Leroy  writes:

> Le 10/03/2025 à 13:44, Donet Tom a écrit :
>> From: "Ritesh Harjani (IBM)" 
>> 
>> Fix compile errors when CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP=n
>
> I don't understand your patch.
>
> As far as I can see, CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP is selected 
> when CONFIG_PPC_RADIX_MMU is selected, and radix_pgtable.o is built only 
> when CONFIG_PPC_RADIX_MMU is selected. So when radix_pgtable.o is built 
> CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP will always be selected.
>
> Can you clarify what the problem is ?
>

You are right CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP always gets enabled
by default for radix mmu. However, when we forcefully wanted to test the
!vmemmap_can_optimize() path in radix, we forcefully removed the support
of ARCH_WANT_OPTIMIZE_DAX_VMEMMAP from arch/powerpc/Kconfig (by making
the kernel change). That is when we were facing the compilation errors
due to duplicate definition of vmemmap_can_optimize(). 

The other one is defined in include/linux/mm.h under #ifdef
CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP and #else.

So it is only a good to have patch.

-ritesh

> Christophe
>
>> 
>> Signed-off-by: Ritesh Harjani (IBM) 
>> Signed-off-by: Donet Tom 
>> ---
>>   arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ++-
>>   1 file changed, 2 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
>> b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> index 311e2112d782..bd6916419472 100644
>> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> @@ -976,7 +976,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned 
>> long start,
>>  return 0;
>>   }
>>   
>> -
>> +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
>>   bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap 
>> *pgmap)
>>   {
>>  if (radix_enabled())
>> @@ -984,6 +984,7 @@ bool vmemmap_can_optimize(struct vmem_altmap *altmap, 
>> struct dev_pagemap *pgmap)
>>   
>>  return false;
>>   }
>> +#endif
>>   
>>   int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
>>  unsigned long addr, unsigned long next)

Re: vmalloc_node_range for size 4198400 failed: Address range restricted to 0xf1000000 - 0xf5110000 (kernel 6.14-rc4, ppc32)

2025-02-26 Thread IBM

Erhard Furtner  writes:

> Greetings!
>
> At boot with a KASAN-enabled v6.14-rc4 kernel on my PowerMac G4 DP I get:
>
> [...]
> vmalloc_node_range for size 4198400 failed: Address range restricted to 
> 0xf100 - 0xf511
> swapon: vmalloc error: size 4194304, vm_struct allocation failed, 
> mode:0xdc0(GFP_KERNEL|__GFP_ZERO), 
> nodemask=(null),cpuset=openrc.swap,mems_allowed=0

Did we exhaust the vmalloc area completely?


> CPU: 0 UID: 0 PID: 870 Comm: swapon Tainted: GW  
> 6.14.0-rc4-PMacG4 #6
> Tainted: [W]=WARN
> Hardware name: PowerMac3,6 7455 0x80010303 PowerMac
> Call Trace:
> [f2ffb9d0] [c14cfd88] dump_stack_lvl+0x70/0x8c (unreliable)
> [f2ffb9f0] [c04fb9b8] warn_alloc+0x154/0x2b8
> [f2ffbab0] [c04de94c] __vmalloc_node_range_noprof+0x154/0x958
> [f2ffbb80] [c04df23c] __vmalloc_node_noprof+0xec/0xf4
> [f2ffbbc0] [c0558524] swap_cgroup_swapon+0x70/0x198
> [f2ffbbf0] [c051e8d8] sys_swapon+0x1838/0x3624
> [f2ffbce0] [c001e574] system_call_exception+0x2dc/0x420

Since only the swapon failed, I think you might still have the console
up right? So this is mostly a vmalloc allocation failure report?


> [f2ffbf30] [c00291ac] ret_from_syscall+0x0/0x2c
> --- interrupt: c00 at 0x2612ec
> NIP:  002612ec LR: 00534108 CTR: 001e8310
> REGS: f2ffbf40 TRAP: 0c00   Tainted: GW   (6.14.0-rc4-PMacG4)
> MSR:  d032   CR: 24002444  XER: 
>
> GPR00: 0057 afe3ef20 a7a95540 01b2bdd0  24002444 fe5ff7e1 
> 00247c24 
> GPR08: d032 fa89 01b2d568 001e8310 24002448 0054fe14 02921154 
>  
> GPR16:  00534b50 afe3f0ac afe3f0b0   0055001c 
> afe3f0d0 
> GPR24: afe3f0b0 0003  1000 01b2bdd0 0002 005579ec 
> 01b2d570 
> NIP [002612ec] 0x2612ec
> LR [00534108] 0x534108
> --- interrupt: c00
> Mem-Info:
> active_anon:1989 inactive_anon:0 isolated_anon:0
>  active_file:6407 inactive_file:5879 isolated_file:0
>  unevictable:0 dirty:0 writeback:0
>  slab_reclaimable:1538 slab_unreclaimable:22927
>  mapped:2753 shmem:107 pagetables:182
>  sec_pagetables:0 bounce:0
>  kernel_misc_reclaimable:0
>  free:433110 free_pcp:472 free_cma:0
> Node 0 active_anon:7972kB inactive_anon:0kB active_file:25652kB 
> inactive_file:23496kB unevictable:0kB isolated(anon):0kB isolated(file):0kB 
> mapped:10908kB dirty:0kB writeback:0kB shmem:464kB writeback_tmp:0kB 
> kernel_stack:1568kB pagetables:724kB sec_pagetables:0kB all_unreclaimable? no
> DMA free:591772kB boost:0kB min:3380kB low:4224kB high:5068kB 
> reserved_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:4kB 
> inactive_file:11056kB unevictable:0kB writepending:0kB present:786432kB 
> managed:716492kB mlocked:0kB bounce:0kB free_pcp:1680kB local_pcp:1180kB 
> free_cma:0kB
> lowmem_reserve[]: 0 0 1184 0
> DMA: 127*4kB (UE) 66*8kB (UME) 37*16kB (UE) 78*32kB (UME) 10*64kB (UE) 
> 4*128kB (UME) 3*256kB (UM) 6*512kB (UM) 5*1024kB (ME) 4*2048kB (M) 139*4096kB 
> (M) = 591772kB
> 12404 total pagecache pages
> 0 pages in swap cache
> Free swap  = 0kB
> Total swap = 0kB
> 524288 pages RAM
> 327680 pages HighMem/MovableOnly
> 42061 pages reserved

Though above are mainly the physical mem info printed, but vmalloc can
also fail sometimes (e.g. this report), it is nice if we can print how
much of vmalloc space is free out of vmalloc total in show_mem() here.

Maybe linux-mm can tell if we should add this diff change for future?

diff --git a/mm/show_mem.c b/mm/show_mem.c
index 43afb56abbd3..b3af59fced02 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "internal.h"
 #include "swap.h"
@@ -416,6 +417,8 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, 
int max_zone_idx)
printk("%lu pages RAM\n", total);
printk("%lu pages HighMem/MovableOnly\n", highmem);
printk("%lu pages reserved\n", reserved);
+   printk("%lu pages Vmalloc Total\n", (unsigned long)VMALLOC_TOTAL >> 
PAGE_SHIFT);
+   printk("%lu pages Vmalloc Used\n", vmalloc_nr_pages());
 #ifdef CONFIG_CMA
printk("%lu pages cma reserved\n", totalcma_pages);
 #endif


But meanwhile below data can give more details about the vmalloc area.

1. cat /proc/vmallocinfo   
2. cat /proc/meminfo   


-ritesh

> Memory allocations:
> 85.3 MiB 6104 mm/slub.c:2423 func:alloc_slab_page
> 38.5 MiB 9862 mm/readahead.c:187 func:ractl_alloc_folio
> 9.47 MiB 2425 mm/filemap.c:1970 func:__filemap_get_folio
> 7.96 MiB 2037 mm/kasan/shadow.c:304 func:kasan_populate_vmalloc_pte
> 7.87 MiB 2125 mm/execmem.c:44 func:execmem_vmalloc
> 5.01 MiB 1283 mm/memory.c:1063 func:folio_prealloc
> 4.00 MiB1 fs/btrfs/zstd.c:366 [btrfs] func:zstd_alloc_workspace
> 3.86 MiB  247 lib/stackdepot.c:627 func:stack_depot_save_flags
> 3.62 MiB  412 mm/slub.c:2425 func:alloc_slab_page
> 3.09 MiB18430 fs/kernfs/dir.c:624 func:__kernfs_new_node
> couldn't allocate enough memo

Re: [PATCH v4] powerpc/hugetlb: Disable gigantic hugepages if fadump is active

2025-03-05 Thread IBM

Sourabh Jain  writes:

> Hello Ritesh,
>
>
> On 04/03/25 10:27, Ritesh Harjani (IBM) wrote:
>> Sourabh Jain  writes:
>>
>>> Hello Ritesh,
>>>
>>> Thanks for the review.
>>>
>>> On 02/03/25 12:05, Ritesh Harjani (IBM) wrote:
>>>> Sourabh Jain  writes:
>>>>
>>>>> The fadump kernel boots with limited memory solely to collect the kernel
>>>>> core dump. Having gigantic hugepages in the fadump kernel is of no use.
>>>> Sure got it.
>>>>
>>>>> Many times, the fadump kernel encounters OOM (Out of Memory) issues if
>>>>> gigantic hugepages are allocated.
>>>>>
>>>>> To address this, disable gigantic hugepages if fadump is active by
>>>>> returning early from arch_hugetlb_valid_size() using
>>>>> hugepages_supported(). When fadump is active, the global variable
>>>>> hugetlb_disabled is set to true, which is later used by the
>>>>> PowerPC-specific hugepages_supported() function to determine hugepage
>>>>> support.
>>>>>
>>>>> Returning early from arch_hugetlb_vali_size() not only disables
>>>>> gigantic hugepages but also avoids unnecessary hstate initialization for
>>>>> every hugepage size supported by the platform.
>>>>>
>>>>> kernel logs related to hugepages with this patch included:
>>>>> kernel argument passed: hugepagesz=1G hugepages=1
>>>>>
>>>>> First kernel: gigantic hugepage got allocated
>>>>> ==
>>>>>
>>>>> dmesg | grep -i "hugetlb"
>>>>> -
>>>>> HugeTLB: registered 1.00 GiB page size, pre-allocated 1 pages
>>>>> HugeTLB: 0 KiB vmemmap can be freed for a 1.00 GiB page
>>>>> HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
>>>>> HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
>>>>>
>>>>> $ cat /proc/meminfo | grep -i "hugetlb"
>>>>> -
>>>>> Hugetlb: 1048576 kB
>>>> Was this tested with patch [1] in your local tree?
>>>>
>>>> [1]: 
>>>> https://web.git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?id=d629d7a8efc33
>>>>
>>>> IIUC, this patch [1] disables the boot time allocation of hugepages.
>>>> Isn't it also disabling the boot time allocation for gigantic huge pages
>>>> passed by the cmdline params like hugepagesz=1G and hugepages=2 ?
>>> Yes, I had the patch [1] in my tree.
>>>
>>> My understanding is that gigantic pages are allocated before normal huge
>>> pages.
>>>
>>> In hugepages_setup() in hugetlb.c, we have:
>>>
>>>       if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
>>>       hugetlb_hstate_alloc_pages(parsed_hstate);
>>>
>>> I believe the above code allocates memory for gigantic pages, and
>>> hugetlb_init() is
>>> called later because it is a subsys_initcall.
>>>
>>> So, by the time the kernel reaches hugetlb_init(), the gigantic pages
>>> are already
>>> allocated. Isn't that right?
>>>
>>> Please let me know your opinion.
>> Yes, you are right. We are allocating hugepages from memblock, however
>> this isn't getting advertized anywhere. i.e. there is no way one can
>> know from any user interface on whether hugepages were allocated or not.
>> i.e. for fadump kernel when hugepagesz= and hugepages= params are
>> passed, though it will allocate gigantic pages, it won't advertize this
>> in meminfo or anywhere else. This was adding the confusion when I tested
>> this (which wasn't clear from the commit msg either).
>>
>> And I guess this is happening during fadump kernel because of our patch
>> [1], which added a check to see whether hugetlb_disabled is true in
>> hugepages_supported(). Due to this hugetlb_init() is now not doing the
>> rest of the initialization for those gigantic pages which were allocated
>> due to cmdline options from hugepages_setup().
>>
>> [1]: 
>> https://lore.kernel.org/linuxppc-dev/20241202054310.928610-1-sourabhj...@linux.ibm.com/
>>
>> Now as we know from below that fadump can set hugetlb_disabled call in 
>> early_setup().
>> i.e. fadump can mark hugetlb_disabled to

Re: [PATCH] KVM: PPC: Book3S HV: Add H_VIRT mapping for tracing exits

2025-05-17 Thread IBM

Gautam Menghani  writes:

> The macro kvm_trace_symbol_exit is used for providing the mappings
> for the trap vectors and their names. Add mapping for H_VIRT so that
> trap reason is displayed as string instead of a vector number when using
> the kvm_guest_exit tracepoint.
>

trace_kvm_guest_exit(vcpu) gets called on kvm exit and vcpu->arch.trap
carries the trap value whose values are defined in
arch/powerpc/include/asm/kvm_asm.h

i.e.
#define BOOK3S_INTERRUPT_H_VIRT 0xea0

kvm_trace_symbol_exit provides these mappings for book3s HV & PR.
The change looks good to me. Please feel free to add:

Reviewed-by: Ritesh Harjani (IBM) 

> Signed-off-by: Gautam Menghani 
> ---
>  arch/powerpc/kvm/trace_book3s.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h
> index 372a82fa2de3..9260ddbd557f 100644
> --- a/arch/powerpc/kvm/trace_book3s.h
> +++ b/arch/powerpc/kvm/trace_book3s.h
> @@ -25,6 +25,7 @@
>   {0xe00, "H_DATA_STORAGE"}, \
>   {0xe20, "H_INST_STORAGE"}, \
>   {0xe40, "H_EMUL_ASSIST"}, \
> + {0xea0, "H_VIRT"}, \
>   {0xf00, "PERFMON"}, \
>   {0xf20, "ALTIVEC"}, \
>   {0xf40, "VSX"}
> --
> 2.49.0

[PATCH] powerpc: declare unmodified attribute_group usages const

2022-02-28 Thread IBM IMAP

>From ec1a16a15a86c6224cc0129ab3c2ae9f69f2c7c5 Mon Sep 17 00:00:00 2001
From: Rohan McLure 
Date: Mon, 28 Feb 2022 10:19:19 +1100
Subject: [PATCH] powerpc: declare unmodified attribute_group usages
const
To: linuxppc-dev@lists.ozlabs.org

Inspired by (bd75b4ef4977: Constify static attribute_group structs),
accepted by linux-next, reported:
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20220210202805.7750-4-rikard.falkeb...@gmail.com/

Nearly all singletons of type struct attribute_group are never
modified, and so
are candidates for being const. Declare them as const.

Signed-off-by: Rohan McLure 
---
 arch/powerpc/perf/generic-compat-pmu.c  | 4 ++--
 arch/powerpc/perf/hv-24x7.c | 6 +++---
 arch/powerpc/perf/hv-gpci.c | 8 
 arch/powerpc/perf/imc-pmu.c | 6 +++---
 arch/powerpc/perf/isa207-common.c   | 2 +-
 arch/powerpc/perf/power10-pmu.c | 6 +++---
 arch/powerpc/perf/power7-pmu.c  | 4 ++--
 arch/powerpc/perf/power8-pmu.c  | 4 ++--
 arch/powerpc/perf/power9-pmu.c  | 6 +++---
 arch/powerpc/platforms/cell/cbe_thermal.c   | 4 ++--
 arch/powerpc/platforms/powernv/opal-core.c  | 2 +-
 arch/powerpc/platforms/powernv/opal-dump.c  | 2 +-
 arch/powerpc/platforms/powernv/opal-flash.c | 2 +-
 arch/powerpc/platforms/pseries/papr_scm.c   | 2 +-
 arch/powerpc/platforms/pseries/power.c  | 2 +-
 15 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/perf/generic-compat-pmu.c
b/arch/powerpc/perf/generic-compat-pmu.c
index b6e25f75109d..f3db88aee4dd 100644
--- a/arch/powerpc/perf/generic-compat-pmu.c
+++ b/arch/powerpc/perf/generic-compat-pmu.c
@@ -130,7 +130,7 @@ static struct attribute
*generic_compat_events_attr[] = {
NULL
 };
 
-static struct attribute_group generic_compat_pmu_events_group = {
+static const struct attribute_group generic_compat_pmu_events_group =
{
.name = "events",
.attrs = generic_compat_events_attr,
 };
@@ -146,7 +146,7 @@ static struct attribute
*generic_compat_pmu_format_attr[] = {
NULL,
 };
 
-static struct attribute_group generic_compat_pmu_format_group = {
+static const struct attribute_group generic_compat_pmu_format_group =
{
.name = "format",
.attrs = generic_compat_pmu_format_attr,
 };
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 1e8aa934e37e..12c1777187fc 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -204,7 +204,7 @@ static struct attribute *format_attrs[] = {
NULL,
 };
 
-static struct attribute_group format_group = {
+static const struct attribute_group format_group = {
.name = "format",
.attrs = format_attrs,
 };
@@ -1148,7 +1148,7 @@ static struct attribute *cpumask_attrs[] = {
NULL,
 };
 
-static struct attribute_group cpumask_attr_group = {
+static const struct attribute_group cpumask_attr_group = {
.attrs = cpumask_attrs,
 };
 
@@ -1162,7 +1162,7 @@ static struct attribute *if_attrs[] = {
NULL,
 };
 
-static struct attribute_group if_group = {
+static const struct attribute_group if_group = {
.name = "interface",
.bin_attrs = if_bin_attrs,
.attrs = if_attrs,
diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c
index c756228a081f..5eb60ed5b5e8 100644
--- a/arch/powerpc/perf/hv-gpci.c
+++ b/arch/powerpc/perf/hv-gpci.c
@@ -65,12 +65,12 @@ static struct attribute *format_attrs[] = {
NULL,
 };
 
-static struct attribute_group format_group = {
+static const struct attribute_group format_group = {
.name = "format",
.attrs = format_attrs,
 };
 
-static struct attribute_group event_group = {
+static const struct attribute_group event_group = {
.name  = "events",
.attrs = hv_gpci_event_attrs,
 };
@@ -126,11 +126,11 @@ static struct attribute *cpumask_attrs[] = {
NULL,
 };
 
-static struct attribute_group cpumask_attr_group = {
+static const struct attribute_group cpumask_attr_group = {
.attrs = cpumask_attrs,
 };
 
-static struct attribute_group interface_group = {
+static const struct attribute_group interface_group = {
.name = "interface",
.attrs = interface_attrs,
 };
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index e106909ff9c3..70981a321036 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -71,7 +71,7 @@ static struct attribute *imc_format_attrs[] = {
NULL,
 };
 
-static struct attribute_group imc_format_group = {
+static const struct attribute_group imc_format_group = {
.name = "format",
.attrs = imc_format_attrs,
 };
@@ -90,7 +90,7 @@ static struct attribute *trace_imc_format_attrs[] = {
NULL,
 };
 
-static struct attribute_group trace_imc_format_group = {
+static const struct attribute_group trace_imc_format_group = {
 .name = "format",
 .attrs = trace_imc_format_attrs,
 };
@@ -125

[PATCH] powerpc/ptdump: Fix walk_vmemmap to also print first vmemmap entry

2024-04-17 Thread Ritesh Harjani (IBM)

walk_vmemmap() was skipping the first vmemmap entry pointed by
vmemmap_list pointer itself. This patch fixes that.

With this we should see the vmemmap entry at 0xc00c for hash
which wasn't getting printed on doing

"cat /sys/kernel/debug/kernel_hash_pagetable"

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/ptdump/hashpagetable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c 
b/arch/powerpc/mm/ptdump/hashpagetable.c
index 9a601587836b..a6baa6166d94 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -491,7 +491,7 @@ static void walk_vmemmap(struct pg_state *st)
 * Traverse the vmemmaped memory and dump pages that are in the hash
 * pagetable.
 */
-   while (ptr->list) {
+   while (ptr) {
hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize);
ptr = ptr->list;
}
--
2.44.0

Re: [PATCH 2/4] fs: define a firmware security filesystem named fwsecurityfs

2022-11-19 Thread Ritesh Harjani (IBM)

Hello Nayna, 

On 22/11/09 03:10PM, Nayna wrote:
> 
> On 11/9/22 08:46, Greg Kroah-Hartman wrote:
> > On Sun, Nov 06, 2022 at 04:07:42PM -0500, Nayna Jain wrote:
> > > securityfs is meant for Linux security subsystems to expose policies/logs
> > > or any other information. However, there are various firmware security
> > > features which expose their variables for user management via the kernel.
> > > There is currently no single place to expose these variables. Different
> > > platforms use sysfs/platform specific filesystem(efivarfs)/securityfs
> > > interface as they find it appropriate. Thus, there is a gap in kernel
> > > interfaces to expose variables for security features.
> > > 
> > > Define a firmware security filesystem (fwsecurityfs) to be used by
> > > security features enabled by the firmware. These variables are platform
> > > specific. This filesystem provides platforms a way to implement their
> > >   own underlying semantics by defining own inode and file operations.
> > > 
> > > Similar to securityfs, the firmware security filesystem is recommended
> > > to be exposed on a well known mount point /sys/firmware/security.
> > > Platforms can define their own directory or file structure under this 
> > > path.
> > > 
> > > Example:
> > > 
> > > # mount -t fwsecurityfs fwsecurityfs /sys/firmware/security
> > Why not juset use securityfs in /sys/security/firmware/ instead?  Then
> > you don't have to create a new filesystem and convince userspace to
> > mount it in a specific location?

I am also curious to know on why not use securityfs, given the similarity
between the two. :)
More specifics on that below...

> 
> From man 5 sysfs page:
> 
> /sys/firmware: This subdirectory contains interfaces for viewing and
> manipulating firmware-specific objects and attributes.
> 
> /sys/kernel: This subdirectory contains various files and subdirectories
> that provide information about the running kernel.
> 
> The security variables which are being exposed via fwsecurityfs are managed
> by firmware, stored in firmware managed space and also often consumed by
> firmware for enabling various security features.

That's ok. As I see it users of securityfs can define their own fileops
(like how you are doing in fwsecurityfs).
See securityfs_create_file() & securityfs_create_symlink(), can accept the fops
& iops. Except maybe securityfs_create_dir(), that could be since there might
not be a usecase for it. But do you also need it in your case is the question to
ask.

> 
> From git commit b67dbf9d4c1987c370fd18fdc4cf9d8aaea604c2, the purpose of
> securityfs(/sys/kernel/security) is to provide a common place for all kernel
> LSMs. The idea of

Which was then seperated out by commit,
da31894ed7b654e2 ("securityfs: do not depend on CONFIG_SECURITY").

securityfs now has a seperate CONFIG_SECURITYFS config option. In fact I was 
even
thinking of why shouldn't we move security/inode.c into fs/securityfs/inode.c .
fs/* is a common place for all filesystems. Users of securityfs can call it's 
exported kernel APIs to create files/dirs/symlinks.

If we move security/inode.c to fs/security/inode.c, then...
...below call within securityfs_init() should be moved into some lsm sepecific 
file.

#ifdef CONFIG_SECURITY
static struct dentry *lsm_dentry;
static ssize_t lsm_read(struct file *filp, char __user *buf, size_t count,
loff_t *ppos)
{
return simple_read_from_buffer(buf, count, ppos, lsm_names,
strlen(lsm_names));
}

static const struct file_operations lsm_ops = {
.read = lsm_read,
.llseek = generic_file_llseek,
};
#endif

securityfs_init()

#ifdef CONFIG_SECURITY
lsm_dentry = securityfs_create_file("lsm", 0444, NULL, NULL,
&lsm_ops);
#endif

So why not move it? Maybe others, can comment more on whether it's a good idea 
to move security/inode.c into fs/security/inode.c? 
This should then help others identify securityfs filesystem in fs/security/ 
for everyone to notice and utilize for their use?

> fwsecurityfs(/sys/firmware/security) is to similarly provide a common place
> for all firmware security objects.
> 
> /sys/firmware already exists. The patch now defines a new /security
> directory in it for firmware security features. Using /sys/kernel/security
> would mean scattering firmware objects in multiple places and confusing the
> purpose of /sys/kernel and /sys/firmware.

We can also think of it this way that, all security related exports should 
happen via /sys/kernel/security/. Then /sys/kernel/security/firmware/ becomes 
the security related firmware exports.

If you see find /sys -iname firmware, I am sure you will find other firmware
specifics directories related to other specific subsystems
(e.g. 
root@qemu:/home/qemu# find /sys -iname firmware
/sys/devices/ndbus0/nmem0/firmware
/sys/devices/ndbus0/firmware
/sys/firmware
)

But it could be, I am not an expert here, although I was thinking a

[RFC v1 00/10] book3s64/hash: Improve kfence support

2024-07-31 Thread Ritesh Harjani (IBM)

Kfence on book3s64 Hash is broken. Kfence depends upon debug_pagealloc
infrastructure on Hash. debug_pagealloc allocates a linear map based on the size
of the DRAM i.e. 1 byte for every 64k page. That means for a 16TB DRAM, it will
need 256MB memory for linear map. Memory for linear map on pseries comes from
RMA region which has size limitation. On P8 RMA is 512MB, in which we also
fit crash kernel at 256MB, paca allocations and emergency stacks.
That means there is not enough memory in the RMA region for the linear map
based on DRAM size (required by debug_pagealloc).

Now kfence only requires memory for it's kfence objects. kfence by default
requires only (255 + 1) * 2 i.e. 32 MB for 64k pagesize.

This patch series removes the direct dependency of kfence on debug_pagealloc
infrastructure. We separate the Hash kernel linear map functions to take
linear map array as a parameter so that it can support debug_pagealloc and
kfence individually. That means we don't need to keep the linear map region of
size DRAM_SIZE >> PAGE_SHIFT anymore for kfence.

Hence, the current patch series solves the boot failure problem when kfence is
enabled by optimizing the memory it requires for linear map within RMA region.

On radix we don't have this problem because no SLB and no RMA region size
limitation.

Testing:

The patch series is still undergoing some testing. However, given that it's in
good shape, I wanted to send out for review.
Note: It passes kfence kunit tests.
  
  [   48.715649][T1] # kfence: pass:23 fail:0 skip:2 total:25
  [   48.716697][T1] # Totals: pass:23 fail:0 skip:2 total:25
  [   48.717842][T1] ok 1 kfence


TODOs: (for future patches)
===
However, there is still another problem which IMO makes kfence not suitable to
be enabled by default on production kernels with Hash MMU i.e.
When kfence is enabled the kernel linear map uses PAGE_SIZE mapping rather than
16MB mapping as in the original case. Correct me if I am wrong, but 
theoretically
at least this could cause TLB pressure in certain cases, which makes it not
really suitable to be enabled by default on production kernels on Hash.

This is because on P8 book3s64, we don't support mapping multiple pagesizes
(MPSS) within the kernel linear map segment. Is this understanding correct?


Ritesh Harjani (IBM) (10):
  book3s64/hash: Remove kfence support temporarily
  book3s64/hash: Refactor kernel linear map related calls
  book3s64/hash: Add hash_debug_pagealloc_add_slot() function
  book3s64/hash: Add hash_debug_pagealloc_alloc_slots() function
  book3s64/hash: Refactor hash__kernel_map_pages() function
  book3s64/hash: Make kernel_map_linear_page() generic
  book3s64/hash: Disable debug_pagealloc if it requires more memory
  book3s64/hash: Add kfence functionality
  book3s64/radix: Refactoring common kfence related functions
  book3s64/hash: Disable kfence if not early init

 arch/powerpc/include/asm/kfence.h|   2 +
 arch/powerpc/mm/book3s64/hash_utils.c| 364 +--
 arch/powerpc/mm/book3s64/radix_pgtable.c |  12 -
 arch/powerpc/mm/init-common.c|  12 +
 4 files changed, 286 insertions(+), 104 deletions(-)

--
2.45.2

[RFC v1 01/10] book3s64/hash: Remove kfence support temporarily

2024-07-31 Thread Ritesh Harjani (IBM)

Kfence on book3s Hash on pseries is anyways broken. It fails to boot
due to RMA size limitation. That is because, kfence with Hash uses
debug_pagealloc infrastructure. debug_pagealloc allocates linear map
for entire dram size instead of just kfence relevant objects.
This means for 16TB of DRAM it will require (16TB >> PAGE_SHIFT)
which is 256MB which is half of RMA region on P8.
crash kernel reserves 256MB and we also need 2048 * 16KB * 3 for
emergency stack and some more for paca allocations.
That means there is not enough memory for reserving the full linear map
in the RMA region, if the DRAM size is too big (>=16TB)
(The issue is seen above 8TB with crash kernel 256 MB reservation).

Now Kfence does not require linear memory map for entire DRAM.
It only needs for kfence objects. So this patch temporarily removes the
kfence functionality since debug_pagealloc code needs some refactoring.
We will bring in kfence on Hash support in later patches.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/include/asm/kfence.h |  5 +
 arch/powerpc/mm/book3s64/hash_utils.c | 16 +++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index fab124ada1c7..f3a9476a71b3 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -10,6 +10,7 @@
 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_PPC64_ELF_ABI_V1
 #define ARCH_FUNC_PREFIX "."
@@ -25,6 +26,10 @@ static inline void disable_kfence(void)
 
 static inline bool arch_kfence_init_pool(void)
 {
+#ifdef CONFIG_PPC64
+   if (!radix_enabled())
+   return false;
+#endif
return !kfence_disabled;
 }
 #endif
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 01c3b4b65241..1a1b50735fa0 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -431,7 +431,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long 
vend,
break;
 
cond_resched();
-   if (debug_pagealloc_enabled_or_kfence() &&
+   if (debug_pagealloc_enabled() &&
(paddr >> PAGE_SHIFT) < linear_map_hash_count)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
}
@@ -814,7 +814,7 @@ static void __init htab_init_page_sizes(void)
bool aligned = true;
init_hpte_page_sizes();
 
-   if (!debug_pagealloc_enabled_or_kfence()) {
+   if (!debug_pagealloc_enabled()) {
/*
 * Pick a size for the linear mapping. Currently, we only
 * support 16M, 1M and 4K which is the default
@@ -1134,7 +1134,7 @@ static void __init htab_initialize(void)
 
prot = pgprot_val(PAGE_KERNEL);
 
-   if (debug_pagealloc_enabled_or_kfence()) {
+   if (debug_pagealloc_enabled()) {
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
linear_map_hash_slots = memblock_alloc_try_nid(
linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
@@ -2117,7 +2117,7 @@ void hpt_do_stress(unsigned long ea, unsigned long 
hpte_group)
}
 }
 
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
+#if defined(CONFIG_DEBUG_PAGEALLOC)
 static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
 
 static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
@@ -2191,7 +2191,13 @@ int hash__kernel_map_pages(struct page *page, int 
numpages, int enable)
local_irq_restore(flags);
return 0;
 }
-#endif /* CONFIG_DEBUG_PAGEALLOC || CONFIG_KFENCE */
+#else /* CONFIG_DEBUG_PAGEALLOC */
+int hash__kernel_map_pages(struct page *page, int numpages,
+int enable)
+{
+   return 0;
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
 void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
-- 
2.45.2

[RFC v1 02/10] book3s64/hash: Refactor kernel linear map related calls

2024-07-31 Thread Ritesh Harjani (IBM)

This just brings all linear map related handling at one place instead of
having those functions scattered in hash_utils file.
Makes it easy for review.

No functionality changes in this patch.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 164 +-
 1 file changed, 82 insertions(+), 82 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 1a1b50735fa0..b6ae955971bf 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -273,6 +273,88 @@ void hash__tlbiel_all(unsigned int action)
WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
 }
 
+#if defined(CONFIG_DEBUG_PAGEALLOC)
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
+
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+   unsigned long hash;
+   unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+   unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+   unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), 
HPTE_USE_KERNEL_KEY);
+   long ret;
+
+   hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+
+   /* Don't create HPTE entries for bad address */
+   if (!vsid)
+   return;
+
+   if (linear_map_hash_slots[lmi] & 0x80)
+   return;
+
+   ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
+   HPTE_V_BOLTED,
+   mmu_linear_psize, mmu_kernel_ssize);
+
+   BUG_ON (ret < 0);
+   raw_spin_lock(&linear_map_hash_lock);
+   BUG_ON(linear_map_hash_slots[lmi] & 0x80);
+   linear_map_hash_slots[lmi] = ret | 0x80;
+   raw_spin_unlock(&linear_map_hash_lock);
+}
+
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+{
+   unsigned long hash, hidx, slot;
+   unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
+   unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
+
+   hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
+   raw_spin_lock(&linear_map_hash_lock);
+   if (!(linear_map_hash_slots[lmi] & 0x80)) {
+   raw_spin_unlock(&linear_map_hash_lock);
+   return;
+   }
+   hidx = linear_map_hash_slots[lmi] & 0x7f;
+   linear_map_hash_slots[lmi] = 0;
+   raw_spin_unlock(&linear_map_hash_lock);
+   if (hidx & _PTEIDX_SECONDARY)
+   hash = ~hash;
+   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+   slot += hidx & _PTEIDX_GROUP_IX;
+   mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+mmu_linear_psize,
+mmu_kernel_ssize, 0);
+}
+
+int hash__kernel_map_pages(struct page *page, int numpages, int enable)
+{
+   unsigned long flags, vaddr, lmi;
+   int i;
+
+   local_irq_save(flags);
+   for (i = 0; i < numpages; i++, page++) {
+   vaddr = (unsigned long)page_address(page);
+   lmi = __pa(vaddr) >> PAGE_SHIFT;
+   if (lmi >= linear_map_hash_count)
+   continue;
+   if (enable)
+   kernel_map_linear_page(vaddr, lmi);
+   else
+   kernel_unmap_linear_page(vaddr, lmi);
+   }
+   local_irq_restore(flags);
+   return 0;
+}
+#else /* CONFIG_DEBUG_PAGEALLOC */
+int hash__kernel_map_pages(struct page *page, int numpages,
+int enable)
+{
+   return 0;
+}
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
 /*
  * 'R' and 'C' update notes:
  *  - Under pHyp or KVM, the updatepp path will not set C, thus it *will*
@@ -2117,88 +2199,6 @@ void hpt_do_stress(unsigned long ea, unsigned long 
hpte_group)
}
 }
 
-#if defined(CONFIG_DEBUG_PAGEALLOC)
-static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
-
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
-{
-   unsigned long hash;
-   unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
-   unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-   unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), 
HPTE_USE_KERNEL_KEY);
-   long ret;
-
-   hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-
-   /* Don't create HPTE entries for bad address */
-   if (!vsid)
-   return;
-
-   if (linear_map_hash_slots[lmi] & 0x80)
-   return;
-
-   ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
-   HPTE_V_BOLTED,
-   mmu_linear_psize, mmu_kernel_ssize);
-
-   BUG_ON (ret < 0);
-   raw_spin_lock(&linear_map_hash_lock);
-   BUG_ON(linear_map_hash_

[RFC v1 03/10] book3s64/hash: Add hash_debug_pagealloc_add_slot() function

2024-07-31 Thread Ritesh Harjani (IBM)

This adds hash_debug_pagealloc_add_slot() function instead of open
coding that in htab_bolt_mapping(). This is required since we will be
separating kfence functionality to not depend upon debug_pagealloc.

No functionality change in this patch.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index b6ae955971bf..47b40b9b49d6 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -328,6 +328,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, 
unsigned long lmi)
 mmu_kernel_ssize, 0);
 }
 
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot)
+{
+   if (!debug_pagealloc_enabled())
+   return;
+   if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
+   linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80;
+}
+
 int hash__kernel_map_pages(struct page *page, int numpages, int enable)
 {
unsigned long flags, vaddr, lmi;
@@ -353,6 +361,7 @@ int hash__kernel_map_pages(struct page *page, int numpages,
 {
return 0;
 }
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) 
{}
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
 /*
@@ -513,9 +522,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long 
vend,
break;
 
cond_resched();
-   if (debug_pagealloc_enabled() &&
-   (paddr >> PAGE_SHIFT) < linear_map_hash_count)
-   linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80;
+   hash_debug_pagealloc_add_slot(paddr, ret);
}
return ret < 0 ? ret : 0;
 }
-- 
2.45.2

[RFC v1 04/10] book3s64/hash: Add hash_debug_pagealloc_alloc_slots() function

2024-07-31 Thread Ritesh Harjani (IBM)

This adds hash_debug_pagealloc_alloc_slots() function instead of open
coding that in htab_initialize(). This is required since we will be
separating the kfence functionality to not depend upon debug_pagealloc.

Now that everything required for debug_pagealloc is under a #ifdef
config. Bring in linear_map_hash_slots and linear_map_hash_count
variables under the same config too.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 29 ---
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 47b40b9b49d6..6af47b996e79 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -123,8 +123,6 @@ EXPORT_SYMBOL_GPL(mmu_slb_size);
 #ifdef CONFIG_PPC_64K_PAGES
 int mmu_ci_restrictions;
 #endif
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
 struct mmu_hash_ops mmu_hash_ops;
 EXPORT_SYMBOL(mmu_hash_ops);
 
@@ -274,6 +272,8 @@ void hash__tlbiel_all(unsigned int action)
 }
 
 #if defined(CONFIG_DEBUG_PAGEALLOC)
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
 static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
 
 static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
@@ -328,6 +328,19 @@ static void kernel_unmap_linear_page(unsigned long vaddr, 
unsigned long lmi)
 mmu_kernel_ssize, 0);
 }
 
+static inline void hash_debug_pagealloc_alloc_slots(void)
+{
+   if (!debug_pagealloc_enabled())
+   return;
+   linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+   linear_map_hash_slots = memblock_alloc_try_nid(
+   linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
+   ppc64_rma_size, NUMA_NO_NODE);
+   if (!linear_map_hash_slots)
+   panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
+ __func__, linear_map_hash_count, &ppc64_rma_size);
+}
+
 static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot)
 {
if (!debug_pagealloc_enabled())
@@ -361,6 +374,7 @@ int hash__kernel_map_pages(struct page *page, int numpages,
 {
return 0;
 }
+static inline void hash_debug_pagealloc_alloc_slots(void) {}
 static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) 
{}
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
@@ -1223,16 +1237,7 @@ static void __init htab_initialize(void)
 
prot = pgprot_val(PAGE_KERNEL);
 
-   if (debug_pagealloc_enabled()) {
-   linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
-   linear_map_hash_slots = memblock_alloc_try_nid(
-   linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
-   ppc64_rma_size, NUMA_NO_NODE);
-   if (!linear_map_hash_slots)
-   panic("%s: Failed to allocate %lu bytes max_addr=%pa\n",
- __func__, linear_map_hash_count, &ppc64_rma_size);
-   }
-
+   hash_debug_pagealloc_alloc_slots();
/* create bolted the linear mapping in the hash table */
for_each_mem_range(i, &base, &end) {
size = end - base;
-- 
2.45.2

[RFC v1 05/10] book3s64/hash: Refactor hash__kernel_map_pages() function

2024-07-31 Thread Ritesh Harjani (IBM)

This refactors hash__kernel_map_pages() function to call
hash_debug_pagealloc_map_pages(). This will come useful when we will add
kfence support.

No functionality changes in this patch.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 6af47b996e79..b96bbb0025fb 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -349,7 +349,8 @@ static inline void 
hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80;
 }
 
-int hash__kernel_map_pages(struct page *page, int numpages, int enable)
+static int hash_debug_pagealloc_map_pages(struct page *page, int numpages,
+ int enable)
 {
unsigned long flags, vaddr, lmi;
int i;
@@ -368,6 +369,12 @@ int hash__kernel_map_pages(struct page *page, int 
numpages, int enable)
local_irq_restore(flags);
return 0;
 }
+
+int hash__kernel_map_pages(struct page *page, int numpages, int enable)
+{
+   return hash_debug_pagealloc_map_pages(page, numpages, enable);
+}
+
 #else /* CONFIG_DEBUG_PAGEALLOC */
 int hash__kernel_map_pages(struct page *page, int numpages,
 int enable)
-- 
2.45.2

[RFC v1 06/10] book3s64/hash: Make kernel_map_linear_page() generic

2024-07-31 Thread Ritesh Harjani (IBM)

Currently kernel_map_linear_page() function assumes to be working on
linear_map_hash_slots array. But since in later patches we need a
separate linear map array for kfence, hence make
kernel_map_linear_page() take a linear map array and lock in it's
function argument.

This is needed to separate out kfence from debug_pagealloc
infrastructure.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 47 ++-
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index b96bbb0025fb..3f3eaf0a254b 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -272,11 +272,8 @@ void hash__tlbiel_all(unsigned int action)
 }
 
 #if defined(CONFIG_DEBUG_PAGEALLOC)
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
-
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx,
+  u8 *slots, raw_spinlock_t *lock)
 {
unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
@@ -290,7 +287,7 @@ static void kernel_map_linear_page(unsigned long vaddr, 
unsigned long lmi)
if (!vsid)
return;
 
-   if (linear_map_hash_slots[lmi] & 0x80)
+   if (slots[idx] & 0x80)
return;
 
ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
@@ -298,36 +295,40 @@ static void kernel_map_linear_page(unsigned long vaddr, 
unsigned long lmi)
mmu_linear_psize, mmu_kernel_ssize);
 
BUG_ON (ret < 0);
-   raw_spin_lock(&linear_map_hash_lock);
-   BUG_ON(linear_map_hash_slots[lmi] & 0x80);
-   linear_map_hash_slots[lmi] = ret | 0x80;
-   raw_spin_unlock(&linear_map_hash_lock);
+   raw_spin_lock(lock);
+   BUG_ON(slots[idx] & 0x80);
+   slots[idx] = ret | 0x80;
+   raw_spin_unlock(lock);
 }
 
-static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long idx,
+u8 *slots, raw_spinlock_t *lock)
 {
-   unsigned long hash, hidx, slot;
+   unsigned long hash, hslot, slot;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
 
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-   raw_spin_lock(&linear_map_hash_lock);
-   if (!(linear_map_hash_slots[lmi] & 0x80)) {
-   raw_spin_unlock(&linear_map_hash_lock);
+   raw_spin_lock(lock);
+   if (!(slots[idx] & 0x80)) {
+   raw_spin_unlock(lock);
return;
}
-   hidx = linear_map_hash_slots[lmi] & 0x7f;
-   linear_map_hash_slots[lmi] = 0;
-   raw_spin_unlock(&linear_map_hash_lock);
-   if (hidx & _PTEIDX_SECONDARY)
+   hslot = slots[idx] & 0x7f;
+   slots[idx] = 0;
+   raw_spin_unlock(lock);
+   if (hslot & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
+   slot += hslot & _PTEIDX_GROUP_IX;
mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
 mmu_linear_psize,
 mmu_kernel_ssize, 0);
 }
 
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
 static inline void hash_debug_pagealloc_alloc_slots(void)
 {
if (!debug_pagealloc_enabled())
@@ -362,9 +363,11 @@ static int hash_debug_pagealloc_map_pages(struct page 
*page, int numpages,
if (lmi >= linear_map_hash_count)
continue;
if (enable)
-   kernel_map_linear_page(vaddr, lmi);
+   kernel_map_linear_page(vaddr, lmi,
+   linear_map_hash_slots, &linear_map_hash_lock);
else
-   kernel_unmap_linear_page(vaddr, lmi);
+   kernel_unmap_linear_page(vaddr, lmi,
+   linear_map_hash_slots, &linear_map_hash_lock);
}
local_irq_restore(flags);
return 0;
-- 
2.45.2

[RFC v1 07/10] book3s64/hash: Disable debug_pagealloc if it requires more memory

2024-07-31 Thread Ritesh Harjani (IBM)

Make size of the linear map to be allocated in RMA region to be of
ppc64_rma_size / 4. If debug_pagealloc requires more memory than that
then do not allocate any memory and disable debug_pagealloc.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 3f3eaf0a254b..906cd167180a 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -331,9 +331,19 @@ static unsigned long linear_map_hash_count;
 static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
 static inline void hash_debug_pagealloc_alloc_slots(void)
 {
+   unsigned long max_hash_count = (ppc64_rma_size / 4) >> PAGE_SHIFT;
+
if (!debug_pagealloc_enabled())
return;
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+   if (unlikely(linear_map_hash_count > max_hash_count)) {
+   pr_info("linear map size (%llu) greater than 4 times RMA region 
(%llu). Disabling debug_pagealloc\n",
+   ((u64)linear_map_hash_count << PAGE_SHIFT),
+   ppc64_rma_size);
+   linear_map_hash_count = 0;
+   return;
+   }
+
linear_map_hash_slots = memblock_alloc_try_nid(
linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
ppc64_rma_size, NUMA_NO_NODE);
@@ -344,7 +354,7 @@ static inline void hash_debug_pagealloc_alloc_slots(void)
 
 static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot)
 {
-   if (!debug_pagealloc_enabled())
+   if (!debug_pagealloc_enabled() || !linear_map_hash_count)
return;
if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80;
@@ -356,6 +366,9 @@ static int hash_debug_pagealloc_map_pages(struct page 
*page, int numpages,
unsigned long flags, vaddr, lmi;
int i;
 
+   if (!debug_pagealloc_enabled() || !linear_map_hash_count)
+   return 0;
+
local_irq_save(flags);
for (i = 0; i < numpages; i++, page++) {
vaddr = (unsigned long)page_address(page);
-- 
2.45.2

[RFC v1 08/10] book3s64/hash: Add kfence functionality

2024-07-31 Thread Ritesh Harjani (IBM)

Now that linear map functionality of debug_pagealloc is made generic,
enable kfence to use this generic infrastructure.

1. Define kfence related linear map variables.
   - u8 *linear_map_kf_hash_slots;
   - unsigned long linear_map_kf_hash_count;
   - DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
2. The linear map size allocated in RMA region is quite small
   (KFENCE_POOL_SIZE >> PAGE_SHIFT) which is 512 bytes by default.
3. kfence pool memory is reserved using memblock_phys_alloc() which has
   can come from anywhere.
   (default 255 objects => ((1+255) * 2) << PAGE_SHIFT = 32MB)
4. The hash slot information for kfence memory gets added in linear map
   in hash_linear_map_add_slot() (which also adds for debug_pagealloc).

Reported-by: Pavithra Prakash 
Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/include/asm/kfence.h |   5 -
 arch/powerpc/mm/book3s64/hash_utils.c | 162 +++---
 2 files changed, 149 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index f3a9476a71b3..fab124ada1c7 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -10,7 +10,6 @@
 
 #include 
 #include 
-#include 
 
 #ifdef CONFIG_PPC64_ELF_ABI_V1
 #define ARCH_FUNC_PREFIX "."
@@ -26,10 +25,6 @@ static inline void disable_kfence(void)
 
 static inline bool arch_kfence_init_pool(void)
 {
-#ifdef CONFIG_PPC64
-   if (!radix_enabled())
-   return false;
-#endif
return !kfence_disabled;
 }
 #endif
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 906cd167180a..c66b9921fc7d 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -66,6 +67,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -271,7 +273,7 @@ void hash__tlbiel_all(unsigned int action)
WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
 }
 
-#if defined(CONFIG_DEBUG_PAGEALLOC)
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
 static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx,
   u8 *slots, raw_spinlock_t *lock)
 {
@@ -325,11 +327,13 @@ static void kernel_unmap_linear_page(unsigned long vaddr, 
unsigned long idx,
 mmu_linear_psize,
 mmu_kernel_ssize, 0);
 }
+#endif
 
+#if defined(CONFIG_DEBUG_PAGEALLOC)
 static u8 *linear_map_hash_slots;
 static unsigned long linear_map_hash_count;
 static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
-static inline void hash_debug_pagealloc_alloc_slots(void)
+static void hash_debug_pagealloc_alloc_slots(void)
 {
unsigned long max_hash_count = (ppc64_rma_size / 4) >> PAGE_SHIFT;
 
@@ -352,7 +356,8 @@ static inline void hash_debug_pagealloc_alloc_slots(void)
  __func__, linear_map_hash_count, &ppc64_rma_size);
 }
 
-static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot)
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr,
+   int slot)
 {
if (!debug_pagealloc_enabled() || !linear_map_hash_count)
return;
@@ -386,20 +391,148 @@ static int hash_debug_pagealloc_map_pages(struct page 
*page, int numpages,
return 0;
 }
 
-int hash__kernel_map_pages(struct page *page, int numpages, int enable)
+#else /* CONFIG_DEBUG_PAGEALLOC */
+static inline void hash_debug_pagealloc_alloc_slots(void) {}
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) 
{}
+static int __maybe_unused
+hash_debug_pagealloc_map_pages(struct page *page, int numpages, int enable)
 {
-   return hash_debug_pagealloc_map_pages(page, numpages, enable);
+   return 0;
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-#else /* CONFIG_DEBUG_PAGEALLOC */
-int hash__kernel_map_pages(struct page *page, int numpages,
-int enable)
+#if defined(CONFIG_KFENCE)
+static u8 *linear_map_kf_hash_slots;
+static unsigned long linear_map_kf_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
+
+static phys_addr_t kfence_pool;
+
+static inline void hash_kfence_alloc_pool(void)
+{
+
+   // allocate linear map for kfence within RMA region
+   linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT;
+   linear_map_kf_hash_slots = memblock_alloc_try_nid(
+   linear_map_kf_hash_count, 1,
+   MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
+   NUMA_NO_NODE);
+   if (!linear_map_kf_hash_slots) {
+   pr_err("%s: memblock for linear map (%lu) failed\n", __func__,
+

[RFC v1 09/10] book3s64/radix: Refactoring common kfence related functions

2024-07-31 Thread Ritesh Harjani (IBM)

Both radix and hash on book3s requires to detect if kfence
early init is enabled or not. Hash needs to disable kfence
if early init is not enabled because with kfence the linear map is
mapped using PAGE_SIZE rather than 16M mapping.
We don't support multiple page sizes for slb entry used for kernel
linear map in book3s64.

This patch refactors out the common functions required to detect kfence
early init is enabled or not.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/include/asm/kfence.h|  2 ++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 12 
 arch/powerpc/mm/init-common.c| 12 
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index fab124ada1c7..5975688d8de1 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -15,6 +15,8 @@
 #define ARCH_FUNC_PREFIX "."
 #endif
 
+extern bool kfence_early_init;
+
 #ifdef CONFIG_KFENCE
 extern bool kfence_disabled;
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index b0d927009af8..311e2112d782 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -363,18 +363,6 @@ static int __meminit create_physical_mapping(unsigned long 
start,
 }
 
 #ifdef CONFIG_KFENCE
-static bool __ro_after_init kfence_early_init = 
!!CONFIG_KFENCE_SAMPLE_INTERVAL;
-
-static int __init parse_kfence_early_init(char *arg)
-{
-   int val;
-
-   if (get_option(&arg, &val))
-   kfence_early_init = !!val;
-   return 0;
-}
-early_param("kfence.sample_interval", parse_kfence_early_init);
-
 static inline phys_addr_t alloc_kfence_pool(void)
 {
phys_addr_t kfence_pool;
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 21131b96d209..259821a4db62 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -33,6 +33,18 @@ bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
 bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
 #ifdef CONFIG_KFENCE
 bool __ro_after_init kfence_disabled;
+bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
+static int __init parse_kfence_early_init(char *arg)
+{
+   int val;
+
+   if (get_option(&arg, &val))
+   kfence_early_init = !!val;
+   return 0;
+}
+early_param("kfence.sample_interval", parse_kfence_early_init);
+#else
+bool __ro_after_init kfence_early_init;
 #endif
 
 static int __init parse_nosmep(char *p)
-- 
2.45.2

[RFC v1 10/10] book3s64/hash: Disable kfence if not early init

2024-07-31 Thread Ritesh Harjani (IBM)

Enable kfence on book3s64 hash only when early init is enabled.
This is because, kfence could cause the kernel linear map to be mapped
at PAGE_SIZE level instead of 16M (which I guess we don't want).

Also currently there is no way to -
1. Make multiple page size entries for the SLB used for kernel linear
   map.
2. No easy way of getting the hash slot details after the page table
   mapping for kernel linear setup. So even if kfence allocate the
   pool in late init, we won't be able to get the hash slot details in
   kfence linear map.

Thus this patch disables kfence on hash if kfence early init is not
enabled.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index c66b9921fc7d..759dbcbf1483 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -410,6 +410,8 @@ static phys_addr_t kfence_pool;

 static inline void hash_kfence_alloc_pool(void)
 {
+   if (!kfence_early_init)
+   goto err;

// allocate linear map for kfence within RMA region
linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT;
@@ -1074,7 +1076,8 @@ static void __init htab_init_page_sizes(void)
bool aligned = true;
init_hpte_page_sizes();

-   if (!debug_pagealloc_enabled_or_kfence()) {
+   if (!debug_pagealloc_enabled() &&
+   !(IS_ENABLED(CONFIG_KFENCE) && kfence_early_init)) {
/*
 * Pick a size for the linear mapping. Currently, we only
 * support 16M, 1M and 4K which is the default
--
2.45.2

[PATCH] powerpc: Use printk instead of WARN in change_memory_attr

2024-08-27 Thread Ritesh Harjani (IBM)

Use pr_warn_once instead of WARN_ON_ONCE as discussed here [1]
for printing possible use of set_memory_* on linear map on Hash.

[1]: https://lore.kernel.org/all/877cc2fpi2.fsf@mail.lhotse/#t

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/pageattr.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
index ac22bf28086f..c8c2d664c6f3 100644
--- a/arch/powerpc/mm/pageattr.c
+++ b/arch/powerpc/mm/pageattr.c
@@ -94,8 +94,11 @@ int change_memory_attr(unsigned long addr, int numpages, 
long action)
if (!radix_enabled()) {
int region = get_region_id(addr);

-   if (WARN_ON_ONCE(region != VMALLOC_REGION_ID && region != 
IO_REGION_ID))
+   if (region != VMALLOC_REGION_ID && region != IO_REGION_ID) {
+   pr_warn_once("%s: possible use of set_memory_* on 
linear map on Hash from (%ps)\n",
+   __func__, __builtin_return_address(0));
return -EINVAL;
+   }
}
 #endif

--
2.39.2

[RFC RESEND v2 09/13] book3s64/hash: Disable debug_pagealloc if it requires more memory

2024-10-14 Thread Ritesh Harjani (IBM)

Make size of the linear map to be allocated in RMA region to be of
ppc64_rma_size / 4. If debug_pagealloc requires more memory than that
then do not allocate any memory and disable debug_pagealloc.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index cc2eaa97982c..cffbb6499ac4 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -331,9 +331,19 @@ static unsigned long linear_map_hash_count;
 static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
 static inline void hash_debug_pagealloc_alloc_slots(void)
 {
+   unsigned long max_hash_count = ppc64_rma_size / 4;
+
if (!debug_pagealloc_enabled())
return;
linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+   if (unlikely(linear_map_hash_count > max_hash_count)) {
+   pr_info("linear map size (%llu) greater than 4 times RMA region 
(%llu). Disabling debug_pagealloc\n",
+   ((u64)linear_map_hash_count << PAGE_SHIFT),
+   ppc64_rma_size);
+   linear_map_hash_count = 0;
+   return;
+   }
+
linear_map_hash_slots = memblock_alloc_try_nid(
linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
ppc64_rma_size, NUMA_NO_NODE);
@@ -344,7 +354,7 @@ static inline void hash_debug_pagealloc_alloc_slots(void)
 
 static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot)
 {
-   if (!debug_pagealloc_enabled())
+   if (!debug_pagealloc_enabled() || !linear_map_hash_count)
return;
if ((paddr >> PAGE_SHIFT) < linear_map_hash_count)
linear_map_hash_slots[paddr >> PAGE_SHIFT] = slot | 0x80;
@@ -356,6 +366,9 @@ static int hash_debug_pagealloc_map_pages(struct page 
*page, int numpages,
unsigned long flags, vaddr, lmi;
int i;
 
+   if (!debug_pagealloc_enabled() || !linear_map_hash_count)
+   return 0;
+
local_irq_save(flags);
for (i = 0; i < numpages; i++, page++) {
vaddr = (unsigned long)page_address(page);
-- 
2.46.0

[RFC RESEND v2 10/13] book3s64/hash: Add kfence functionality

2024-10-14 Thread Ritesh Harjani (IBM)

Now that linear map functionality of debug_pagealloc is made generic,
enable kfence to use this generic infrastructure.

1. Define kfence related linear map variables.
   - u8 *linear_map_kf_hash_slots;
   - unsigned long linear_map_kf_hash_count;
   - DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
2. The linear map size allocated in RMA region is quite small
   (KFENCE_POOL_SIZE >> PAGE_SHIFT) which is 512 bytes by default.
3. kfence pool memory is reserved using memblock_phys_alloc() which has
   can come from anywhere.
   (default 255 objects => ((1+255) * 2) << PAGE_SHIFT = 32MB)
4. The hash slot information for kfence memory gets added in linear map
   in hash_linear_map_add_slot() (which also adds for debug_pagealloc).

Reported-by: Pavithra Prakash 
Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/include/asm/kfence.h |   5 -
 arch/powerpc/mm/book3s64/hash_utils.c | 162 +++---
 2 files changed, 149 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index f3a9476a71b3..fab124ada1c7 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -10,7 +10,6 @@
 
 #include 
 #include 
-#include 
 
 #ifdef CONFIG_PPC64_ELF_ABI_V1
 #define ARCH_FUNC_PREFIX "."
@@ -26,10 +25,6 @@ static inline void disable_kfence(void)
 
 static inline bool arch_kfence_init_pool(void)
 {
-#ifdef CONFIG_PPC64
-   if (!radix_enabled())
-   return false;
-#endif
return !kfence_disabled;
 }
 #endif
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index cffbb6499ac4..53e6f3a524eb 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -66,6 +67,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -271,7 +273,7 @@ void hash__tlbiel_all(unsigned int action)
WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
 }
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
 static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx,
   u8 *slots, raw_spinlock_t *lock)
 {
@@ -325,11 +327,13 @@ static void kernel_unmap_linear_page(unsigned long vaddr, 
unsigned long idx,
 mmu_linear_psize,
 mmu_kernel_ssize, 0);
 }
+#endif
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
 static u8 *linear_map_hash_slots;
 static unsigned long linear_map_hash_count;
 static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
-static inline void hash_debug_pagealloc_alloc_slots(void)
+static void hash_debug_pagealloc_alloc_slots(void)
 {
unsigned long max_hash_count = ppc64_rma_size / 4;
 
@@ -352,7 +356,8 @@ static inline void hash_debug_pagealloc_alloc_slots(void)
  __func__, linear_map_hash_count, &ppc64_rma_size);
 }
 
-static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot)
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr,
+   int slot)
 {
if (!debug_pagealloc_enabled() || !linear_map_hash_count)
return;
@@ -386,20 +391,148 @@ static int hash_debug_pagealloc_map_pages(struct page 
*page, int numpages,
return 0;
 }
 
-int hash__kernel_map_pages(struct page *page, int numpages, int enable)
+#else /* CONFIG_DEBUG_PAGEALLOC */
+static inline void hash_debug_pagealloc_alloc_slots(void) {}
+static inline void hash_debug_pagealloc_add_slot(phys_addr_t paddr, int slot) 
{}
+static int __maybe_unused
+hash_debug_pagealloc_map_pages(struct page *page, int numpages, int enable)
 {
-   return hash_debug_pagealloc_map_pages(page, numpages, enable);
+   return 0;
 }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
 
-#else /* CONFIG_DEBUG_PAGEALLOC */
-int hash__kernel_map_pages(struct page *page, int numpages,
-int enable)
+#ifdef CONFIG_KFENCE
+static u8 *linear_map_kf_hash_slots;
+static unsigned long linear_map_kf_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_kf_hash_lock);
+
+static phys_addr_t kfence_pool;
+
+static inline void hash_kfence_alloc_pool(void)
+{
+
+   // allocate linear map for kfence within RMA region
+   linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT;
+   linear_map_kf_hash_slots = memblock_alloc_try_nid(
+   linear_map_kf_hash_count, 1,
+   MEMBLOCK_LOW_LIMIT, ppc64_rma_size,
+   NUMA_NO_NODE);
+   if (!linear_map_kf_hash_slots) {
+   pr_err("%s: memblock for linear map (%lu) failed\n", __func__,
+   linear_map_kf_hash_count);
+   goto

[RFC RESEND v2 11/13] book3s64/radix: Refactoring common kfence related functions

2024-10-14 Thread Ritesh Harjani (IBM)

Both radix and hash on book3s requires to detect if kfence
early init is enabled or not. Hash needs to disable kfence
if early init is not enabled because with kfence the linear map is
mapped using PAGE_SIZE rather than 16M mapping.
We don't support multiple page sizes for slb entry used for kernel
linear map in book3s64.

This patch refactors out the common functions required to detect kfence
early init is enabled or not.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/include/asm/kfence.h|  8 ++--
 arch/powerpc/mm/book3s64/pgtable.c   | 13 +
 arch/powerpc/mm/book3s64/radix_pgtable.c | 12 
 arch/powerpc/mm/init-common.c|  1 +
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kfence.h 
b/arch/powerpc/include/asm/kfence.h
index fab124ada1c7..1f7cab58ab2c 100644
--- a/arch/powerpc/include/asm/kfence.h
+++ b/arch/powerpc/include/asm/kfence.h
@@ -15,7 +15,7 @@
 #define ARCH_FUNC_PREFIX "."
 #endif
 
-#ifdef CONFIG_KFENCE
+extern bool kfence_early_init;
 extern bool kfence_disabled;
 
 static inline void disable_kfence(void)
@@ -27,7 +27,11 @@ static inline bool arch_kfence_init_pool(void)
 {
return !kfence_disabled;
 }
-#endif
+
+static inline bool kfence_early_init_enabled(void)
+{
+   return IS_ENABLED(CONFIG_KFENCE) && kfence_early_init;
+}
 
 #ifdef CONFIG_PPC64
 static inline bool kfence_protect_page(unsigned long addr, bool protect)
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 5a4a75369043..374542528080 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -37,6 +37,19 @@ EXPORT_SYMBOL(__pmd_frag_nr);
 unsigned long __pmd_frag_size_shift;
 EXPORT_SYMBOL(__pmd_frag_size_shift);
 
+#ifdef CONFIG_KFENCE
+extern bool kfence_early_init;
+static int __init parse_kfence_early_init(char *arg)
+{
+   int val;
+
+   if (get_option(&arg, &val))
+   kfence_early_init = !!val;
+   return 0;
+}
+early_param("kfence.sample_interval", parse_kfence_early_init);
+#endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * This is called when relaxing access to a hugepage. It's also called in the 
page
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index b0d927009af8..311e2112d782 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -363,18 +363,6 @@ static int __meminit create_physical_mapping(unsigned long 
start,
 }
 
 #ifdef CONFIG_KFENCE
-static bool __ro_after_init kfence_early_init = 
!!CONFIG_KFENCE_SAMPLE_INTERVAL;
-
-static int __init parse_kfence_early_init(char *arg)
-{
-   int val;
-
-   if (get_option(&arg, &val))
-   kfence_early_init = !!val;
-   return 0;
-}
-early_param("kfence.sample_interval", parse_kfence_early_init);
-
 static inline phys_addr_t alloc_kfence_pool(void)
 {
phys_addr_t kfence_pool;
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index 2978fcbe307e..745097554bea 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -33,6 +33,7 @@ bool disable_kuep = !IS_ENABLED(CONFIG_PPC_KUEP);
 bool disable_kuap = !IS_ENABLED(CONFIG_PPC_KUAP);
 #ifdef CONFIG_KFENCE
 bool __ro_after_init kfence_disabled;
+bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
 #endif
 
 static int __init parse_nosmep(char *p)
-- 
2.46.0

[RFC RESEND v2 13/13] book3s64/hash: Early detect debug_pagealloc size requirement

2024-10-14 Thread Ritesh Harjani (IBM)

Add hash_supports_debug_pagealloc() helper to detect whether
debug_pagealloc can be supported on hash or not. This checks for both,
whether debug_pagealloc config is enabled and the linear map should
fit within rma_size/4 region size.

This can then be used early during htab_init_page_sizes() to decide
linear map pagesize if hash supports either debug_pagealloc or
kfence.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 25 +
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index b6da25719e37..3ffc98b3deb1 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -329,25 +329,26 @@ static void kernel_unmap_linear_page(unsigned long vaddr, 
unsigned long idx,
 }
 #endif
 
+static inline bool hash_supports_debug_pagealloc(void)
+{
+   unsigned long max_hash_count = ppc64_rma_size / 4;
+   unsigned long linear_map_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
+
+   if (!debug_pagealloc_enabled() || linear_map_count > max_hash_count)
+   return false;
+   return true;
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static u8 *linear_map_hash_slots;
 static unsigned long linear_map_hash_count;
 static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
 static void hash_debug_pagealloc_alloc_slots(void)
 {
-   unsigned long max_hash_count = ppc64_rma_size / 4;
-
-   if (!debug_pagealloc_enabled())
-   return;
-   linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
-   if (unlikely(linear_map_hash_count > max_hash_count)) {
-   pr_info("linear map size (%llu) greater than 4 times RMA region 
(%llu). Disabling debug_pagealloc\n",
-   ((u64)linear_map_hash_count << PAGE_SHIFT),
-   ppc64_rma_size);
-   linear_map_hash_count = 0;
+   if (!hash_supports_debug_pagealloc())
return;
-   }
 
+   linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT;
linear_map_hash_slots = memblock_alloc_try_nid(
linear_map_hash_count, 1, MEMBLOCK_LOW_LIMIT,
ppc64_rma_size, NUMA_NO_NODE);
@@ -1076,7 +1077,7 @@ static void __init htab_init_page_sizes(void)
bool aligned = true;
init_hpte_page_sizes();
 
-   if (!debug_pagealloc_enabled() && !kfence_early_init_enabled()) {
+   if (!hash_supports_debug_pagealloc() && !kfence_early_init_enabled()) {
/*
 * Pick a size for the linear mapping. Currently, we only
 * support 16M, 1M and 4K which is the default
-- 
2.46.0

[RFC RESEND v2 12/13] book3s64/hash: Disable kfence if not early init

2024-10-14 Thread Ritesh Harjani (IBM)

Enable kfence on book3s64 hash only when early init is enabled.
This is because, kfence could cause the kernel linear map to be mapped
at PAGE_SIZE level instead of 16M (which I guess we don't want).

Also currently there is no way to -
1. Make multiple page size entries for the SLB used for kernel linear
   map.
2. No easy way of getting the hash slot details after the page table
   mapping for kernel linear setup. So even if kfence allocate the
   pool in late init, we won't be able to get the hash slot details in
   kfence linear map.

Thus this patch disables kfence on hash if kfence early init is not
enabled.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 53e6f3a524eb..b6da25719e37 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -410,6 +410,8 @@ static phys_addr_t kfence_pool;
 
 static inline void hash_kfence_alloc_pool(void)
 {
+   if (!kfence_early_init_enabled())
+   goto err;
 
// allocate linear map for kfence within RMA region
linear_map_kf_hash_count = KFENCE_POOL_SIZE >> PAGE_SHIFT;
@@ -1074,7 +1076,7 @@ static void __init htab_init_page_sizes(void)
bool aligned = true;
init_hpte_page_sizes();
 
-   if (!debug_pagealloc_enabled_or_kfence()) {
+   if (!debug_pagealloc_enabled() && !kfence_early_init_enabled()) {
/*
 * Pick a size for the linear mapping. Currently, we only
 * support 16M, 1M and 4K which is the default
-- 
2.46.0

[RFC RESEND v2 00/13] powerpc/kfence: Improve kfence support

2024-10-14 Thread Ritesh Harjani (IBM)

Resending v2 for review comments.

This patch series addresses following to improve kfence support on Powerpc.

1. Usage of copy_from_kernel_nofault() within kernel, such as read from
   /proc/kcore can cause kfence to report false negatives.

   This is similar to what was reported on s390. [1]
   [1]: https://lore.kernel.org/all/20230213183858.1473681-1-...@linux.ibm.com/

   Hence this series adds patch-1 as a kfence kunit test to detect
   copy_from_kernel_nofault() case. I assume the same might be needed for all
   other archs as well (Please correct if this understanding is wrong).

   Patch-2, thus adds a fix to handle this case in ___do_page_fault() for
   powerpc.

2. (book3s64) Kfence depends upon debug_pagealloc infrastructure on Hash.
   debug_pagealloc allocates a linear map based on the size of the DRAM i.e.
   1 byte for every 64k page. That means for a 16TB DRAM, it will need 256MB
   memory for linear map. Memory for linear map on pseries comes from
   RMA region which has size limitation. On P8 RMA is 512MB, in which we also
   fit crash kernel at 256MB, paca allocations and emergency stacks.
   That means there is not enough memory in the RMA region for the linear map
   based on DRAM size (required by debug_pagealloc).

   Now kfence only requires memory for it's kfence objects. kfence by default
   requires only (255 + 1) * 2 i.e. 32 MB for 64k pagesize.

Summary of patches
==
This series in Patch-1 adds a kfence kunit testcase to detect
copy_from_kernel_nofault() case. I assume the same should be needed for all
other archs as well.

Patch-2 adds a fix to handle this false negatives from 
copy_from_kernel_nofault().

Patch[3-9] removes the direct dependency of kfence on debug_pagealloc
infrastructure. We make Hash kernel linear map functions to take linear map 
array
as a parameter so that it can support debug_pagealloc and kfence individually.
That means we don't need to keep the size of the linear map to be
DRAM_SIZE >> PAGE_SHIFT anymore for kfence.

Patch-10: Adds kfence support with above (abstracted out) kernel linear map
infrastructure. With it, this also fixes, the boot failure problem when kfence
gets enabled on Hash with >=16TB of RAM.

Patch-11 & Patch-12: Ensure late initialization of kfence is disabled for both
Hash and Radix due to linear mapping size limiations. Commit gives more
description.

Patch-13: Early detects if debug_pagealloc cannot be enabled (due to RMA size
limitation) so that the linear mapping size can be set correctly during init.

Testing:

It passes kfence kunit tests with Hash and Radix.
[   44.355173][T1] # kfence: pass:27 fail:0 skip:0 total:27
[   44.358631][T1] # Totals: pass:27 fail:0 skip:0 total:27
[   44.365570][T1] ok 1 kfence


Future TODO:

When kfence on Hash gets enabled, the kernel linear map uses PAGE_SIZE mapping
rather than 16MB mapping. This should be improved in future.

v1 -> v2:
=
1. Added a kunit testcase patch-1.
2. Fixed a false negative with copy_from_kernel_nofault() in patch-2.
3. Addressed review comments from Christophe Leroy.
4. Added patch-13.


Nirjhar Roy (1):
  mm/kfence: Add a new kunit test test_use_after_free_read_nofault()

Ritesh Harjani (IBM) (12):
  powerpc: mm: Fix kfence page fault reporting
  book3s64/hash: Remove kfence support temporarily
  book3s64/hash: Refactor kernel linear map related calls
  book3s64/hash: Add hash_debug_pagealloc_add_slot() function
  book3s64/hash: Add hash_debug_pagealloc_alloc_slots() function
  book3s64/hash: Refactor hash__kernel_map_pages() function
  book3s64/hash: Make kernel_map_linear_page() generic
  book3s64/hash: Disable debug_pagealloc if it requires more memory
  book3s64/hash: Add kfence functionality
  book3s64/radix: Refactoring common kfence related functions
  book3s64/hash: Disable kfence if not early init
  book3s64/hash: Early detect debug_pagealloc size requirement

 arch/powerpc/include/asm/kfence.h|   8 +-
 arch/powerpc/mm/book3s64/hash_utils.c| 364 +--
 arch/powerpc/mm/book3s64/pgtable.c   |  13 +
 arch/powerpc/mm/book3s64/radix_pgtable.c |  12 -
 arch/powerpc/mm/fault.c  |  10 +-
 arch/powerpc/mm/init-common.c|   1 +
 mm/kfence/kfence_test.c  |  17 ++
 7 files changed, 318 insertions(+), 107 deletions(-)

--
2.46.0

[RFC RESEND v2 08/13] book3s64/hash: Make kernel_map_linear_page() generic

2024-10-14 Thread Ritesh Harjani (IBM)

Currently kernel_map_linear_page() function assumes to be working on
linear_map_hash_slots array. But since in later patches we need a
separate linear map array for kfence, hence make
kernel_map_linear_page() take a linear map array and lock in it's
function argument.

This is needed to separate out kfence from debug_pagealloc
infrastructure.

Signed-off-by: Ritesh Harjani (IBM) 
---
 arch/powerpc/mm/book3s64/hash_utils.c | 47 ++-
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index da9b089c8e8b..cc2eaa97982c 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -272,11 +272,8 @@ void hash__tlbiel_all(unsigned int action)
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
-
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+static void kernel_map_linear_page(unsigned long vaddr, unsigned long idx,
+  u8 *slots, raw_spinlock_t *lock)
 {
unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
@@ -290,7 +287,7 @@ static void kernel_map_linear_page(unsigned long vaddr, 
unsigned long lmi)
if (!vsid)
return;
 
-   if (linear_map_hash_slots[lmi] & 0x80)
+   if (slots[idx] & 0x80)
return;
 
ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
@@ -298,36 +295,40 @@ static void kernel_map_linear_page(unsigned long vaddr, 
unsigned long lmi)
mmu_linear_psize, mmu_kernel_ssize);
 
BUG_ON (ret < 0);
-   raw_spin_lock(&linear_map_hash_lock);
-   BUG_ON(linear_map_hash_slots[lmi] & 0x80);
-   linear_map_hash_slots[lmi] = ret | 0x80;
-   raw_spin_unlock(&linear_map_hash_lock);
+   raw_spin_lock(lock);
+   BUG_ON(slots[idx] & 0x80);
+   slots[idx] = ret | 0x80;
+   raw_spin_unlock(lock);
 }
 
-static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
+static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long idx,
+u8 *slots, raw_spinlock_t *lock)
 {
-   unsigned long hash, hidx, slot;
+   unsigned long hash, hslot, slot;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
 
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-   raw_spin_lock(&linear_map_hash_lock);
-   if (!(linear_map_hash_slots[lmi] & 0x80)) {
-   raw_spin_unlock(&linear_map_hash_lock);
+   raw_spin_lock(lock);
+   if (!(slots[idx] & 0x80)) {
+   raw_spin_unlock(lock);
return;
}
-   hidx = linear_map_hash_slots[lmi] & 0x7f;
-   linear_map_hash_slots[lmi] = 0;
-   raw_spin_unlock(&linear_map_hash_lock);
-   if (hidx & _PTEIDX_SECONDARY)
+   hslot = slots[idx] & 0x7f;
+   slots[idx] = 0;
+   raw_spin_unlock(lock);
+   if (hslot & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
+   slot += hslot & _PTEIDX_GROUP_IX;
mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
 mmu_linear_psize,
 mmu_kernel_ssize, 0);
 }
 
+static u8 *linear_map_hash_slots;
+static unsigned long linear_map_hash_count;
+static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
 static inline void hash_debug_pagealloc_alloc_slots(void)
 {
if (!debug_pagealloc_enabled())
@@ -362,9 +363,11 @@ static int hash_debug_pagealloc_map_pages(struct page 
*page, int numpages,
if (lmi >= linear_map_hash_count)
continue;
if (enable)
-   kernel_map_linear_page(vaddr, lmi);
+   kernel_map_linear_page(vaddr, lmi,
+   linear_map_hash_slots, &linear_map_hash_lock);
else
-   kernel_unmap_linear_page(vaddr, lmi);
+   kernel_unmap_linear_page(vaddr, lmi,
+   linear_map_hash_slots, &linear_map_hash_lock);
}
local_irq_restore(flags);
return 0;
-- 
2.46.0

1 2 >

1 - 100 of 168 matches

Mail list logo