[PATCH] powerpc/xmon: Fix array_size.cocci warning

2022-11-13 Thread wangkailong
Fix the following coccicheck warning:

arch/powerpc/xmon/ppc-opc.c:957:67-68: WARNING: Use ARRAY_SIZE
arch/powerpc/xmon/ppc-opc.c:7280:24-25: WARNING: Use ARRAY_SIZE
arch/powerpc/xmon/ppc-opc.c:6972:25-26: WARNING: Use ARRAY_SIZE
arch/powerpc/xmon/ppc-opc.c:7211:21-22: WARNING: Use ARRAY_SIZE

Signed-off-by: KaiLong Wang 
---
 arch/powerpc/xmon/ppc-opc.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
index 0774d711453e..ad64c2709757 100644
--- a/arch/powerpc/xmon/ppc-opc.c
+++ b/arch/powerpc/xmon/ppc-opc.c
@@ -954,9 +954,7 @@ const struct powerpc_operand powerpc_operands[] =
   { 0xff, 11, NULL, NULL, PPC_OPERAND_SIGNOPT },
 };
 
-const unsigned int num_powerpc_operands = (sizeof (powerpc_operands)
-  / sizeof (powerpc_operands[0]));
-
+const unsigned int num_powerpc_operands = ARRAY_SIZE(powerpc_operands);
 /* The functions used to insert and extract complicated operands.  */
 
 /* The ARX, ARY, RX and RY operands are alternate encodings of GPRs.  */
@@ -6968,8 +6966,7 @@ const struct powerpc_opcode powerpc_opcodes[] = {
 {"fcfidu.",XRC(63,974,1),  XRA_MASK, POWER7|PPCA2, PPCVLE, {FRT, 
FRB}},
 };
 
-const int powerpc_num_opcodes =
-  sizeof (powerpc_opcodes) / sizeof (powerpc_opcodes[0]);
+const int powerpc_num_opcodes = ARRAY_SIZE(powerpc_opcodes);
 
 /* The VLE opcode table.
 
@@ -7207,8 +7204,7 @@ const struct powerpc_opcode vle_opcodes[] = {
 {"se_bl",  BD8(58,0,1),BD8_MASK,   PPCVLE, 0,  {B8}},
 };
 
-const int vle_num_opcodes =
-  sizeof (vle_opcodes) / sizeof (vle_opcodes[0]);
+const int vle_num_opcodes = ARRAY_SIZE(vle_opcodes);
 
 /* The macro table.  This is only used by the assembler.  */
 
@@ -7276,5 +7272,4 @@ const struct powerpc_macro powerpc_macros[] = {
 {"e_clrlslwi",4, PPCVLE, "e_rlwinm %0,%1,%3,(%2)-(%3),31-(%3)"},
 };
 
-const int powerpc_num_macros =
-  sizeof (powerpc_macros) / sizeof (powerpc_macros[0]);
+const int powerpc_num_macros = ARRAY_SIZE(powerpc_macros);
-- 
2.25.1


[PATCH] macintosh/mac_hid.c: don't load by default

2022-11-13 Thread Thomas Weißschuh
There should be no need to automatically load this driver on *all*
machines with a keyboard.

This driver is of very limited utility and has to be enabled by the user
explicitly anyway.
Furthermore its own header comment has deprecated it for 17 years.

Fixes: 99b089c3c38a ("Input: Mac button emulation - implement as an input 
filter")
Signed-off-by: Thomas Weißschuh 
---
 drivers/macintosh/mac_hid.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c
index d8c4d5664145..d01d28890db4 100644
--- a/drivers/macintosh/mac_hid.c
+++ b/drivers/macintosh/mac_hid.c
@@ -149,8 +149,6 @@ static const struct input_device_id mac_hid_emumouse_ids[] 
= {
{ },
 };
 
-MODULE_DEVICE_TABLE(input, mac_hid_emumouse_ids);
-
 static struct input_handler mac_hid_emumouse_handler = {
.filter = mac_hid_emumouse_filter,
.connect= mac_hid_emumouse_connect,

base-commit: fef7fd48922d11b22620e19f9c9101647bfe943d
-- 
2.38.1



Re: Writing not working to CPLD/FPGA.

2022-11-13 Thread Christophe Leroy
Le 11/11/2022 à 15:27, Steven J. Hill a écrit :
> On 11/11/22 02:53, Christophe Leroy wrote:
>>
>> First of all, kernel 3.12 is prehistoric. Have you tried with latest
>> kernel, or at least with one of the long term support releases (see
>> https://www.kernel.org/category/releases.html) ?
>>
> It is what my customer wants. For this project, upgrading the kernel is 
> not an option. I am using the IO accessor out_be32() along with a BAT:

That's maybe not an option for your customer, but it is a good option 
for yourself to find out what the problem is. If recent kernel don't 
have the problem, you can then perform a 'git bisect' in order to find 
out which commit fixed the problem. Once that is done, you may backport 
the fixing commit to 3.12 for your customer.

> 
>     setbat(6, 0xe5000, 0xe500, 4096*1024, PAGE_KERNEL_NCG);

Have you tried with ioremap() instead of setbat() ?

Also, what other BATs do you have in your setup ? Maybe you have some 
overlapping BATs.

Christophe



Re: [RFC PATCH 1/3] powerpc/bpf: implement bpf_arch_text_copy

2022-11-13 Thread Christophe Leroy
Le 10/11/2022 à 19:43, Hari Bathini a écrit :
> bpf_arch_text_copy is used to dump JITed binary to RX page, allowing
> multiple BPF programs to share the same page. Using patch_instruction
> to implement it.

Using patch_instruction() is nice for a quick implementation, but it is 
probably suboptimal. Due to the amount of data to be copied, it is worth 
a dedicated function that maps a RW copy of the page to be updated then 
does the copy at once with memcpy() then unmaps the page.

> 
> Signed-off-by: Hari Bathini 
> ---
>   arch/powerpc/net/bpf_jit_comp.c | 39 -
>   1 file changed, 38 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 43e634126514..7383e0effad2 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -13,9 +13,12 @@
>   #include 
>   #include 
>   #include 
> -#include 
> +#include 
>   #include 
>   
> +#include 
> +#include 
> +
>   #include "bpf_jit.h"
>   
>   static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
> @@ -23,6 +26,35 @@ static void bpf_jit_fill_ill_insns(void *area, unsigned 
> int size)
>   memset32(area, BREAKPOINT_INSTRUCTION, size / 4);
>   }
>   
> +/*
> + * Patch 'len' bytes of instructions from opcode to addr, one instruction
> + * at a time. Returns addr on success. ERR_PTR(-EINVAL), otherwise.
> + */
> +static void *bpf_patch_instructions(void *addr, void *opcode, size_t len)
> +{
> + void *ret = ERR_PTR(-EINVAL);
> + size_t patched = 0;
> + u32 *inst = opcode;
> + u32 *start = addr;
> +
> + if (WARN_ON_ONCE(core_kernel_text((unsigned long)addr)))
> + return ret;
> +
> + mutex_lock(&text_mutex);
> + while (patched < len) {
> + if (patch_instruction(start++, ppc_inst(*inst)))
> + goto error;
> +
> + inst++;
> + patched += 4;
> + }
> +
> + ret = addr;
> +error:
> + mutex_unlock(&text_mutex);
> + return ret;
> +}
> +
>   /* Fix updated addresses (for subprog calls, ldimm64, et al) during extra 
> pass */
>   static int bpf_jit_fixup_addresses(struct bpf_prog *fp, u32 *image,
>  struct codegen_context *ctx, u32 *addrs)
> @@ -357,3 +389,8 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 
> *image, int pass, struct code
>   ctx->exentry_idx++;
>   return 0;
>   }
> +
> +void *bpf_arch_text_copy(void *dst, void *src, size_t len)
> +{
> + return bpf_patch_instructions(dst, src, len);
> +}

I can't see the added value of having two functions when the first one 
just calls the second one and is the only user of it. Why not have 
implemented bpf_patch_instructions() directly inside bpf_arch_text_copy() ?

By the way, it can be nice to have two functions, but split them 
differently, to avoid the goto: etc 

I also prefer using for loops instead of while loops.

It could have looked like below (untested):

static void *bpf_patch_instructions(void *addr, void *opcode, size_t len)
{
u32 *inst = opcode;
u32 *start = addr;
u32 *end = addr + len;

for (inst = opcode, start = addr; start < end; inst++, start++) {
if (patch_instruction(start, ppc_inst(*inst)))
return ERR_PTR(-EINVAL);
}

return addr;
}

void *bpf_arch_text_copy(void *dst, void *src, size_t len)
{
if (WARN_ON_ONCE(core_kernel_text((unsigned long)dst)))
return ret;

mutex_lock(&text_mutex);

ret = bpf_patch_instructions(dst, src, len);

mutex_unlock(&text_mutex);

return ret;
}




Re: [RFC PATCH 2/3] powerpc/bpf: implement bpf_arch_text_invalidate for bpf_prog_pack

2022-11-13 Thread Christophe Leroy
Le 10/11/2022 à 19:43, Hari Bathini a écrit :
> Implement bpf_arch_text_invalidate and use it to fill unused part of
> the bpf_prog_pack with trap instructions when a BPF program is freed.

Same here, allthough patch_instruction() is nice for a first try, it is 
not the solution on the long run. Same as with previous patch, it should 
just map the necessary page by allocating a vma area then mapping the 
associated physical pages over it using map_kernel_page(), then use 
bpf_jit_fill_ill_insns() over than page.

> 
> Signed-off-by: Hari Bathini 
> ---
>   arch/powerpc/net/bpf_jit_comp.c | 32 
>   1 file changed, 32 insertions(+)
> 
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 7383e0effad2..f925755cd249 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -26,6 +26,33 @@ static void bpf_jit_fill_ill_insns(void *area, unsigned 
> int size)
>   memset32(area, BREAKPOINT_INSTRUCTION, size / 4);
>   }
>   
> +/*
> + * Patch 'len' bytes with trap instruction at addr, one instruction
> + * at a time. Returns addr on success. ERR_PTR(-EINVAL), otherwise.
> + */
> +static void *bpf_patch_ill_insns(void *addr, size_t len)
> +{
> + void *ret = ERR_PTR(-EINVAL);
> + size_t patched = 0;
> + u32 *start = addr;
> +
> + if (WARN_ON_ONCE(core_kernel_text((unsigned long)addr)))
> + return ret;
> +
> + mutex_lock(&text_mutex);
> + while (patched < len) {
> + if (patch_instruction(start++, ppc_inst(PPC_RAW_TRAP(

Use BREAKPOINT_INSTRUCTION instead of PPC_RAW_TRAP()

> + goto error;
> +
> + patched += 4;
> + }
> +
> + ret = addr;
> +error:
> + mutex_unlock(&text_mutex);
> + return ret;
> +}
> +
>   /*
>* Patch 'len' bytes of instructions from opcode to addr, one instruction
>* at a time. Returns addr on success. ERR_PTR(-EINVAL), otherwise.
> @@ -394,3 +421,8 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len)
>   {
>   return bpf_patch_instructions(dst, src, len);
>   }
> +
> +int bpf_arch_text_invalidate(void *dst, size_t len)
> +{
> + return IS_ERR(bpf_patch_ill_insns(dst, len));
> +}


The exact same split between bpf_arch_text_invalidate() and 
bpf_patch_ill_insns() as previous patch could be done here.



Re: [RFC PATCH 3/3] powerpc/bpf: use bpf_jit_binary_pack_[alloc|finalize|free]

2022-11-13 Thread Christophe Leroy
Le 10/11/2022 à 19:43, Hari Bathini a écrit :
> Use bpf_jit_binary_pack_alloc in powerpc jit. The jit engine first
> writes the program to the rw buffer. When the jit is done, the program
> is copied to the final location with bpf_jit_binary_pack_finalize.
> With multiple jit_subprogs, bpf_jit_free is called on some subprograms
> that haven't got bpf_jit_binary_pack_finalize() yet. Implement custom
> bpf_jit_free() like in commit 1d5f82d9dd47 ("bpf, x86: fix freeing of
> not-finalized bpf_prog_pack") to call bpf_jit_binary_pack_finalize(),
> if necessary. While here, correct the misnomer powerpc64_jit_data to
> powerpc_jit_data as it is meant for both ppc32 and ppc64.

This patch looks heavy compared to x86 commit 1022a5498f6f.

I didn't look into details, is there really a need to carry that 
rw_image over all functions you changed ?

As far as I can see, ok you need it for EMIT macro. But then some of the 
function that use EMIT will now use rw_image instead of image, so why do 
they need both image and rw_image ?

Maybe you'd have less churn if you leave image, and add a ro_image 
wherever necessary but not everywhere.


> 
> Signed-off-by: Hari Bathini 
> ---
>   arch/powerpc/net/bpf_jit.h|  18 +++--
>   arch/powerpc/net/bpf_jit_comp.c   | 123 +-
>   arch/powerpc/net/bpf_jit_comp32.c |  26 +++
>   arch/powerpc/net/bpf_jit_comp64.c |  32 
>   4 files changed, 128 insertions(+), 71 deletions(-)
> 
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index f925755cd249..c4c1f7a21d89 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -181,22 +183,25 @@ bool bpf_jit_needs_zext(void)
>   
>   struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>   {
> - u32 proglen;
> - u32 alloclen;
> - u8 *image = NULL;
> - u32 *code_base;
> - u32 *addrs;
> - struct powerpc64_jit_data *jit_data;
> - struct codegen_context cgctx;
> - int pass;
> - int flen;
> + struct bpf_binary_header *rw_header = NULL;
> + struct powerpc_jit_data *jit_data;
>   struct bpf_binary_header *bpf_hdr;
> + struct codegen_context cgctx;
>   struct bpf_prog *org_fp = fp;
>   struct bpf_prog *tmp_fp;
>   bool bpf_blinded = false;
>   bool extra_pass = false;
> + u8 *rw_image = NULL;
> + u32 *rw_code_base;
> + u8 *image = NULL;
>   u32 extable_len;
> + u32 *code_base;
>   u32 fixup_len;
> + u32 alloclen;
> + u32 proglen;
> + u32 *addrs;
> + int pass;
> + int flen;

Why so many changes here, a lot of items seems to only have moved 
without any modification. Why that churn ?

>   
>   if (!fp->jit_requested)
>   return org_fp;
> @@ -227,6 +232,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>   image = jit_data->image;
>   bpf_hdr = jit_data->header;
>   proglen = jit_data->proglen;
> + rw_header = jit_data->rw_header;
> + rw_image = (void *)rw_header + ((void *)image - (void 
> *)bpf_hdr);
>   extra_pass = true;
>   goto skip_init_ctx;
>   }
> @@ -244,7 +251,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>   cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
>   
>   /* Scouting faux-generate pass 0 */
> - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) {
> + if (bpf_jit_build_body(fp, 0, 0, &cgctx, addrs, 0)) {

Some of the 0s in this call are pointers. You should use NULL instead.
This comment applies to several other lines you have changed.

>   /* We hit something illegal or unsupported. */
>   fp = org_fp;
>   goto out_addrs;
> @@ -259,7 +266,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>*/
>   if (cgctx.seen & SEEN_TAILCALL || 
> !is_offset_in_branch_range((long)cgctx.idx * 4)) {
>   cgctx.idx = 0;
> - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) {
> + if (bpf_jit_build_body(fp, 0, 0, &cgctx, addrs, 0)) {

0 ==> NULL

>   fp = org_fp;
>   goto out_addrs;
>   }
> @@ -271,9 +278,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
>* update ctgtx.idx as it pretends to output instructions, then we can
>* calculate total size from idx.
>*/
> - bpf_jit_build_prologue(0, &cgctx);
> + bpf_jit_build_prologue(0, 0, &cgctx);
>   addrs[fp->len] = cgctx.idx * 4;
> - bpf_jit_build_epilogue(0, &cgctx);
> + bpf_jit_build_epilogue(0, 0, &cgctx);

0 ==> NULL

>   
>   fixup_len = fp->aux->num_exentries * BPF_FIXUP_LEN * 4;
>   extable_len = fp->aux->num_exentries * sizeof(struct 
> exception_table_entry);
> @@ -337,17 +348,26 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog 
> *fp)
>   
>   #ifdef CONFIG_PPC64_ELF_ABI_V1
>   /* Function descriptor n

Re: [PATCH] powerpc/kernel: fix repeated words in comments

2022-11-13 Thread Christophe Leroy
Le 12/11/2022 à 08:58, wangjianli a écrit :
> Delete the redundant word 'the'.
> 
> Signed-off-by: wangjianli 
> ---
>   arch/powerpc/kernel/process.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index ab786da8c30b..6fa4ddec6c11 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -281,7 +281,7 @@ void enable_kernel_altivec(void)
>   EXPORT_SYMBOL(enable_kernel_altivec);
>   
>   /*
> - * Make sure the VMX/Altivec register state in the
> + * Make sure the VMX/Altivec register state in

There are more unnecessary 'the' in this sentence.

I think if would read better as:

Make sure VMX/Altivec register state in thread_struct is up to date for 
task tsk.

>* the thread_struct is up to date for task tsk.
>*/
>   void flush_altivec_to_thread(struct task_struct *tsk)



[PATCH v2 00/17] powerpc: alternate queued spinlock implementation

2022-11-13 Thread Nicholas Piggin
This replaces the generic queued spinlock code (like s390 does) with
our own implementation. There is an extra shim patch 1a to get the
series to apply.

Generic PV qspinlock code is causing latency / starvation regressions on
large systems that are resulting in hard lockups reported (mostly in
pathoogical cases).  The generic qspinlock code has a number of issues
important for powerpc hardware and hypervisors that aren't easily solved
without changing code that would impact other architectures. Follow
s390's lead and implement our own for now.

Issues for powerpc using generic qspinlocks:
- The previous lock value should not be loaded with simple loads, and
  need not be passed around from previous loads or cmpxchg results,
  because powerpc uses ll/sc-style atomics which can perform more
  complex operations that do not require this. powerpc implementations
  tend to prefer loads use larx for improved coherency performance.
- The queueing process should absolutely minimise the number of stores
  to the lock word to reduce exclusive coherency probes, important for
  large system scalability. The pending logic is counter productive
  here.
- Non-atomic unlock for paravirt locks is important (atomic instructions
  tend to still be more expensive than x86 CPUs).
- Yielding to the lock owner is important in the oversubscribed paravirt
  case, which requires storing the owner CPU in the lock word.
- More control of lock stealing for the paravirt case is important to
  keep latency down on large systems.
- The lock acquisition operation should always be made with a special
  variant of atomic instructions with the lock hint bit set, including
  (especially) in the queueing paths. This is more a matter of adding
  more arch lock helpers so not an insurmountable problem for generic
  code.

So far this still has some work to test and tune performance. It does
improve some of the latency and starvation issues, it also has some
throughput regressions in some cases, but I already left it too long
since Jordan's really nice review including two subtle bugs found, so
I'm posting the current state of things...

Since v1:
- Change most 'if (cond) return 1 ; return 0;'
- Bug fix: was testing count == MAX, but reentrant NMIs could bring that
  > MAX and crash.
- Fix missing memory barrier lost in asm conversion patch.
- Seperate the release barrier in publish_tail from the acquire barrier
  in get_tail_qnode.
- Moving a few minor things into their logically correct change.
- Make encode_tail_cpu take a cpu argument to match get_tail_cpu.
- Rename get_tail_cpu to decode_tail_cpu to match encode_tail_cpu.
- Rename lock_set_locked to set_locked.
- IS_ENABLED(x) ? 1 : 0 -> IS_ENABLED(x)
- Fix some comments inside inline asm.
- Change tunable names to lowercase.
- Consolidate asm for trylock_clear_tail_cpu and trylock_with_tail_cpu
- Restructure steal/wait loops to be more readable
- Count a failed cmpxchg as an iteration in steal/wait loops to avoid
  theoretical livelock/latency concern.

Nicholas Piggin (17):
  powerpc/qspinlock: powerpc qspinlock implementation
  powerpc/qspinlock: add mcs queueing for contended waiters
  powerpc/qspinlock: use a half-word store to unlock to avoid larx/stcx.
  powerpc/qspinlock: convert atomic operations to assembly
  powerpc/qspinlock: allow new waiters to steal the lock before queueing
  powerpc/qspinlock: theft prevention to control latency
  powerpc/qspinlock: store owner CPU in lock word
  powerpc/qspinlock: paravirt yield to lock owner
  powerpc/qspinlock: implement option to yield to previous node
  powerpc/qspinlock: allow stealing when head of queue yields
  powerpc/qspinlock: allow propagation of yield CPU down the queue
  powerpc/qspinlock: add ability to prod new queue head CPU
  powerpc/qspinlock: trylock and initial lock attempt may steal
  powerpc/qspinlock: use spin_begin/end API
  powerpc/qspinlock: reduce remote node steal spins
  powerpc/qspinlock: allow indefinite spinning on a preempted owner
  powerpc/qspinlock: provide accounting and options for sleepy locks

 arch/powerpc/Kconfig   |1 -
 arch/powerpc/include/asm/qspinlock.h   |  133 ++-
 arch/powerpc/include/asm/qspinlock_types.h |   70 ++
 arch/powerpc/include/asm/spinlock_types.h  |2 +-
 arch/powerpc/lib/Makefile  |4 +-
 arch/powerpc/lib/qspinlock.c   | 1008 
 6 files changed, 1174 insertions(+), 44 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock_types.h
 create mode 100644 arch/powerpc/lib/qspinlock.c

-- 
2.37.2



[PATCH v2 01a/17] powerpc/qspinlock: prepare powerpc qspinlock implementation

2022-11-13 Thread Nicholas Piggin
This is a merge placeholder with a conflicting series of patches to
generic qspinlocks. Not intended to be standalone, this should be
applied before patch 1.

diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index bcf95ce0964f..813a8c3405ad 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -4,7 +4,6 @@ generated-y += syscall_table_64.h
 generated-y += syscall_table_spu.h
 generic-y += export.h
 generic-y += kvm_types.h
-generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
 generic-y += vtime.h
 generic-y += early_ioremap.h
diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index b676c4fb90fd..39c1c7f80579 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -7,42 +7,32 @@
 
 #define _Q_PENDING_LOOPS   (1 << 9) /* not tuned */
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+void __pv_queued_spin_unlock(struct qspinlock *lock);
 
-static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, 
u32 val)
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
-   if (!is_shared_processor())
-   native_queued_spin_lock_slowpath(lock, val);
+   u32 val = 0;
+
+   if (likely(arch_atomic_try_cmpxchg_lock(&lock->val, &val, 
_Q_LOCKED_VAL)))
+   return;
+
+   if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) || !is_shared_processor())
+   queued_spin_lock_slowpath(lock, val);
else
__pv_queued_spin_lock_slowpath(lock, val);
 }
+#define queued_spin_lock queued_spin_lock
 
-#define queued_spin_unlock queued_spin_unlock
 static inline void queued_spin_unlock(struct qspinlock *lock)
 {
-   if (!is_shared_processor())
+   if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) || !is_shared_processor())
smp_store_release(&lock->locked, 0);
else
__pv_queued_spin_unlock(lock);
 }
-
-#else
-extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-#endif
-
-static __always_inline void queued_spin_lock(struct qspinlock *lock)
-{
-   u32 val = 0;
-
-   if (likely(arch_atomic_try_cmpxchg_lock(&lock->val, &val, 
_Q_LOCKED_VAL)))
-   return;
-
-   queued_spin_lock_slowpath(lock, val);
-}
-#define queued_spin_lock queued_spin_lock
+#define queued_spin_unlock queued_spin_unlock
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define SPIN_THRESHOLD (1<<15) /* not tuned */
@@ -63,13 +53,6 @@ static __always_inline void pv_kick(int cpu)
prod_cpu(cpu);
 }
 
-extern void __pv_init_lock_hash(void);
-
-static inline void pv_spinlocks_init(void)
-{
-   __pv_init_lock_hash();
-}
-
 #endif
 
 /*
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h 
b/arch/powerpc/include/asm/qspinlock_paravirt.h
deleted file mode 100644
index 6b60e7736a47..
--- a/arch/powerpc/include/asm/qspinlock_paravirt.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_POWERPC_QSPINLOCK_PARAVIRT_H
-#define _ASM_POWERPC_QSPINLOCK_PARAVIRT_H
-
-EXPORT_SYMBOL(__pv_queued_spin_unlock);
-
-#endif /* _ASM_POWERPC_QSPINLOCK_PARAVIRT_H */
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index bd75872a6334..7dafca8e3f02 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -13,7 +13,7 @@
 /* See include/linux/spinlock.h */
 #define smp_mb__after_spinlock()   smp_mb()
 
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
+#ifndef CONFIG_PPC_QUEUED_SPINLOCKS
 static inline void pv_spinlocks_init(void) { }
 #endif
 


[PATCH v2 01/17] powerpc/qspinlock: powerpc qspinlock implementation

2022-11-13 Thread Nicholas Piggin
Add a powerpc specific implementation of queued spinlocks. This is the
build framework with a very simple (non-queued) spinlock implementation
to begin with. Later changes add queueing, and other features and
optimisations one-at-a-time. It is done this way to more easily see how
the queued spinlocks are built, and to make performance and correctness
bisects more useful.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/Kconfig   |  1 -
 arch/powerpc/include/asm/qspinlock.h   | 76 +-
 arch/powerpc/include/asm/qspinlock_types.h | 13 
 arch/powerpc/include/asm/spinlock_types.h  |  2 +-
 arch/powerpc/lib/Makefile  |  4 +-
 arch/powerpc/lib/qspinlock.c   | 17 +
 6 files changed, 66 insertions(+), 47 deletions(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock_types.h
 create mode 100644 arch/powerpc/lib/qspinlock.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2ca5418457ed..1d5b4f280feb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,7 +155,6 @@ config PPC
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS  if PPC_QUEUED_SPINLOCKS
-   select ARCH_USE_QUEUED_SPINLOCKSif PPC_QUEUED_SPINLOCKS
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 39c1c7f80579..b1443aab2145 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -2,66 +2,54 @@
 #ifndef _ASM_POWERPC_QSPINLOCK_H
 #define _ASM_POWERPC_QSPINLOCK_H
 
-#include 
-#include 
+#include 
+#include 
+#include 
 
-#define _Q_PENDING_LOOPS   (1 << 9) /* not tuned */
-
-void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-void __pv_queued_spin_unlock(struct qspinlock *lock);
-
-static __always_inline void queued_spin_lock(struct qspinlock *lock)
+static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
-   u32 val = 0;
+   return atomic_read(&lock->val);
+}
 
-   if (likely(arch_atomic_try_cmpxchg_lock(&lock->val, &val, 
_Q_LOCKED_VAL)))
-   return;
+static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
+{
+   return !atomic_read(&lock.val);
+}
 
-   if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) || !is_shared_processor())
-   queued_spin_lock_slowpath(lock, val);
-   else
-   __pv_queued_spin_lock_slowpath(lock, val);
+static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
+{
+   return 0;
 }
-#define queued_spin_lock queued_spin_lock
 
-static inline void queued_spin_unlock(struct qspinlock *lock)
+static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 {
-   if (!IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) || !is_shared_processor())
-   smp_store_release(&lock->locked, 0);
-   else
-   __pv_queued_spin_unlock(lock);
+   return atomic_cmpxchg_acquire(&lock->val, 0, 1) == 0;
 }
-#define queued_spin_unlock queued_spin_unlock
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-#define SPIN_THRESHOLD (1<<15) /* not tuned */
+void queued_spin_lock_slowpath(struct qspinlock *lock);
 
-static __always_inline void pv_wait(u8 *ptr, u8 val)
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
-   if (*ptr != val)
-   return;
-   yield_to_any();
-   /*
-* We could pass in a CPU here if waiting in the queue and yield to
-* the previous CPU in the queue.
-*/
+   if (!queued_spin_trylock(lock))
+   queued_spin_lock_slowpath(lock);
 }
 
-static __always_inline void pv_kick(int cpu)
+static inline void queued_spin_unlock(struct qspinlock *lock)
 {
-   prod_cpu(cpu);
+   atomic_set_release(&lock->val, 0);
 }
 
-#endif
+#define arch_spin_is_locked(l) queued_spin_is_locked(l)
+#define arch_spin_is_contended(l)  queued_spin_is_contended(l)
+#define arch_spin_value_unlocked(l)queued_spin_value_unlocked(l)
+#define arch_spin_lock(l)  queued_spin_lock(l)
+#define arch_spin_trylock(l)   queued_spin_trylock(l)
+#define arch_spin_unlock(l)queued_spin_unlock(l)
 
-/*
- * Queued spinlocks rely heavily on smp_cond_load_relaxed() to busy-wait,
- * which was found to have performance problems if implemented with
- * the preferred spin_begin()/spin_end() SMT priority pattern. Use the
- * generic version instead.
- */
-
-#include 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void pv_spinlocks_init(void);
+#else
+static inline void pv_spinlocks_init(void) { }
+#endif
 
 #endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
b/arch/powerpc/include/asm

[PATCH v2 02/17] powerpc/qspinlock: add mcs queueing for contended waiters

2022-11-13 Thread Nicholas Piggin
This forms the basis of the qspinlock slow path.

Like generic qspinlocks and unlike the vanilla MCS algorithm, the lock
owner does not participate in the queue, only waiters. The first waiter
spins on the lock word, then when the lock is released it takes
ownership and unqueues the next waiter. This is how qspinlocks can be
implemented with the spinlock API -- lock owners don't need a node, only
waiters do.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/qspinlock.h   |  10 +-
 arch/powerpc/include/asm/qspinlock_types.h |  21 +++
 arch/powerpc/lib/qspinlock.c   | 180 -
 3 files changed, 205 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index b1443aab2145..300c7d2ebe2e 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -18,12 +18,12 @@ static __always_inline int 
queued_spin_value_unlocked(struct qspinlock lock)
 
 static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
 {
-   return 0;
+   return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK);
 }
 
 static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 {
-   return atomic_cmpxchg_acquire(&lock->val, 0, 1) == 0;
+   return atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0;
 }
 
 void queued_spin_lock_slowpath(struct qspinlock *lock);
@@ -36,7 +36,11 @@ static __always_inline void queued_spin_lock(struct 
qspinlock *lock)
 
 static inline void queued_spin_unlock(struct qspinlock *lock)
 {
-   atomic_set_release(&lock->val, 0);
+   for (;;) {
+   int val = atomic_read(&lock->val);
+   if (atomic_cmpxchg_release(&lock->val, val, val & 
~_Q_LOCKED_VAL) == val)
+   return;
+   }
 }
 
 #define arch_spin_is_locked(l) queued_spin_is_locked(l)
diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
b/arch/powerpc/include/asm/qspinlock_types.h
index 59606bc0c774..9630e714c70d 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -10,4 +10,25 @@ typedef struct qspinlock {
 
 #define__ARCH_SPIN_LOCK_UNLOCKED   { .val = ATOMIC_INIT(0) }
 
+/*
+ * Bitfields in the atomic value:
+ *
+ * 0: locked bit
+ * 16-31: tail cpu (+1)
+ */
+#define_Q_SET_MASK(type)   (((1U << _Q_ ## type ## _BITS) - 1)\
+ << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET   0
+#define _Q_LOCKED_BITS 1
+#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
+#define _Q_LOCKED_VAL  (1U << _Q_LOCKED_OFFSET)
+
+#define _Q_TAIL_CPU_OFFSET 16
+#define _Q_TAIL_CPU_BITS   (32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK   _Q_SET_MASK(TAIL_CPU)
+
+#if CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)
+#error "qspinlock does not support such large CONFIG_NR_CPUS"
+#endif
+
 #endif /* _ASM_POWERPC_QSPINLOCK_TYPES_H */
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 1c669b5b4607..f3c3d5128bd5 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -1,12 +1,186 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
+#include 
+#include 
+#include 
 #include 
-#include 
+#include 
+#include 
 #include 
 
-void queued_spin_lock_slowpath(struct qspinlock *lock)
+#define MAX_NODES  4
+
+struct qnode {
+   struct qnode*next;
+   struct qspinlock *lock;
+   u8  locked; /* 1 if lock acquired */
+};
+
+struct qnodes {
+   int count;
+   struct qnode nodes[MAX_NODES];
+};
+
+static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
+
+static inline int encode_tail_cpu(int cpu)
+{
+   return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+}
+
+static inline int decode_tail_cpu(int val)
+{
+   return (val >> _Q_TAIL_CPU_OFFSET) - 1;
+}
+
+/* Take the lock by setting the bit, no other CPUs may concurrently lock it. */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+   atomic_or(_Q_LOCKED_VAL, &lock->val);
+   __atomic_acquire_fence();
+}
+
+/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */
+static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int 
val)
+{
+   int newval = _Q_LOCKED_VAL;
+
+   BUG_ON(val & _Q_LOCKED_VAL);
+
+   return atomic_cmpxchg_acquire(&lock->val, val, newval) == val;
+}
+
+/*
+ * Publish our tail, replacing previous tail. Return previous value.
+ *
+ * This provides a release barrier for publishing node, this pairs with the
+ * acquire barrier in get_tail_qnode() when the next CPU finds this tail
+ * value.
+ */
+static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail)
+{
+   for (;;) {
+   int val = atomic_read(&lock->val);
+   int newval = (val & ~_Q_TAIL_CPU_MASK) | tail;
+   int old;
+
+   old = atomic_cmpxchg_releas

[PATCH v2 03/17] powerpc/qspinlock: use a half-word store to unlock to avoid larx/stcx.

2022-11-13 Thread Nicholas Piggin
The first 16 bits of the lock are only modified by the owner, and other
modifications always use atomic operations on the entire 32 bits, so
unlocks can use plain stores on the 16 bits. This is the same kind of
optimisation done by core qspinlock code.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/qspinlock.h   |  6 +-
 arch/powerpc/include/asm/qspinlock_types.h | 19 +--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 300c7d2ebe2e..7bc254c55705 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -36,11 +36,7 @@ static __always_inline void queued_spin_lock(struct 
qspinlock *lock)
 
 static inline void queued_spin_unlock(struct qspinlock *lock)
 {
-   for (;;) {
-   int val = atomic_read(&lock->val);
-   if (atomic_cmpxchg_release(&lock->val, val, val & 
~_Q_LOCKED_VAL) == val)
-   return;
-   }
+   smp_store_release(&lock->locked, 0);
 }
 
 #define arch_spin_is_locked(l) queued_spin_is_locked(l)
diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
b/arch/powerpc/include/asm/qspinlock_types.h
index 9630e714c70d..3425dab42576 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -3,12 +3,27 @@
 #define _ASM_POWERPC_QSPINLOCK_TYPES_H
 
 #include 
+#include 
 
 typedef struct qspinlock {
-   atomic_t val;
+   union {
+   atomic_t val;
+
+#ifdef __LITTLE_ENDIAN
+   struct {
+   u16 locked;
+   u8  reserved[2];
+   };
+#else
+   struct {
+   u8  reserved[2];
+   u16 locked;
+   };
+#endif
+   };
 } arch_spinlock_t;
 
-#define__ARCH_SPIN_LOCK_UNLOCKED   { .val = ATOMIC_INIT(0) }
+#define__ARCH_SPIN_LOCK_UNLOCKED   { { .val = ATOMIC_INIT(0) } }
 
 /*
  * Bitfields in the atomic value:
-- 
2.37.2



[PATCH v2 04/17] powerpc/qspinlock: convert atomic operations to assembly

2022-11-13 Thread Nicholas Piggin
This uses more optimal ll/sc style access patterns (rather than
cmpxchg), and also sets the EH=1 lock hint on those operations
which acquire ownership of the lock.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/qspinlock.h   | 24 +--
 arch/powerpc/include/asm/qspinlock_types.h |  6 +-
 arch/powerpc/lib/qspinlock.c   | 81 +++---
 3 files changed, 77 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 7bc254c55705..7d300e6883a8 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -2,28 +2,42 @@
 #ifndef _ASM_POWERPC_QSPINLOCK_H
 #define _ASM_POWERPC_QSPINLOCK_H
 
-#include 
 #include 
 #include 
 
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
-   return atomic_read(&lock->val);
+   return READ_ONCE(lock->val);
 }
 
 static __always_inline int queued_spin_value_unlocked(struct qspinlock lock)
 {
-   return !atomic_read(&lock.val);
+   return !lock.val;
 }
 
 static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
 {
-   return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK);
+   return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK);
 }
 
 static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 {
-   return atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0;
+   u32 prev;
+
+   asm volatile(
+"1:lwarx   %0,0,%1,%3  # queued_spin_trylock   \n"
+"  cmpwi   0,%0,0  \n"
+"  bne-2f  \n"
+"  stwcx.  %2,0,%1 \n"
+"  bne-1b  \n"
+"\t"   PPC_ACQUIRE_BARRIER "   \n"
+"2:\n"
+   : "=&r" (prev)
+   : "r" (&lock->val), "r" (_Q_LOCKED_VAL),
+ "i" (IS_ENABLED(CONFIG_PPC64))
+   : "cr0", "memory");
+
+   return likely(prev == 0);
 }
 
 void queued_spin_lock_slowpath(struct qspinlock *lock);
diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
b/arch/powerpc/include/asm/qspinlock_types.h
index 3425dab42576..210adf05b235 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -7,7 +7,7 @@
 
 typedef struct qspinlock {
union {
-   atomic_t val;
+   u32 val;
 
 #ifdef __LITTLE_ENDIAN
struct {
@@ -23,10 +23,10 @@ typedef struct qspinlock {
};
 } arch_spinlock_t;
 
-#define__ARCH_SPIN_LOCK_UNLOCKED   { { .val = ATOMIC_INIT(0) } }
+#define__ARCH_SPIN_LOCK_UNLOCKED   { { .val = 0 } }
 
 /*
- * Bitfields in the atomic value:
+ * Bitfields in the lock word:
  *
  * 0: locked bit
  * 16-31: tail cpu (+1)
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index f3c3d5128bd5..6c58c24af5a0 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-#include 
 #include 
 #include 
 #include 
@@ -22,31 +21,56 @@ struct qnodes {
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
-static inline int encode_tail_cpu(int cpu)
+static inline u32 encode_tail_cpu(int cpu)
 {
return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
 }
 
-static inline int decode_tail_cpu(int val)
+static inline int decode_tail_cpu(u32 val)
 {
return (val >> _Q_TAIL_CPU_OFFSET) - 1;
 }
 
-/* Take the lock by setting the bit, no other CPUs may concurrently lock it. */
+/* Take the lock by setting the lock bit, no other CPUs will touch it. */
 static __always_inline void set_locked(struct qspinlock *lock)
 {
-   atomic_or(_Q_LOCKED_VAL, &lock->val);
-   __atomic_acquire_fence();
+   u32 prev, tmp;
+
+   asm volatile(
+"1:lwarx   %0,0,%2,%4  # set_locked\n"
+"  or  %1,%0,%3\n"
+"  stwcx.  %1,0,%2 \n"
+"  bne-1b  \n"
+"\t"   PPC_ACQUIRE_BARRIER "   \n"
+   : "=&r" (prev), "=&r" (tmp)
+   : "r" (&lock->val), "i" (_Q_LOCKED_VAL),
+ "i" (IS_ENABLED(CONFIG_PPC64))
+   : "cr0", "memory");
+
+   BUG_ON(prev & _Q_LOCKED_VAL);
 }
 
-/* Take lock, clearing tail, cmpxchg with val (which must not be locked) */
-static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, int 
val)
+/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */
+static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 
old)
 {
-   int newval = _Q_LOCKED_VAL;
-
-   BUG_ON(val & _Q

[PATCH v2 05/17] powerpc/qspinlock: allow new waiters to steal the lock before queueing

2022-11-13 Thread Nicholas Piggin
Allow new waiters a number of spins on the lock word before queueing,
which particularly helps paravirt performance when physical CPUs are
oversubscribed.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 159 ++-
 1 file changed, 140 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 6c58c24af5a0..872d4628a44d 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -19,8 +19,17 @@ struct qnodes {
struct qnode nodes[MAX_NODES];
 };
 
+/* Tuning parameters */
+static int steal_spins __read_mostly = (1<<5);
+static bool maybe_stealers __read_mostly = true;
+
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
+static __always_inline int get_steal_spins(void)
+{
+   return steal_spins;
+}
+
 static inline u32 encode_tail_cpu(int cpu)
 {
return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
@@ -50,15 +59,14 @@ static __always_inline void set_locked(struct qspinlock 
*lock)
BUG_ON(prev & _Q_LOCKED_VAL);
 }
 
-/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */
-static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 
old)
+static __always_inline u32 __trylock_cmpxchg(struct qspinlock *lock, u32 old, 
u32 new)
 {
u32 prev;
 
BUG_ON(old & _Q_LOCKED_VAL);
 
asm volatile(
-"1:lwarx   %0,0,%1,%4  # trylock_clear_tail_cpu\n"
+"1:lwarx   %0,0,%1,%4  # __trylock_cmpxchg \n"
 "  cmpw0,%0,%2 \n"
 "  bne-2f  \n"
 "  stwcx.  %3,0,%1 \n"
@@ -66,13 +74,27 @@ static __always_inline int trylock_clear_tail_cpu(struct 
qspinlock *lock, u32 ol
 "\t"   PPC_ACQUIRE_BARRIER "   \n"
 "2:\n"
: "=&r" (prev)
-   : "r" (&lock->val), "r"(old), "r" (_Q_LOCKED_VAL),
+   : "r" (&lock->val), "r"(old), "r" (new),
  "i" (IS_ENABLED(CONFIG_PPC64))
: "cr0", "memory");
 
return likely(prev == old);
 }
 
+/* Take lock, clearing tail, cmpxchg with old (which must not be locked) */
+static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 
val)
+{
+   return __trylock_cmpxchg(lock, val, _Q_LOCKED_VAL);
+}
+
+/* Take lock, preserving tail, cmpxchg with val (which must not be locked) */
+static __always_inline int trylock_with_tail_cpu(struct qspinlock *lock, u32 
val)
+{
+   u32 newval = _Q_LOCKED_VAL | (val & _Q_TAIL_CPU_MASK);
+
+   return __trylock_cmpxchg(lock, val, newval);
+}
+
 /*
  * Publish our tail, replacing previous tail. Return previous value.
  *
@@ -122,6 +144,30 @@ static struct qnode *get_tail_qnode(struct qspinlock 
*lock, u32 val)
BUG();
 }
 
+static inline bool try_to_steal_lock(struct qspinlock *lock)
+{
+   int iters = 0;
+
+   if (!maybe_stealers)
+   return false;
+
+   /* Attempt to steal the lock */
+   do {
+   u32 val = READ_ONCE(lock->val);
+
+   if (unlikely(!(val & _Q_LOCKED_VAL))) {
+   if (trylock_with_tail_cpu(lock, val))
+   return true;
+   } else {
+   cpu_relax();
+   }
+
+   iters++;
+   } while (iters < get_steal_spins());
+
+   return false;
+}
+
 static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
 {
struct qnodes *qnodesp;
@@ -171,25 +217,49 @@ static inline void queued_spin_lock_mcs_queue(struct 
qspinlock *lock)
smp_rmb(); /* acquire barrier for the mcs lock */
}
 
-   /* We're at the head of the waitqueue, wait for the lock. */
-   for (;;) {
-   val = READ_ONCE(lock->val);
-   if (!(val & _Q_LOCKED_VAL))
-   break;
+   if (!maybe_stealers) {
+   /* We're at the head of the waitqueue, wait for the lock. */
+   for (;;) {
+   val = READ_ONCE(lock->val);
+   if (!(val & _Q_LOCKED_VAL))
+   break;
 
-   cpu_relax();
-   }
+   cpu_relax();
+   }
+
+   /* If we're the last queued, must clean up the tail. */
+   if ((val & _Q_TAIL_CPU_MASK) == tail) {
+   if (trylock_clear_tail_cpu(lock, val))
+   goto release;
+   /* Another waiter must have enqueued. */
+   }
+
+   /* We must be the owner, just set the lock bit and acquire */
+   set_locked(lock);
+   } else {
+again:
+   /* We're at the head of the waitqueue, wait for the lock. */
+  

[PATCH v2 06/17] powerpc/qspinlock: theft prevention to control latency

2022-11-13 Thread Nicholas Piggin
Give the queue head the ability to stop stealers. After a number of
spins without sucessfully acquiring the lock, the queue head employs
this, which will assure it is the next owner.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/qspinlock_types.h | 10 -
 arch/powerpc/lib/qspinlock.c   | 52 ++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
b/arch/powerpc/include/asm/qspinlock_types.h
index 210adf05b235..8b20f5e22bba 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -29,7 +29,8 @@ typedef struct qspinlock {
  * Bitfields in the lock word:
  *
  * 0: locked bit
- * 16-31: tail cpu (+1)
+ *16: must queue bit
+ * 17-31: tail cpu (+1)
  */
 #define_Q_SET_MASK(type)   (((1U << _Q_ ## type ## _BITS) - 1)\
  << _Q_ ## type ## _OFFSET)
@@ -38,7 +39,12 @@ typedef struct qspinlock {
 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
 #define _Q_LOCKED_VAL  (1U << _Q_LOCKED_OFFSET)
 
-#define _Q_TAIL_CPU_OFFSET 16
+#define _Q_MUST_Q_OFFSET   16
+#define _Q_MUST_Q_BITS 1
+#define _Q_MUST_Q_MASK _Q_SET_MASK(MUST_Q)
+#define _Q_MUST_Q_VAL  (1U << _Q_MUST_Q_OFFSET)
+
+#define _Q_TAIL_CPU_OFFSET 17
 #define _Q_TAIL_CPU_BITS   (32 - _Q_TAIL_CPU_OFFSET)
 #define _Q_TAIL_CPU_MASK   _Q_SET_MASK(TAIL_CPU)
 
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 872d4628a44d..8f437b0768a5 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -22,6 +22,7 @@ struct qnodes {
 /* Tuning parameters */
 static int steal_spins __read_mostly = (1<<5);
 static bool maybe_stealers __read_mostly = true;
+static int head_spins __read_mostly = (1<<8);
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
@@ -30,6 +31,11 @@ static __always_inline int get_steal_spins(void)
return steal_spins;
 }
 
+static __always_inline int get_head_spins(void)
+{
+   return head_spins;
+}
+
 static inline u32 encode_tail_cpu(int cpu)
 {
return (cpu + 1) << _Q_TAIL_CPU_OFFSET;
@@ -120,6 +126,22 @@ static __always_inline u32 publish_tail_cpu(struct 
qspinlock *lock, u32 tail)
return prev;
 }
 
+static __always_inline u32 set_mustq(struct qspinlock *lock)
+{
+   u32 prev;
+
+   asm volatile(
+"1:lwarx   %0,0,%1 # set_mustq \n"
+"  or  %0,%0,%2\n"
+"  stwcx.  %0,0,%1 \n"
+"  bne-1b  \n"
+   : "=&r" (prev)
+   : "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
+   : "cr0", "memory");
+
+   return prev;
+}
+
 static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
 {
int cpu = decode_tail_cpu(val);
@@ -155,6 +177,9 @@ static inline bool try_to_steal_lock(struct qspinlock *lock)
do {
u32 val = READ_ONCE(lock->val);
 
+   if (val & _Q_MUST_Q_VAL)
+   break;
+
if (unlikely(!(val & _Q_LOCKED_VAL))) {
if (trylock_with_tail_cpu(lock, val))
return true;
@@ -237,6 +262,9 @@ static inline void queued_spin_lock_mcs_queue(struct 
qspinlock *lock)
/* We must be the owner, just set the lock bit and acquire */
set_locked(lock);
} else {
+   int iters = 0;
+   bool mustq = false;
+
 again:
/* We're at the head of the waitqueue, wait for the lock. */
for (;;) {
@@ -245,6 +273,13 @@ static inline void queued_spin_lock_mcs_queue(struct 
qspinlock *lock)
break;
 
cpu_relax();
+
+   iters++;
+   if (!mustq && iters >= get_head_spins()) {
+   mustq = true;
+   set_mustq(lock);
+   val |= _Q_MUST_Q_VAL;
+   }
}
 
/* If we're the last queued, must clean up the tail. */
@@ -332,9 +367,26 @@ static int steal_spins_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, 
"%llu\n");
 
+static int head_spins_set(void *data, u64 val)
+{
+   head_spins = val;
+
+   return 0;
+}
+
+static int head_spins_get(void *data, u64 *val)
+{
+   *val = head_spins;
+
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, 
"%llu\n");
+
 static __init int spinlock_debugfs_init(void)
 {
debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
&fops_steal_spins);
+   debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, 
&fops

[PATCH v2 07/17] powerpc/qspinlock: store owner CPU in lock word

2022-11-13 Thread Nicholas Piggin
Store the owner CPU number in the lock word so it may be yielded to,
as powerpc's paravirtualised simple spinlocks do.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/qspinlock.h   |  9 -
 arch/powerpc/include/asm/qspinlock_types.h | 10 ++
 arch/powerpc/lib/qspinlock.c   |  9 ++---
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 7d300e6883a8..3eff2d875bb6 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -20,8 +20,15 @@ static __always_inline int queued_spin_is_contended(struct 
qspinlock *lock)
return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK);
 }
 
+static __always_inline u32 queued_spin_encode_locked_val(void)
+{
+   /* XXX: make this use lock value in paca like simple spinlocks? */
+   return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET);
+}
+
 static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 {
+   u32 new = queued_spin_encode_locked_val();
u32 prev;
 
asm volatile(
@@ -33,7 +40,7 @@ static __always_inline int queued_spin_trylock(struct 
qspinlock *lock)
 "\t"   PPC_ACQUIRE_BARRIER "   \n"
 "2:\n"
: "=&r" (prev)
-   : "r" (&lock->val), "r" (_Q_LOCKED_VAL),
+   : "r" (&lock->val), "r" (new),
  "i" (IS_ENABLED(CONFIG_PPC64))
: "cr0", "memory");
 
diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
b/arch/powerpc/include/asm/qspinlock_types.h
index 8b20f5e22bba..35f9525381e6 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -29,6 +29,8 @@ typedef struct qspinlock {
  * Bitfields in the lock word:
  *
  * 0: locked bit
+ *  1-14: lock holder cpu
+ *15: unused bit
  *16: must queue bit
  * 17-31: tail cpu (+1)
  */
@@ -39,6 +41,14 @@ typedef struct qspinlock {
 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
 #define _Q_LOCKED_VAL  (1U << _Q_LOCKED_OFFSET)
 
+#define _Q_OWNER_CPU_OFFSET1
+#define _Q_OWNER_CPU_BITS  14
+#define _Q_OWNER_CPU_MASK  _Q_SET_MASK(OWNER_CPU)
+
+#if CONFIG_NR_CPUS > (1U << _Q_OWNER_CPU_BITS)
+#error "qspinlock does not support such large CONFIG_NR_CPUS"
+#endif
+
 #define _Q_MUST_Q_OFFSET   16
 #define _Q_MUST_Q_BITS 1
 #define _Q_MUST_Q_MASK _Q_SET_MASK(MUST_Q)
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 8f437b0768a5..b25a52251cb3 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -49,6 +49,7 @@ static inline int decode_tail_cpu(u32 val)
 /* Take the lock by setting the lock bit, no other CPUs will touch it. */
 static __always_inline void set_locked(struct qspinlock *lock)
 {
+   u32 new = queued_spin_encode_locked_val();
u32 prev, tmp;
 
asm volatile(
@@ -58,7 +59,7 @@ static __always_inline void set_locked(struct qspinlock *lock)
 "  bne-1b  \n"
 "\t"   PPC_ACQUIRE_BARRIER "   \n"
: "=&r" (prev), "=&r" (tmp)
-   : "r" (&lock->val), "i" (_Q_LOCKED_VAL),
+   : "r" (&lock->val), "r" (new),
  "i" (IS_ENABLED(CONFIG_PPC64))
: "cr0", "memory");
 
@@ -90,13 +91,15 @@ static __always_inline u32 __trylock_cmpxchg(struct 
qspinlock *lock, u32 old, u3
 /* Take lock, clearing tail, cmpxchg with old (which must not be locked) */
 static __always_inline int trylock_clear_tail_cpu(struct qspinlock *lock, u32 
val)
 {
-   return __trylock_cmpxchg(lock, val, _Q_LOCKED_VAL);
+   u32 newval = queued_spin_encode_locked_val();
+
+   return __trylock_cmpxchg(lock, val, newval);
 }
 
 /* Take lock, preserving tail, cmpxchg with val (which must not be locked) */
 static __always_inline int trylock_with_tail_cpu(struct qspinlock *lock, u32 
val)
 {
-   u32 newval = _Q_LOCKED_VAL | (val & _Q_TAIL_CPU_MASK);
+   u32 newval = queued_spin_encode_locked_val() | (val & _Q_TAIL_CPU_MASK);
 
return __trylock_cmpxchg(lock, val, newval);
 }
-- 
2.37.2



[PATCH v2 08/17] powerpc/qspinlock: paravirt yield to lock owner

2022-11-13 Thread Nicholas Piggin
Waiters spinning on the lock word should yield to the lock owner if the
vCPU is preempted. This improves performance when the hypervisor has
oversubscribed physical CPUs.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 101 ++-
 1 file changed, 88 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index b25a52251cb3..d81d72125034 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define MAX_NODES  4
 
@@ -24,14 +25,16 @@ static int steal_spins __read_mostly = (1<<5);
 static bool maybe_stealers __read_mostly = true;
 static int head_spins __read_mostly = (1<<8);
 
+static bool pv_yield_owner __read_mostly = true;
+
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
-static __always_inline int get_steal_spins(void)
+static __always_inline int get_steal_spins(bool paravirt)
 {
return steal_spins;
 }
 
-static __always_inline int get_head_spins(void)
+static __always_inline int get_head_spins(bool paravirt)
 {
return head_spins;
 }
@@ -46,6 +49,11 @@ static inline int decode_tail_cpu(u32 val)
return (val >> _Q_TAIL_CPU_OFFSET) - 1;
 }
 
+static inline int get_owner_cpu(u32 val)
+{
+   return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET;
+}
+
 /* Take the lock by setting the lock bit, no other CPUs will touch it. */
 static __always_inline void set_locked(struct qspinlock *lock)
 {
@@ -169,7 +177,45 @@ static struct qnode *get_tail_qnode(struct qspinlock 
*lock, u32 val)
BUG();
 }
 
-static inline bool try_to_steal_lock(struct qspinlock *lock)
+static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 
val, bool paravirt)
+{
+   int owner;
+   u32 yield_count;
+
+   BUG_ON(!(val & _Q_LOCKED_VAL));
+
+   if (!paravirt)
+   goto relax;
+
+   if (!pv_yield_owner)
+   goto relax;
+
+   owner = get_owner_cpu(val);
+   yield_count = yield_count_of(owner);
+
+   if ((yield_count & 1) == 0)
+   goto relax; /* owner vcpu is running */
+
+   /*
+* Read the lock word after sampling the yield count. On the other side
+* there may a wmb because the yield count update is done by the
+* hypervisor preemption and the value update by the OS, however this
+* ordering might reduce the chance of out of order accesses and
+* improve the heuristic.
+*/
+   smp_rmb();
+
+   if (READ_ONCE(lock->val) == val) {
+   yield_to_preempted(owner, yield_count);
+   /* Don't relax if we yielded. Maybe we should? */
+   return;
+   }
+relax:
+   cpu_relax();
+}
+
+
+static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
paravirt)
 {
int iters = 0;
 
@@ -187,16 +233,16 @@ static inline bool try_to_steal_lock(struct qspinlock 
*lock)
if (trylock_with_tail_cpu(lock, val))
return true;
} else {
-   cpu_relax();
+   yield_to_locked_owner(lock, val, paravirt);
}
 
iters++;
-   } while (iters < get_steal_spins());
+   } while (iters < get_steal_spins(paravirt));
 
return false;
 }
 
-static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock)
+static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, 
bool paravirt)
 {
struct qnodes *qnodesp;
struct qnode *next, *node;
@@ -252,7 +298,7 @@ static inline void queued_spin_lock_mcs_queue(struct 
qspinlock *lock)
if (!(val & _Q_LOCKED_VAL))
break;
 
-   cpu_relax();
+   yield_to_locked_owner(lock, val, paravirt);
}
 
/* If we're the last queued, must clean up the tail. */
@@ -275,10 +321,10 @@ static inline void queued_spin_lock_mcs_queue(struct 
qspinlock *lock)
if (!(val & _Q_LOCKED_VAL))
break;
 
-   cpu_relax();
+   yield_to_locked_owner(lock, val, paravirt);
 
iters++;
-   if (!mustq && iters >= get_head_spins()) {
+   if (!mustq && iters >= get_head_spins(paravirt)) {
mustq = true;
set_mustq(lock);
val |= _Q_MUST_Q_VAL;
@@ -317,10 +363,20 @@ static inline void queued_spin_lock_mcs_queue(struct 
qspinlock *lock)
 
 void queued_spin_lock_slowpath(struct qspinlock *lock)
 {
-   if (try_to_steal_lock(lock))
-   return;
-
-   queued_spin_lock_mcs_queue(lock);
+   /*
+* This looks funny, but it induces the compiler to inline both
+  

[PATCH v2 09/17] powerpc/qspinlock: implement option to yield to previous node

2022-11-13 Thread Nicholas Piggin
Queued waiters which are not at the head of the queue don't spin on
the lock word but their qnode lock word, waiting for the previous queued
CPU to release them. Add an option which allows these waiters to yield
to the previous CPU if its vCPU is preempted.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 46 +++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index d81d72125034..272467c99b90 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -26,6 +26,7 @@ static bool maybe_stealers __read_mostly = true;
 static int head_spins __read_mostly = (1<<8);
 
 static bool pv_yield_owner __read_mostly = true;
+static bool pv_yield_prev __read_mostly = true;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
@@ -214,6 +215,32 @@ static __always_inline void yield_to_locked_owner(struct 
qspinlock *lock, u32 va
cpu_relax();
 }
 
+static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode 
*node, u32 val, bool paravirt)
+{
+   int prev_cpu = decode_tail_cpu(val);
+   u32 yield_count;
+
+   if (!paravirt)
+   goto relax;
+
+   if (!pv_yield_prev)
+   goto relax;
+
+   yield_count = yield_count_of(prev_cpu);
+   if ((yield_count & 1) == 0)
+   goto relax; /* owner vcpu is running */
+
+   smp_rmb(); /* See yield_to_locked_owner comment */
+
+   if (!node->locked) {
+   yield_to_preempted(prev_cpu, yield_count);
+   return;
+   }
+
+relax:
+   cpu_relax();
+}
+
 
 static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
paravirt)
 {
@@ -286,7 +313,7 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 
/* Wait for mcs node lock to be released */
while (!node->locked)
-   cpu_relax();
+   yield_to_prev(lock, node, old, paravirt);
 
smp_rmb(); /* acquire barrier for the mcs lock */
}
@@ -458,12 +485,29 @@ static int pv_yield_owner_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, 
pv_yield_owner_set, "%llu\n");
 
+static int pv_yield_prev_set(void *data, u64 val)
+{
+   pv_yield_prev = !!val;
+
+   return 0;
+}
+
+static int pv_yield_prev_get(void *data, u64 *val)
+{
+   *val = pv_yield_prev;
+
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, 
pv_yield_prev_set, "%llu\n");
+
 static __init int spinlock_debugfs_init(void)
 {
debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
&fops_steal_spins);
debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, 
&fops_head_spins);
if (is_shared_processor()) {
debugfs_create_file("qspl_pv_yield_owner", 0600, 
arch_debugfs_dir, NULL, &fops_pv_yield_owner);
+   debugfs_create_file("qspl_pv_yield_prev", 0600, 
arch_debugfs_dir, NULL, &fops_pv_yield_prev);
}
 
return 0;
-- 
2.37.2



[PATCH v2 10/17] powerpc/qspinlock: allow stealing when head of queue yields

2022-11-13 Thread Nicholas Piggin
If the head of queue is preventing stealing but it finds the owner vCPU
is preempted, it will yield its cycles to the owner which could cause it
to become preempted. Add an option to re-allow stealers before yielding,
and disallow them again after returning from the yield.

Disable this option by default for now, i.e., no logical change.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 61 +---
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 272467c99b90..6b54b4628991 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -26,6 +26,7 @@ static bool maybe_stealers __read_mostly = true;
 static int head_spins __read_mostly = (1<<8);
 
 static bool pv_yield_owner __read_mostly = true;
+static bool pv_yield_allow_steal __read_mostly = false;
 static bool pv_yield_prev __read_mostly = true;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
@@ -154,6 +155,22 @@ static __always_inline u32 set_mustq(struct qspinlock 
*lock)
return prev;
 }
 
+static __always_inline u32 clear_mustq(struct qspinlock *lock)
+{
+   u32 prev;
+
+   asm volatile(
+"1:lwarx   %0,0,%1 # clear_mustq   \n"
+"  andc%0,%0,%2\n"
+"  stwcx.  %0,0,%1 \n"
+"  bne-1b  \n"
+   : "=&r" (prev)
+   : "r" (&lock->val), "r" (_Q_MUST_Q_VAL)
+   : "cr0", "memory");
+
+   return prev;
+}
+
 static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
 {
int cpu = decode_tail_cpu(val);
@@ -178,7 +195,7 @@ static struct qnode *get_tail_qnode(struct qspinlock *lock, 
u32 val)
BUG();
 }
 
-static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 
val, bool paravirt)
+static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt, bool mustq)
 {
int owner;
u32 yield_count;
@@ -207,7 +224,11 @@ static __always_inline void yield_to_locked_owner(struct 
qspinlock *lock, u32 va
smp_rmb();
 
if (READ_ONCE(lock->val) == val) {
+   if (mustq)
+   clear_mustq(lock);
yield_to_preempted(owner, yield_count);
+   if (mustq)
+   set_mustq(lock);
/* Don't relax if we yielded. Maybe we should? */
return;
}
@@ -215,6 +236,21 @@ static __always_inline void yield_to_locked_owner(struct 
qspinlock *lock, u32 va
cpu_relax();
 }
 
+static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 
val, bool paravirt)
+{
+   __yield_to_locked_owner(lock, val, paravirt, false);
+}
+
+static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt)
+{
+   bool mustq = false;
+
+   if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal)
+   mustq = true;
+
+   __yield_to_locked_owner(lock, val, paravirt, mustq);
+}
+
 static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode 
*node, u32 val, bool paravirt)
 {
int prev_cpu = decode_tail_cpu(val);
@@ -230,7 +266,7 @@ static __always_inline void yield_to_prev(struct qspinlock 
*lock, struct qnode *
if ((yield_count & 1) == 0)
goto relax; /* owner vcpu is running */
 
-   smp_rmb(); /* See yield_to_locked_owner comment */
+   smp_rmb(); /* See __yield_to_locked_owner comment */
 
if (!node->locked) {
yield_to_preempted(prev_cpu, yield_count);
@@ -325,7 +361,7 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
if (!(val & _Q_LOCKED_VAL))
break;
 
-   yield_to_locked_owner(lock, val, paravirt);
+   yield_head_to_locked_owner(lock, val, paravirt);
}
 
/* If we're the last queued, must clean up the tail. */
@@ -348,7 +384,7 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
if (!(val & _Q_LOCKED_VAL))
break;
 
-   yield_to_locked_owner(lock, val, paravirt);
+   yield_head_to_locked_owner(lock, val, paravirt);
 
iters++;
if (!mustq && iters >= get_head_spins(paravirt)) {
@@ -485,6 +521,22 @@ static int pv_yield_owner_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, 
pv_yield_owner_set, "%llu\n");
 
+static int pv_yield_allow_steal_set(void *data, u64 val)
+{
+   pv_yield_allow_steal = !!val;
+
+   return 0;
+}
+
+static int pv_yield_allow_steal_get(void *data, u64 *val)
+{

[PATCH v2 11/17] powerpc/qspinlock: allow propagation of yield CPU down the queue

2022-11-13 Thread Nicholas Piggin
Having all CPUs poll the lock word for the owner CPU that should be
yielded to defeats most of the purpose of using MCS queueing for
scalability. Yet it may be desirable for queued waiters to to yield
to a preempted owner.

s390 addreses this problem by having queued waiters sample the lock
word to find the owner much less frequently. In this approach, the
waiters never sample it directly, but the queue head propagates the
owner CPU back to the next waiter if it ever finds the owner has
been preempted. Queued waiters then subsequently propagate the owner
CPU back to the next waiter, and so on.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 82 
 1 file changed, 82 insertions(+)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 6b54b4628991..f07843b4c497 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -12,6 +12,7 @@
 struct qnode {
struct qnode*next;
struct qspinlock *lock;
+   int yield_cpu;
u8  locked; /* 1 if lock acquired */
 };
 
@@ -28,6 +29,7 @@ static int head_spins __read_mostly = (1<<8);
 static bool pv_yield_owner __read_mostly = true;
 static bool pv_yield_allow_steal __read_mostly = false;
 static bool pv_yield_prev __read_mostly = true;
+static bool pv_yield_propagate_owner __read_mostly = true;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
@@ -251,14 +253,67 @@ static __always_inline void 
yield_head_to_locked_owner(struct qspinlock *lock, u
__yield_to_locked_owner(lock, val, paravirt, mustq);
 }
 
+static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, 
int *set_yield_cpu, bool paravirt)
+{
+   struct qnode *next;
+   int owner;
+
+   if (!paravirt)
+   return;
+   if (!pv_yield_propagate_owner)
+   return;
+
+   owner = get_owner_cpu(val);
+   if (*set_yield_cpu == owner)
+   return;
+
+   next = READ_ONCE(node->next);
+   if (!next)
+   return;
+
+   if (vcpu_is_preempted(owner)) {
+   next->yield_cpu = owner;
+   *set_yield_cpu = owner;
+   } else if (*set_yield_cpu != -1) {
+   next->yield_cpu = owner;
+   *set_yield_cpu = owner;
+   }
+}
+
 static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode 
*node, u32 val, bool paravirt)
 {
int prev_cpu = decode_tail_cpu(val);
u32 yield_count;
+   int yield_cpu;
 
if (!paravirt)
goto relax;
 
+   if (!pv_yield_propagate_owner)
+   goto yield_prev;
+
+   yield_cpu = READ_ONCE(node->yield_cpu);
+   if (yield_cpu == -1) {
+   /* Propagate back the -1 CPU */
+   if (node->next && node->next->yield_cpu != -1)
+   node->next->yield_cpu = yield_cpu;
+   goto yield_prev;
+   }
+
+   yield_count = yield_count_of(yield_cpu);
+   if ((yield_count & 1) == 0)
+   goto yield_prev; /* owner vcpu is running */
+
+   smp_rmb();
+
+   if (yield_cpu == node->yield_cpu) {
+   if (node->next && node->next->yield_cpu != yield_cpu)
+   node->next->yield_cpu = yield_cpu;
+   yield_to_preempted(yield_cpu, yield_count);
+   return;
+   }
+
+yield_prev:
if (!pv_yield_prev)
goto relax;
 
@@ -331,6 +386,7 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
node = &qnodesp->nodes[idx];
node->next = NULL;
node->lock = lock;
+   node->yield_cpu = -1;
node->locked = 0;
 
tail = encode_tail_cpu(smp_processor_id());
@@ -351,16 +407,23 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
while (!node->locked)
yield_to_prev(lock, node, old, paravirt);
 
+   /* Clear out stale propagated yield_cpu */
+   if (paravirt && pv_yield_propagate_owner && node->yield_cpu != 
-1)
+   node->yield_cpu = -1;
+
smp_rmb(); /* acquire barrier for the mcs lock */
}
 
if (!maybe_stealers) {
+   int set_yield_cpu = -1;
+
/* We're at the head of the waitqueue, wait for the lock. */
for (;;) {
val = READ_ONCE(lock->val);
if (!(val & _Q_LOCKED_VAL))
break;
 
+   propagate_yield_cpu(node, val, &set_yield_cpu, 
paravirt);
yield_head_to_locked_owner(lock, val, paravirt);
}
 
@@ -374,6 +437,7 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
/* We must be the owner, just set the lock bit and acquire */
set_locked(lock);
} els

[PATCH v2 12/17] powerpc/qspinlock: add ability to prod new queue head CPU

2022-11-13 Thread Nicholas Piggin
After the head of the queue acquires the lock, it releases the
next waiter in the queue to become the new head. Add an option
to prod the new head if its vCPU was preempted. This may only
have an effect if queue waiters are yielding.

Disable this option by default for now, i.e., no logical change.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 29 -
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index f07843b4c497..51123240da8e 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -12,6 +12,7 @@
 struct qnode {
struct qnode*next;
struct qspinlock *lock;
+   int cpu;
int yield_cpu;
u8  locked; /* 1 if lock acquired */
 };
@@ -30,6 +31,7 @@ static bool pv_yield_owner __read_mostly = true;
 static bool pv_yield_allow_steal __read_mostly = false;
 static bool pv_yield_prev __read_mostly = true;
 static bool pv_yield_propagate_owner __read_mostly = true;
+static bool pv_prod_head __read_mostly = false;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
 
@@ -386,6 +388,7 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
node = &qnodesp->nodes[idx];
node->next = NULL;
node->lock = lock;
+   node->cpu = smp_processor_id();
node->yield_cpu = -1;
node->locked = 0;
 
@@ -483,7 +486,14 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 * this store to locked. The corresponding barrier is the smp_rmb()
 * acquire barrier for mcs lock, above.
 */
-   WRITE_ONCE(next->locked, 1);
+   if (paravirt && pv_prod_head) {
+   int next_cpu = next->cpu;
+   WRITE_ONCE(next->locked, 1);
+   if (vcpu_is_preempted(next_cpu))
+   prod_cpu(next_cpu);
+   } else {
+   WRITE_ONCE(next->locked, 1);
+   }
 
 release:
qnodesp->count--; /* release the node */
@@ -634,6 +644,22 @@ static int pv_yield_propagate_owner_get(void *data, u64 
*val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, 
pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n");
 
+static int pv_prod_head_set(void *data, u64 val)
+{
+   pv_prod_head = !!val;
+
+   return 0;
+}
+
+static int pv_prod_head_get(void *data, u64 *val)
+{
+   *val = pv_prod_head;
+
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, 
"%llu\n");
+
 static __init int spinlock_debugfs_init(void)
 {
debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
&fops_steal_spins);
@@ -643,6 +669,7 @@ static __init int spinlock_debugfs_init(void)
debugfs_create_file("qspl_pv_yield_allow_steal", 0600, 
arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
debugfs_create_file("qspl_pv_yield_prev", 0600, 
arch_debugfs_dir, NULL, &fops_pv_yield_prev);
debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, 
arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
+   debugfs_create_file("qspl_pv_prod_head", 0600, 
arch_debugfs_dir, NULL, &fops_pv_prod_head);
}
 
return 0;
-- 
2.37.2



[PATCH v2 13/17] powerpc/qspinlock: trylock and initial lock attempt may steal

2022-11-13 Thread Nicholas Piggin
This gives trylock slightly more strength, and it also gives most
of the benefit of passing 'val' back through the slowpath without
the complexity.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/qspinlock.h | 44 +++-
 arch/powerpc/lib/qspinlock.c |  9 ++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 3eff2d875bb6..56638175e49b 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -5,6 +5,15 @@
 #include 
 #include 
 
+/*
+ * The trylock itself may steal. This makes trylocks slightly stronger, and
+ * might make spin locks slightly more efficient when stealing.
+ *
+ * This is compile-time, so if true then there may always be stealers, so the
+ * nosteal paths become unused.
+ */
+#define _Q_SPIN_TRY_LOCK_STEAL 1
+
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
return READ_ONCE(lock->val);
@@ -26,11 +35,12 @@ static __always_inline u32 
queued_spin_encode_locked_val(void)
return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET);
 }
 
-static __always_inline int queued_spin_trylock(struct qspinlock *lock)
+static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock 
*lock)
 {
u32 new = queued_spin_encode_locked_val();
u32 prev;
 
+   /* Trylock succeeds only when unlocked and no queued nodes */
asm volatile(
 "1:lwarx   %0,0,%1,%3  # queued_spin_trylock   \n"
 "  cmpwi   0,%0,0  \n"
@@ -47,6 +57,38 @@ static __always_inline int queued_spin_trylock(struct 
qspinlock *lock)
return likely(prev == 0);
 }
 
+static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock)
+{
+   u32 new = queued_spin_encode_locked_val();
+   u32 prev, tmp;
+
+   /* Trylock may get ahead of queued nodes if it finds unlocked */
+   asm volatile(
+"1:lwarx   %0,0,%2,%5  # queued_spin_trylock   \n"
+"  andc.   %1,%0,%4\n"
+"  bne-2f  \n"
+"  and %1,%0,%4\n"
+"  or  %1,%1,%3\n"
+"  stwcx.  %1,0,%2 \n"
+"  bne-1b  \n"
+"\t"   PPC_ACQUIRE_BARRIER "   \n"
+"2:\n"
+   : "=&r" (prev), "=&r" (tmp)
+   : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK),
+ "i" (IS_ENABLED(CONFIG_PPC64))
+   : "cr0", "memory");
+
+   return likely(!(prev & ~_Q_TAIL_CPU_MASK));
+}
+
+static __always_inline int queued_spin_trylock(struct qspinlock *lock)
+{
+   if (!_Q_SPIN_TRY_LOCK_STEAL)
+   return __queued_spin_trylock_nosteal(lock);
+   else
+   return __queued_spin_trylock_steal(lock);
+}
+
 void queued_spin_lock_slowpath(struct qspinlock *lock);
 
 static __always_inline void queued_spin_lock(struct qspinlock *lock)
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 51123240da8e..830a90a66f5f 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -24,7 +24,11 @@ struct qnodes {
 
 /* Tuning parameters */
 static int steal_spins __read_mostly = (1<<5);
+#if _Q_SPIN_TRY_LOCK_STEAL == 1
+static const bool maybe_stealers = true;
+#else
 static bool maybe_stealers __read_mostly = true;
+#endif
 static int head_spins __read_mostly = (1<<8);
 
 static bool pv_yield_owner __read_mostly = true;
@@ -527,6 +531,10 @@ void pv_spinlocks_init(void)
 #include 
 static int steal_spins_set(void *data, u64 val)
 {
+#if _Q_SPIN_TRY_LOCK_STEAL == 1
+   /* MAYBE_STEAL remains true */
+   steal_spins = val;
+#else
static DEFINE_MUTEX(lock);
 
/*
@@ -551,6 +559,7 @@ static int steal_spins_set(void *data, u64 val)
steal_spins = val;
}
mutex_unlock(&lock);
+#endif
 
return 0;
 }
-- 
2.37.2



[PATCH v2 14/17] powerpc/qspinlock: use spin_begin/end API

2022-11-13 Thread Nicholas Piggin
Use the spin_begin/spin_cpu_relax/spin_end APIs in qspinlock, which helps
to prevent threads issuing a lot of expensive priority nops which may not
have much effect due to immediately executing low then medium priority.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 41 
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 830a90a66f5f..ea8886e2922b 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -203,6 +203,7 @@ static struct qnode *get_tail_qnode(struct qspinlock *lock, 
u32 val)
BUG();
 }
 
+/* Called inside spin_begin() */
 static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt, bool mustq)
 {
int owner;
@@ -222,6 +223,8 @@ static __always_inline void __yield_to_locked_owner(struct 
qspinlock *lock, u32
if ((yield_count & 1) == 0)
goto relax; /* owner vcpu is running */
 
+   spin_end();
+
/*
 * Read the lock word after sampling the yield count. On the other side
 * there may a wmb because the yield count update is done by the
@@ -237,18 +240,22 @@ static __always_inline void 
__yield_to_locked_owner(struct qspinlock *lock, u32
yield_to_preempted(owner, yield_count);
if (mustq)
set_mustq(lock);
+   spin_begin();
/* Don't relax if we yielded. Maybe we should? */
return;
}
+   spin_begin();
 relax:
-   cpu_relax();
+   spin_cpu_relax();
 }
 
+/* Called inside spin_begin() */
 static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 
val, bool paravirt)
 {
__yield_to_locked_owner(lock, val, paravirt, false);
 }
 
+/* Called inside spin_begin() */
 static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt)
 {
bool mustq = false;
@@ -286,6 +293,7 @@ static __always_inline void propagate_yield_cpu(struct 
qnode *node, u32 val, int
}
 }
 
+/* Called inside spin_begin() */
 static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode 
*node, u32 val, bool paravirt)
 {
int prev_cpu = decode_tail_cpu(val);
@@ -310,14 +318,18 @@ static __always_inline void yield_to_prev(struct 
qspinlock *lock, struct qnode *
if ((yield_count & 1) == 0)
goto yield_prev; /* owner vcpu is running */
 
+   spin_end();
+
smp_rmb();
 
if (yield_cpu == node->yield_cpu) {
if (node->next && node->next->yield_cpu != yield_cpu)
node->next->yield_cpu = yield_cpu;
yield_to_preempted(yield_cpu, yield_count);
+   spin_begin();
return;
}
+   spin_begin();
 
 yield_prev:
if (!pv_yield_prev)
@@ -327,15 +339,19 @@ static __always_inline void yield_to_prev(struct 
qspinlock *lock, struct qnode *
if ((yield_count & 1) == 0)
goto relax; /* owner vcpu is running */
 
+   spin_end();
+
smp_rmb(); /* See __yield_to_locked_owner comment */
 
if (!node->locked) {
yield_to_preempted(prev_cpu, yield_count);
+   spin_begin();
return;
}
+   spin_begin();
 
 relax:
-   cpu_relax();
+   spin_cpu_relax();
 }
 
 
@@ -347,6 +363,8 @@ static __always_inline bool try_to_steal_lock(struct 
qspinlock *lock, bool parav
return false;
 
/* Attempt to steal the lock */
+   spin_begin();
+
do {
u32 val = READ_ONCE(lock->val);
 
@@ -354,8 +372,10 @@ static __always_inline bool try_to_steal_lock(struct 
qspinlock *lock, bool parav
break;
 
if (unlikely(!(val & _Q_LOCKED_VAL))) {
+   spin_end();
if (trylock_with_tail_cpu(lock, val))
return true;
+   spin_begin();
} else {
yield_to_locked_owner(lock, val, paravirt);
}
@@ -363,6 +383,8 @@ static __always_inline bool try_to_steal_lock(struct 
qspinlock *lock, bool parav
iters++;
} while (iters < get_steal_spins(paravirt));
 
+   spin_end();
+
return false;
 }
 
@@ -411,8 +433,10 @@ static __always_inline void 
queued_spin_lock_mcs_queue(struct qspinlock *lock, b
WRITE_ONCE(prev->next, node);
 
/* Wait for mcs node lock to be released */
+   spin_begin();
while (!node->locked)
yield_to_prev(lock, node, old, paravirt);
+   spin_end();
 
/* Clear out stale propagated yield_cpu */
if (paravirt && pv_yield_propagate_owner && node->yield_cpu != 
-1)
@@ -425,6 +449,7 @@ static __

[PATCH v2 15/17] powerpc/qspinlock: reduce remote node steal spins

2022-11-13 Thread Nicholas Piggin
Allow for a reduction in the number of times a CPU from a different
node than the owner can attempt to steal the lock before queueing.
This could bias the transfer behaviour of the lock across the
machine and reduce NUMA crossings.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 43 +---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index ea8886e2922b..a1c832a52d26 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -24,6 +25,7 @@ struct qnodes {
 
 /* Tuning parameters */
 static int steal_spins __read_mostly = (1<<5);
+static int remote_steal_spins __read_mostly = (1<<2);
 #if _Q_SPIN_TRY_LOCK_STEAL == 1
 static const bool maybe_stealers = true;
 #else
@@ -44,6 +46,11 @@ static __always_inline int get_steal_spins(bool paravirt)
return steal_spins;
 }
 
+static __always_inline int get_remote_steal_spins(bool paravirt)
+{
+   return remote_steal_spins;
+}
+
 static __always_inline int get_head_spins(bool paravirt)
 {
return head_spins;
@@ -354,10 +361,24 @@ static __always_inline void yield_to_prev(struct 
qspinlock *lock, struct qnode *
spin_cpu_relax();
 }
 
+static __always_inline bool steal_break(u32 val, int iters, bool paravirt)
+{
+   if (iters >= get_steal_spins(paravirt))
+   return true;
+
+   if (IS_ENABLED(CONFIG_NUMA) &&
+   (iters >= get_remote_steal_spins(paravirt))) {
+   int cpu = get_owner_cpu(val);
+   if (numa_node_id() != cpu_to_node(cpu))
+   return true;
+   }
+   return false;
+}
 
 static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool 
paravirt)
 {
int iters = 0;
+   u32 val;
 
if (!maybe_stealers)
return false;
@@ -366,8 +387,7 @@ static __always_inline bool try_to_steal_lock(struct 
qspinlock *lock, bool parav
spin_begin();
 
do {
-   u32 val = READ_ONCE(lock->val);
-
+   val = READ_ONCE(lock->val);
if (val & _Q_MUST_Q_VAL)
break;
 
@@ -381,7 +401,7 @@ static __always_inline bool try_to_steal_lock(struct 
qspinlock *lock, bool parav
}
 
iters++;
-   } while (iters < get_steal_spins(paravirt));
+   } while (!steal_break(val, iters, paravirt));
 
spin_end();
 
@@ -606,6 +626,22 @@ static int steal_spins_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, 
"%llu\n");
 
+static int remote_steal_spins_set(void *data, u64 val)
+{
+   remote_steal_spins = val;
+
+   return 0;
+}
+
+static int remote_steal_spins_get(void *data, u64 *val)
+{
+   *val = remote_steal_spins;
+
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, 
remote_steal_spins_set, "%llu\n");
+
 static int head_spins_set(void *data, u64 val)
 {
head_spins = val;
@@ -705,6 +741,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, 
pv_prod_head_get, pv_prod_head_set, "
 static __init int spinlock_debugfs_init(void)
 {
debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, 
&fops_steal_spins);
+   debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, 
NULL, &fops_remote_steal_spins);
debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, 
&fops_head_spins);
if (is_shared_processor()) {
debugfs_create_file("qspl_pv_yield_owner", 0600, 
arch_debugfs_dir, NULL, &fops_pv_yield_owner);
-- 
2.37.2



[PATCH v2 16/17] powerpc/qspinlock: allow indefinite spinning on a preempted owner

2022-11-13 Thread Nicholas Piggin
Provide an option that holds off queueing indefinitely while the lock
owner is preempted. This could reduce queueing latencies for very
overcommitted vcpu situations.

This is disabled by default.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/lib/qspinlock.c | 74 
 1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index a1c832a52d26..7e6ab1f30d50 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -35,6 +35,7 @@ static int head_spins __read_mostly = (1<<8);
 
 static bool pv_yield_owner __read_mostly = true;
 static bool pv_yield_allow_steal __read_mostly = false;
+static bool pv_spin_on_preempted_owner __read_mostly = false;
 static bool pv_yield_prev __read_mostly = true;
 static bool pv_yield_propagate_owner __read_mostly = true;
 static bool pv_prod_head __read_mostly = false;
@@ -210,11 +211,12 @@ static struct qnode *get_tail_qnode(struct qspinlock 
*lock, u32 val)
BUG();
 }
 
-/* Called inside spin_begin() */
-static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt, bool mustq)
+/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. 
*/
+static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt, bool mustq)
 {
int owner;
u32 yield_count;
+   bool preempted = false;
 
BUG_ON(!(val & _Q_LOCKED_VAL));
 
@@ -232,6 +234,8 @@ static __always_inline void __yield_to_locked_owner(struct 
qspinlock *lock, u32
 
spin_end();
 
+   preempted = true;
+
/*
 * Read the lock word after sampling the yield count. On the other side
 * there may a wmb because the yield count update is done by the
@@ -248,29 +252,32 @@ static __always_inline void 
__yield_to_locked_owner(struct qspinlock *lock, u32
if (mustq)
set_mustq(lock);
spin_begin();
+
/* Don't relax if we yielded. Maybe we should? */
-   return;
+   return preempted;
}
spin_begin();
 relax:
spin_cpu_relax();
+
+   return preempted;
 }
 
-/* Called inside spin_begin() */
-static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 
val, bool paravirt)
+/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. 
*/
+static __always_inline bool yield_to_locked_owner(struct qspinlock *lock, u32 
val, bool paravirt)
 {
-   __yield_to_locked_owner(lock, val, paravirt, false);
+   return __yield_to_locked_owner(lock, val, paravirt, false);
 }
 
-/* Called inside spin_begin() */
-static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt)
+/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. 
*/
+static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, 
u32 val, bool paravirt)
 {
bool mustq = false;
 
if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal)
mustq = true;
 
-   __yield_to_locked_owner(lock, val, paravirt, mustq);
+   return __yield_to_locked_owner(lock, val, paravirt, mustq);
 }
 
 static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, 
int *set_yield_cpu, bool paravirt)
@@ -380,13 +387,16 @@ static __always_inline bool try_to_steal_lock(struct 
qspinlock *lock, bool parav
int iters = 0;
u32 val;
 
-   if (!maybe_stealers)
+   if (!maybe_stealers) {
+   /* XXX: should spin_on_preempted_owner do anything here? */
return false;
+   }
 
/* Attempt to steal the lock */
spin_begin();
-
do {
+   bool preempted = false;
+
val = READ_ONCE(lock->val);
if (val & _Q_MUST_Q_VAL)
break;
@@ -397,10 +407,23 @@ static __always_inline bool try_to_steal_lock(struct 
qspinlock *lock, bool parav
return true;
spin_begin();
} else {
-   yield_to_locked_owner(lock, val, paravirt);
+   preempted = yield_to_locked_owner(lock, val, paravirt);
}
 
-   iters++;
+   if (preempted) {
+   if (!pv_spin_on_preempted_owner)
+   iters++;
+   /*
+* pv_spin_on_preempted_owner don't increase iters
+* while the owner is preempted -- we won't interfere
+* with it by definition. This could introduce some
+* latency issue if we continually observe preempted
+* owners, but hopefully that's a rare corner case of
+* a badly oversubscribed system.
+   

[PATCH v2 17/17] powerpc/qspinlock: provide accounting and options for sleepy locks

2022-11-13 Thread Nicholas Piggin
Finding the owner or a queued waiter on a lock with a preempted vcpu
is indicative of an oversubscribed guest causing the lock to get into
trouble. Provide some options to detect this situation and have new
CPUs avoid queueing for a longer time (more steal iterations) to
minimise the problems caused by vcpu preemption on the queue.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/qspinlock_types.h |   7 +-
 arch/powerpc/lib/qspinlock.c   | 244 +++--
 2 files changed, 232 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock_types.h 
b/arch/powerpc/include/asm/qspinlock_types.h
index 35f9525381e6..4fbcc8a4230b 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -30,7 +30,7 @@ typedef struct qspinlock {
  *
  * 0: locked bit
  *  1-14: lock holder cpu
- *15: unused bit
+ *15: lock owner or queuer vcpus observed to be preempted bit
  *16: must queue bit
  * 17-31: tail cpu (+1)
  */
@@ -49,6 +49,11 @@ typedef struct qspinlock {
 #error "qspinlock does not support such large CONFIG_NR_CPUS"
 #endif
 
+#define _Q_SLEEPY_OFFSET   15
+#define _Q_SLEEPY_BITS 1
+#define _Q_SLEEPY_MASK _Q_SET_MASK(SLEEPY_OWNER)
+#define _Q_SLEEPY_VAL  (1U << _Q_SLEEPY_OFFSET)
+
 #define _Q_MUST_Q_OFFSET   16
 #define _Q_MUST_Q_BITS 1
 #define _Q_MUST_Q_MASK _Q_SET_MASK(MUST_Q)
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 7e6ab1f30d50..36afdfde41aa 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -36,25 +37,56 @@ static int head_spins __read_mostly = (1<<8);
 static bool pv_yield_owner __read_mostly = true;
 static bool pv_yield_allow_steal __read_mostly = false;
 static bool pv_spin_on_preempted_owner __read_mostly = false;
+static bool pv_sleepy_lock __read_mostly = true;
+static bool pv_sleepy_lock_sticky __read_mostly = false;
+static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
+static int pv_sleepy_lock_factor __read_mostly = 256;
 static bool pv_yield_prev __read_mostly = true;
 static bool pv_yield_propagate_owner __read_mostly = true;
 static bool pv_prod_head __read_mostly = false;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
+static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
 
-static __always_inline int get_steal_spins(bool paravirt)
+static __always_inline bool recently_sleepy(void)
 {
-   return steal_spins;
+   /* pv_sleepy_lock is true when this is called */
+   if (pv_sleepy_lock_interval_ns) {
+   u64 seen = this_cpu_read(sleepy_lock_seen_clock);
+
+   if (seen) {
+   u64 delta = sched_clock() - seen;
+   if (delta < pv_sleepy_lock_interval_ns)
+   return true;
+   this_cpu_write(sleepy_lock_seen_clock, 0);
+   }
+   }
+
+   return false;
 }
 
-static __always_inline int get_remote_steal_spins(bool paravirt)
+static __always_inline int get_steal_spins(bool paravirt, bool sleepy)
 {
-   return remote_steal_spins;
+   if (paravirt && sleepy)
+   return steal_spins * pv_sleepy_lock_factor;
+   else
+   return steal_spins;
 }
 
-static __always_inline int get_head_spins(bool paravirt)
+static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy)
 {
-   return head_spins;
+   if (paravirt && sleepy)
+   return remote_steal_spins * pv_sleepy_lock_factor;
+   else
+   return remote_steal_spins;
+}
+
+static __always_inline int get_head_spins(bool paravirt, bool sleepy)
+{
+   if (paravirt && sleepy)
+   return head_spins * pv_sleepy_lock_factor;
+   else
+   return head_spins;
 }
 
 static inline u32 encode_tail_cpu(int cpu)
@@ -187,6 +219,56 @@ static __always_inline u32 clear_mustq(struct qspinlock 
*lock)
return prev;
 }
 
+static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old)
+{
+   u32 prev;
+   u32 new = old | _Q_SLEEPY_VAL;
+
+   BUG_ON(!(old & _Q_LOCKED_VAL));
+   BUG_ON(old & _Q_SLEEPY_VAL);
+
+   asm volatile(
+"1:lwarx   %0,0,%1 # try_set_sleepy\n"
+"  cmpw0,%0,%2 \n"
+"  bne-2f  \n"
+"  stwcx.  %3,0,%1 \n"
+"  bne-1b  \n"
+"2:\n"
+   : "=&r" (prev)
+   : "r" (&lock->val), "r"(old), "r" (new)
+   : "cr0", "memory");
+
+   return likely(prev == old);
+}
+
+static __always_inline void seen_sleepy_owner(stru

Re: [PATCH v5 2/2] arm64: support batched/deferred tlb shootdown during page reclamation

2022-11-13 Thread Anshuman Khandual



On 10/28/22 13:42, Yicong Yang wrote:
> +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
> +{
> + /*
> +  * TLB batched flush is proved to be beneficial for systems with large
> +  * number of CPUs, especially system with more than 8 CPUs. TLB shutdown
> +  * is cheap on small systems which may not need this feature. So use
> +  * a threshold for enabling this to avoid potential side effects on
> +  * these platforms.
> +  */
> + if (num_online_cpus() <= CONFIG_ARM64_NR_CPUS_FOR_BATCHED_TLB)
> + return false;
> +
> +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
> + if (unlikely(this_cpu_has_cap(ARM64_WORKAROUND_REPEAT_TLBI)))
> + return false;
> +#endif

should_defer_flush() is immediately followed by set_tlb_ubc_flush_pending() 
which calls
arch_tlbbatch_add_mm(), triggering the actual TLBI flush via 
__flush_tlb_page_nosync().
It should be okay to check capability with this_cpu_has_cap() as the entire 
call chain
here is executed on the same cpu. But just wondering if cpus_have_const_cap() 
would be
simpler, consistent, and also cost effective ?

Regardless, a comment is needed before the #ifdef block explaining why it does 
not make
sense to defer/batch when __tlbi()/__tlbi_user() implementation will execute 
'dsb(ish)'
between two TLBI instructions to workaround the errata.

> +
> + return true;
> +}
> +
> +static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch 
> *batch,
> + struct mm_struct *mm,
> + unsigned long uaddr)
> +{
> + __flush_tlb_page_nosync(mm, uaddr);
> +}
> +
> +static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch 
> *batch)
> +{
> + dsb(ish);
> +}


[PATCH linux next v2] scsi: ibmvfc: use sysfs_emit() to instead of scnprintf()

2022-11-13 Thread yang.yang29
From: Xu Panda 

Replace the open-code with sysfs_emit() to simplify the code.

---
change for v2
 - align code
---
Signed-off-by: Xu Panda 
Signed-off-by: Yang Yang 
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 1a0c0b7289d2..841e47c94b12 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3411,8 +3411,7 @@ static ssize_t ibmvfc_show_host_partition_name(struct 
device *dev,
struct Scsi_Host *shost = class_to_shost(dev);
struct ibmvfc_host *vhost = shost_priv(shost);

-   return snprintf(buf, PAGE_SIZE, "%s\n",
-   vhost->login_buf->resp.partition_name);
+   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.partition_name);
 }

 static ssize_t ibmvfc_show_host_device_name(struct device *dev,
@@ -3421,8 +3420,7 @@ static ssize_t ibmvfc_show_host_device_name(struct device 
*dev,
struct Scsi_Host *shost = class_to_shost(dev);
struct ibmvfc_host *vhost = shost_priv(shost);

-   return snprintf(buf, PAGE_SIZE, "%s\n",
-   vhost->login_buf->resp.device_name);
+   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.device_name);
 }

 static ssize_t ibmvfc_show_host_loc_code(struct device *dev,
@@ -3431,8 +3429,7 @@ static ssize_t ibmvfc_show_host_loc_code(struct device 
*dev,
struct Scsi_Host *shost = class_to_shost(dev);
struct ibmvfc_host *vhost = shost_priv(shost);

-   return snprintf(buf, PAGE_SIZE, "%s\n",
-   vhost->login_buf->resp.port_loc_code);
+   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.port_loc_code);
 }

 static ssize_t ibmvfc_show_host_drc_name(struct device *dev,
@@ -3441,8 +3438,7 @@ static ssize_t ibmvfc_show_host_drc_name(struct device 
*dev,
struct Scsi_Host *shost = class_to_shost(dev);
struct ibmvfc_host *vhost = shost_priv(shost);

-   return snprintf(buf, PAGE_SIZE, "%s\n",
-   vhost->login_buf->resp.drc_name);
+   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.drc_name);
 }

 static ssize_t ibmvfc_show_host_npiv_version(struct device *dev,
@@ -3450,7 +3446,7 @@ static ssize_t ibmvfc_show_host_npiv_version(struct 
device *dev,
 {
struct Scsi_Host *shost = class_to_shost(dev);
struct ibmvfc_host *vhost = shost_priv(shost);
-   return snprintf(buf, PAGE_SIZE, "%d\n", 
be32_to_cpu(vhost->login_buf->resp.version));
+   return sysfs_emit(buf, "%d\n", 
be32_to_cpu(vhost->login_buf->resp.version));
 }

 static ssize_t ibmvfc_show_host_capabilities(struct device *dev,
@@ -3458,7 +3454,7 @@ static ssize_t ibmvfc_show_host_capabilities(struct 
device *dev,
 {
struct Scsi_Host *shost = class_to_shost(dev);
struct ibmvfc_host *vhost = shost_priv(shost);
-   return snprintf(buf, PAGE_SIZE, "%llx\n", 
be64_to_cpu(vhost->login_buf->resp.capabilities));
+   return sysfs_emit(buf, "%llx\n", 
be64_to_cpu(vhost->login_buf->resp.capabilities));
 }

 /**
@@ -3479,7 +3475,7 @@ static ssize_t ibmvfc_show_log_level(struct device *dev,
int len;

spin_lock_irqsave(shost->host_lock, flags);
-   len = snprintf(buf, PAGE_SIZE, "%d\n", vhost->log_level);
+   len = sysfs_emit(buf, "%d\n", vhost->log_level);
spin_unlock_irqrestore(shost->host_lock, flags);
return len;
 }
@@ -3517,7 +3513,7 @@ static ssize_t ibmvfc_show_scsi_channels(struct device 
*dev,
int len;

spin_lock_irqsave(shost->host_lock, flags);
-   len = snprintf(buf, PAGE_SIZE, "%d\n", vhost->client_scsi_channels);
+   len = sysfs_emit(buf, "%d\n", vhost->client_scsi_channels);
spin_unlock_irqrestore(shost->host_lock, flags);
return len;
 }
-- 
2.15.2


Re: [PATCH v5 02/16] powerpc: Override __ALIGN and __ALIGN_STR macros

2022-11-13 Thread Sathvika Vasireddy

Hi Peter,

On 03/11/22 14:18, Peter Zijlstra wrote:

On Wed, Nov 02, 2022 at 12:35:07PM +, Christophe Leroy wrote:


Le 28/10/2022 à 16:33, Sathvika Vasireddy a écrit :

In a subsequent patch, we would want to annotate powerpc assembly functions
with SYM_FUNC_START_LOCAL macro. This macro depends on __ALIGN macro.

The default expansion of __ALIGN macro is:
  #define __ALIGN  .align 4,0x90

So, override __ALIGN and __ALIGN_STR macros to use the same alignment as
that of the existing _GLOBAL macro. Also, do not pad with 0x90, because
repeated 0x90s are not a nop or trap on powerpc.

By the way, do we know what the instruction 0x90909090 is on powerpc ?
Is that something valid or not ?

Please also look at the version that's in tip/x86/core (and next). This
stuff should be gone now.

include/linux/linkage.h now reads like:

#ifndef __ALIGN
#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT
#define __ALIGN_STR __stringify(__ALIGN)
#endif


Since the above mentioned changes are not a part of powerpc/merge branch 
yet, I am retaining this patch for this merge cycle and will post a 
cleanup patch (to move to using FUNCTION_ALIGNMENT_4B) after the next -rc1.


Thanks,
Sathvika


Re: [PATCH linux next v2] scsi: ibmvfc: use sysfs_emit() to instead of scnprintf()

2022-11-13 Thread Christophe Leroy


Le 14/11/2022 à 04:38, yang.yan...@zte.com.cn a écrit :
> [Vous ne recevez pas souvent de courriers de yang.yan...@zte.com.cn. 
> Découvrez pourquoi ceci est important à 
> https://aka.ms/LearnAboutSenderIdentification ]
> 
> From: Xu Panda 
> 
> Replace the open-code with sysfs_emit() to simplify the code.
> 
> ---
> change for v2
>   - align code
> ---
> Signed-off-by: Xu Panda 
> Signed-off-by: Yang Yang 

Reviewed-by: Christophe Leroy 

> ---
>   drivers/scsi/ibmvscsi/ibmvfc.c | 20 
>   1 file changed, 8 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
> index 1a0c0b7289d2..841e47c94b12 100644
> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
> @@ -3411,8 +3411,7 @@ static ssize_t ibmvfc_show_host_partition_name(struct 
> device *dev,
>  struct Scsi_Host *shost = class_to_shost(dev);
>  struct ibmvfc_host *vhost = shost_priv(shost);
> 
> -   return snprintf(buf, PAGE_SIZE, "%s\n",
> -   vhost->login_buf->resp.partition_name);
> +   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.partition_name);
>   }
> 
>   static ssize_t ibmvfc_show_host_device_name(struct device *dev,
> @@ -3421,8 +3420,7 @@ static ssize_t ibmvfc_show_host_device_name(struct 
> device *dev,
>  struct Scsi_Host *shost = class_to_shost(dev);
>  struct ibmvfc_host *vhost = shost_priv(shost);
> 
> -   return snprintf(buf, PAGE_SIZE, "%s\n",
> -   vhost->login_buf->resp.device_name);
> +   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.device_name);
>   }
> 
>   static ssize_t ibmvfc_show_host_loc_code(struct device *dev,
> @@ -3431,8 +3429,7 @@ static ssize_t ibmvfc_show_host_loc_code(struct device 
> *dev,
>  struct Scsi_Host *shost = class_to_shost(dev);
>  struct ibmvfc_host *vhost = shost_priv(shost);
> 
> -   return snprintf(buf, PAGE_SIZE, "%s\n",
> -   vhost->login_buf->resp.port_loc_code);
> +   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.port_loc_code);
>   }
> 
>   static ssize_t ibmvfc_show_host_drc_name(struct device *dev,
> @@ -3441,8 +3438,7 @@ static ssize_t ibmvfc_show_host_drc_name(struct device 
> *dev,
>  struct Scsi_Host *shost = class_to_shost(dev);
>  struct ibmvfc_host *vhost = shost_priv(shost);
> 
> -   return snprintf(buf, PAGE_SIZE, "%s\n",
> -   vhost->login_buf->resp.drc_name);
> +   return sysfs_emit(buf, "%s\n", vhost->login_buf->resp.drc_name);
>   }
> 
>   static ssize_t ibmvfc_show_host_npiv_version(struct device *dev,
> @@ -3450,7 +3446,7 @@ static ssize_t ibmvfc_show_host_npiv_version(struct 
> device *dev,
>   {
>  struct Scsi_Host *shost = class_to_shost(dev);
>  struct ibmvfc_host *vhost = shost_priv(shost);
> -   return snprintf(buf, PAGE_SIZE, "%d\n", 
> be32_to_cpu(vhost->login_buf->resp.version));
> +   return sysfs_emit(buf, "%d\n", 
> be32_to_cpu(vhost->login_buf->resp.version));
>   }
> 
>   static ssize_t ibmvfc_show_host_capabilities(struct device *dev,
> @@ -3458,7 +3454,7 @@ static ssize_t ibmvfc_show_host_capabilities(struct 
> device *dev,
>   {
>  struct Scsi_Host *shost = class_to_shost(dev);
>  struct ibmvfc_host *vhost = shost_priv(shost);
> -   return snprintf(buf, PAGE_SIZE, "%llx\n", 
> be64_to_cpu(vhost->login_buf->resp.capabilities));
> +   return sysfs_emit(buf, "%llx\n", 
> be64_to_cpu(vhost->login_buf->resp.capabilities));
>   }
> 
>   /**
> @@ -3479,7 +3475,7 @@ static ssize_t ibmvfc_show_log_level(struct device *dev,
>  int len;
> 
>  spin_lock_irqsave(shost->host_lock, flags);
> -   len = snprintf(buf, PAGE_SIZE, "%d\n", vhost->log_level);
> +   len = sysfs_emit(buf, "%d\n", vhost->log_level);
>  spin_unlock_irqrestore(shost->host_lock, flags);
>  return len;
>   }
> @@ -3517,7 +3513,7 @@ static ssize_t ibmvfc_show_scsi_channels(struct device 
> *dev,
>  int len;
> 
>  spin_lock_irqsave(shost->host_lock, flags);
> -   len = snprintf(buf, PAGE_SIZE, "%d\n", vhost->client_scsi_channels);
> +   len = sysfs_emit(buf, "%d\n", vhost->client_scsi_channels);
>  spin_unlock_irqrestore(shost->host_lock, flags);
>  return len;
>   }
> --
> 2.15.2


[PATCH] powerpc/mce: log the error for all unrecoverable errors

2022-11-13 Thread Ganesh Goudar
machine_check_log_err() is not getting called for all
unrecoverable errors, And we are missing to log the error.

Raise irq work in save_mce_event() for unrecoverable errors,
So that we log the error from MCE event handling block in
timer handler.

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..a1cb2172eb7b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
+   /*
+* Raise irq work, So that we don't miss to log the error for
+* unrecoverable errors.
+*/
+   if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+   mce_irq_work_queue();
+
if (!addr)
return;
 
@@ -235,7 +242,6 @@ static void machine_check_ue_event(struct 
machine_check_event *evt)
   evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   mce_irq_work_queue();
 }
 
 /*
-- 
2.37.1