[PATCH 2/3] powerpc: replace #include with #include

2023-08-06 Thread Masahiro Yamada
Commit ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost")
deprecated , which is now a wrapper of .

Replace #include  with #include .

After all the  lines are converted,  and
 will be removed.

Signed-off-by: Masahiro Yamada 
---

 arch/powerpc/kernel/epapr_hcalls.S  | 2 +-
 arch/powerpc/kernel/fpu.S   | 2 +-
 arch/powerpc/kernel/misc.S  | 2 +-
 arch/powerpc/kernel/misc_32.S   | 2 +-
 arch/powerpc/kernel/misc_64.S   | 2 +-
 arch/powerpc/kernel/tm.S| 2 +-
 arch/powerpc/kernel/trace/ftrace_low.S  | 2 +-
 arch/powerpc/kernel/ucall.S | 2 +-
 arch/powerpc/kernel/vector.S| 2 +-
 arch/powerpc/kvm/book3s_64_entry.S  | 2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +-
 arch/powerpc/kvm/tm.S   | 2 +-
 arch/powerpc/lib/checksum_32.S  | 2 +-
 arch/powerpc/lib/checksum_64.S  | 2 +-
 arch/powerpc/lib/copy_32.S  | 2 +-
 arch/powerpc/lib/copy_mc_64.S   | 2 +-
 arch/powerpc/lib/copypage_64.S  | 2 +-
 arch/powerpc/lib/copyuser_64.S  | 2 +-
 arch/powerpc/lib/hweight_64.S   | 2 +-
 arch/powerpc/lib/mem_64.S   | 2 +-
 arch/powerpc/lib/memcmp_32.S| 2 +-
 arch/powerpc/lib/memcmp_64.S| 2 +-
 arch/powerpc/lib/memcpy_64.S| 2 +-
 arch/powerpc/lib/string.S   | 2 +-
 arch/powerpc/lib/string_32.S| 2 +-
 arch/powerpc/lib/string_64.S| 2 +-
 arch/powerpc/lib/strlen_32.S| 2 +-
 arch/powerpc/mm/book3s32/hash_low.S | 2 +-
 arch/powerpc/sysdev/dcr-low.S   | 2 +-
 29 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/kernel/epapr_hcalls.S 
b/arch/powerpc/kernel/epapr_hcalls.S
index 033116e465d0..1a9b5ae8ccb2 100644
--- a/arch/powerpc/kernel/epapr_hcalls.S
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -3,6 +3,7 @@
  * Copyright (C) 2012 Freescale Semiconductor, Inc.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -12,7 +13,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #ifndef CONFIG_PPC64
 /* epapr_ev_idle() was derived from e500_idle() */
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index f71f2bbd4de6..6a9acfb690c9 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -9,6 +9,7 @@
  *Copyright (C) 1997 Dan Malek (dma...@jlc.net).
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -18,7 +19,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index fb7de3543c03..29e1440d14cc 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -10,11 +10,11 @@
  *
  * setjmp/longjmp code by Paul Mackerras.
  */
+#include 
 #include 
 #include 
 #include 
 #include 
-#include 
 
.text
 
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index daf8f87d2372..2eabb15687a6 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -8,6 +8,7 @@
  *
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -22,7 +23,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
.text
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 2c9ac70aaf0c..1a8cdafd68e8 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -9,6 +9,7 @@
  * PPC64 updates by Dave Engebretsen (engeb...@us.ibm.com)
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -23,7 +24,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
.text
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 9feab5e0485b..a9cd6507163a 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -6,13 +6,13 @@
  * Copyright 2012 Matt Evans & Michael Neuling, IBM Corporation.
  */
 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #ifdef CONFIG_VSX
diff --git a/arch/powerpc/kernel/trace/ftrace_low.S 
b/arch/powerpc/kernel/trace/ftrace_low.S
index 294d1e05958a..5e271f87f799 100644
--- a/arch/powerpc/kernel/trace/ftrace_low.S
+++ b/arch/powerpc/kernel/trace/ftrace_low.S
@@ -3,12 +3,12 @@
  * Split from entry_64.S
  */
 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
-#include 
 
 #ifdef CONFIG_PPC64
 .pushsection ".tramp.ftrace.text","aw",@progbits;
diff --git a/arch/powerpc/kernel/ucall.S b/arch/powerpc/kernel/ucall.S
index 07296bc39166..80a1f9a4300a 100644
--- a/arch/powerpc/kernel/ucall.S
+++ b/arch/powerpc/kernel/ucall.S
@@ -5,8 +5,8 @@
  * Copyright 2019, IBM Corporation.
  *
  */
+#include 
 #include 
-#include 
 
 _GLOBAL(ucall_norets)
 EXPORT_SYMBOL_GPL(ucall_norets)
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index fcc0ad6d9c7b..4094e4c4c77a 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include 

[PATCH 1/3] powerpc: remove unneeded #include

2023-08-06 Thread Masahiro Yamada
There is no EXPORT_SYMBOL line there, hence #include 
is unneeded.

Signed-off-by: Masahiro Yamada 
---

 arch/powerpc/kernel/entry_32.S  | 1 -
 arch/powerpc/kernel/head_40x.S  | 1 -
 arch/powerpc/kernel/head_44x.S  | 1 -
 arch/powerpc/kernel/head_64.S   | 1 -
 arch/powerpc/kernel/head_85xx.S | 1 -
 arch/powerpc/kernel/head_8xx.S  | 1 -
 arch/powerpc/kernel/head_book3s_32.S| 1 -
 arch/powerpc/kernel/trace/ftrace_64_pg.S| 1 -
 arch/powerpc/kernel/trace/ftrace_mprofile.S | 1 -
 9 files changed, 9 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index fe27d41f9a3d..9692acb0361f 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -29,7 +29,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index 3f68a1624646..b32e7b2ebdcf 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -38,7 +38,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include "head_32.h"
 
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 63a85c16fef4..a3197c9f721c 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -35,7 +35,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include "head_booke.h"
 
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 6440b1bb332a..4690c219bfa4 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -40,7 +40,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #ifdef CONFIG_PPC_BOOK3S
 #include 
diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S
index fdbee1093e2b..97e9ea0c7297 100644
--- a/arch/powerpc/kernel/head_85xx.S
+++ b/arch/powerpc/kernel/head_85xx.S
@@ -40,7 +40,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include "head_booke.h"
 
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index a79751e05781..647b0b445e89 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -29,7 +29,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
diff --git a/arch/powerpc/kernel/head_book3s_32.S 
b/arch/powerpc/kernel/head_book3s_32.S
index c51f28b5abc0..6764b98ca360 100644
--- a/arch/powerpc/kernel/head_book3s_32.S
+++ b/arch/powerpc/kernel/head_book3s_32.S
@@ -31,7 +31,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S 
b/arch/powerpc/kernel/trace/ftrace_64_pg.S
index 6708e24db0ab..cdbcb5a0783b 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_pg.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -8,7 +8,6 @@
 #include 
 #include 
 #include 
-#include 
 
 _GLOBAL_TOC(ftrace_caller)
lbz r3, PACA_FTRACE_ENABLED(r13)
diff --git a/arch/powerpc/kernel/trace/ftrace_mprofile.S 
b/arch/powerpc/kernel/trace/ftrace_mprofile.S
index 1f7d86de1538..600406716d66 100644
--- a/arch/powerpc/kernel/trace/ftrace_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_mprofile.S
@@ -8,7 +8,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
-- 
2.39.2



[PATCH 3/3] powerpc: remove

2023-08-06 Thread Masahiro Yamada
All *.S files under arch/powerpc/ have been converted to include
 instead of .

Remove .

Signed-off-by: Masahiro Yamada 
---

 arch/powerpc/include/asm/Kbuild | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 419319c4963c..61a8dcd7 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -3,7 +3,6 @@ generated-y += syscall_table_32.h
 generated-y += syscall_table_64.h
 generated-y += syscall_table_spu.h
 generic-y += agp.h
-generic-y += export.h
 generic-y += kvm_types.h
 generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
-- 
2.39.2



Re: [PATCH net-next v2 10/10] net: fs_enet: Use cpm_muram_xxx() functions instead of cpm_dpxxx() macros

2023-08-06 Thread Simon Horman
On Fri, Aug 04, 2023 at 03:30:20PM +0200, Christophe Leroy wrote:
> cpm_dpxxx() macros are now always referring to cpm_muram_xxx() fonctions

nit: fonctions -> functions

Thanks Christophe,

This minor nit notwithstanding, this series looks good to me.
I'll send a reviewed-by tag for the whole series in response
to the cover letter.

...


Re: [PATCH net-next v2 00/10] net: fs_enet: Driver cleanup

2023-08-06 Thread Simon Horman
On Fri, Aug 04, 2023 at 03:30:10PM +0200, Christophe Leroy wrote:
> Over the years, platform and driver initialisation have evolved into
> more generic ways, and driver or platform specific stuff has gone
> away, leaving stale objects behind.
> 
> This series aims at cleaning all that up for fs_enet ethernet driver.
> 
> Changes in v2:
> - Remove a trailing whitespace in the old struct moved in patch 7.
> - Include powerpc people and list that I forgot when sending v1
> (and Rob as expected by Patchwork for patch 6, not sure why)

Thanks, this looks good to me.

For the series,

Reviewed-by: Simon Horman 



Re: [PATCH] floppy: ERROR: that open brace { should be on the previous line

2023-08-06 Thread Bagas Sanjaya
On Fri, Aug 04, 2023 at 04:05:09PM +, Christophe Leroy wrote:
> Hello,
> 
> Le 20/07/2023 à 12:17, zhangyongle...@208suo.com a écrit :
> > [Vous ne recevez pas souvent de courriers de zhangyongle...@208suo.com. 
> > D?couvrez pourquoi ceci est important ? 
> > https://aka.ms/LearnAboutSenderIdentification ]
> > 
> > Fix twoce occurrences of the checkpatch.pl error:
> > ERROR: that open brace { should be on the previous line
> 
> 
> Can you please explain the purpose of those changes ? Do you use some 
> tools that get disturbed by such cosmetic errors ? Otherwise what is 
> your reason ?

Hi,

208suo.com people do checkpatch fixes (that is, they run
scripts/checkpatch.pl -f  then try to make the script
happy). Steven warned them to not submitting such patches again [1] but
they keep spamming maintainers with checkpatch patches (ignoring the review
warning). I voiced this concern when reviewing one of their patches and
Jani replied that such one-way interaction with kernel communty is
detrimental [2].

The exact same situation happened last year involving developers from
cdjrlc.com domain. They also did trivial patches, including mostly
(and notoriously known for) redundant word stripping. While some of these
patches were accepted, others were not with reviews requesting changes in
v2, yet they also ignored reviews. In fact, in the early waves of 208suo.com
patches, they used the same email infra as 208suo.com people and they sent
patches as HTML emails (which were rejected by mailing lists obviously)
so that the latter people have to send their patches on their behalf
(but corrupted since 208suo.com people used Roundcube instead of
git-send-email(1)).

Regarding 208suo.com's mail infra, after I pointed out this [3], they
changed the infra so that patches sent didn't get corrupted. Thus, they did
listen in regard of tooling and infra changes, but they deliberately
doesn't answer code reviews.

Thanks.

[1]: https://lore.kernel.org/lkml/20230720134501.01f9f...@gandalf.local.home/
[2]: https://lore.kernel.org/lkml/87cz07vvwu@intel.com/
[3]: https://lore.kernel.org/lkml/zjk7sc4i+mk98k%...@debian.me/

> 
> We don't accept such standelone minor cosmetic changes at the first 
> place because it looks like a waste of time.

PS: And in fact, complicating stable backports...

-- 
An old man doll... just what I always wanted! - Clara


signature.asc
Description: PGP signature


[PATCH v2] powerpc: Use shared font data

2023-08-06 Thread linux
From: "Dr. David Alan Gilbert" 

PowerPC has a 'btext' font used for the console which is almost identical
to the shared font_sun8x16, so use it rather than duplicating the data.

They were actually identical until about a decade ago when
   commit bcfbeecea11c ("drivers: console: font_: Change a glyph from
"broken bar" to "vertical line"")

which changed the | in the shared font to be a solid
bar rather than a broken bar.  That's the only difference.

This was originally spotted by PMD which noticed that sparc does
the same thing with the same data, and they also share a bunch
of functions to manipulate the data.  I've previously posted a near
identical patch for sparc.

One difference I notice in PowerPC is that there are a bunch of compile
options for the .c files for the early code to avoid a bunch of security
compilation features;  it's not clear to me if this is a problem for
this font data.

Tested very lightly with a boot without FS in qemu.

v2
  Added 'select FONT_SUPPORT' (to stop modconfig causing the font to be
   linked into a module rather than the main kernel)
  Added 'select FONTS' to satisfy requirements in lib/fonts

Signed-off-by: Dr. David Alan Gilbert 
---
 arch/powerpc/Kconfig.debug  |   3 +
 arch/powerpc/kernel/btext.c | 360 +---
 2 files changed, 9 insertions(+), 354 deletions(-)

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 2a54fadbeaf51..521c4baf30e88 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -147,6 +147,9 @@ config BDI_SWITCH
 config BOOTX_TEXT
bool "Support for early boot text console (BootX or OpenFirmware only)"
depends on PPC_BOOK3S
+   select FONT_SUN8x16
+   select FONT_SUPPORT
+   select FONTS
help
  Say Y here to see progress messages from the boot firmware in text
  mode. Requires either BootX or Open Firmware.
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c
index 19e46fd623b0d..7f63f1cdc6c39 100644
--- a/arch/powerpc/kernel/btext.c
+++ b/arch/powerpc/kernel/btext.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -41,10 +42,6 @@ static unsigned char *logicalDisplayBase __force_data;
 
 unsigned long disp_BAT[2] __initdata = {0, 0};
 
-#define cmapsz (16*256)
-
-static unsigned char vga_font[cmapsz];
-
 static int boot_text_mapped __force_data;
 
 extern void rmci_on(void);
@@ -407,7 +404,7 @@ static unsigned int expand_bits_16[4] = {
 };
 
 
-static void draw_byte_32(unsigned char *font, unsigned int *base, int rb)
+static void draw_byte_32(const unsigned char *font, unsigned int *base, int rb)
 {
int l, bits;
int fg = 0xUL;
@@ -428,7 +425,7 @@ static void draw_byte_32(unsigned char *font, unsigned int 
*base, int rb)
}
 }
 
-static inline void draw_byte_16(unsigned char *font, unsigned int *base, int 
rb)
+static inline void draw_byte_16(const unsigned char *font, unsigned int *base, 
int rb)
 {
int l, bits;
int fg = 0xUL;
@@ -446,7 +443,7 @@ static inline void draw_byte_16(unsigned char *font, 
unsigned int *base, int rb)
}
 }
 
-static inline void draw_byte_8(unsigned char *font, unsigned int *base, int rb)
+static inline void draw_byte_8(const unsigned char *font, unsigned int *base, 
int rb)
 {
int l, bits;
int fg = 0x0F0F0F0FUL;
@@ -465,7 +462,8 @@ static inline void draw_byte_8(unsigned char *font, 
unsigned int *base, int rb)
 static noinline void draw_byte(unsigned char c, long locX, long locY)
 {
unsigned char *base = calc_base(locX << 3, locY << 4);
-   unsigned char *font = &vga_font[((unsigned int)c) * 16];
+   unsigned int font_index = c * 16;
+   const unsigned char *font   = font_sun_8x16.data + font_index;
int rb  = dispDeviceRowBytes;
 
rmci_maybe_on();
@@ -583,349 +581,3 @@ void __init udbg_init_btext(void)
 */
udbg_putc = btext_drawchar;
 }
-
-static unsigned char vga_font[cmapsz] = {
-0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0x81, 0xa5, 0x81, 0x81, 0xbd,
-0x99, 0x81, 0x81, 0x7e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7e, 0xff,
-0xdb, 0xff, 0xff, 0xc3, 0xe7, 0xff, 0xff, 0x7e, 0x00, 0x00, 0x00, 0x00,
-0x00, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xfe, 0xfe, 0xfe, 0x7c, 0x38, 0x10,
-0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x38, 0x7c, 0xfe,
-0x7c, 0x38, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18,
-0x3c, 0x3c, 0xe7, 0xe7, 0xe7, 0x18, 0x18, 0x3c, 0x00, 0x00, 0x00, 0x00,
-0x00, 0x00, 0x00, 0x18, 0x3c, 0x7e, 0xff, 0xff, 0x7e, 0x18, 0x18, 0x3c,
-0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x3c,
-0x3c, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
-0xff, 0xff, 0xe7, 0xc3, 0xc3, 0xe7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-0x00, 0x00

[PATCH v3 0/6] KVM: PPC: Nested APIv2 guest support

2023-08-06 Thread Jordan Niethe
A nested-HV API for PAPR has been developed based on the KVM-specific
nested-HV API that is upstream in Linux/KVM and QEMU. The PAPR API had
to break compatibility to accommodate implementation in other
hypervisors and partitioning firmware. The existing KVM-specific API
will be known as the Nested APIv1 and the PAPR API will be known as the
Nested APIv2. 

The control flow and interrupt processing between L0, L1, and L2 in
the Nested APIv2 are conceptually unchanged. Where Nested APIv1 is almost
stateless, the Nested APIv2 is stateful, with the L1 registering L2 virtual
machines and vCPUs with the L0. Supervisor-privileged register switching
duty is now the responsibility for the L0, which holds canonical L2
register state and handles all switching. This new register handling
motivates the "getters and setters" wrappers to assist in syncing the
L2s state in the L1 and the L0.

Broadly, the new hcalls will be used for  creating and managing guests
by a regular partition in the following way:

  - L1 and L0 negotiate capabilities with
H_GUEST_{G,S}ET_CAPABILITIES

  - L1 requests the L0 create a L2 with
H_GUEST_CREATE and receives a handle to use in future hcalls

  - L1 requests the L0 create a L2 vCPU with
H_GUEST_CREATE_VCPU

  - L1 sets up the L2 using H_GUEST_SET and the
H_GUEST_VCPU_RUN input buffer

  - L1 requests the L0 runs the L2 vCPU using H_GUEST_VCPU_RUN

  - L2 returns to L1 with an exit reason and L1 reads the
H_GUEST_VCPU_RUN output buffer populated by the L0

  - L1 handles the exit using H_GET_STATE if necessary

  - L1 reruns L2 vCPU with H_GUEST_VCPU_RUN

  - L1 frees the L2 in the L0 with H_GUEST_DELETE

Further details are available in Documentation/powerpc/kvm-nested.rst.

This series adds KVM support for using this hcall interface as a regular
PAPR partition, i.e. the L1. It does not add support for running as the
L0.

The new hcalls have been implemented in the spapr qemu model for
testing.

This is available at https://github.com/planetharsh/qemu/tree/upstream-0714-kop

There are scripts available to assist in setting up an environment for
testing nested guests at https://github.com/iamjpn/kvm-powervm-test

A tree with this series is available at
https://github.com/iamjpn/linux/tree/features/kvm-nestedv2-v3

Thanks to Amit Machhiwal, Kautuk Consul, Vaibhav Jain, Michael Neuling,
Shivaprasad Bhat, Harsh Prateek Bora, Paul Mackerras and Nicholas
Piggin. 

Change overview in v3:
  - KVM: PPC: Use getters and setters for vcpu register state
  - Do not add a helper for pvr
  - Use an expression when declaring variable in case
  - Squash in all getters and setters
  - Pass vector registers by reference
  - KVM: PPC: Rename accessor generator macros
  - New to series
  - KVM: PPC: Add helper library for Guest State Buffers
  - Use EXPORT_SYMBOL_GPL()
  - Use the kvmppc namespace
  - Move kvmppc_gsb_reset() out of kvmppc_gsm_fill_info()
  - Comments for GSID elements
  - Pass vector elements by reference
  - Remove generic put and get functions
  - KVM: PPC: Book3s HV: Hold LPIDs in an unsigned long
  - New to series
  - KVM: PPC: Add support for nestedv2 guests
  - Use EXPORT_SYMBOL_GPL()
  - Change to kvmhv_nestedv2 namespace
  - Make kvmhv_enable_nested() return -ENODEV on NESTEDv2 L1 hosts
  - s/kvmhv_on_papr/kvmhv_is_nestedv2/
  - mv book3s_hv_papr.c book3s_hv_nestedv2.c
  - Handle shared regs without a guest state id in the same wrapper
  - Use a static key for API version
  - Add a positive test for NESTEDv1
  - Give the amor a static value
  - s/struct kvmhv_nestedv2_host/struct kvmhv_nestedv2_io/
  - Propagate failure in kvmhv_vcpu_entry_nestedv2()
  - WARN if getters and setters fail
  - Progagate failure from kvmhv_nestedv2_parse_output()
  - Replace delay with sleep in plpar_guest_{create,delete,create_vcpu}()
  - Add logical PVR handling
  - Replace kvmppc_gse_{get,put} with specific version
  - docs: powerpc: Document nested KVM on POWER
  - Fix typos


Change overview in v2:
  - Rebase on top of kvm ppc prefix instruction support
  - Make documentation an individual patch
  - Move guest state buffer files from arch/powerpc/lib/ to
arch/powerpc/kvm/
  - Use kunit for testing guest state buffer
  - Fix some build errors
  - Change HEIR element from 4 bytes to 8 bytes

Previous revisions:

  - v1: 
https://lore.kernel.org/linuxppc-dev/20230508072332.2937883-1-...@linux.vnet.ibm.com/
  - v2: 
https://lore.kernel.org/linuxppc-dev/20230605064848.12319-1-...@linux.vnet.ibm.com/
 

Jordan Niethe (5):
  KVM: PPC: Use getters and setters for vcpu register state
  KVM: PPC: Rename accessor generator macros
  KVM: PPC: Add helper library for Guest State Buffers
  KVM: PPC: Book3s HV: Hold LPIDs in an unsigned long
  KVM: PPC: Add support for nestedv2 guests

Michael Neuling (1):
  docs: powerpc: Document nested KVM on POWER

 Documentation/powerpc/

[PATCH v3 1/6] KVM: PPC: Use getters and setters for vcpu register state

2023-08-06 Thread Jordan Niethe
There are already some getter and setter functions used for accessing
vcpu register state, e.g. kvmppc_get_pc(). There are also more
complicated examples that are generated by macros like
kvmppc_get_sprg0() which are generated by the SHARED_SPRNG_WRAPPER()
macro.

In the new PAPR "Nestedv2" API for nested guest partitions the L1 is
required to communicate with the L0 to modify and read nested guest
state.

Prepare to support this by replacing direct accesses to vcpu register
state with wrapper functions. Follow the existing pattern of using
macros to generate individual wrappers. These wrappers will
be augmented for supporting Nestedv2 guests later.

Signed-off-by: Gautam Menghani 
Signed-off-by: Jordan Niethe 
---
v3:
  - Do not add a helper for pvr
  - Use an expression when declaring variable in case
  - Squash in all getters and setters
  - Guatam: Pass vector registers by reference
---
 arch/powerpc/include/asm/kvm_book3s.h  | 123 +-
 arch/powerpc/include/asm/kvm_booke.h   |  10 ++
 arch/powerpc/kvm/book3s.c  |  38 ++---
 arch/powerpc/kvm/book3s_64_mmu_hv.c|   4 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c |   9 +-
 arch/powerpc/kvm/book3s_64_vio.c   |   4 +-
 arch/powerpc/kvm/book3s_hv.c   | 220 +
 arch/powerpc/kvm/book3s_hv.h   |  58 +++
 arch/powerpc/kvm/book3s_hv_builtin.c   |  10 +-
 arch/powerpc/kvm/book3s_hv_p9_entry.c  |   4 +-
 arch/powerpc/kvm/book3s_hv_ras.c   |   5 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c|   8 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c   |   4 +-
 arch/powerpc/kvm/book3s_xive.c |   9 +-
 arch/powerpc/kvm/emulate_loadstore.c   |   2 +-
 arch/powerpc/kvm/powerpc.c |  76 -
 16 files changed, 395 insertions(+), 189 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index bbf5e2c5fe09..1a7e837ea2d5 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -392,6 +392,16 @@ static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
return vcpu->arch.regs.nip;
 }
 
+static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 val)
+{
+   vcpu->arch.pid = val;
+}
+
+static inline u32 kvmppc_get_pid(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.pid;
+}
+
 static inline u64 kvmppc_get_msr(struct kvm_vcpu *vcpu);
 static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu)
 {
@@ -403,10 +413,121 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu 
*vcpu)
return vcpu->arch.fault_dar;
 }
 
+static inline u64 kvmppc_get_fpr(struct kvm_vcpu *vcpu, int i)
+{
+   return vcpu->arch.fp.fpr[i][TS_FPROFFSET];
+}
+
+static inline void kvmppc_set_fpr(struct kvm_vcpu *vcpu, int i, u64 val)
+{
+   vcpu->arch.fp.fpr[i][TS_FPROFFSET] = val;
+}
+
+static inline u64 kvmppc_get_fpscr(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.fp.fpscr;
+}
+
+static inline void kvmppc_set_fpscr(struct kvm_vcpu *vcpu, u64 val)
+{
+   vcpu->arch.fp.fpscr = val;
+}
+
+
+static inline u64 kvmppc_get_vsx_fpr(struct kvm_vcpu *vcpu, int i, int j)
+{
+   return vcpu->arch.fp.fpr[i][j];
+}
+
+static inline void kvmppc_set_vsx_fpr(struct kvm_vcpu *vcpu, int i, int j,
+ u64 val)
+{
+   vcpu->arch.fp.fpr[i][j] = val;
+}
+
+#ifdef CONFIG_VSX
+static inline void kvmppc_get_vsx_vr(struct kvm_vcpu *vcpu, int i, vector128 
*v)
+{
+   *v =  vcpu->arch.vr.vr[i];
+}
+
+static inline void kvmppc_set_vsx_vr(struct kvm_vcpu *vcpu, int i,
+vector128 *val)
+{
+   vcpu->arch.vr.vr[i] = *val;
+}
+
+static inline u32 kvmppc_get_vscr(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.vr.vscr.u[3];
+}
+
+static inline void kvmppc_set_vscr(struct kvm_vcpu *vcpu, u32 val)
+{
+   vcpu->arch.vr.vscr.u[3] = val;
+}
+#endif
+
+#define KVMPPC_BOOK3S_VCPU_ACCESSOR_SET(reg, size) \
+static inline void kvmppc_set_##reg(struct kvm_vcpu *vcpu, u##size val)
\
+{  \
+   \
+   vcpu->arch.reg = val;   \
+}
+
+#define KVMPPC_BOOK3S_VCPU_ACCESSOR_GET(reg, size) \
+static inline u##size kvmppc_get_##reg(struct kvm_vcpu *vcpu)  \
+{  \
+   return vcpu->arch.reg;  \
+}
+
+#define KVMPPC_BOOK3S_VCPU_ACCESSOR(reg, size) \
+   KVMPPC_BOOK3S_VCPU_ACCESSOR_SET(reg, size)  \
+   KVMPPC_BOOK3S_VCPU_ACCESSOR_GET(reg, size)  \
+
+KVMPPC_BOOK3S_VCPU_ACCESSOR(tar, 64)
+KVMPPC_BOOK3S_VCPU_ACCESSOR(ebbhr, 64)
+KVMPPC_BOOK3S_VCPU_ACCESSOR(ebbrr, 64)
+KVMPPC_BOOK3S_VCPU_ACCESSOR(bescr, 64)
+KVMPPC_BOOK3S_VCPU_ACCESSOR(ic, 64)
+K

[PATCH v3 2/6] KVM: PPC: Rename accessor generator macros

2023-08-06 Thread Jordan Niethe
More "wrapper" style accessor generating macros will be introduced for
the nestedv2 guest support. Rename the existing macros with more
descriptive names now so there is a consistent naming convention.

Signed-off-by: Jordan Niethe 
---
v3:
  - New to series
---
 arch/powerpc/include/asm/kvm_ppc.h | 60 +++---
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index d16d80ad2ae4..b66084a81dd0 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -927,19 +927,19 @@ static inline bool kvmppc_shared_big_endian(struct 
kvm_vcpu *vcpu)
 #endif
 }
 
-#define SPRNG_WRAPPER_GET(reg, bookehv_spr)\
+#define KVMPPC_BOOKE_HV_SPRNG_ACESSOR_GET(reg, bookehv_spr)\
 static inline ulong kvmppc_get_##reg(struct kvm_vcpu *vcpu)\
 {  \
return mfspr(bookehv_spr);  \
 }  \
 
-#define SPRNG_WRAPPER_SET(reg, bookehv_spr)\
+#define KVMPPC_BOOKE_HV_SPRNG_ACESSOR_SET(reg, bookehv_spr)\
 static inline void kvmppc_set_##reg(struct kvm_vcpu *vcpu, ulong val)  \
 {  \
mtspr(bookehv_spr, val);
\
 }  \
 
-#define SHARED_WRAPPER_GET(reg, size)  \
+#define KVMPPC_VCPU_SHARED_REGS_ACESSOR_GET(reg, size) \
 static inline u##size kvmppc_get_##reg(struct kvm_vcpu *vcpu)  \
 {  \
if (kvmppc_shared_big_endian(vcpu)) \
@@ -948,7 +948,7 @@ static inline u##size kvmppc_get_##reg(struct kvm_vcpu 
*vcpu)   \
   return le##size##_to_cpu(vcpu->arch.shared->reg);\
 }  \
 
-#define SHARED_WRAPPER_SET(reg, size)  \
+#define KVMPPC_VCPU_SHARED_REGS_ACESSOR_SET(reg, size) \
 static inline void kvmppc_set_##reg(struct kvm_vcpu *vcpu, u##size val)
\
 {  \
if (kvmppc_shared_big_endian(vcpu)) \
@@ -957,36 +957,36 @@ static inline void kvmppc_set_##reg(struct kvm_vcpu 
*vcpu, u##size val)   \
   vcpu->arch.shared->reg = cpu_to_le##size(val);   \
 }  \
 
-#define SHARED_WRAPPER(reg, size)  \
-   SHARED_WRAPPER_GET(reg, size)   \
-   SHARED_WRAPPER_SET(reg, size)   \
+#define KVMPPC_VCPU_SHARED_REGS_ACESSOR(reg, size) 
\
+   KVMPPC_VCPU_SHARED_REGS_ACESSOR_GET(reg, size)  
\
+   KVMPPC_VCPU_SHARED_REGS_ACESSOR_SET(reg, size)  
\
 
-#define SPRNG_WRAPPER(reg, bookehv_spr)
\
-   SPRNG_WRAPPER_GET(reg, bookehv_spr) \
-   SPRNG_WRAPPER_SET(reg, bookehv_spr) \
+#define KVMPPC_BOOKE_HV_SPRNG_ACESSOR(reg, bookehv_spr)
\
+   KVMPPC_BOOKE_HV_SPRNG_ACESSOR_GET(reg, bookehv_spr) 
\
+   KVMPPC_BOOKE_HV_SPRNG_ACESSOR_SET(reg, bookehv_spr) 
\
 
 #ifdef CONFIG_KVM_BOOKE_HV
 
-#define SHARED_SPRNG_WRAPPER(reg, size, bookehv_spr)   \
-   SPRNG_WRAPPER(reg, bookehv_spr) \
+#define KVMPPC_BOOKE_HV_SPRNG_OR_VCPU_SHARED_REGS_ACCESSOR(reg, size, 
bookehv_spr) \
+   KVMPPC_BOOKE_HV_SPRNG_ACESSOR(reg, bookehv_spr) \
 
 #else
 
-#define SHARED_SPRNG_WRAPPER(reg, size, bookehv_spr)   \
-   SHARED_WRAPPER(reg, size)   \
+#define KVMPPC_BOOKE_HV_SPRNG_OR_VCPU_SHARED_REGS_ACCESSOR(reg, size, 
bookehv_spr) \
+   KVMPPC_VCPU_SHARED_REGS_ACESSOR(reg, size)  \
 
 #endif
 
-SHARED_WRAPPER(critical, 64)
-SHARED_SPRNG_WRAPPER(sprg0, 64, SPRN_GSPRG0)
-SHARED_SPRNG_WRAPPER(sprg1, 64, SPRN_GSPRG1)
-SHARED_SPRNG_WRAPPER(sprg2, 64, SPRN_GSPRG2)
-SHARED_SPRNG_WRAPPER(sprg3, 64, SPRN_GSPRG3)
-SHARED_SPRNG_WRAPPER(srr0, 64, SPRN_GSRR0)
-SHARED_SPRNG_WRAPPER(srr1, 64, SPRN_GSRR1)
-SHARED_SPRNG_WRAPPER(dar, 64, SPRN_GDEAR)
-SHARED_SPRNG_WRAPPER(esr, 64, SPRN_GESR)
-SHARED_WRAPPER_GET(msr, 64)
+KVMPPC_VCPU_SHARED_REGS_ACESSOR(critical, 64)

[PATCH v3 3/6] KVM: PPC: Add helper library for Guest State Buffers

2023-08-06 Thread Jordan Niethe
The PAPR "Nestedv2" guest API introduces the concept of a Guest State
Buffer for communication about L2 guests between L1 and L0 hosts.

In the new API, the L0 manages the L2 on behalf of the L1. This means
that if the L1 needs to change L2 state (e.g. GPRs, SPRs, partition
table...), it must request the L0 perform the modification. If the
nested host needs to read L2 state likewise this request must
go through the L0.

The Guest State Buffer is a Type-Length-Value style data format defined
in the PAPR which assigns all relevant partition state a unique
identity. Unlike a typical TLV format the length is redundant as the
length of each identity is fixed but is included for checking
correctness.

A guest state buffer consists of an element count followed by a stream
of elements, where elements are composed of an ID number, data length,
then the data:

  Header:

   <---4 bytes--->
  ++-
  | Element Count  | Elements...
  ++-

  Element:

   <2 bytes---> <-2 bytes-> <-Length bytes->
  ++---++
  | Guest State ID |  Length   |  Data  |
  ++---++

Guest State IDs have other attributes defined in the PAPR such as
whether they are per thread or per guest, or read-only.

Introduce a library for using guest state buffers. This includes support
for actions such as creating buffers, adding elements to buffers,
reading the value of elements and parsing buffers. This will be used
later by the nestedv2 guest support.

Signed-off-by: Jordan Niethe 
---
v2:
  - Add missing #ifdef CONFIG_VSXs
  - Move files from lib/ to kvm/
  - Guard compilation on CONFIG_KVM_BOOK3S_HV_POSSIBLE
  - Use kunit for guest state buffer tests
  - Add configuration option for the tests
  - Use macros for contiguous id ranges like GPRs
  - Add some missing EXPORTs to functions
  - HEIR element is a double word not a word
v3:
  - Use EXPORT_SYMBOL_GPL()
  - Use the kvmppc namespace
  - Move kvmppc_gsb_reset() out of kvmppc_gsm_fill_info()
  - Comments for GSID elements
  - Pass vector elements by reference
  - Remove generic put and get functions
---
 arch/powerpc/Kconfig.debug|  12 +
 arch/powerpc/include/asm/guest-state-buffer.h | 904 ++
 arch/powerpc/kvm/Makefile |   3 +
 arch/powerpc/kvm/guest-state-buffer.c | 571 +++
 arch/powerpc/kvm/test-guest-state-buffer.c| 328 +++
 5 files changed, 1818 insertions(+)
 create mode 100644 arch/powerpc/include/asm/guest-state-buffer.h
 create mode 100644 arch/powerpc/kvm/guest-state-buffer.c
 create mode 100644 arch/powerpc/kvm/test-guest-state-buffer.c

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 2a54fadbeaf5..339c3a5f56f1 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -82,6 +82,18 @@ config MSI_BITMAP_SELFTEST
bool "Run self-tests of the MSI bitmap code"
depends on DEBUG_KERNEL
 
+config GUEST_STATE_BUFFER_TEST
+   def_tristate n
+   prompt "Enable Guest State Buffer unit tests"
+   depends on KUNIT
+   depends on KVM_BOOK3S_HV_POSSIBLE
+   default KUNIT_ALL_TESTS
+   help
+ The Guest State Buffer is a data format specified in the PAPR.
+ It is by hcalls to communicate the state of L2 guests between
+ the L1 and L0 hypervisors. Enable unit tests for the library
+ used to create and use guest state buffers.
+
 config PPC_IRQ_SOFT_MASK_DEBUG
bool "Include extra checks for powerpc irq soft masking"
depends on PPC64
diff --git a/arch/powerpc/include/asm/guest-state-buffer.h 
b/arch/powerpc/include/asm/guest-state-buffer.h
new file mode 100644
index ..aaefe1075fc4
--- /dev/null
+++ b/arch/powerpc/include/asm/guest-state-buffer.h
@@ -0,0 +1,904 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Interface based on include/net/netlink.h
+ */
+#ifndef _ASM_POWERPC_GUEST_STATE_BUFFER_H
+#define _ASM_POWERPC_GUEST_STATE_BUFFER_H
+
+#include 
+#include 
+#include 
+
+/**
+ * Guest State Buffer Constants
+ **/
+/* Element without a value and any length */
+#define KVMPPC_GSID_BLANK  0x
+/* Size required for the L0's internal VCPU representation */
+#define KVMPPC_GSID_HOST_STATE_SIZE0x0001
+ /* Minimum size for the H_GUEST_RUN_VCPU output buffer */
+#define KVMPPC_GSID_RUN_OUTPUT_MIN_SIZE0x0002
+ /* "Logical" PVR value as defined in the PAPR */
+#define KVMPPC_GSID_LOGICAL_PVR0x0003
+ /* L0 relative timebase offset */
+#define KVMPPC_GSID_TB_OFFSET  0x0004
+ /* Partition Scoped Page Table Info */
+#define KVMPPC_GSID_PARTITION_TABLE0x0005
+ /* Process Table Info */
+#define KVMPPC_GSID_PROCESS_TABLE   

[PATCH v3 4/6] KVM: PPC: Book3s HV: Hold LPIDs in an unsigned long

2023-08-06 Thread Jordan Niethe
The LPID register is 32 bits long. The host keeps the lpids for each
guest in an unsigned word struct kvm_arch. Currently, LPIDs are already
limited by mmu_lpid_bits and KVM_MAX_NESTED_GUESTS_SHIFT.

The nestedv2 API returns a 64 bit "Guest ID" to be used be the L1 host
for each L2 guest. This value is used as an lpid, e.g. it is the
parameter used by H_RPT_INVALIDATE. To minimize needless special casing
it makes sense to keep this "Guest ID" in struct kvm_arch::lpid.

This means that struct kvm_arch::lpid is too small so prepare for this
and make it an unsigned long. This is not a problem for the KVM-HV and
nestedv1 cases as their lpid values are already limited to valid ranges
so in those contexts the lpid can be used as an unsigned word safely as
needed.

In the PAPR, the H_RPT_INVALIDATE pid/lpid parameter is already
specified as an unsigned long so change pseries_rpt_invalidate() to
match that.  Update the callers of pseries_rpt_invalidate() to also take
an unsigned long if they take an lpid value.

Signed-off-by: Jordan Niethe 
---
v3:
  - New to series
---
 arch/powerpc/include/asm/kvm_book3s.h | 10 +-
 arch/powerpc/include/asm/kvm_book3s_64.h  |  2 +-
 arch/powerpc/include/asm/kvm_host.h   |  2 +-
 arch/powerpc/include/asm/plpar_wrappers.h |  4 ++--
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c| 22 +++---
 arch/powerpc/kvm/book3s_hv_nested.c   |  4 ++--
 arch/powerpc/kvm/book3s_xive.c|  4 ++--
 8 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 1a7e837ea2d5..98d4870ec4b3 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -191,14 +191,14 @@ extern int kvmppc_mmu_radix_translate_table(struct 
kvm_vcpu *vcpu, gva_t eaddr,
 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite);
 extern void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
-   unsigned int pshift, unsigned int lpid);
+   unsigned int pshift, unsigned long lpid);
 extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
unsigned int shift,
const struct kvm_memory_slot *memslot,
-   unsigned int lpid);
+   unsigned long lpid);
 extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested,
bool writing, unsigned long gpa,
-   unsigned int lpid);
+   unsigned long lpid);
 extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
unsigned long gpa,
struct kvm_memory_slot *memslot,
@@ -207,7 +207,7 @@ extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu 
*vcpu,
 extern int kvmppc_init_vm_radix(struct kvm *kvm);
 extern void kvmppc_free_radix(struct kvm *kvm);
 extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
- unsigned int lpid);
+ unsigned long lpid);
 extern int kvmppc_radix_init(void);
 extern void kvmppc_radix_exit(void);
 extern void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
@@ -300,7 +300,7 @@ void kvmhv_nested_exit(void);
 void kvmhv_vm_nested_init(struct kvm *kvm);
 long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
 long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu);
-void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
+void kvmhv_set_ptbl_entry(unsigned long lpid, u64 dw0, u64 dw1);
 void kvmhv_release_all_nested(struct kvm *kvm);
 long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
 long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index d49065af08e9..9fc3ad3990f7 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -624,7 +624,7 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 
 extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 unsigned long gpa, unsigned int level,
-unsigned long mmu_seq, unsigned int lpid,
+unsigned long mmu_seq, unsigned long lpid,
 unsigned long *rmapp, struct rmap_nested **n_rmap);
 extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
   struct rmap_nested **n_rmap);
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 14ee0dece853..67dd3e749cac 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/

[PATCH v3 5/6] KVM: PPC: Add support for nestedv2 guests

2023-08-06 Thread Jordan Niethe
A series of hcalls have been added to the PAPR which allow a regular
guest partition to create and manage guest partitions of its own. KVM
already had an interface that allowed this on powernv platforms. This
existing interface will now be called "nestedv1". The newly added PAPR
interface will be called "nestedv2".  PHYP will support the nestedv2
interface. At this time the host side of the nestedv2 interface has not
been implemented on powernv but there is no technical reason why it
could not be added.

The nestedv1 interface is still supported.

Add support to KVM to utilize these hcalls to enable running nested
guests as a pseries guest on PHYP.

Overview of the new hcall usage:

- L1 and L0 negotiate capabilities with
  H_GUEST_{G,S}ET_CAPABILITIES()

- L1 requests the L0 create a L2 with
  H_GUEST_CREATE() and receives a handle to use in future hcalls

- L1 requests the L0 create a L2 vCPU with
  H_GUEST_CREATE_VCPU()

- L1 sets up the L2 using H_GUEST_SET and the
  H_GUEST_VCPU_RUN input buffer

- L1 requests the L0 runs the L2 vCPU using H_GUEST_VCPU_RUN()

- L2 returns to L1 with an exit reason and L1 reads the
  H_GUEST_VCPU_RUN output buffer populated by the L0

- L1 handles the exit using H_GET_STATE if necessary

- L1 reruns L2 vCPU with H_GUEST_VCPU_RUN

- L1 frees the L2 in the L0 with H_GUEST_DELETE()

Support for the new API is determined by trying
H_GUEST_GET_CAPABILITIES. On a successful return, use the nestedv2
interface.

Use the vcpu register state setters for tracking modified guest state
elements and copy the thread wide values into the H_GUEST_VCPU_RUN input
buffer immediately before running a L2. The guest wide
elements can not be added to the input buffer so send them with a
separate H_GUEST_SET call if necessary.

Make the vcpu register getter load the corresponding value from the real
host with H_GUEST_GET. To avoid unnecessarily calling H_GUEST_GET, track
which values have already been loaded between H_GUEST_VCPU_RUN calls. If
an element is present in the H_GUEST_VCPU_RUN output buffer it also does
not need to be loaded again.

Signed-off-by: Vaibhav Jain 
Signed-off-by: Gautam Menghani 
Signed-off-by: Kautuk Consul 
Signed-off-by: Amit Machhiwal 
Signed-off-by: Jordan Niethe 
---
v2:
  - Declare op structs as static
  - Guatam: Use expressions in switch case with local variables
  - Do not use the PVR for the LOGICAL PVR ID
  - Kautuk: Handle emul_inst as now a double word, init correctly
  - Use new GPR(), etc macros
  - Amit: Determine PAPR nested capabilities from cpu features
v3:
  - Use EXPORT_SYMBOL_GPL()
  - Change to kvmhv_nestedv2 namespace
  - Make kvmhv_enable_nested() return -ENODEV on NESTEDv2 L1 hosts
  - s/kvmhv_on_papr/kvmhv_is_nestedv2/
  - mv book3s_hv_papr.c book3s_hv_nestedv2.c
  - Handle shared regs without a guest state id in the same wrapper
  - Vaibhav: Use a static key for API version
  - Add a positive test for NESTEDv1
  - Give the amor a static value
  - s/struct kvmhv_nestedv2_host/struct kvmhv_nestedv2_io/
  - Propagate failure in kvmhv_vcpu_entry_nestedv2()
  - WARN if getters and setters fail
  - Progagate failure from kvmhv_nestedv2_parse_output()
  - Replace delay with sleep in plpar_guest_{create,delete,create_vcpu}()
  - Amit: Add logical PVR handling
  - Replace kvmppc_gse_{get,put} with specific version
---
 arch/powerpc/include/asm/guest-state-buffer.h |  91 ++
 arch/powerpc/include/asm/hvcall.h |  30 +
 arch/powerpc/include/asm/kvm_book3s.h | 136 ++-
 arch/powerpc/include/asm/kvm_book3s_64.h  |   6 +
 arch/powerpc/include/asm/kvm_host.h   |  20 +
 arch/powerpc/include/asm/kvm_ppc.h|  96 +-
 arch/powerpc/include/asm/plpar_wrappers.h | 188 
 arch/powerpc/kvm/Makefile |   1 +
 arch/powerpc/kvm/book3s_hv.c  | 136 ++-
 arch/powerpc/kvm/book3s_hv.h  |  72 +-
 arch/powerpc/kvm/book3s_hv_nested.c   |  38 +-
 arch/powerpc/kvm/book3s_hv_nestedv2.c | 985 ++
 arch/powerpc/kvm/emulate_loadstore.c  |   4 +-
 arch/powerpc/kvm/guest-state-buffer.c |  50 +
 14 files changed, 1757 insertions(+), 96 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_nestedv2.c

diff --git a/arch/powerpc/include/asm/guest-state-buffer.h 
b/arch/powerpc/include/asm/guest-state-buffer.h
index aaefe1075fc4..808149f31576 100644
--- a/arch/powerpc/include/asm/guest-state-buffer.h
+++ b/arch/powerpc/include/asm/guest-state-buffer.h
@@ -5,6 +5,7 @@
 #ifndef _ASM_POWERPC_GUEST_STATE_BUFFER_H
 #define _ASM_POWERPC_GUEST_STATE_BUFFER_H
 
+#include "asm/hvcall.h"
 #include 
 #include 
 #include 
@@ -313,6 +314,8 @@ struct kvmppc_gs_buff *kvmppc_gsb_new(size_t size, unsigned 
long guest_id,
  unsigned long vcpu_id, gfp_t flags);
 void kvmppc_gsb_free(struct kvmppc_gs_buff *gsb);
 void *kvmppc_gsb_put(struct kvmppc_gs_buff *gsb, size_t size);
+int kvmppc_gsb_send(struct kvmppc_gs_buff *gsb, un

[PATCH v3 6/6] docs: powerpc: Document nested KVM on POWER

2023-08-06 Thread Jordan Niethe
From: Michael Neuling 

Document support for nested KVM on POWER using the existing API as well
as the new PAPR API. This includes the new HCALL interface and how it
used by KVM.

Signed-off-by: Michael Neuling 
Signed-off-by: Jordan Niethe 
---
v2:
  - Separated into individual patch
v3:
  - Fix typos
---
 Documentation/powerpc/index.rst  |   1 +
 Documentation/powerpc/kvm-nested.rst | 636 +++
 2 files changed, 637 insertions(+)
 create mode 100644 Documentation/powerpc/kvm-nested.rst

diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index d33b554ca7ba..23e449994c2a 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -26,6 +26,7 @@ powerpc
 isa-versions
 kaslr-booke32
 mpc52xx
+kvm-nested
 papr_hcalls
 pci_iov_resource_on_powernv
 pmu-ebb
diff --git a/Documentation/powerpc/kvm-nested.rst 
b/Documentation/powerpc/kvm-nested.rst
new file mode 100644
index ..8b37981dc3d9
--- /dev/null
+++ b/Documentation/powerpc/kvm-nested.rst
@@ -0,0 +1,636 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+
+Nested KVM on POWER
+
+
+Introduction
+
+
+This document explains how a guest operating system can act as a
+hypervisor and run nested guests through the use of hypercalls, if the
+hypervisor has implemented them. The terms L0, L1, and L2 are used to
+refer to different software entities. L0 is the hypervisor mode entity
+that would normally be called the "host" or "hypervisor". L1 is a
+guest virtual machine that is directly run under L0 and is initiated
+and controlled by L0. L2 is a guest virtual machine that is initiated
+and controlled by L1 acting as a hypervisor.
+
+Existing API
+
+
+Linux/KVM has had support for Nesting as an L0 or L1 since 2018
+
+The L0 code was added::
+
+   commit 8e3f5fc1045dc49fd175b978c5457f5f51e7a2ce
+   Author: Paul Mackerras 
+   Date:   Mon Oct 8 16:31:03 2018 +1100
+   KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
+
+The L1 code was added::
+
+   commit 360cae313702cdd0b90f82c261a8302fecef030a
+   Author: Paul Mackerras 
+   Date:   Mon Oct 8 16:31:04 2018 +1100
+   KVM: PPC: Book3S HV: Nested guest entry via hypercall
+
+This API works primarily using a single hcall h_enter_nested(). This
+call made by the L1 to tell the L0 to start an L2 vCPU with the given
+state. The L0 then starts this L2 and runs until an L2 exit condition
+is reached. Once the L2 exits, the state of the L2 is given back to
+the L1 by the L0. The full L2 vCPU state is always transferred from
+and to L1 when the L2 is run. The L0 doesn't keep any state on the L2
+vCPU (except in the short sequence in the L0 on L1 -> L2 entry and L2
+-> L1 exit).
+
+The only state kept by the L0 is the partition table. The L1 registers
+it's partition table using the h_set_partition_table() hcall. All
+other state held by the L0 about the L2s is cached state (such as
+shadow page tables).
+
+The L1 may run any L2 or vCPU without first informing the L0. It
+simply starts the vCPU using h_enter_nested(). The creation of L2s and
+vCPUs is done implicitly whenever h_enter_nested() is called.
+
+In this document, we call this existing API the v1 API.
+
+New PAPR API
+===
+
+The new PAPR API changes from the v1 API such that the creating L2 and
+associated vCPUs is explicit. In this document, we call this the v2
+API.
+
+h_enter_nested() is replaced with H_GUEST_VCPU_RUN().  Before this can
+be called the L1 must explicitly create the L2 using h_guest_create()
+and any associated vCPUs() created with h_guest_create_vCPU(). Getting
+and setting vCPU state can also be performed using h_guest_{g|s}et
+hcall.
+
+The basic execution flow is for an L1 to create an L2, run it, and
+delete it is:
+
+- L1 and L0 negotiate capabilities with H_GUEST_{G,S}ET_CAPABILITIES()
+  (normally at L1 boot time).
+
+- L1 requests the L0 create an L2 with H_GUEST_CREATE() and receives a token
+
+- L1 requests the L0 create an L2 vCPU with H_GUEST_CREATE_VCPU()
+
+- L1 and L0 communicate the vCPU state using the H_GUEST_{G,S}ET() hcall
+
+- L1 requests the L0 runs the vCPU running H_GUEST_VCPU_RUN() hcall
+
+- L1 deletes L2 with H_GUEST_DELETE()
+
+More details of the individual hcalls follows:
+
+HCALL Details
+=
+
+This documentation is provided to give an overall understating of the
+API. It doesn't aim to provide all the details required to implement
+an L1 or L0. Latest version of PAPR can be referred to for more details.
+
+All these HCALLs are made by the L1 to the L0.
+
+H_GUEST_GET_CAPABILITIES()
+--
+
+This is called to get the capabilities of the L0 nested
+hypervisor. This includes capabilities such the CPU versions (eg
+POWER9, POWER10) that are supported as L2s::
+
+  H_GUEST_GET_CAPABILITIES(uint64 flags)
+
+  Parameters:
+Input:
+  flags: Reserv

Re: [PATCH v3 10/13] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()

2023-08-06 Thread Qi Zheng




On 2023/8/6 11:55, Hugh Dickins wrote:

On Thu, 3 Aug 2023, Qi Zheng wrote:

On 2023/7/12 12:42, Hugh Dickins wrote:

Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp().
It does need mmap_read_lock(), but it does not need mmap_write_lock(),
nor vma_start_write() nor i_mmap lock nor anon_vma lock.  All racing
paths are relying on pte_offset_map_lock() and pmd_lock(), so use those.

...

@@ -1681,47 +1634,76 @@ int collapse_pte_mapped_thp(struct mm_struct *mm,
unsigned long addr,
   
 if (pte_none(ptent))

continue;
-   page = vm_normal_page(vma, addr, ptent);
-   if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+   /*
+* We dropped ptl after the first scan, to do the
mmu_notifier:
+* page lock stops more PTEs of the hpage being faulted in,
but
+* does not stop write faults COWing anon copies from existing
+* PTEs; and does not stop those being swapped out or
migrated.
+*/
+   if (!pte_present(ptent)) {
+   result = SCAN_PTE_NON_PRESENT;
goto abort;
+   }
+   page = vm_normal_page(vma, addr, ptent);
+   if (hpage + i != page)
+   goto abort;
+
+   /*
+* Must clear entry, or a racing truncate may re-remove it.
+* TLB flush can be left until pmdp_collapse_flush() does it.
+* PTE dirty? Shmem page is already dirty; file is read-only.
+*/
+   pte_clear(mm, addr, pte);


This is not non-present PTE entry, so we should call ptep_clear() to let
page_table_check track the PTE clearing operation, right? Otherwise it
may lead to false positives?


You are right: thanks a lot for catching that: fix patch follows.


With fix patch:

Reviewed-by: Qi Zheng 

Thanks.



Hugh


[PATCH 2/2] ocxl: use pci_find_next_dvsec_capability() to simplify the code

2023-08-06 Thread Xiongfeng Wang
PCI core add pci_find_next_dvsec_capability() to query the next DVSEC.
We can use that core API to simplify the code. Also remove the unused
macros.

Signed-off-by: Xiongfeng Wang 
---
 arch/powerpc/platforms/powernv/ocxl.c | 20 ++--
 drivers/misc/ocxl/config.c| 21 ++---
 include/misc/ocxl-config.h|  4 
 3 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/ocxl.c 
b/arch/powerpc/platforms/powernv/ocxl.c
index 629067781cec..8dbc1a9535fc 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -71,29 +71,13 @@ static DEFINE_MUTEX(links_list_lock);
  * the AFUs, by pro-rating if needed.
  */
 
-static int find_dvsec_from_pos(struct pci_dev *dev, int dvsec_id, int pos)
-{
-   int vsec = pos;
-   u16 vendor, id;
-
-   while ((vsec = pci_find_next_ext_capability(dev, vsec,
-   OCXL_EXT_CAP_ID_DVSEC))) {
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
-   &vendor);
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, &id);
-   if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
-   return vsec;
-   }
-   return 0;
-}
-
 static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
 {
int vsec = 0;
u8 idx;
 
-   while ((vsec = find_dvsec_from_pos(dev, OCXL_DVSEC_AFU_CTRL_ID,
-  vsec))) {
+   while ((vsec = pci_find_next_dvsec_capability(dev, vsec,
+   PCI_VENDOR_ID_IBM, OCXL_DVSEC_AFU_CTRL_ID))) {
pci_read_config_byte(dev, vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
&idx);
if (idx == afu_idx)
diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index 92ab49705f64..6c0fca32e6db 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -39,23 +39,14 @@ static int find_dvsec(struct pci_dev *dev, int dvsec_id)
 static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
 {
int vsec = 0;
-   u16 vendor, id;
u8 idx;
 
-   while ((vsec = pci_find_next_ext_capability(dev, vsec,
-   OCXL_EXT_CAP_ID_DVSEC))) {
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
-   &vendor);
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, &id);
-
-   if (vendor == PCI_VENDOR_ID_IBM &&
-   id == OCXL_DVSEC_AFU_CTRL_ID) {
-   pci_read_config_byte(dev,
-   vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
-   &idx);
-   if (idx == afu_idx)
-   return vsec;
-   }
+   while ((vsec = pci_find_next_dvsec_capability(dev, vsec,
+   PCI_VENDOR_ID_IBM, OCXL_DVSEC_AFU_CTRL_ID))) {
+   pci_read_config_byte(dev, vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
+&idx);
+   if (idx == afu_idx)
+   return vsec;
}
return 0;
 }
diff --git a/include/misc/ocxl-config.h b/include/misc/ocxl-config.h
index ccfd3b463517..40cf1b143170 100644
--- a/include/misc/ocxl-config.h
+++ b/include/misc/ocxl-config.h
@@ -10,10 +10,6 @@
  * It follows the specification for opencapi 3.0
  */
 
-#define OCXL_EXT_CAP_ID_DVSEC 0x23
-
-#define OCXL_DVSEC_VENDOR_OFFSET  0x4
-#define OCXL_DVSEC_ID_OFFSET  0x8
 #define OCXL_DVSEC_TL_ID  0xF000
 #define   OCXL_DVSEC_TL_BACKOFF_TIMERS  0x10
 #define   OCXL_DVSEC_TL_RECV_CAP0x18
-- 
2.20.1



[PATCH 1/2] PCI: Add pci_find_next_dvsec_capability to find next designated VSEC

2023-08-06 Thread Xiongfeng Wang
Some devices may have several DVSEC(Designated Vendor-Specific Extended
Capability) entries with the same DVSEC ID. Add
pci_find_next_dvsec_capability() to find them all.

Signed-off-by: Xiongfeng Wang 
---
 drivers/pci/pci.c   | 37 +
 include/linux/pci.h |  2 ++
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 60230da957e0..3455ca7306ae 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -749,35 +749,48 @@ u16 pci_find_vsec_capability(struct pci_dev *dev, u16 
vendor, int cap)
 EXPORT_SYMBOL_GPL(pci_find_vsec_capability);
 
 /**
- * pci_find_dvsec_capability - Find DVSEC for vendor
+ * pci_find_next_dvsec_capability - Find next DVSEC for vendor
  * @dev: PCI device to query
+ * @start: address at which to start looking (0 to start at beginning of list)
  * @vendor: Vendor ID to match for the DVSEC
  * @dvsec: Designated Vendor-specific capability ID
  *
- * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability
- * offset in config space; otherwise return 0.
+ * Returns the address of the next DVSEC if the DVSEC has Vendor ID @vendor and
+ * DVSEC ID @dvsec; otherwise return 0. DVSEC can occur several times with the
+ * same DVSEC ID for some devices, and this provides a way to find them all.
  */
-u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec)
+u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start, u16 vendor,
+  u16 dvsec)
 {
-   int pos;
+   u16 pos = start;
 
-   pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
-   if (!pos)
-   return 0;
-
-   while (pos) {
+   while ((pos = pci_find_next_ext_capability(dev, pos,
+ PCI_EXT_CAP_ID_DVSEC))) {
u16 v, id;
 
pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, &v);
pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, &id);
if (vendor == v && dvsec == id)
return pos;
-
-   pos = pci_find_next_ext_capability(dev, pos, 
PCI_EXT_CAP_ID_DVSEC);
}
 
return 0;
 }
+EXPORT_SYMBOL_GPL(pci_find_next_dvsec_capability);
+
+/**
+ * pci_find_dvsec_capability - Find DVSEC for vendor
+ * @dev: PCI device to query
+ * @vendor: Vendor ID to match for the DVSEC
+ * @dvsec: Designated Vendor-specific capability ID
+ *
+ * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability
+ * offset in config space; otherwise return 0.
+ */
+u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec)
+{
+   return pci_find_next_dvsec_capability(dev, 0, vendor, dvsec);
+}
 EXPORT_SYMBOL_GPL(pci_find_dvsec_capability);
 
 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c69a2cc1f412..82bb905daf72 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1168,6 +1168,8 @@ u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 
pos, int cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap);
 u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec);
+u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start, u16 vendor,
+  u16 dvsec);
 
 u64 pci_get_dsn(struct pci_dev *dev);
 
-- 
2.20.1



[PATCH 0/2] introduce pci_find_next_dvsec_capability() to simplify the code

2023-08-06 Thread Xiongfeng Wang
Some devices may have several DVSEC(Designated Vendor-Specific Extended
Capability) entries with the same DVSEC ID. Introduce
pci_find_next_dvsec_capability() to simplify the code.

Xiongfeng Wang (2):
  PCI: Add pci_find_next_dvsec_capability to find next designated VSEC
  ocxl: use pci_find_next_dvsec_capability() to simplify the code

 arch/powerpc/platforms/powernv/ocxl.c | 20 ++-
 drivers/misc/ocxl/config.c| 21 +--
 drivers/pci/pci.c | 37 ++-
 include/linux/pci.h   |  2 ++
 include/misc/ocxl-config.h|  4 ---
 5 files changed, 35 insertions(+), 49 deletions(-)

-- 
2.20.1



Re: [RFC PATCH] cxl: Use pci_find_vsec_capability() to simplify the code

2023-08-06 Thread Andrew Donnellan
On Fri, 2023-08-04 at 15:56 +0800, Xiongfeng Wang wrote:
> PCI core add pci_find_vsec_capability() to query VSEC. We can use
> that
> core API to simplify the code.
> 
> The only logical change is that pci_find_vsec_capability check the
> Vendor ID before finding the VSEC.
> 
> PCI spec rev 5.0 says in 7.9.5.2 Vendor-Specific Header:
>   VSEC ID - This field is a vendor-defined ID number that indicates
> the
>   nature and format of the VSEC structure
>   Software must qualify the Vendor ID before interpreting this field.
> 
> Signed-off-by: Xiongfeng Wang 

LGTM

The cxl driver doesn't currently bind to any devices that don't have an
IBM vendor ID, and it's very unlikely to in future. If that ever
changes, this will of course need to be updated accordingly.

Reviewed-by: Andrew Donnellan 

> ---
>  drivers/misc/cxl/pci.c | 12 ++--
>  1 file changed, 2 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
> index 0ff944860dda..f3108977755d 100644
> --- a/drivers/misc/cxl/pci.c
> +++ b/drivers/misc/cxl/pci.c
> @@ -150,16 +150,8 @@ static inline resource_size_t p2_size(struct
> pci_dev *dev)
>  
>  static int find_cxl_vsec(struct pci_dev *dev)
>  {
> -   int vsec = 0;
> -   u16 val;
> -
> -   while ((vsec = pci_find_next_ext_capability(dev, vsec,
> PCI_EXT_CAP_ID_VNDR))) {
> -   pci_read_config_word(dev, vsec + 0x4, &val);
> -   if (val == CXL_PCI_VSEC_ID)
> -   return vsec;
> -   }
> -   return 0;
> -
> +   return pci_find_vsec_capability(dev, PCI_VENDOR_ID_IBM,
> +   CXL_PCI_VSEC_ID);
>  }
>  
>  static void dump_cxl_config_space(struct pci_dev *dev)

-- 
Andrew DonnellanOzLabs, ADL Canberra
a...@linux.ibm.com   IBM Australia Limited


Re: [PATCH 1/2] PCI: Add pci_find_next_dvsec_capability to find next designated VSEC

2023-08-06 Thread Andrew Donnellan
On Mon, 2023-08-07 at 11:18 +0800, Xiongfeng Wang wrote:
> Some devices may have several DVSEC(Designated Vendor-Specific
> Extended
> Capability) entries with the same DVSEC ID. Add
> pci_find_next_dvsec_capability() to find them all.
> 
> Signed-off-by: Xiongfeng Wang 
> 

Reviewed-by: Andrew Donnellan 

> ---
>  drivers/pci/pci.c   | 37 +
>  include/linux/pci.h |  2 ++
>  2 files changed, 27 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 60230da957e0..3455ca7306ae 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -749,35 +749,48 @@ u16 pci_find_vsec_capability(struct pci_dev
> *dev, u16 vendor, int cap)
>  EXPORT_SYMBOL_GPL(pci_find_vsec_capability);
>  
>  /**
> - * pci_find_dvsec_capability - Find DVSEC for vendor
> + * pci_find_next_dvsec_capability - Find next DVSEC for vendor
>   * @dev: PCI device to query
> + * @start: address at which to start looking (0 to start at
> beginning of list)
>   * @vendor: Vendor ID to match for the DVSEC
>   * @dvsec: Designated Vendor-specific capability ID
>   *
> - * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the
> capability
> - * offset in config space; otherwise return 0.
> + * Returns the address of the next DVSEC if the DVSEC has Vendor ID
> @vendor and
> + * DVSEC ID @dvsec; otherwise return 0. DVSEC can occur several
> times with the
> + * same DVSEC ID for some devices, and this provides a way to find
> them all.
>   */
> -u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16
> dvsec)
> +u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start,
> u16 vendor,
> +  u16 dvsec)
>  {
> -   int pos;
> +   u16 pos = start;
>  
> -   pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
> -   if (!pos)
> -   return 0;
> -
> -   while (pos) {
> +   while ((pos = pci_find_next_ext_capability(dev, pos,
> +
> PCI_EXT_CAP_ID_DVSEC))) {
> u16 v, id;
>  
> pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1,
> &v);
> pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2,
> &id);
> if (vendor == v && dvsec == id)
> return pos;
> -
> -   pos = pci_find_next_ext_capability(dev, pos,
> PCI_EXT_CAP_ID_DVSEC);
> }
>  
> return 0;
>  }
> +EXPORT_SYMBOL_GPL(pci_find_next_dvsec_capability);
> +
> +/**
> + * pci_find_dvsec_capability - Find DVSEC for vendor
> + * @dev: PCI device to query
> + * @vendor: Vendor ID to match for the DVSEC
> + * @dvsec: Designated Vendor-specific capability ID
> + *
> + * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the
> capability
> + * offset in config space; otherwise return 0.
> + */
> +u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16
> dvsec)
> +{
> +   return pci_find_next_dvsec_capability(dev, 0, vendor, dvsec);
> +}
>  EXPORT_SYMBOL_GPL(pci_find_dvsec_capability);
>  
>  /**
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index c69a2cc1f412..82bb905daf72 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -1168,6 +1168,8 @@ u16 pci_find_next_ext_capability(struct pci_dev
> *dev, u16 pos, int cap);
>  struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
>  u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int
> cap);
>  u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16
> dvsec);
> +u16 pci_find_next_dvsec_capability(struct pci_dev *dev, u16 start,
> u16 vendor,
> +  u16 dvsec);
>  
>  u64 pci_get_dsn(struct pci_dev *dev);
>  

-- 
Andrew DonnellanOzLabs, ADL Canberra
a...@linux.ibm.com   IBM Australia Limited


[PATCH] perf test: Fix parse-events tests to skip parametrized events

2023-08-06 Thread Athira Rajeev
Testcase "Parsing of all PMU events from sysfs" parse events for
all PMUs, and not just cpu. In case of powerpc, the PowerVM
environment supports events from hv_24x7 and hv_gpci PMU which
is of example format like below:

- hv_24x7/CPM_ADJUNCT_INST,domain=?,core=?/
- hv_gpci/event,partition_id=?/

The value for "?" needs to be filled in depending on system
configuration. It is better to skip these parametrized events
in this test as it is done in:
'commit b50d691e50e6 ("perf test: Fix "all PMU test" to skip
parametrized events")' which handled a simialr instance with
"all PMU test".

Fix parse-events test to skip parametrized events since
it needs proper setup of the parameters.

Signed-off-by: Athira Rajeev 
---
 tools/perf/tests/parse-events.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index b2f82847e4c3..605373c7d005 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -2504,7 +2504,11 @@ static int test__pmu_events(struct test_suite *test 
__maybe_unused, int subtest
while ((pmu = perf_pmus__scan(pmu)) != NULL) {
struct stat st;
char path[PATH_MAX];
+   char pmu_event[PATH_MAX + 256];
+   char *buf = NULL;
+   FILE *file;
struct dirent *ent;
+   size_t len = 0;
DIR *dir;
int err;
 
@@ -2528,11 +2532,39 @@ static int test__pmu_events(struct test_suite *test 
__maybe_unused, int subtest
struct evlist_test e = { .name = NULL, };
char name[2 * NAME_MAX + 1 + 12 + 3];
int test_ret;
+   int skip = 0;
 
/* Names containing . are special and cannot be used 
directly */
if (strchr(ent->d_name, '.'))
continue;
 
+   /* exclude parametrized ones (name contains '?') */
+   snprintf(pmu_event, PATH_MAX + 256, "%s%s", path, 
ent->d_name);
+   file = fopen(pmu_event, "r");
+   if (!file) {
+   pr_debug("can't open pmu event file for 
'%s'\n", ent->d_name);
+   ret = combine_test_results(ret, TEST_FAIL);
+   continue;
+   }
+
+   if (getline(&buf, &len, file) < 0) {
+   pr_debug(" pmu event: %s is a null event\n", 
ent->d_name);
+   ret = combine_test_results(ret, TEST_FAIL);
+   continue;
+   }
+
+   if (strchr(buf, '?'))
+   skip = 1;
+
+   free(buf);
+   buf = NULL;
+   fclose(file);
+
+   if (skip == 1) {
+   pr_debug("skipping parametrized PMU event: %s 
which contains ?\n", pmu_event);
+   continue;
+   }
+
snprintf(name, sizeof(name), "%s/event=%s/u", 
pmu->name, ent->d_name);
 
e.name  = name;
-- 
2.31.1



[PATCH] tools/perf: Fix bpf__probe to set bpf_prog_type type only if differs from the desired one

2023-08-06 Thread Athira Rajeev
The test "BPF prologue generation" fails as below:

   Writing event: p:perf_bpf_probe/func _text+10423200 f_mode=+20(%gpr3):x32 
offset=%gpr4:s64 orig=%gpr5:s32
   In map_prologue, ntevs=1
   mapping[0]=0
   libbpf: prog 'bpf_func__null_lseek': BPF program load failed: Permission 
denied
   libbpf: prog 'bpf_func__null_lseek': -- BEGIN PROG LOAD LOG --
   btf_vmlinux is malformed
   reg type unsupported for arg#0 function bpf_func__null_lseek#5
   0: R1=ctx(off=0,imm=0) R10=fp0
   ;
   0: (57) r3 &= 2
   R3 !read_ok
   processed 1 insns (limit 100) max_states_per_insn 0 total_states 0 
peak_states 0 mark_read 0
   -- END PROG LOAD LOG --
   libbpf: prog 'bpf_func__null_lseek': failed to load: -13
   libbpf: failed to load object '[bpf_prologue_test]'
   bpf: load objects failed: err=-13: (Permission denied)
   Failed to add events selected by BPF

This fails occurs after this commit:
commit d6e6286a12e7 ("libbpf: disassociate section handler
on explicit bpf_program__set_type() call")'

With this change, SEC_DEF handler libbpf which is determined
initially based on program's SEC() is set to NULL. The change
is made because sec_def is not valid when user sets the program
type with bpf_program__set_type function. This commit also fixed
bpf_prog_test_load() helper in selftests/bpf to force-set program
type only if it differs from the desired one.

The "bpf__probe" function in util/bpf-loader.c, also calls
bpf_program__set_type to set bpf_prog_type. Add similar fix in
here as well to avoid setting sec_def to NULL.

Reported-by: Sachin Sant 
Signed-off-by: Athira Rajeev 
---
 tools/perf/util/bpf-loader.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 44cde27d6389..b8e0b430e302 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -870,12 +870,14 @@ int bpf__probe(struct bpf_object *obj)
goto out;
}
 
-   if (priv->is_tp) {
+   if (priv->is_tp && bpf_program__type(prog) != 
BPF_PROG_TYPE_TRACEPOINT) {
bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
continue;
}
 
-   bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
+   if (bpf_program__type(prog) != BPF_PROG_TYPE_KPROBE)
+   bpf_program__set_type(prog, BPF_PROG_TYPE_KPROBE);
+
pev = &priv->pev;
 
err = convert_perf_probe_events(pev, 1);
-- 
2.39.3



Re: [PATCH 2/2] ocxl: use pci_find_next_dvsec_capability() to simplify the code

2023-08-06 Thread Andrew Donnellan
On Mon, 2023-08-07 at 11:18 +0800, Xiongfeng Wang wrote:
> PCI core add pci_find_next_dvsec_capability() to query the next
> DVSEC.
> We can use that core API to simplify the code. Also remove the unused
> macros.
> 
> Signed-off-by: Xiongfeng Wang 

Reviewed-by: Andrew Donnellan 


-- 
Andrew DonnellanOzLabs, ADL Canberra
a...@linux.ibm.com   IBM Australia Limited


Re: [PATCH 1/1] perf tests task_analyzer: Check perf build options for libtraceevent support

2023-08-06 Thread Aditya Gupta
On Thu, Aug 03, 2023 at 12:52:50PM +0530, Athira Rajeev wrote:
> 
> 
> > On 03-Aug-2023, at 8:33 AM, Aditya Gupta  wrote:
> > 
> > Hi Arnaldo,
> > I am working on a patch for 'perf version --has', and will send a patch next
> 
> Hi Aditya,
> 
> I believe, it will be “perf build —has” option. And not “perf version —has”  ?

Oh okay. I misread. Thanks for pointing it out Athira.

Thanks,
Aditya G



Re: [PATCH] tools/perf: Fix bpf__probe to set bpf_prog_type type only if differs from the desired one

2023-08-06 Thread Sachin Sant



> On 07-Aug-2023, at 10:22 AM, Athira Rajeev  
> wrote:
> 
> The test "BPF prologue generation" fails as below:
> 
>   Writing event: p:perf_bpf_probe/func _text+10423200 f_mode=+20(%gpr3):x32 
> offset=%gpr4:s64 orig=%gpr5:s32
>   In map_prologue, ntevs=1
>   mapping[0]=0
>   libbpf: prog 'bpf_func__null_lseek': BPF program load failed: Permission 
> denied
>   libbpf: prog 'bpf_func__null_lseek': -- BEGIN PROG LOAD LOG --
>   btf_vmlinux is malformed
>   reg type unsupported for arg#0 function bpf_func__null_lseek#5
>   0: R1=ctx(off=0,imm=0) R10=fp0
>   ;
>   0: (57) r3 &= 2
>   R3 !read_ok
>   processed 1 insns (limit 100) max_states_per_insn 0 total_states 0 
> peak_states 0 mark_read 0
>   -- END PROG LOAD LOG --
>   libbpf: prog 'bpf_func__null_lseek': failed to load: -13
>   libbpf: failed to load object '[bpf_prologue_test]'
>   bpf: load objects failed: err=-13: (Permission denied)
>   Failed to add events selected by BPF
> 
> This fails occurs after this commit:
> commit d6e6286a12e7 ("libbpf: disassociate section handler
> on explicit bpf_program__set_type() call")'
> 
> With this change, SEC_DEF handler libbpf which is determined
> initially based on program's SEC() is set to NULL. The change
> is made because sec_def is not valid when user sets the program
> type with bpf_program__set_type function. This commit also fixed
> bpf_prog_test_load() helper in selftests/bpf to force-set program
> type only if it differs from the desired one.
> 
> The "bpf__probe" function in util/bpf-loader.c, also calls
> bpf_program__set_type to set bpf_prog_type. Add similar fix in
> here as well to avoid setting sec_def to NULL.
> 
> Reported-by: Sachin Sant 
> Signed-off-by: Athira Rajeev 
> ---

Thanks Athira for the fix.
With this patch applied perf BPF prologue sub test works correctly.

 42: BPF filter :
 42.1: Basic BPF filtering: Ok
 42.2: BPF pinning  : Ok
 42.3: BPF prologue generation  : Ok

Tested-by: Sachin Sant 

Can you please use the above mentioned id(without vnet) in the reported-by ?

- Sachin



Re: [PATCH] perf test: Fix parse-events tests to skip parametrized events

2023-08-06 Thread Sachin Sant



> On 07-Aug-2023, at 10:20 AM, Athira Rajeev  
> wrote:
> 
> Testcase "Parsing of all PMU events from sysfs" parse events for
> all PMUs, and not just cpu. In case of powerpc, the PowerVM
> environment supports events from hv_24x7 and hv_gpci PMU which
> is of example format like below:
> 
> - hv_24x7/CPM_ADJUNCT_INST,domain=?,core=?/
> - hv_gpci/event,partition_id=?/
> 
> The value for "?" needs to be filled in depending on system
> configuration. It is better to skip these parametrized events
> in this test as it is done in:
> 'commit b50d691e50e6 ("perf test: Fix "all PMU test" to skip
> parametrized events")' which handled a simialr instance with
> "all PMU test".
> 
> Fix parse-events test to skip parametrized events since
> it needs proper setup of the parameters.
> 
> Signed-off-by: Athira Rajeev 
> —

Thanks Athira for the fix. With this fix applied the reported problem
Is fixed.

6.1: Test event parsing: Ok
  6.2: Parsing of all PMU events from sysfs : Ok
  6.3: Parsing of given PMU events from sysfs: Ok

Tested-by: Sachin Sant 

- Sachin


[PATCH v8 1/2] powerpc/rtas: Rename rtas_error_rc to rtas_generic_errno

2023-08-06 Thread Mahesh Salgaonkar
rtas_generic_errno() function will convert the generic rtas return codes
into errno. Also, #define descriptive names for rtas return codes and use
it instead of numeric values.

Signed-off-by: Mahesh Salgaonkar 
---

(no changes since v7)

Change in V7:
- Until v6 there was only one patch with subject "PCI hotplug: rpaphp:
  Error out on busy status from get-sensor-state". Starting from v7, adding
  this new patch to introduce rtas_generic_errno() to handle generic rtas
  error codes.
  https://lore.kernel.org/all/20220429162545.GA79541@bhelgaas/
---
 arch/powerpc/include/asm/rtas.h |   10 +++
 arch/powerpc/kernel/rtas.c  |   53 ---
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 3abe15ac79db1..5572a0a2f6e18 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -202,7 +202,9 @@ typedef struct {
 #define RTAS_USER_REGION_SIZE (64 * 1024)
 
 /* RTAS return status codes */
-#define RTAS_BUSY  -2/* RTAS Busy */
+#define RTAS_HARDWARE_ERROR(-1)  /* Hardware Error */
+#define RTAS_BUSY  (-2)  /* RTAS Busy */
+#define RTAS_INVALID_PARAMETER (-3)  /* Invalid indicator/domain/sensor etc. */
 #define RTAS_EXTENDED_DELAY_MIN9900
 #define RTAS_EXTENDED_DELAY_MAX9905
 
@@ -212,6 +214,11 @@ typedef struct {
 #define RTAS_THREADS_ACTIVE -9005 /* Multiple processor threads active */
 #define RTAS_OUTSTANDING_COPROC -9006 /* Outstanding coprocessor operations */
 
+/* statuses specific to get-sensor-state */
+#define RTAS_SLOT_UNISOLATED   (-9000)
+#define RTAS_SLOT_NOT_UNISOLATED   (-9001)
+#define RTAS_SLOT_NOT_USABLE   (-9002)
+
 /* RTAS event classes */
 #define RTAS_INTERNAL_ERROR0x8000 /* set bit 0 */
 #define RTAS_EPOW_WARNING  0x4000 /* set bit 1 */
@@ -425,6 +432,7 @@ extern int rtas_set_indicator(int indicator, int index, int 
new_value);
 extern int rtas_set_indicator_fast(int indicator, int index, int new_value);
 extern void rtas_progress(char *s, unsigned short hex);
 int rtas_ibm_suspend_me(int *fw_status);
+int rtas_generic_errno(int rtas_rc);
 
 struct rtc_time;
 extern time64_t rtas_get_boot_time(void);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index c087320ff..80b6099e8ce20 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1330,33 +1330,34 @@ bool __ref rtas_busy_delay(int status)
 }
 EXPORT_SYMBOL_GPL(rtas_busy_delay);
 
-static int rtas_error_rc(int rtas_rc)
+int rtas_generic_errno(int rtas_rc)
 {
int rc;
 
switch (rtas_rc) {
-   case -1:/* Hardware Error */
-   rc = -EIO;
-   break;
-   case -3:/* Bad indicator/domain/etc */
-   rc = -EINVAL;
-   break;
-   case -9000: /* Isolation error */
-   rc = -EFAULT;
-   break;
-   case -9001: /* Outstanding TCE/PTE */
-   rc = -EEXIST;
-   break;
-   case -9002: /* No usable slot */
-   rc = -ENODEV;
-   break;
-   default:
-   pr_err("%s: unexpected error %d\n", __func__, rtas_rc);
-   rc = -ERANGE;
-   break;
+   case RTAS_HARDWARE_ERROR:   /* Hardware Error */
+   rc = -EIO;
+   break;
+   case RTAS_INVALID_PARAMETER:/* Bad indicator/domain/etc */
+   rc = -EINVAL;
+   break;
+   case RTAS_SLOT_UNISOLATED:  /* Isolation error */
+   rc = -EFAULT;
+   break;
+   case RTAS_SLOT_NOT_UNISOLATED:  /* Outstanding TCE/PTE */
+   rc = -EEXIST;
+   break;
+   case RTAS_SLOT_NOT_USABLE:  /* No usable slot */
+   rc = -ENODEV;
+   break;
+   default:
+   pr_err("%s: unexpected error %d\n", __func__, rtas_rc);
+   rc = -ERANGE;
+   break;
}
return rc;
 }
+EXPORT_SYMBOL(rtas_generic_errno);
 
 int rtas_get_power_level(int powerdomain, int *level)
 {
@@ -1370,7 +1371,7 @@ int rtas_get_power_level(int powerdomain, int *level)
udelay(1);
 
if (rc < 0)
-   return rtas_error_rc(rc);
+   return rtas_generic_errno(rc);
return rc;
 }
 EXPORT_SYMBOL_GPL(rtas_get_power_level);
@@ -1388,7 +1389,7 @@ int rtas_set_power_level(int powerdomain, int level, int 
*setlevel)
} while (rtas_busy_delay(rc));
 
if (rc < 0)
-   return rtas_error_rc(rc);
+   return rtas_generic_errno(rc);
return rc;
 }
 EXPORT_SYMBOL_GPL(rtas_set_power_level);
@@ -1406,7

[PATCH v8 2/2] PCI: rpaphp: Error out on busy status from get-sensor-state

2023-08-06 Thread Mahesh Salgaonkar
When certain PHB HW failure causes pHyp to recover PHB, it marks the PE
state as temporarily unavailable until recovery is complete. This also
triggers an EEH handler in Linux which needs to notify drivers, and perform
recovery. But before notifying the driver about the PCI error it uses
get_adapter_status()->rpaphp_get_sensor_state()->rtas_call(get-sensor-state)
operation of the hotplug_slot to determine if the slot contains a device or
not. If the slot is empty, the recovery is skipped entirely.

eeh_event_handler()
  ->eeh_handle_normal_event()
->eeh_slot_presence_check()
  ->get_adapter_status()
->rpaphp_get_sensor_state()
  ->rtas_get_sensor()
->rtas_call(get-sensor-state)

However on certain PHB failures, the RTAS call rtas_call(get-sensor-state)
returns extended busy error (9902) until PHB is recovered by pHyp. Once PHB
is recovered, the rtas_call(get-sensor-state) returns success with correct
presence status. The RTAS call interface rtas_get_sensor() loops over the
RTAS call on extended delay return code (9902) until the return value is
either success (0) or error (-1). This causes the EEH handler to get stuck
for ~6 seconds before it could notify that the PCI error has been detected
and stop any active operations. Hence with running I/O traffic, during this
6 seconds, the network driver continues its operation and hits a timeout
(netdev watchdog).


[52732.244731] DEBUG: ibm_read_slot_reset_state2()
[52732.244762] DEBUG: ret = 0, rets[0]=5, rets[1]=1, rets[2]=4000, rets[3]=>
[52732.244798] DEBUG: in eeh_slot_presence_check
[52732.244804] DEBUG: error state check
[52732.244807] DEBUG: Is slot hotpluggable
[52732.244810] DEBUG: hotpluggable ops ?
[52732.244953] DEBUG: Calling ops->get_adapter_status
[52732.244958] DEBUG: calling rpaphp_get_sensor_state
[52736.564262] [ cut here ]
[52736.564299] NETDEV WATCHDOG: enP64p1s0f3 (tg3): transmit queue 0 timed o>
[52736.564324] WARNING: CPU: 1442 PID: 0 at net/sched/sch_generic.c:478 dev>
[...]
[52736.564505] NIP [c0c32368] dev_watchdog+0x438/0x440
[52736.564513] LR [c0c32364] dev_watchdog+0x434/0x440


On timeouts, network driver starts dumping debug information to console
(e.g bnx2 driver calls bnx2x_panic_dump()), and go into recovery path while
pHyp is still recovering the PHB. As part of recovery, the driver tries to
reset the device and it keeps failing since every PCI read/write returns
ff's. And when EEH recovery kicks-in, the driver is unable to recover the
device. This impacts the ssh connection and leads to the system being
inaccessible. To get the NIC working again it needs a reboot or re-assign
the I/O adapter from HMC.

[ 9531.168587] EEH: Beginning: 'slot_reset'
[ 9531.168601] PCI 0013:01:00.0#1: EEH: Invoking bnx2x->slot_reset()
[...]
[ 9614.110094] bnx2x: [bnx2x_func_stop:9129(enP19p1s0f0)]FUNC_STOP ramrod 
failed. Running a dry transaction
[ 9614.110300] bnx2x: [bnx2x_igu_int_disable:902(enP19p1s0f0)]BUG! Proper val 
not read from IGU!
[ 9629.178067] bnx2x: [bnx2x_fw_command:3055(enP19p1s0f0)]FW failed to respond!
[ 9629.178085] bnx2x 0013:01:00.0 enP19p1s0f0: bc 7.10.4
[ 9629.178091] bnx2x: [bnx2x_fw_dump_lvl:789(enP19p1s0f0)]Cannot dump MCP info 
while in PCI error
[ 9644.241813] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f0)]IO slot reset --> 
driver unload
[...]
[ 9644.241819] PCI 0013:01:00.0#1: EEH: bnx2x driver reports: 'disconnect'
[ 9644.241823] PCI 0013:01:00.1#1: EEH: Invoking bnx2x->slot_reset()
[ 9644.241827] bnx2x: [bnx2x_io_slot_reset:14229(enP19p1s0f1)]IO slot reset 
initializing...
[ 9644.241916] bnx2x 0013:01:00.1: enabling device (0140 -> 0142)
[ 9644.258604] bnx2x: [bnx2x_io_slot_reset:14245(enP19p1s0f1)]IO slot reset --> 
driver unload
[ 9644.258612] PCI 0013:01:00.1#1: EEH: bnx2x driver reports: 'disconnect'
[ 9644.258615] EEH: Finished:'slot_reset' with aggregate recovery 
state:'disconnect'
[ 9644.258620] EEH: Unable to recover from failure from PHB#13-PE#1.
[ 9644.261811] EEH: Beginning: 'error_detected(permanent failure)'
[...]
[ 9644.261823] EEH: Finished:'error_detected(permanent failure)'

Hence, it becomes important to inform driver about the PCI error detection
as early as possible, so that driver is aware of PCI error and waits for
EEH handler's next action for successful recovery.

Current implementation uses rtas_get_sensor() API which blocks the slot
check state until RTAS call returns success. To avoid this, fix the PCI
hotplug driver (rpaphp) to return an error (-EBUSY) if the slot presence
state can not be detected immediately while PE is in EEH recovery state.
Change rpaphp_get_sensor_state() to invoke rtas_call(get-sensor-state)
directly only if the respective PE is in EEH recovery state, and take
actions based on RTAS return status. This way EEH handler will not be
blocked on rpaphp_get_sensor_state() and can immediately notify driver
about the PCI error and stop any active operati