date:20250114

Re: [PATCH v2 09/18] riscv: vdso: Switch to generic storage implementation

2025-01-14 Thread Thomas Weißschuh

On Mon, Jan 13, 2025 at 07:48:15PM +, Conor Dooley wrote:
> On Fri, Jan 10, 2025 at 04:23:48PM +0100, Thomas Weißschuh wrote:
> > The generic storage implementation provides the same features as the
> > custom one. However it can be shared between architectures, making
> > maintenance easier.
> > 
> > Co-developed-by: Nam Cao 
> > Signed-off-by: Nam Cao 
> > Signed-off-by: Thomas Weißschuh 
> 
> For rv64, nommu:
>   LD  vmlinux
> ld.lld: error: undefined symbol: vmf_insert_pfn
> >>> referenced by datastore.c
> >>>   lib/vdso/datastore.o:(vvar_fault) in archive vmlinux.a
> 
> ld.lld: error: undefined symbol: _install_special_mapping
> >>> referenced by datastore.c
> >>>   lib/vdso/datastore.o:(vdso_install_vvar_mapping) in archive 
> >>> vmlinux.a
> 
> Later patches in the series don't make it build again.
> rv32 builds now though, so thanks for fixing that.

Thanks for the report.
Can you try to diff below?

I'm adding rv64 and arm nommu configs to my test matrix and
doublechecking all kconfig dependencies.


diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 335cbbd4dddb..583c55910612 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -52,7 +52,7 @@ config RISCV
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN
-   select ARCH_HAS_VDSO_ARCH_DATA
+   select ARCH_HAS_VDSO_ARCH_DATA if GENERIC_VDSO_DATA_STORE
select ARCH_KEEP_MEMBLOCK if ACPI
select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if 64BIT && MMU
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
@@ -115,7 +115,7 @@ config RISCV
select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL if MMU && 64BIT
-   select GENERIC_VDSO_DATA_STORE
+   select GENERIC_VDSO_DATA_STORE if MMU
select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO
select HARDIRQS_SW_RESEND
select HAS_IOPORT if MMU

[PATCH 1/2] ASoC: fsl_micfil: Add i.MX943 platform support

2025-01-14 Thread Shengjiu Wang

On i.MX943, the FIFO data address is changed to 0x20~0x3c,
compared to previous version, there is a minus 4 offset,
so add a new regmap configuration for it.
And the bit width of CICOSR is changed to 5 bits, from bit
16th to 20th in REG_MICFIL_CTRL2 register, so update its
definition header file.

Signed-off-by: Shengjiu Wang 
---
 sound/soc/fsl/fsl_micfil.c | 98 ++
 sound/soc/fsl/fsl_micfil.h |  2 +-
 2 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/sound/soc/fsl/fsl_micfil.c b/sound/soc/fsl/fsl_micfil.c
index e908cfb594ab..1075598a6647 100644
--- a/sound/soc/fsl/fsl_micfil.c
+++ b/sound/soc/fsl/fsl_micfil.c
@@ -89,6 +89,7 @@ struct fsl_micfil_soc_data {
bool use_verid;
bool volume_sx;
u64  formats;
+   int  fifo_offset;
 };
 
 static struct fsl_micfil_soc_data fsl_micfil_imx8mm = {
@@ -98,6 +99,7 @@ static struct fsl_micfil_soc_data fsl_micfil_imx8mm = {
.dataline =  0xf,
.formats = SNDRV_PCM_FMTBIT_S16_LE,
.volume_sx = true,
+   .fifo_offset = 0,
 };
 
 static struct fsl_micfil_soc_data fsl_micfil_imx8mp = {
@@ -107,6 +109,7 @@ static struct fsl_micfil_soc_data fsl_micfil_imx8mp = {
.dataline =  0xf,
.formats = SNDRV_PCM_FMTBIT_S32_LE,
.volume_sx = false,
+   .fifo_offset = 0,
 };
 
 static struct fsl_micfil_soc_data fsl_micfil_imx93 = {
@@ -118,12 +121,26 @@ static struct fsl_micfil_soc_data fsl_micfil_imx93 = {
.use_edma = true,
.use_verid = true,
.volume_sx = false,
+   .fifo_offset = 0,
+};
+
+static struct fsl_micfil_soc_data fsl_micfil_imx943 = {
+   .imx = true,
+   .fifos = 8,
+   .fifo_depth = 32,
+   .dataline =  0xf,
+   .formats = SNDRV_PCM_FMTBIT_S32_LE,
+   .use_edma = true,
+   .use_verid = true,
+   .volume_sx = false,
+   .fifo_offset = -4,
 };
 
 static const struct of_device_id fsl_micfil_dt_ids[] = {
{ .compatible = "fsl,imx8mm-micfil", .data = &fsl_micfil_imx8mm },
{ .compatible = "fsl,imx8mp-micfil", .data = &fsl_micfil_imx8mp },
{ .compatible = "fsl,imx93-micfil", .data = &fsl_micfil_imx93 },
+   { .compatible = "fsl,imx943-micfil", .data = &fsl_micfil_imx943 },
{}
 };
 MODULE_DEVICE_TABLE(of, fsl_micfil_dt_ids);
@@ -793,7 +810,7 @@ static int fsl_micfil_hw_params(struct snd_pcm_substream 
*substream,
ret = regmap_update_bits(micfil->regmap, REG_MICFIL_CTRL2,
 MICFIL_CTRL2_CLKDIV | MICFIL_CTRL2_CICOSR,
 FIELD_PREP(MICFIL_CTRL2_CLKDIV, clk_div) |
-FIELD_PREP(MICFIL_CTRL2_CICOSR, 16 - osr));
+FIELD_PREP(MICFIL_CTRL2_CICOSR, 32 - osr));
 
/* Configure CIC OSR in VADCICOSR */
regmap_update_bits(micfil->regmap, REG_MICFIL_VAD0_CTRL1,
@@ -932,9 +949,39 @@ static const struct reg_default fsl_micfil_reg_defaults[] 
= {
{REG_MICFIL_VAD0_ZCD,   0x0004},
 };
 
+static const struct reg_default fsl_micfil_reg_defaults_v2[] = {
+   {REG_MICFIL_CTRL1,  0x},
+   {REG_MICFIL_CTRL2,  0x},
+   {REG_MICFIL_STAT,   0x},
+   {REG_MICFIL_FIFO_CTRL,  0x001F},
+   {REG_MICFIL_FIFO_STAT,  0x},
+   {REG_MICFIL_DATACH0 - 0x4,  0x},
+   {REG_MICFIL_DATACH1 - 0x4,  0x},
+   {REG_MICFIL_DATACH2 - 0x4,  0x},
+   {REG_MICFIL_DATACH3 - 0x4,  0x},
+   {REG_MICFIL_DATACH4 - 0x4,  0x},
+   {REG_MICFIL_DATACH5 - 0x4,  0x},
+   {REG_MICFIL_DATACH6 - 0x4,  0x},
+   {REG_MICFIL_DATACH7 - 0x4,  0x},
+   {REG_MICFIL_DC_CTRL,0x},
+   {REG_MICFIL_OUT_CTRL,   0x},
+   {REG_MICFIL_OUT_STAT,   0x},
+   {REG_MICFIL_VAD0_CTRL1, 0x},
+   {REG_MICFIL_VAD0_CTRL2, 0x000A},
+   {REG_MICFIL_VAD0_STAT,  0x},
+   {REG_MICFIL_VAD0_SCONFIG,   0x},
+   {REG_MICFIL_VAD0_NCONFIG,   0x8000},
+   {REG_MICFIL_VAD0_NDATA, 0x},
+   {REG_MICFIL_VAD0_ZCD,   0x0004},
+};
+
 static bool fsl_micfil_readable_reg(struct device *dev, unsigned int reg)
 {
struct fsl_micfil *micfil = dev_get_drvdata(dev);
+   int ofs = micfil->soc->fifo_offset;
+
+   if (reg >= (REG_MICFIL_DATACH0 + ofs) && reg <= (REG_MICFIL_DATACH7 + 
ofs))
+   return true;
 
switch (reg) {
case REG_MICFIL_CTRL1:
@@ -942,14 +989,6 @@ static bool fsl_micfil_readable_reg(struct device *dev, 
unsigned int reg)
case REG_MICFIL_STAT:
case REG_MICFIL_FIFO_CTRL:
case REG_MICFIL_FIFO_STAT:
-   case REG_MICFIL_DATACH0:
-   case REG_MICFIL_DATACH1:
-   case REG_MICFIL_DATACH2:
-   case REG_MICFIL_DATACH3

[PATCH 2/2] ASoC: dt-bindings: fsl,micfil: Add compatible string for i.MX943 platform

2025-01-14 Thread Shengjiu Wang

Add compatible string "fsl,imx943-micfil" for i.MX943 platform.
The definition of register map and some register bit map is
different on the i.MX943 platform.

Signed-off-by: Shengjiu Wang 
---
 Documentation/devicetree/bindings/sound/fsl,micfil.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/sound/fsl,micfil.yaml 
b/Documentation/devicetree/bindings/sound/fsl,micfil.yaml
index c1e9803fc113..c47b7a097490 100644
--- a/Documentation/devicetree/bindings/sound/fsl,micfil.yaml
+++ b/Documentation/devicetree/bindings/sound/fsl,micfil.yaml
@@ -25,6 +25,7 @@ properties:
   - fsl,imx8mm-micfil
   - fsl,imx8mp-micfil
   - fsl,imx93-micfil
+  - fsl,imx943-micfil
 
   reg:
 maxItems: 1
-- 
2.34.1

[PATCH 0/2] ASoC: fsl: Support micfil on i.MX943

2025-01-14 Thread Shengjiu Wang

On i.MX943, the FIFO data address is changed and the bit width
of CICOSR is changed.
Add a new compatible string and update driver for these changes.

Shengjiu Wang (2):
  ASoC: fsl_micfil: Add i.MX943 platform support
  ASoC: dt-bindings: fsl,micfil: Add compatible string for i.MX943
platform

 .../devicetree/bindings/sound/fsl,micfil.yaml |  1 +
 sound/soc/fsl/fsl_micfil.c| 98 +++
 sound/soc/fsl/fsl_micfil.h|  2 +-
 3 files changed, 79 insertions(+), 22 deletions(-)

-- 
2.34.1

Re: [PATCH v2 1/7] powerpc: properly negate error in syscall_set_return_value()

2025-01-14 Thread Alexey Gladkov

On Mon, Jan 13, 2025 at 07:10:54PM +0200, Dmitry V. Levin wrote:
> Bring syscall_set_return_value() in sync with syscall_get_error(),
> and let upcoming ptrace/set_syscall_info selftest pass on powerpc.
> 
> This reverts commit 1b1a3702a65c ("powerpc: Don't negate error in
> syscall_set_return_value()").
> 
> Signed-off-by: Dmitry V. Levin 
> ---
>  arch/powerpc/include/asm/syscall.h | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/syscall.h 
> b/arch/powerpc/include/asm/syscall.h
> index 3dd36c5e334a..422d7735ace6 100644
> --- a/arch/powerpc/include/asm/syscall.h
> +++ b/arch/powerpc/include/asm/syscall.h
> @@ -82,7 +82,11 @@ static inline void syscall_set_return_value(struct 
> task_struct *task,
>*/
>   if (error) {
>   regs->ccr |= 0x1000L;
> - regs->gpr[3] = error;
> + /*
> +  * In case of an error regs->gpr[3] contains
> +  * a positive ERRORCODE.
> +  */
> + regs->gpr[3] = -error;

After this change the syscall_get_error() will return positive value if
the system call failed. Since syscall_get_error() still believes
regs->gpr[3] is still positive in case !trap_is_scv().

Or am I missing something?

It looks like the selftest you mentioned in the commit message doesn't
check the !trap_is_scv() branch.

>   } else {
>   regs->ccr &= ~0x1000L;
>   regs->gpr[3] = val;
> -- 
> ldv

-- 
Rgrds, legion

[powerpc:merge] BUILD SUCCESS 9295162062634a11f4cf8c700da9265b0747ad04

2025-01-14 Thread kernel test robot

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
merge
branch HEAD: 9295162062634a11f4cf8c700da9265b0747ad04  Automatic merge of 
'next' into merge (2025-01-13 18:04)

elapsed time: 1443m

configs tested: 121
configs skipped: 3

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
alpha allnoconfiggcc-14.2.0
alphaallyesconfiggcc-14.2.0
arc  allmodconfiggcc-13.2.0
arc   allnoconfiggcc-13.2.0
arc  allyesconfiggcc-13.2.0
arc   randconfig-001-20250113gcc-13.2.0
arc   randconfig-002-20250113gcc-13.2.0
arm  allmodconfiggcc-14.2.0
arm   allnoconfigclang-17
arm  allyesconfiggcc-14.2.0
armdove_defconfiggcc-14.2.0
arm   randconfig-001-20250113gcc-14.2.0
arm   randconfig-002-20250113gcc-14.2.0
arm   randconfig-003-20250113gcc-14.2.0
arm   randconfig-004-20250113clang-20
arm s3c6400_defconfiggcc-14.2.0
arm64 allnoconfiggcc-14.2.0
arm64 randconfig-001-20250113gcc-14.2.0
arm64 randconfig-002-20250113clang-18
arm64 randconfig-003-20250113clang-20
arm64 randconfig-004-20250113clang-20
csky  allnoconfiggcc-14.2.0
csky  randconfig-001-20250113gcc-14.2.0
csky  randconfig-002-20250113gcc-14.2.0
hexagon  allmodconfigclang-20
hexagon   allnoconfigclang-20
hexagon  allyesconfigclang-18
hexagon   randconfig-001-20250113clang-20
hexagon   randconfig-002-20250113clang-20
i386 allmodconfiggcc-12
i386  allnoconfiggcc-12
i386 allyesconfiggcc-12
i386buildonly-randconfig-001-20250113clang-19
i386buildonly-randconfig-002-20250113gcc-12
i386buildonly-randconfig-003-20250113clang-19
i386buildonly-randconfig-004-20250113clang-19
i386buildonly-randconfig-005-20250113clang-19
i386buildonly-randconfig-006-20250113clang-19
i386defconfigclang-19
loongarchallmodconfiggcc-14.2.0
loongarch allnoconfiggcc-14.2.0
loongarch randconfig-001-20250113gcc-14.2.0
loongarch randconfig-002-20250113gcc-14.2.0
m68k allmodconfiggcc-14.2.0
m68k  allnoconfiggcc-14.2.0
m68k allyesconfiggcc-14.2.0
m68kmvme16x_defconfiggcc-14.2.0
microblaze   allmodconfiggcc-14.2.0
microblazeallnoconfiggcc-14.2.0
microblaze   allyesconfiggcc-14.2.0
mips  allnoconfiggcc-14.2.0
mips   ip28_defconfiggcc-14.2.0
mips   mtx1_defconfigclang-20
mips   sb1250_swarm_defconfiggcc-14.2.0
nios2 allnoconfiggcc-14.2.0
nios2 randconfig-001-20250113gcc-14.2.0
nios2 randconfig-002-20250113gcc-14.2.0
openrisc  allnoconfiggcc-14.2.0
openrisc allyesconfiggcc-14.2.0
openriscdefconfiggcc-14.2.0
parisc   allmodconfiggcc-14.2.0
pariscallnoconfiggcc-14.2.0
parisc   allyesconfiggcc-14.2.0
parisc  defconfiggcc-14.2.0
pariscrandconfig-001-20250113gcc-14.2.0
pariscrandconfig-002-20250113gcc-14.2.0
powerpc   allnoconfiggcc-14.2.0
powerpc  allyesconfigclang-16
powerpc   motionpro_defconfigclang-17
powerpcmvme5100_defconfiggcc-14.2.0
powerpc  ppc64e_defconfiggcc-14.2.0
powerpc   randconfig-001-20250113clang-18
powerpc   randconfig-002-20250113gcc-14.2.0
powerpc   randconfig-003-20250113clang-20
powerpcsocrates_defconfiggcc-14.2.0
powerpc64 randconfig-001-20250113clang-20
powerpc64 randconfig-002-20250113gcc-14.2.0
powerpc64 randconfig-003-20250113gcc-14.2.0
ris

[PATCH] cpufreq: Use str_enable_disable-like helpers

2025-01-14 Thread Krzysztof Kozlowski

Replace ternary (condition ? "enable" : "disable") syntax with helpers
from string_choices.h because:
1. Simple function call with one argument is easier to read.  Ternary
   operator has three arguments and with wrapping might lead to quite
   long code.
2. Is slightly shorter thus also easier to read.
3. It brings uniformity in the text - same string.
4. Allows deduping by the linker, which results in a smaller binary
   file.

Signed-off-by: Krzysztof Kozlowski 
---
 drivers/cpufreq/cpufreq.c | 7 ---
 drivers/cpufreq/powernv-cpufreq.c | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 418236fef172..fba62124b56a 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -602,12 +603,12 @@ static ssize_t store_boost(struct kobject *kobj, struct 
kobj_attribute *attr,
 
if (cpufreq_boost_trigger_state(enable)) {
pr_err("%s: Cannot %s BOOST!\n",
-  __func__, enable ? "enable" : "disable");
+  __func__, str_enable_disable(enable));
return -EINVAL;
}
 
pr_debug("%s: cpufreq BOOST %s\n",
-__func__, enable ? "enabled" : "disabled");
+__func__, str_enable_disable(enable));
 
return count;
 }
@@ -2812,7 +2813,7 @@ int cpufreq_boost_trigger_state(int state)
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
pr_err("%s: Cannot %s BOOST\n",
-  __func__, state ? "enable" : "disable");
+  __func__, str_enable_disable(state));
 
return ret;
 }
diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 8de759247771..ae79d909943b 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -281,7 +282,7 @@ static int init_powernv_pstates(void)
pr_info("cpufreq pstate min 0x%x nominal 0x%x max 0x%x\n", pstate_min,
pstate_nominal, pstate_max);
pr_info("Workload Optimized Frequency is %s in the platform\n",
-   (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
+   str_enabled_disabled(powernv_pstate_info.wof_enabled));
 
pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
if (!pstate_ids) {
-- 
2.43.0

Re: [PATCH v4 1/6] elf: Define note name macros

2025-01-14 Thread Dave Martin

Hi,

On Sat, Jan 11, 2025 at 02:48:44PM +0900, Akihiko Odaki wrote:
> elf.h had a comment saying:
> > Notes used in ET_CORE. Architectures export some of the arch register
> > sets using the corresponding note types via the PTRACE_GETREGSET and
> > PTRACE_SETREGSET requests.
> > The note name for these types is "LINUX", except NT_PRFPREG that is
> > named "CORE".
> 
> However, NT_PRSTATUS is also named "CORE". It is also unclear what
> "these types" refers to.
> 
> To fix these problems, define a name for each note type. The added
> definitions are macros so the kernel and userspace can directly refer to
> them.

I guess another motivation is to move towards having elf.h as the
single point of definition for the note names, so that obscure and
duplicate logic for determining note names can be removed.

(Could be worth adding a note on this is the patch is respun, but this
would be overkill otherwise...)


> Signed-off-by: Akihiko Odaki 
> Acked-by: Baoquan He 
> ---
>  include/uapi/linux/elf.h | 86 
> ++--
>  1 file changed, 83 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index b44069d29cec..343f5c40d03a 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -368,99 +368,179 @@ typedef struct elf64_shdr {
>  #define ELF_OSABI ELFOSABI_NONE
>  #endif
>  
> +/* Note definitions: NN_ defines names. NT_ defines types. */
> +
>  /*
>   * Notes used in ET_CORE. Architectures export some of the arch register sets
>   * using the corresponding note types via the PTRACE_GETREGSET and
>   * PTRACE_SETREGSET requests.
> - * The note name for these types is "LINUX", except NT_PRFPREG that is named
> - * "CORE".
>   */
> +#define NN_PRSTATUS  "CORE"
>  #define NT_PRSTATUS  1
> +#define NN_PRFPREG   "CORE"
>  #define NT_PRFPREG   2
> +#define NN_PRPSINFO  "CORE"
>  #define NT_PRPSINFO  3
> +#define NN_TASKSTRUCT"CORE"
>  #define NT_TASKSTRUCT4
> +#define NN_AUXV  "CORE"
>  #define NT_AUXV  6
>  /*
>   * Note to userspace developers: size of NT_SIGINFO note may increase
>   * in the future to accomodate more fields, don't assume it is fixed!
>   */
> +#define NN_SIGINFO  "CORE"
>  #define NT_SIGINFO  0x53494749
> +#define NN_FILE "CORE"
>  #define NT_FILE 0x46494c45
> +#define NN_PRXFPREG "LINUX"
>  #define NT_PRXFPREG 0x46e62b7f  /* copied from 
> gdb5.1/include/elf/common.h */

[...]

> +#define NN_LOONGARCH_HW_WATCH"LINUX"
>  #define NT_LOONGARCH_HW_WATCH0xa06   /* LoongArch hardware 
> watchpoint registers */
>  
> -/* Note types with note name "GNU" */
> +/* Note used in other file types. */

For this, it may make sense to be vaguer, e.g.

/* Other notes */

(We don't dump NT_GNU_PROPERTY_TYPE_0 in coredumps, but I don't think
it would be nonsensical to do so.)

> +#define NN_GNU_PROPERTY_TYPE_0   "GNU"
>  #define NT_GNU_PROPERTY_TYPE_0   5

[...]

With or without those changes,

Reviewed-by: Dave Martin

Re: [PATCH v4 3/6] powerpc: Use note name macros

2025-01-14 Thread Dave Martin

Hi,

On Sat, Jan 11, 2025 at 02:48:46PM +0900, Akihiko Odaki wrote:
> Use note name macros to match with the userspace's expectation.

Note, there seems to be some manual encoding of some arch-specific
notes in arch/powerpc/boot/addnote.c, but since this is all hard-coded
and the note types are not taken from elf.h anyway, I think this is
probably out of scope for this series.  I haven't tried to understand
fully what that code is doing.

Anyway:

Reviewed-by: Dave Martin 

> 
> Acked-by: Baoquan He 
> Signed-off-by: Akihiko Odaki 
> ---
>  arch/powerpc/kernel/fadump.c   | 2 +-
>  arch/powerpc/platforms/powernv/opal-core.c | 8 
>  2 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index 4b371c738213..d44349fe8e2b 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -751,7 +751,7 @@ u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct 
> pt_regs *regs)
>* prstatus.pr_pid = 
>*/
>   elf_core_copy_regs(&prstatus.pr_reg, regs);
> - buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
> + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
> &prstatus, sizeof(prstatus));
>   return buf;
>  }
> diff --git a/arch/powerpc/platforms/powernv/opal-core.c 
> b/arch/powerpc/platforms/powernv/opal-core.c
> index c9a9b759cc92..a379ff86c120 100644
> --- a/arch/powerpc/platforms/powernv/opal-core.c
> +++ b/arch/powerpc/platforms/powernv/opal-core.c
> @@ -149,7 +149,7 @@ static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word 
> *buf,
>   /* end of vector */
>   bufp[idx++] = cpu_to_be64(AT_NULL);
>  
> - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
> + buf = append_elf64_note(buf, NN_AUXV, NT_AUXV,
>   oc_conf->auxv_buf, AUXV_DESC_SZ);
>   return buf;
>  }
> @@ -252,7 +252,7 @@ static Elf64_Word * __init 
> opalcore_append_cpu_notes(Elf64_Word *buf)
>* crashing CPU's prstatus.
>*/
>   first_cpu_note = buf;
> - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
> + buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS,
>   &prstatus, sizeof(prstatus));
>  
>   for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
> @@ -279,7 +279,7 @@ static Elf64_Word * __init 
> opalcore_append_cpu_notes(Elf64_Word *buf)
>   fill_prstatus(&prstatus, thread_pir, ®s);
>  
>   if (thread_pir != oc_conf->crashing_cpu) {
> - buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
> + buf = append_elf64_note(buf, NN_PRSTATUS,
>   NT_PRSTATUS, &prstatus,
>   sizeof(prstatus));
>   } else {
> @@ -287,7 +287,7 @@ static Elf64_Word * __init 
> opalcore_append_cpu_notes(Elf64_Word *buf)
>* Add crashing CPU as the first NT_PRSTATUS note for
>* GDB to process the core file appropriately.
>*/
> - append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
> + append_elf64_note(first_cpu_note, NN_PRSTATUS,
> NT_PRSTATUS, &prstatus,
> sizeof(prstatus));
>   }
> 
> -- 
> 2.47.1
> 
>

Re: [PATCH v4 2/6] binfmt_elf: Use note name macros

2025-01-14 Thread Dave Martin

On Sat, Jan 11, 2025 at 02:48:45PM +0900, Akihiko Odaki wrote:
> Use note name macros to match with the userspace's expectation.
> 
> Signed-off-by: Akihiko Odaki 
> Acked-by: Baoquan He 

Reviewed-by: Dave Martin 

> ---
>  fs/binfmt_elf.c   | 21 ++---
>  fs/binfmt_elf_fdpic.c |  8 
>  2 files changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 106f0e8af177..5b4a92e5e508 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t 
> *off, size_t datasz,
>  }
>  
>  #define NOTE_DATA_SZ SZ_1K
> -#define GNU_PROPERTY_TYPE_0_NAME "GNU"
> -#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME))
> +#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0))
>  
>  static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
>   struct arch_elf_state *arch)
> @@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const 
> struct elf_phdr *phdr,
>   if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 ||
>   note.nhdr.n_namesz != NOTE_NAME_SZ ||
>   strncmp(note.data + sizeof(note.nhdr),
> - GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr)))
> + NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr)))
>   return -ENOEXEC;
>  
>   off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ,
> @@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, 
> struct mm_struct *mm)
>   do
>   i += 2;
>   while (auxv[i - 2] != AT_NULL);
> - fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
> + fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
>  }
>  
>  static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t 
> *csigdata,
>   const kernel_siginfo_t *siginfo)
>  {
>   copy_siginfo_to_external(csigdata, siginfo);
> - fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
> + fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata);
>  }
>  
>  /*
> @@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, 
> struct coredump_params *cprm
>   }
>  
>   size = name_curpos - (char *)data;
> - fill_note(note, "CORE", NT_FILE, size, data);
> + fill_note(note, NN_FILE, NT_FILE, size, data);
>   return 0;
>  }
>  
> @@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct 
> elf_thread_core_info *t,
>   regset_get(t->task, &view->regsets[0],
>  sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
>  
> - fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
> + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS,
> PRSTATUS_SIZE, &t->prstatus);
>   info->size += notesize(&t->notes[0]);
>  
> @@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct 
> elf_thread_core_info *t,
>   if (is_fpreg)
>   SET_PR_FPVALID(&t->prstatus);
>  
> - fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
> + fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX",
> note_type, ret, data);
>  
>   info->size += notesize(&t->notes[note_iter]);
> @@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct 
> elf_thread_core_info *t,
>   fill_prstatus(&t->prstatus.common, p, signr);
>   elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
>  
> - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
> + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
> &(t->prstatus));
>   info->size += notesize(&t->notes[0]);
>  
> @@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct 
> elf_thread_core_info *t,
>   }
>  
>   t->prstatus.pr_fpvalid = 1;
> - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
> + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu);
>   info->size += notesize(&t->notes[1]);
>  
>   return 1;
> @@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
>   psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
>   if (!psinfo)
>   return 0;
> - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
> + fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), 
> psinfo);
>  
>  #ifdef CORE_DUMP_USE_REGSET
>   view = task_user_regset_view(dump_task);
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index f1a7c4875c4a..96bd9d2f23d7 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -1397,7 +1397,7 @@ static struct elf_thread_status 
> *elf_dump_thread_status(long signr, struct task_
>   regset_get(p, &view->regsets[0],
>  sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
>  
> - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->

Re: [PATCH v4 4/6] crash: Use note name macros

2025-01-14 Thread Dave Martin

On Sat, Jan 11, 2025 at 02:48:47PM +0900, Akihiko Odaki wrote:
> Use note name macros to match with the userspace's expectation.
> 
> Signed-off-by: Akihiko Odaki 
> Acked-by: Baoquan He 

Reviewed-by: Dave Martin 

> ---
>  fs/proc/kcore.c | 12 ++--
>  include/linux/vmcore_info.h |  2 +-
>  kernel/crash_core.c |  2 +-
>  3 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
> index e376f48c4b8b..e5612313b8b4 100644
> --- a/fs/proc/kcore.c
> +++ b/fs/proc/kcore.c
> @@ -34,8 +34,6 @@
>  #include 
>  #include "internal.h"
>  
> -#define CORE_STR "CORE"
> -
>  #ifndef ELF_CORE_EFLAGS
>  #define ELF_CORE_EFLAGS  0
>  #endif
> @@ -119,7 +117,9 @@ static size_t get_kcore_size(int *nphdr, size_t 
> *phdrs_len, size_t *notes_len,
>  
>   *phdrs_len = *nphdr * sizeof(struct elf_phdr);
>   *notes_len = (4 * sizeof(struct elf_note) +
> -   3 * ALIGN(sizeof(CORE_STR), 4) +
> +   ALIGN(sizeof(NN_PRSTATUS), 4) +
> +   ALIGN(sizeof(NN_PRPSINFO), 4) +
> +   ALIGN(sizeof(NN_TASKSTRUCT), 4) +
> VMCOREINFO_NOTE_NAME_BYTES +
> ALIGN(sizeof(struct elf_prstatus), 4) +
> ALIGN(sizeof(struct elf_prpsinfo), 4) +
> @@ -444,11 +444,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, 
> struct iov_iter *iter)
>   goto out;
>   }
>  
> - append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
> + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, 
> &prstatus,
> sizeof(prstatus));
> - append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
> + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, 
> &prpsinfo,
> sizeof(prpsinfo));
> - append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
> + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, 
> current,
> arch_task_struct_size);
>   /*
>* vmcoreinfo_size is mostly constant after init time, but it
> diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
> index e1dec1a6a749..1672801fd98c 100644
> --- a/include/linux/vmcore_info.h
> +++ b/include/linux/vmcore_info.h
> @@ -8,7 +8,7 @@
>  
>  #define CRASH_CORE_NOTE_NAME"CORE"
>  #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
> -#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
> +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
>  #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
>  
>  /*
> diff --git a/kernel/crash_core.c b/kernel/crash_core.c
> index 078fe5bc5a74..335b8425dd4b 100644
> --- a/kernel/crash_core.c
> +++ b/kernel/crash_core.c
> @@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
>   memset(&prstatus, 0, sizeof(prstatus));
>   prstatus.common.pr_pid = current->pid;
>   elf_core_copy_regs(&prstatus.pr_reg, regs);
> - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
> + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
> &prstatus, sizeof(prstatus));
>   final_note(buf);
>  }
> 
> -- 
> 2.47.1
> 
>

Re: [PATCH v4 5/6] s390/crash: Use note name macros

2025-01-14 Thread Dave Martin

Hi,

On Sat, Jan 11, 2025 at 02:48:48PM +0900, Akihiko Odaki wrote:
> Use note name macros to match with the userspace's expectation.
> 
> Signed-off-by: Akihiko Odaki 
> Acked-by: Heiko Carstens 

Reviewed-by: Dave Martin 

(I'm not in a position to test this, though.)

> ---
>  arch/s390/kernel/crash_dump.c | 62 
> ---
>  1 file changed, 23 insertions(+), 39 deletions(-)
> 
> diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
> index cd0c93a8fb8b..022f4f198edf 100644
> --- a/arch/s390/kernel/crash_dump.c
> +++ b/arch/s390/kernel/crash_dump.c
> @@ -248,15 +248,6 @@ bool is_kdump_kernel(void)
>  }
>  EXPORT_SYMBOL_GPL(is_kdump_kernel);
>  
> -static const char *nt_name(Elf64_Word type)
> -{
> - const char *name = "LINUX";
> -
> - if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG)
> - name = KEXEC_CORE_NOTE_NAME;
> - return name;
> -}
> -
>  /*
>   * Initialize ELF note
>   */
> @@ -281,10 +272,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, 
> void *desc, int d_len,
>   return PTR_ADD(buf, len);
>  }
>  
> -static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int 
> d_len)
> -{
> - return nt_init_name(buf, type, desc, d_len, nt_name(type));
> -}
> +#define nt_init(buf, type, desc) \
> + nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type)
>  
>  /*
>   * Calculate the size of ELF note
> @@ -300,10 +289,7 @@ static size_t nt_size_name(int d_len, const char *name)
>   return size;
>  }
>  
> -static inline size_t nt_size(Elf64_Word type, int d_len)
> -{
> - return nt_size_name(d_len, nt_name(type));
> -}
> +#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type)
>  
>  /*
>   * Fill ELF notes for one CPU with save area registers
> @@ -324,18 +310,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, 
> struct save_area *sa)
>   memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
>   memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs));
>   /* Create ELF notes for the CPU */
> - ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus));
> - ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset));
> - ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer));
> - ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp));
> - ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg));
> - ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs));
> - ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix));
> + ptr = nt_init(ptr, PRSTATUS, nt_prstatus);
> + ptr = nt_init(ptr, PRFPREG, nt_fpregset);
> + ptr = nt_init(ptr, S390_TIMER, sa->timer);
> + ptr = nt_init(ptr, S390_TODCMP, sa->todcmp);
> + ptr = nt_init(ptr, S390_TODPREG, sa->todpreg);
> + ptr = nt_init(ptr, S390_CTRS, sa->ctrs);
> + ptr = nt_init(ptr, S390_PREFIX, sa->prefix);
>   if (cpu_has_vx()) {
> - ptr = nt_init(ptr, NT_S390_VXRS_HIGH,
> -   &sa->vxrs_high, sizeof(sa->vxrs_high));
> - ptr = nt_init(ptr, NT_S390_VXRS_LOW,
> -   &sa->vxrs_low, sizeof(sa->vxrs_low));
> + ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high);
> + ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low);
>   }
>   return ptr;
>  }
> @@ -348,16 +332,16 @@ static size_t get_cpu_elf_notes_size(void)
>   struct save_area *sa = NULL;
>   size_t size;
>  
> - size =  nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus));
> - size +=  nt_size(NT_PRFPREG, sizeof(elf_fpregset_t));
> - size +=  nt_size(NT_S390_TIMER, sizeof(sa->timer));
> - size +=  nt_size(NT_S390_TODCMP, sizeof(sa->todcmp));
> - size +=  nt_size(NT_S390_TODPREG, sizeof(sa->todpreg));
> - size +=  nt_size(NT_S390_CTRS, sizeof(sa->ctrs));
> - size +=  nt_size(NT_S390_PREFIX, sizeof(sa->prefix));
> + size =  nt_size(PRSTATUS, struct elf_prstatus);
> + size += nt_size(PRFPREG, elf_fpregset_t);
> + size += nt_size(S390_TIMER, sa->timer);
> + size += nt_size(S390_TODCMP, sa->todcmp);
> + size += nt_size(S390_TODPREG, sa->todpreg);
> + size += nt_size(S390_CTRS, sa->ctrs);
> + size += nt_size(S390_PREFIX, sa->prefix);
>   if (cpu_has_vx()) {
> - size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high));
> - size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low));
> + size += nt_size(S390_VXRS_HIGH, sa->vxrs_high);
> + size += nt_size(S390_VXRS_LOW, sa->vxrs_low);
>   }
>  
>   return size;
> @@ -373,7 +357,7 @@ static void *nt_prpsinfo(void *ptr)
>   memset(&prpsinfo, 0, sizeof(prpsinfo));
>   prpsinfo.pr_sname = 'R';
>   strcpy(prpsinfo.pr_fname, "vmlinux");
> - return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo));
> + return nt_init(ptr, PRPSINFO, prpsinfo);
>  }
>

Re: [PATCH v4 6/6] crash: Remove KEXEC_CORE_NOTE_NAME

2025-01-14 Thread Dave Martin

Hi,

On Sat, Jan 11, 2025 at 02:48:49PM +0900, Akihiko Odaki wrote:
> KEXEC_CORE_NOTE_NAME is no longer used.
> 
> Signed-off-by: Akihiko Odaki 
> Acked-by: Baoquan He 

Reviewed-by: Dave Martin 

> ---
>  include/linux/kexec.h   | 2 --
>  include/linux/vmcore_info.h | 1 -
>  2 files changed, 3 deletions(-)
> 
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index f0e9f8eda7a3..c840431eadda 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -68,8 +68,6 @@ extern note_buf_t __percpu *crash_notes;
>  #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE
>  #endif
>  
> -#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME
> -
>  /*
>   * This structure is used to hold the arguments that are used when loading
>   * kernel binaries.
> diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
> index 1672801fd98c..37e003ae5262 100644
> --- a/include/linux/vmcore_info.h
> +++ b/include/linux/vmcore_info.h
> @@ -6,7 +6,6 @@
>  #include 
>  #include 
>  
> -#define CRASH_CORE_NOTE_NAME"CORE"
>  #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
>  #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
>  #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
> 
> -- 
> 2.47.1
> 
>

Re: [PATCH v2 09/18] riscv: vdso: Switch to generic storage implementation

2025-01-14 Thread Conor Dooley

On Tue, Jan 14, 2025 at 09:40:11AM +0100, Thomas Weißschuh wrote:
> On Mon, Jan 13, 2025 at 07:48:15PM +, Conor Dooley wrote:
> > On Fri, Jan 10, 2025 at 04:23:48PM +0100, Thomas Weißschuh wrote:
> > > The generic storage implementation provides the same features as the
> > > custom one. However it can be shared between architectures, making
> > > maintenance easier.
> > > 
> > > Co-developed-by: Nam Cao 
> > > Signed-off-by: Nam Cao 
> > > Signed-off-by: Thomas Weißschuh 
> > 
> > For rv64, nommu:
> >   LD  vmlinux
> > ld.lld: error: undefined symbol: vmf_insert_pfn
> > >>> referenced by datastore.c
> > >>>   lib/vdso/datastore.o:(vvar_fault) in archive vmlinux.a
> > 
> > ld.lld: error: undefined symbol: _install_special_mapping
> > >>> referenced by datastore.c
> > >>>   lib/vdso/datastore.o:(vdso_install_vvar_mapping) in 
> > >>> archive vmlinux.a
> > 
> > Later patches in the series don't make it build again.
> > rv32 builds now though, so thanks for fixing that.
> 
> Thanks for the report.
> Can you try to diff below?

Ye, that resolved it. Thanks!


signature.asc
Description: PGP signature

Re: [PATCH v6 12/26] mm/memory: Enhance insert_page_into_pte_locked() to create writable mappings

2025-01-14 Thread David Hildenbrand


On 10.01.25 07:00, Alistair Popple wrote:

In preparation for using insert_page() for DAX, enhance
insert_page_into_pte_locked() to handle establishing writable
mappings.  Recall that DAX returns VM_FAULT_NOPAGE after installing a
PTE which bypasses the typical set_pte_range() in finish_fault.

Signed-off-by: Alistair Popple 
Suggested-by: Dan Williams 

---

Changes for v5:
  - Minor comment/formatting fixes suggested by David Hildenbrand

Changes since v2:

  - New patch split out from "mm/memory: Add dax_insert_pfn"
---
  mm/memory.c | 37 +
  1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 06bb29e..8531acb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2126,19 +2126,40 @@ static int validate_page_before_insert(struct 
vm_area_struct *vma,
  }
  
  static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,

-   unsigned long addr, struct page *page, pgprot_t prot)
+   unsigned long addr, struct page *page,
+   pgprot_t prot, bool mkwrite)
  {
struct folio *folio = page_folio(page);
+   pte_t entry = ptep_get(pte);
pte_t pteval;

>

Just drop "entry" and reuse "pteval"; even saves you from one bug below :)

pte_t pteval = ptep_get(pte);


-   if (!pte_none(ptep_get(pte)))
-   return -EBUSY;
+   if (!pte_none(entry)) {
+   if (!mkwrite)
+   return -EBUSY;
+
+   /* see insert_pfn(). */
+   if (pte_pfn(entry) != page_to_pfn(page)) {
+   WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
+   return -EFAULT;
+   }
+   entry = maybe_mkwrite(entry, vma);
+   entry = pte_mkyoung(entry);
+   if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+   update_mmu_cache(vma, addr, pte);
+   return 0;
+   }
+
/* Ok, finally just insert the thing.. */
pteval = mk_pte(page, prot);
if (unlikely(is_zero_folio(folio))) {
pteval = pte_mkspecial(pteval);
} else {
folio_get(folio);
+   entry = mk_pte(page, prot);


we already do "pteval = mk_pte(page, prot);" above?

And I think your change here does not do what you want, because you
modify the new "entry" but we do

set_pte_at(vma->vm_mm, addr, pte, pteval);

below ...


+   if (mkwrite) {
+   entry = pte_mkyoung(entry);
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   }


So again, better just reuse pteval :)

--
Cheers,

David / dhildenb

Re: [PATCH v6 13/26] mm/memory: Add vmf_insert_page_mkwrite()

2025-01-14 Thread David Hildenbrand


On 10.01.25 07:00, Alistair Popple wrote:

Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This
creates a special devmap PTE entry for the pfn but does not take a
reference on the underlying struct page for the mapping. This is
because DAX page refcounts are treated specially, as indicated by the
presence of a devmap entry.

To allow DAX page refcounts to be managed the same as normal page
refcounts introduce vmf_insert_page_mkwrite(). This will take a
reference on the underlying page much the same as vmf_insert_page,
except it also permits upgrading an existing mapping to be writable if
requested/possible.

Signed-off-by: Alistair Popple 

---

Updates from v2:

  - Rename function to make not DAX specific

  - Split the insert_page_into_pte_locked() change into a separate
patch.

Updates from v1:

  - Re-arrange code in insert_page_into_pte_locked() based on comments
from Jan Kara.

  - Call mkdrity/mkyoung for the mkwrite case, also suggested by Jan.
---
  include/linux/mm.h |  2 ++
  mm/memory.c| 36 
  2 files changed, 38 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e790298..f267b06 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3620,6 +3620,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page 
**pages,
unsigned long num);
  int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
+   bool write);
  vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
  vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 8531acb..c60b819 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2624,6 +2624,42 @@ static vm_fault_t __vm_insert_mixed(struct 
vm_area_struct *vma,
return VM_FAULT_NOPAGE;
  }
  
+vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,

+   bool write)
+{
+   struct vm_area_struct *vma = vmf->vma;
+   pgprot_t pgprot = vma->vm_page_prot;
+   unsigned long pfn = page_to_pfn(page);
+   unsigned long addr = vmf->address;
+   int err;
+
+   if (addr < vma->vm_start || addr >= vma->vm_end)
+   return VM_FAULT_SIGBUS;
+
+   track_pfn_insert(vma, &pgprot, pfn_to_pfn_t(pfn));


I think I raised this before: why is this track_pfn_insert() in here? It 
only ever does something to VM_PFNMAP mappings, and that cannot possibly 
be the case here (nothing in VM_PFNMAP is refcounted, ever)?



+
+   if (!pfn_modify_allowed(pfn, pgprot))
+   return VM_FAULT_SIGBUS;


Why is that required? Why are we messing so much with PFNs? :)

Note that x86 does in there

/* If it's real memory always allow */
if (pfn_valid(pfn))
return true;

See below, when would we ever have a "struct page *" but !pfn_valid() ?



+
+   /*
+* We refcount the page normally so make sure pfn_valid is true.
+*/
+   if (!pfn_valid(pfn))
+   return VM_FAULT_SIGBUS;


Somebody gave us a "struct page", how could the pfn ever by invalid (not 
have a struct page)?


I think all of the above regarding PFNs should be dropped -- unless I am 
missing something important.



+
+   if (WARN_ON(is_zero_pfn(pfn) && write))
+   return VM_FAULT_SIGBUS;


is_zero_page() if you already have the "page". But note that in 
validate_page_before_insert() we do have a check that allows for 
conditional insertion of the shared zeropage.


So maybe this hunk is also not required.


+
+   err = insert_page(vma, addr, page, pgprot, write);
+   if (err == -ENOMEM)
+   return VM_FAULT_OOM;
+   if (err < 0 && err != -EBUSY)
+   return VM_FAULT_SIGBUS;
+
+   return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);






--
Cheers,

David / dhildenb

Re: [PATCH v6 15/26] huge_memory: Add vmf_insert_folio_pud()

2025-01-14 Thread David Hildenbrand


On 10.01.25 07:00, Alistair Popple wrote:

Currently DAX folio/page reference counts are managed differently to
normal pages. To allow these to be managed the same as normal pages
introduce vmf_insert_folio_pud. This will map the entire PUD-sized folio
and take references as it would for a normally mapped page.

This is distinct from the current mechanism, vmf_insert_pfn_pud, which
simply inserts a special devmap PUD entry into the page table without
holding a reference to the page for the mapping.

Signed-off-by: Alistair Popple 


[...]


+/**
+ * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
+ * @vmf: Structure describing the fault
+ * @folio: folio to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, 
bool write)
+{
+   struct vm_area_struct *vma = vmf->vma;
+   unsigned long addr = vmf->address & PUD_MASK;
+   pud_t *pud = vmf->pud;
+   struct mm_struct *mm = vma->vm_mm;
+   spinlock_t *ptl;
+
+   if (addr < vma->vm_start || addr >= vma->vm_end)
+   return VM_FAULT_SIGBUS;
+
+   if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
+   return VM_FAULT_SIGBUS;
+
+   ptl = pud_lock(mm, pud);
+   if (pud_none(*vmf->pud)) {
+   folio_get(folio);
+   folio_add_file_rmap_pud(folio, &folio->page, vma);
+   add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
+   }
+   insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)), 
write);


This looks scary at first (inserting something when not taking a 
reference), but insert_pfn_pud() seems to handle that. A comment here 
would have been nice.


It's weird, though, that if there is already something else, that we 
only WARN but don't actually return an error. So ...



+   spin_unlock(ptl);
+
+   return VM_FAULT_NOPAGE;


I assume always returning VM_FAULT_NOPAGE, even when something went 
wrong, is the right thing to do?


Apart from that LGTM.


--
Cheers,

David / dhildenb

Re: [PATCH v6 16/26] huge_memory: Add vmf_insert_folio_pmd()

2025-01-14 Thread David Hildenbrand


+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, 
bool write)
+{
+   struct vm_area_struct *vma = vmf->vma;
+   unsigned long addr = vmf->address & PMD_MASK;
+   struct mm_struct *mm = vma->vm_mm;
+   spinlock_t *ptl;
+   pgtable_t pgtable = NULL;
+
+   if (addr < vma->vm_start || addr >= vma->vm_end)
+   return VM_FAULT_SIGBUS;
+
+   if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
+   return VM_FAULT_SIGBUS;
+
+   if (arch_needs_pgtable_deposit()) {
+   pgtable = pte_alloc_one(vma->vm_mm);
+   if (!pgtable)
+   return VM_FAULT_OOM;
+   }


This is interesting and nasty at the same time (only to make ppc64 boo3s 
with has tables happy). But it seems to be the right thing to do.



+
+   ptl = pmd_lock(mm, vmf->pmd);
+   if (pmd_none(*vmf->pmd)) {
+   folio_get(folio);
+   folio_add_file_rmap_pmd(folio, &folio->page, vma);
+   add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
+   }
+   insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
+  vma->vm_page_prot, write, pgtable);
+   spin_unlock(ptl);
+   if (pgtable)
+   pte_free(mm, pgtable);


Ehm, are you unconditionally freeing the pgtable, even if consumed by 
insert_pfn_pmd() ?


Note that setting pgtable to NULL in insert_pfn_pmd() when consumed will 
not be visible here.


You'd have to pass a pointer to the ... pointer (&pgtable).

... unless I am missing something, staring at the diff.

--
Cheers,

David / dhildenb

Re: [PATCH v6 19/26] proc/task_mmu: Mark devdax and fsdax pages as always unpinned

2025-01-14 Thread David Hildenbrand


On 14.01.25 03:28, Dan Williams wrote:

Alistair Popple wrote:

The procfs mmu files such as smaps and pagemap currently ignore devdax and
fsdax pages because these pages are considered special. A future change
will start treating these as normal pages, meaning they can be exposed via
smaps and pagemap.

The only difference is that devdax and fsdax pages can never be pinned for
DMA via FOLL_LONGTERM, so add an explicit check in pte_is_pinned() to
reflect that.


I don't understand this patch.



This whole pte_is_pinned() should likely be ripped out (and I have a 
patch here to do that for a long time).


But that's a different discussion.



pin_user_pages() is also used for Direct-I/O page pinning, so the
comment about FOLL_LONGTERM is wrong, and I otherwise do not understand
what goes wrong if the only pte_is_pinned() user correctly detects the
pin state?


Yes, this patch should likely just be dropped.

Even if folio_maybe_dma_pinned() == true because of "false positives", 
it will behave just like other order-0 pages with false positives, and 
only affect soft-dirty tracking ... which nobody should be caring about 
here at all.


We would always detect the PTE as soft-dirty because we we never
pte_wrprotect(old_pte)

Yes, nobody should care.

--
Cheers,

David / dhildenb

[PATCH v9 0/8] PCI: Consolidate TLP Log reading and printing

2025-01-14 Thread Ilpo Järvinen

This series has the remaining patches of the AER & DPC TLP Log handling
consolidation and now includes a few minor improvements to the earlier
accepted TLP Logging code.

v9:
- Added patch to define header logging register sizes.

v8:
- Added missing parameter to kerneldoc.
- Dropped last patch due to conflict with the pci_printk() cleanup
  series (will move the patch into that series).

v7:
- Explain in commit message reasoning why eetlp_prefix_max stores Max
  End-End TLP Prefixes value instead of limiting it by the bridge/RP
  imposed limits
- Take account TLP Prefix Log Present flag.
- Align PCI_ERR_CAP_* flags in pci_regs.h
- Add EE_PREFIX_STR define to be able to take its sizeof() for output
  char[] sizing.

v6:
- Preserve "AER:"/"DPC:" prefix on the printed TLP line
- New patch to add "AER:" also  on other lines of the AER error dump

v5:
- Fix build with AER=y and DPC=n
- Match kerneldoc and function parameter name

v4:
- Added patches:
- Remove EXPORT of pcie_read_tlp_log()
- Moved code to pcie/tlp.c and build only with AER enabled
- Match variables in prototype and function
- int -> unsigned int conversion
- eetlp_prefix_max into own patch
- struct pcie_tlp_log param consistently called "log" within tlp.c
- Moved function prototypes into drivers/pci/pci.h
- Describe AER/DPC differences more clearly in one commit message

v3:
- Small rewording in a commit message

v2:
- Don't add EXPORT()s
- Don't include igxbe changes
- Don't use pr_cont() as it's incompatible with pci_err() and according
  to Andy Shevchenko should not be used in the first place


Ilpo Järvinen (8):
  PCI: Don't expose pcie_read_tlp_log() outside of PCI subsystem
  PCI: Move TLP Log handling to own file
  PCI: Add defines for TLP Header/Prefix log sizes
  PCI: Make pcie_read_tlp_log() signature same
  PCI: Use unsigned int i in pcie_read_tlp_log()
  PCI: Store # of supported End-End TLP Prefixes
  PCI: Add TLP Prefix reading into pcie_read_tlp_log()
  PCI: Create helper to print TLP Header and Prefix Log

 drivers/pci/ats.c |   2 +-
 drivers/pci/pci.c |  28 -
 drivers/pci/pci.h |   9 +++
 drivers/pci/pcie/Makefile |   2 +-
 drivers/pci/pcie/aer.c|  15 ++---
 drivers/pci/pcie/dpc.c|  22 +++
 drivers/pci/pcie/tlp.c| 115 ++
 drivers/pci/probe.c   |  14 +++--
 drivers/pci/quirks.c  |   6 +-
 include/linux/aer.h   |  12 +++-
 include/linux/pci.h   |   2 +-
 include/uapi/linux/pci_regs.h |  11 ++--
 12 files changed, 172 insertions(+), 66 deletions(-)
 create mode 100644 drivers/pci/pcie/tlp.c


base-commit: 40384c840ea1944d7c5a392e8975ed088ecf0b37
-- 
2.39.5

[PATCH v9 2/8] PCI: Move TLP Log handling to own file

2025-01-14 Thread Ilpo Järvinen

TLP Log is PCIe feature and is processed only by AER and DPC.
Configwise, DPC depends AER being enabled. In lack of better place, the
TLP Log handling code was initially placed into pci.c but it can be
easily placed in a separate file.

Move TLP Log handling code to own file under pcie/ subdirectory and
include it only when AER is enabled.

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Jonathan Cameron 
Reviewed-by: Yazen Ghannam 
---
 drivers/pci/pci.c | 27 ---
 drivers/pci/pci.h |  2 +-
 drivers/pci/pcie/Makefile |  2 +-
 drivers/pci/pcie/tlp.c| 39 +++
 4 files changed, 41 insertions(+), 29 deletions(-)
 create mode 100644 drivers/pci/pcie/tlp.c

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e0fdc9d10f91..02cd4c7eb80b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1099,33 +1099,6 @@ static void pci_enable_acs(struct pci_dev *dev)
pci_write_config_word(dev, pos + PCI_ACS_CTRL, caps.ctrl);
 }
 
-/**
- * pcie_read_tlp_log - read TLP Header Log
- * @dev: PCIe device
- * @where: PCI Config offset of TLP Header Log
- * @tlp_log: TLP Log structure to fill
- *
- * Fill @tlp_log from TLP Header Log registers, e.g., AER or DPC.
- *
- * Return: 0 on success and filled TLP Log structure, <0 on error.
- */
-int pcie_read_tlp_log(struct pci_dev *dev, int where,
- struct pcie_tlp_log *tlp_log)
-{
-   int i, ret;
-
-   memset(tlp_log, 0, sizeof(*tlp_log));
-
-   for (i = 0; i < 4; i++) {
-   ret = pci_read_config_dword(dev, where + i * 4,
-   &tlp_log->dw[i]);
-   if (ret)
-   return pcibios_err_to_errno(ret);
-   }
-
-   return 0;
-}
-
 /**
  * pci_restore_bars - restore a device's BAR values (e.g. after wake-up)
  * @dev: PCI device to have its BARs restored
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8a60fc9e7786..55fcf3bac4f7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -549,9 +549,9 @@ struct aer_err_info {
 
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
-#endif /* CONFIG_PCIEAER */
 
 int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log 
*log);
+#endif /* CONFIG_PCIEAER */
 
 #ifdef CONFIG_PCIEPORTBUS
 /* Cached RCEC Endpoint Association */
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 53ccab62314d..173829aa02e6 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -7,7 +7,7 @@ pcieportdrv-y   := portdrv.o rcec.o
 obj-$(CONFIG_PCIEPORTBUS)  += pcieportdrv.o bwctrl.o
 
 obj-y  += aspm.o
-obj-$(CONFIG_PCIEAER)  += aer.o err.o
+obj-$(CONFIG_PCIEAER)  += aer.o err.o tlp.o
 obj-$(CONFIG_PCIEAER_INJECT)   += aer_inject.o
 obj-$(CONFIG_PCIE_PME) += pme.o
 obj-$(CONFIG_PCIE_DPC) += dpc.o
diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c
new file mode 100644
index ..3f053cc62290
--- /dev/null
+++ b/drivers/pci/pcie/tlp.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCIe TLP Log handling
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#include 
+#include 
+#include 
+
+#include "../pci.h"
+
+/**
+ * pcie_read_tlp_log - read TLP Header Log
+ * @dev: PCIe device
+ * @where: PCI Config offset of TLP Header Log
+ * @tlp_log: TLP Log structure to fill
+ *
+ * Fill @tlp_log from TLP Header Log registers, e.g., AER or DPC.
+ *
+ * Return: 0 on success and filled TLP Log structure, <0 on error.
+ */
+int pcie_read_tlp_log(struct pci_dev *dev, int where,
+ struct pcie_tlp_log *tlp_log)
+{
+   int i, ret;
+
+   memset(tlp_log, 0, sizeof(*tlp_log));
+
+   for (i = 0; i < 4; i++) {
+   ret = pci_read_config_dword(dev, where + i * 4,
+   &tlp_log->dw[i]);
+   if (ret)
+   return pcibios_err_to_errno(ret);
+   }
+
+   return 0;
+}
-- 
2.39.5

[PATCH v9 7/8] PCI: Add TLP Prefix reading into pcie_read_tlp_log()

2025-01-14 Thread Ilpo Järvinen

pcie_read_tlp_log() handles only 4 Header Log DWORDs but TLP Prefix Log
(PCIe r6.1 secs 7.8.4.12 & 7.9.14.13) may also be present.

Generalize pcie_read_tlp_log() and struct pcie_tlp_log to handle also
TLP Prefix Log. The relevant registers are formatted identically in AER
and DPC Capability, but has these variations:

a) The offsets of TLP Prefix Log registers vary.
b) DPC RP PIO TLP Prefix Log register can be < 4 DWORDs.
c) AER TLP Prefix Log Present (PCIe r6.1 sec 7.8.4.7) can indicate
   Prefix Log is not present.

Therefore callers must pass the offset of the TLP Prefix Log register
and the entire length to pcie_read_tlp_log() to be able to read the
correct number of TLP Prefix DWORDs from the correct offset.

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Jonathan Cameron 
---
 drivers/pci/pci.h |  5 +++-
 drivers/pci/pcie/aer.c|  5 +++-
 drivers/pci/pcie/dpc.c| 13 -
 drivers/pci/pcie/tlp.c| 52 +++
 include/linux/aer.h   |  1 +
 include/uapi/linux/pci_regs.h | 10 ---
 6 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 55fcf3bac4f7..7797b2544d00 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -550,7 +550,9 @@ struct aer_err_info {
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 
-int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log 
*log);
+int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
+ unsigned int tlp_len, struct pcie_tlp_log *log);
+unsigned int aer_tlp_log_len(struct pci_dev *dev, u32 aercc);
 #endif /* CONFIG_PCIEAER */
 
 #ifdef CONFIG_PCIEPORTBUS
@@ -569,6 +571,7 @@ void pci_dpc_init(struct pci_dev *pdev);
 void dpc_process_error(struct pci_dev *pdev);
 pci_ers_result_t dpc_reset_link(struct pci_dev *pdev);
 bool pci_dpc_recovered(struct pci_dev *pdev);
+unsigned int dpc_tlp_log_len(struct pci_dev *dev);
 #else
 static inline void pci_save_dpc_state(struct pci_dev *dev) { }
 static inline void pci_restore_dpc_state(struct pci_dev *dev) { }
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 80c5ba8d8296..656dbf1ac45b 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1248,7 +1248,10 @@ int aer_get_device_error_info(struct pci_dev *dev, 
struct aer_err_info *info)
 
if (info->status & AER_LOG_TLP_MASKS) {
info->tlp_header_valid = 1;
-   pcie_read_tlp_log(dev, aer + PCI_ERR_HEADER_LOG, 
&info->tlp);
+   pcie_read_tlp_log(dev, aer + PCI_ERR_HEADER_LOG,
+ aer + PCI_ERR_PREFIX_LOG,
+ aer_tlp_log_len(dev, aercc),
+ &info->tlp);
}
}
 
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 0674d8c89bfa..0aa20bc58697 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -190,7 +190,7 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev)
 static void dpc_process_rp_pio_error(struct pci_dev *pdev)
 {
u16 cap = pdev->dpc_cap, dpc_status, first_error;
-   u32 status, mask, sev, syserr, exc, log, prefix;
+   u32 status, mask, sev, syserr, exc, log;
struct pcie_tlp_log tlp_log;
int i;
 
@@ -217,20 +217,19 @@ static void dpc_process_rp_pio_error(struct pci_dev *pdev)
 
if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG)
goto clear_status;
-   pcie_read_tlp_log(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG, &tlp_log);
+   pcie_read_tlp_log(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG,
+ cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG,
+ dpc_tlp_log_len(pdev), &tlp_log);
pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n",
tlp_log.dw[0], tlp_log.dw[1], tlp_log.dw[2], tlp_log.dw[3]);
+   for (i = 0; i < pdev->dpc_rp_log_size - PCIE_STD_NUM_TLP_HEADERLOG - 1; 
i++)
+   pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, 
tlp_log.prefix[i]);
 
if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG + 1)
goto clear_status;
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG, &log);
pci_err(pdev, "RP PIO ImpSpec Log %#010x\n", log);
 
-   for (i = 0; i < pdev->dpc_rp_log_size - PCIE_STD_NUM_TLP_HEADERLOG - 1; 
i++) {
-   pci_read_config_dword(pdev,
-   cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG + i * 4, 
&prefix);
-   pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, prefix);
-   }
  clear_status:
pci_write_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_STATUS, status);
 }
diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c
index d7ad99f466b9..2f029deebc33 100644

[PATCH v9 4/8] PCI: Make pcie_read_tlp_log() signature same

2025-01-14 Thread Ilpo Järvinen

pcie_read_tlp_log()'s prototype and function signature diverged due to
changes made while applying.

Make the parameters of pcie_read_tlp_log() named identically.

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Jonathan Cameron 
Reviewed-by: Yazen Ghannam 
---
 drivers/pci/pcie/tlp.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c
index 4cc76bd1867a..f0cfe7e39078 100644
--- a/drivers/pci/pcie/tlp.c
+++ b/drivers/pci/pcie/tlp.c
@@ -15,22 +15,21 @@
  * pcie_read_tlp_log - read TLP Header Log
  * @dev: PCIe device
  * @where: PCI Config offset of TLP Header Log
- * @tlp_log: TLP Log structure to fill
+ * @log: TLP Log structure to fill
  *
- * Fill @tlp_log from TLP Header Log registers, e.g., AER or DPC.
+ * Fill @log from TLP Header Log registers, e.g., AER or DPC.
  *
  * Return: 0 on success and filled TLP Log structure, <0 on error.
  */
 int pcie_read_tlp_log(struct pci_dev *dev, int where,
- struct pcie_tlp_log *tlp_log)
+ struct pcie_tlp_log *log)
 {
int i, ret;
 
-   memset(tlp_log, 0, sizeof(*tlp_log));
+   memset(log, 0, sizeof(*log));
 
for (i = 0; i < PCIE_STD_NUM_TLP_HEADERLOG; i++) {
-   ret = pci_read_config_dword(dev, where + i * 4,
-   &tlp_log->dw[i]);
+   ret = pci_read_config_dword(dev, where + i * 4, &log->dw[i]);
if (ret)
return pcibios_err_to_errno(ret);
}
-- 
2.39.5

[PATCH v9 3/8] PCI: Add defines for TLP Header/Prefix log sizes

2025-01-14 Thread Ilpo Järvinen

Add defines for AER and DPC capabilities TLP Header Logging register
sizes (PCIe r6.2, sec 7.8.4 / 7.9.14) and replace literals with them.

Suggested-by: Yazen Ghannam 
Signed-off-by: Ilpo Järvinen 
---
 drivers/pci/pcie/dpc.c | 10 ++
 drivers/pci/pcie/tlp.c |  2 +-
 drivers/pci/quirks.c   |  6 --
 include/linux/aer.h|  9 -
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 2b6ef7efa3c1..0674d8c89bfa 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -215,18 +215,18 @@ static void dpc_process_rp_pio_error(struct pci_dev *pdev)
first_error == i ? " (First)" : "");
}
 
-   if (pdev->dpc_rp_log_size < 4)
+   if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG)
goto clear_status;
pcie_read_tlp_log(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG, &tlp_log);
pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n",
tlp_log.dw[0], tlp_log.dw[1], tlp_log.dw[2], tlp_log.dw[3]);
 
-   if (pdev->dpc_rp_log_size < 5)
+   if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG + 1)
goto clear_status;
pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG, &log);
pci_err(pdev, "RP PIO ImpSpec Log %#010x\n", log);
 
-   for (i = 0; i < pdev->dpc_rp_log_size - 5; i++) {
+   for (i = 0; i < pdev->dpc_rp_log_size - PCIE_STD_NUM_TLP_HEADERLOG - 1; 
i++) {
pci_read_config_dword(pdev,
cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG + i * 4, 
&prefix);
pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, prefix);
@@ -404,7 +404,9 @@ void pci_dpc_init(struct pci_dev *pdev)
if (!pdev->dpc_rp_log_size) {
pdev->dpc_rp_log_size =
FIELD_GET(PCI_EXP_DPC_RP_PIO_LOG_SIZE, cap);
-   if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) {
+   if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG ||
+   pdev->dpc_rp_log_size > PCIE_STD_NUM_TLP_HEADERLOG + 1 +
+   PCIE_STD_MAX_TLP_PREFIXLOG) {
pci_err(pdev, "RP PIO log size %u is invalid\n",
pdev->dpc_rp_log_size);
pdev->dpc_rp_log_size = 0;
diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c
index 3f053cc62290..4cc76bd1867a 100644
--- a/drivers/pci/pcie/tlp.c
+++ b/drivers/pci/pcie/tlp.c
@@ -28,7 +28,7 @@ int pcie_read_tlp_log(struct pci_dev *dev, int where,
 
memset(tlp_log, 0, sizeof(*tlp_log));
 
-   for (i = 0; i < 4; i++) {
+   for (i = 0; i < PCIE_STD_NUM_TLP_HEADERLOG; i++) {
ret = pci_read_config_dword(dev, where + i * 4,
&tlp_log->dw[i]);
if (ret)
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 76f4df75b08a..84487615e1d1 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -12,6 +12,7 @@
  * file, where their drivers can use them.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -6233,8 +6234,9 @@ static void dpc_log_size(struct pci_dev *dev)
return;
 
if (FIELD_GET(PCI_EXP_DPC_RP_PIO_LOG_SIZE, val) == 0) {
-   pci_info(dev, "Overriding RP PIO Log Size to 4\n");
-   dev->dpc_rp_log_size = 4;
+   pci_info(dev, "Overriding RP PIO Log Size to %d\n",
+PCIE_STD_NUM_TLP_HEADERLOG);
+   dev->dpc_rp_log_size = PCIE_STD_NUM_TLP_HEADERLOG;
}
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x461f, dpc_log_size);
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 190a0a2061cd..4ef6515c3205 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -16,10 +16,17 @@
 #define AER_CORRECTABLE2
 #define DPC_FATAL  3
 
+/*
+ * AER and DPC capabilities TLP Logging register sizes (PCIe r6.2, sec 7.8.4
+ * & 7.9.14).
+ */
+#define PCIE_STD_NUM_TLP_HEADERLOG 4
+#define PCIE_STD_MAX_TLP_PREFIXLOG 4
+
 struct pci_dev;
 
 struct pcie_tlp_log {
-   u32 dw[4];
+   u32 dw[PCIE_STD_NUM_TLP_HEADERLOG];
 };
 
 struct aer_capability_regs {
-- 
2.39.5

[PATCH v9 5/8] PCI: Use unsigned int i in pcie_read_tlp_log()

2025-01-14 Thread Ilpo Järvinen

Loop variable i counting from 0 upwards does not need to be signed so
make it unsigned int.

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Jonathan Cameron 
---
 drivers/pci/pcie/tlp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c
index f0cfe7e39078..d7ad99f466b9 100644
--- a/drivers/pci/pcie/tlp.c
+++ b/drivers/pci/pcie/tlp.c
@@ -24,7 +24,8 @@
 int pcie_read_tlp_log(struct pci_dev *dev, int where,
  struct pcie_tlp_log *log)
 {
-   int i, ret;
+   unsigned int i;
+   int ret;
 
memset(log, 0, sizeof(*log));
 
-- 
2.39.5

[PATCH v9 6/8] PCI: Store # of supported End-End TLP Prefixes

2025-01-14 Thread Ilpo Järvinen

eetlp_prefix_path in the struct pci_dev tells if End-End TLP Prefixes
are supported by the path or not, the value is only calculated if
CONFIG_PCI_PASID is set.

The Max End-End TLP Prefixes field in the Device Capabilities Register
2 also tells how many (1-4) End-End TLP Prefixes are supported (PCIe
r6.2 sec 7.5.3.15). The number of supported End-End Prefixes is useful
for reading correct number of DWORDs from TLP Prefix Log register in AER
capability (PCIe r6.2 sec 7.8.4.12).

Replace eetlp_prefix_path with eetlp_prefix_max and determine the
number of supported End-End Prefixes regardless of CONFIG_PCI_PASID so
that an upcoming commit generalizing TLP Prefix Log register reading
does not have to read extra DWORDs for End-End Prefixes that never will
be there.

The value stored into eetlp_prefix_max is directly derived from
device's Max End-End TLP Prefixes and does not consider limitations
imposed by bridges or the Root Port beyond supported/not supported
flags. This is intentional for two reasons:

  1) PCIe r6.2 spec sections r6.1 2.2.10.4 & 6.2.4.4 indicate that TLP
  is handled malformed only if the number of prefixes exceed the number
  of Max End-End TLP Prefixes, which seems to be the case even if the
  device could never receive that many prefixes due to smaller maximum
  imposed by a bridge or the Root Port. If TLP parsing is later added,
  this distinction is significant in interpreting what is logged by the
  TLP Prefix Log registers and the value matching to the Malformed TLP
  threshold is going to be more useful.

  2) TLP Prefix handling happens autonomously on a low layer and the
  value in eetlp_prefix_max is not programmed anywhere by the kernel
  (i.e., there is no limiter OS can control to prevent sending more
  than n TLP Prefixes).

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Jonathan Cameron 
Reviewed-by: Yazen Ghannam 
---
 drivers/pci/ats.c |  2 +-
 drivers/pci/probe.c   | 14 +-
 include/linux/pci.h   |  2 +-
 include/uapi/linux/pci_regs.h |  1 +
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 6afff1f1b143..c6b266c772c8 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -410,7 +410,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features)
if (WARN_ON(pdev->pasid_enabled))
return -EBUSY;
 
-   if (!pdev->eetlp_prefix_path && !pdev->pasid_no_tlp)
+   if (!pdev->eetlp_prefix_max && !pdev->pasid_no_tlp)
return -EINVAL;
 
if (!pasid)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 2e81ab0f5a25..381c22e3ccdb 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2251,8 +2251,8 @@ static void pci_configure_relaxed_ordering(struct pci_dev 
*dev)
 
 static void pci_configure_eetlp_prefix(struct pci_dev *dev)
 {
-#ifdef CONFIG_PCI_PASID
struct pci_dev *bridge;
+   unsigned int eetlp_max;
int pcie_type;
u32 cap;
 
@@ -2264,15 +2264,19 @@ static void pci_configure_eetlp_prefix(struct pci_dev 
*dev)
return;
 
pcie_type = pci_pcie_type(dev);
+
+   eetlp_max = FIELD_GET(PCI_EXP_DEVCAP2_EE_PREFIX_MAX, cap);
+   /* 00b means 4 */
+   eetlp_max = eetlp_max ?: 4;
+
if (pcie_type == PCI_EXP_TYPE_ROOT_PORT ||
pcie_type == PCI_EXP_TYPE_RC_END)
-   dev->eetlp_prefix_path = 1;
+   dev->eetlp_prefix_max = eetlp_max;
else {
bridge = pci_upstream_bridge(dev);
-   if (bridge && bridge->eetlp_prefix_path)
-   dev->eetlp_prefix_path = 1;
+   if (bridge && bridge->eetlp_prefix_max)
+   dev->eetlp_prefix_max = eetlp_max;
}
-#endif
 }
 
 static void pci_configure_serr(struct pci_dev *dev)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index db9b47ce3eef..21be5a1edf1a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -407,7 +407,7 @@ struct pci_dev {
   supported from root to here */
 #endif
unsigned intpasid_no_tlp:1; /* PASID works without TLP 
Prefix */
-   unsigned inteetlp_prefix_path:1;/* End-to-End TLP Prefix */
+   unsigned inteetlp_prefix_max:3; /* Max # of End-End TLP 
Prefixes, 0=not supported */
 
pci_channel_state_t error_state;/* Current connectivity state */
struct device   dev;/* Generic device interface */
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 1601c7ed5fab..14a6306c4ce1 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -665,6 +665,7 @@
 #define  PCI_EXP_DEVCAP2_OBFF_MSG  0x0004 /* New message signaling */
 #define  PCI_EXP_DEVCAP2_OBFF_WAKE 0x0008 /* Re-use WAKE# for OBFF */
 #define  PCI_EXP_DEVCAP2_EE_PREFIX 0x0020 /* End-End TLP Prefix */
+#define  PCI_EXP_DEVCAP2_EE

[PATCH v9 8/8] PCI: Create helper to print TLP Header and Prefix Log

2025-01-14 Thread Ilpo Järvinen

Add pcie_print_tlp_log() helper to print TLP Header and Prefix Log.
Print End-End Prefixes only if they are non-zero.

Consolidate the few places which currently print TLP using custom
formatting.

The first attempt used pr_cont() instead of building a string first but
it turns out pr_cont() is not compatible with pci_err() and prints on a
separate line. When I asked about this, Andy Shevchenko suggested
pr_cont() should not be used in the first place (to eventually get rid
of it) so pr_cont() is now replaced with building the string first.

Signed-off-by: Ilpo Järvinen 
Reviewed-by: Jonathan Cameron 
---
 drivers/pci/pci.h  |  2 ++
 drivers/pci/pcie/aer.c | 10 ++
 drivers/pci/pcie/dpc.c |  5 +
 drivers/pci/pcie/tlp.c | 34 ++
 4 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 7797b2544d00..797ad43a7683 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -553,6 +553,8 @@ void aer_print_error(struct pci_dev *dev, struct 
aer_err_info *info);
 int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2,
  unsigned int tlp_len, struct pcie_tlp_log *log);
 unsigned int aer_tlp_log_len(struct pci_dev *dev, u32 aercc);
+void pcie_print_tlp_log(const struct pci_dev *dev,
+   const struct pcie_tlp_log *log, const char *pfx);
 #endif /* CONFIG_PCIEAER */
 
 #ifdef CONFIG_PCIEPORTBUS
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 656dbf1ac45b..ece8cb88d110 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -665,12 +665,6 @@ static void pci_rootport_aer_stats_incr(struct pci_dev 
*pdev,
}
 }
 
-static void __print_tlp_header(struct pci_dev *dev, struct pcie_tlp_log *t)
-{
-   pci_err(dev, "  TLP Header: %08x %08x %08x %08x\n",
-   t->dw[0], t->dw[1], t->dw[2], t->dw[3]);
-}
-
 static void __aer_print_error(struct pci_dev *dev,
  struct aer_err_info *info)
 {
@@ -725,7 +719,7 @@ void aer_print_error(struct pci_dev *dev, struct 
aer_err_info *info)
__aer_print_error(dev, info);
 
if (info->tlp_header_valid)
-   __print_tlp_header(dev, &info->tlp);
+   pcie_print_tlp_log(dev, &info->tlp, dev_fmt("  "));
 
 out:
if (info->id && info->error_dev_num > 1 && info->id == id)
@@ -797,7 +791,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity,
aer->uncor_severity);
 
if (tlp_header_valid)
-   __print_tlp_header(dev, &aer->header_log);
+   pcie_print_tlp_log(dev, &aer->header_log, dev_fmt("  "));
 
trace_aer_event(dev_name(&dev->dev), (status & ~mask),
aer_severity, tlp_header_valid, &aer->header_log);
diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index 0aa20bc58697..242cabd5eeeb 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -220,10 +220,7 @@ static void dpc_process_rp_pio_error(struct pci_dev *pdev)
pcie_read_tlp_log(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG,
  cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG,
  dpc_tlp_log_len(pdev), &tlp_log);
-   pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n",
-   tlp_log.dw[0], tlp_log.dw[1], tlp_log.dw[2], tlp_log.dw[3]);
-   for (i = 0; i < pdev->dpc_rp_log_size - PCIE_STD_NUM_TLP_HEADERLOG - 1; 
i++)
-   pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, 
tlp_log.prefix[i]);
+   pcie_print_tlp_log(pdev, &tlp_log, dev_fmt(""));
 
if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG + 1)
goto clear_status;
diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c
index 2f029deebc33..7eb88d1b37b7 100644
--- a/drivers/pci/pcie/tlp.c
+++ b/drivers/pci/pcie/tlp.c
@@ -6,6 +6,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 
@@ -79,3 +80,36 @@ int pcie_read_tlp_log(struct pci_dev *dev, int where, int 
where2,
 
return 0;
 }
+
+#define EE_PREFIX_STR " E-E Prefixes:"
+
+/**
+ * pcie_print_tlp_log - Print TLP Header / Prefix Log contents
+ * @dev: PCIe device
+ * @log: TLP Log structure
+ * @pfx: String prefix
+ *
+ * Prints TLP Header and Prefix Log information held by @log.
+ */
+void pcie_print_tlp_log(const struct pci_dev *dev,
+   const struct pcie_tlp_log *log, const char *pfx)
+{
+   char buf[11 * (PCIE_STD_NUM_TLP_HEADERLOG + ARRAY_SIZE(log->prefix)) +
+sizeof(EE_PREFIX_STR)];
+   unsigned int i;
+   int len;
+
+   len = scnprintf(buf, sizeof(buf), "%#010x %#010x %#010x %#010x",
+   log->dw[0], log->dw[1], log->dw[2], log->dw[3]);
+
+   if (log->prefix[0])
+   len += scnprintf(buf + len, sizeof(buf) - len, EE_PREFIX_STR);
+   for (i = 0; i < ARRAY_SIZE(log->prefix); i++) {
+   if (!log->prefix[i])
+   break;
+

Re: [PATCH v2 1/7] powerpc: properly negate error in syscall_set_return_value()

2025-01-14 Thread Dmitry V. Levin

On Mon, Jan 13, 2025 at 06:34:44PM +0100, Christophe Leroy wrote:
> Le 13/01/2025 à 18:10, Dmitry V. Levin a écrit :
> > Bring syscall_set_return_value() in sync with syscall_get_error(),
> > and let upcoming ptrace/set_syscall_info selftest pass on powerpc.
> > 
> > This reverts commit 1b1a3702a65c ("powerpc: Don't negate error in
> > syscall_set_return_value()").
> 
> There is a clear detailed explanation in that commit of why it needs to 
> be done.
> 
> If you think that commit is wrong you have to explain why with at least 
> the same level of details.

OK, please have a look whether this explanation is clear and detailed enough:

===
powerpc: properly negate error in syscall_set_return_value()

When syscall_set_return_value() is used to set an error code, the caller
specifies it as a negative value in -ERRORCODE form.

In !trap_is_scv case the error code is traditionally stored as follows:
gpr[3] contains a positive ERRORCODE, and ccr has 0x1000 flag set.
Here are a few examples to illustrate this convention.  The first one
is from syscall_get_error():
/*
 * If the system call failed,
 * regs->gpr[3] contains a positive ERRORCODE.
 */
return (regs->ccr & 0x1000UL) ? -regs->gpr[3] : 0;

The second example is from regs_return_value():
if (is_syscall_success(regs))
return regs->gpr[3];
else
return -regs->gpr[3];

The third example is from check_syscall_restart():
regs->result = -EINTR;
regs->gpr[3] = EINTR;
regs->ccr |= 0x1000;

Compared with these examples, the failure of syscall_set_return_value()
to assign a positive ERRORCODE into regs->gpr[3] is clearly visible:
/*
 * In the general case it's not obvious that we must deal with
 * CCR here, as the syscall exit path will also do that for us.
 * However there are some places, eg. the signal code, which
 * check ccr to decide if the value in r3 is actually an error.
 */
if (error) {
regs->ccr |= 0x1000L;
regs->gpr[3] = error;
} else {
regs->ccr &= ~0x1000L;
regs->gpr[3] = val;
}

This fix brings syscall_set_return_value() in sync with syscall_get_error()
and lets upcoming ptrace/set_syscall_info selftest pass on powerpc.

Fixes: 1b1a3702a65c ("powerpc: Don't negate error in 
syscall_set_return_value()").
===

-- 
ldv

Re: [kvm-unit-tests PATCH v1 1/5] configure: Document that the architecture name 'aarch64' is also supported

2025-01-14 Thread Alexandru Elisei

Hi Drew,

On Mon, Jan 13, 2025 at 04:01:58PM +0100, Andrew Jones wrote:
> On Fri, Jan 10, 2025 at 01:58:44PM +, Alexandru Elisei wrote:
> > $arch, on arm64, defaults to 'aarch64', and later in the script is replaced
> > by 'arm64'. Intentional or not, document that the name 'aarch64' is also
> > supported when configuring for the arm64 architecture. This has been the
> > case since the initial commit that added support for the arm64
> > architecture, commit 39ac3f8494be ("arm64: initial drop").
> > 
> > The help text for --arch changes from*:
> > 
> >--arch=ARCHarchitecture to compile for (aarch64). ARCH can 
> > be one of:
> >arm, arm64, i386, ppc64, riscv32, riscv64, 
> > s390x, x86_64
> > 
> > to:
> > 
> > --arch=ARCHarchitecture to compile for (aarch64). ARCH can 
> > be one of:
> >arm, arm64/aarch64, i386, ppc64, riscv32, 
> > riscv64, s390x, x86_64
> > 
> > *Worth pointing out that the default architecture is 'aarch64', even though
> > the rest of the help text doesn't have it as one of the supported
> > architectures.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  configure | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/configure b/configure
> > index 86cf1da36467..5b0a2d7f39c0 100755
> > --- a/configure
> > +++ b/configure
> > @@ -47,7 +47,7 @@ usage() {
> >  
> > Options include:
> > --arch=ARCHarchitecture to compile for ($arch). ARCH 
> > can be one of:
> > -  arm, arm64, i386, ppc64, riscv32, riscv64, 
> > s390x, x86_64
> > +  arm, arm64/aarch64, i386, ppc64, riscv32, 
> > riscv64, s390x, x86_64
> > --processor=PROCESSOR  processor to compile for ($arch)
> > --target=TARGETtarget platform that the tests will be 
> > running on (qemu or
> >kvmtool, default is qemu) (arm/arm64 only)
> > -- 
> > 2.47.1
> >
> 
> I'd prefer to support --arch=aarch64, but then always refer to it as only
> arm64 everywhere else. We need to support arch=aarch64 since that's what
> 'uname -m' returns, but I don't think we need to change the help text for
> it. If we don't want to trust our users to figure out arm64==aarch64,

I sincerely dislike the fact that in the help text the default architecture on
arm64 is not among the list of supported architectures.

> then we can do something like
> 
> @@ -216,12 +197,12 @@ while [[ $optno -le $argc ]]; do
> werror=
> ;;
> --help)
> -   usage
> +   do_help=1
> ;;
> *)
> echo "Unknown option '$opt'"
> echo
> -   usage
> +   do_help=1
> ;;
>  esac
>  done
> 
> And then only do
> 
>  if [ $do_help ]; then
> usage
>  fi
> 
> after $arch and other variables have had a chance to be converted.

That still doesn't work if displaying the help text on an arm64 board:
$arch=aarch64 if compiling natively, because that's what uname -m prints, and
$arch gets converted to 'arm64' later in the script. We could move the
conversion before calling usage, but at that point I wonder if it wouldn't be
better to never set $arch to 'aarch64' in the first place.

If you don't want to modify the help text to say that aarch64 is supported, even
though it's displayed as the default architecture on arm64, we could modify
$arch to never be set to 'aarch64', i.e:

diff --git a/configure b/configure
index 86cf1da36467..1362b68dd68b 100755
--- a/configure
+++ b/configure
@@ -15,8 +15,8 @@ objdump=objdump
 readelf=readelf
 ar=ar
 addr2line=addr2line
-arch=$(uname -m | sed -e 
's/i.86/i386/;s/arm64/aarch64/;s/arm.*/arm/;s/ppc64.*/ppc64/')
-host=$arch
+host=$(uname -m | sed -e 
's/i.86/i386/;s/arm64/aarch64/;s/arm.*/arm/;s/ppc64.*/ppc64/')
+arch=$(echo $host | sed -e 's/aarch64/arm64/')
 cross_prefix=
 endian=""
 pretty_print_stacks=yes

and keep the conversion from aarch64 to arm64 where it is, and still keep it
undocumented, just in case someone is using that.

($host still needs to be aarch64, because that's the name of the qemu
executable).

Thanks,
Alex

[PATCH v9 1/8] PCI: Don't expose pcie_read_tlp_log() outside of PCI subsystem

2025-01-14 Thread Ilpo Järvinen

pcie_read_tlp_log() was exposed by the commit 0a5a46a6a61b ("PCI/AER:
Generalize TLP Header Log reading") but this is now considered a
mistake. No drivers outside of PCI subsystem should build their own
diagnostic logging but should rely on PCI core doing it for them.

There's currently one driver (ixgbe) doing it independently which was
the initial reason why the export was added but it was decided by the
PCI maintainer that it's something that should be eliminated.

Remove the unwanted EXPORT of pcie_read_tlp_log() and remove it from
include/linux/aer.h.

Link: https://lore.kernel.org/all/20240322193011.GA701027@bhelgaas/
Signed-off-by: Ilpo Järvinen 
Reviewed-by: Jonathan Cameron 
Reviewed-by: Yazen Ghannam 
---
 drivers/pci/pci.c   | 1 -
 drivers/pci/pci.h   | 4 
 include/linux/aer.h | 2 --
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 0b29ec6e8e5e..e0fdc9d10f91 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1125,7 +1125,6 @@ int pcie_read_tlp_log(struct pci_dev *dev, int where,
 
return 0;
 }
-EXPORT_SYMBOL_GPL(pcie_read_tlp_log);
 
 /**
  * pci_restore_bars - restore a device's BAR values (e.g. after wake-up)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 2e40fc63ba31..8a60fc9e7786 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -4,6 +4,8 @@
 
 #include 
 
+struct pcie_tlp_log;
+
 /* Number of possible devfns: 0.0 to 1f.7 inclusive */
 #define MAX_NR_DEVFNS 256
 
@@ -549,6 +551,8 @@ int aer_get_device_error_info(struct pci_dev *dev, struct 
aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 #endif /* CONFIG_PCIEAER */
 
+int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log 
*log);
+
 #ifdef CONFIG_PCIEPORTBUS
 /* Cached RCEC Endpoint Association */
 struct rcec_ea {
diff --git a/include/linux/aer.h b/include/linux/aer.h
index 4b97f38f3fcf..190a0a2061cd 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -37,8 +37,6 @@ struct aer_capability_regs {
u16 uncor_err_source;
 };
 
-int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log 
*log);
-
 #if defined(CONFIG_PCIEAER)
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
 int pcie_aer_is_native(struct pci_dev *dev);
-- 
2.39.5

Re: [PATCH v2 1/7] powerpc: properly negate error in syscall_set_return_value()

2025-01-14 Thread Dmitry V. Levin

On Tue, Jan 14, 2025 at 02:00:16PM +0100, Alexey Gladkov wrote:
> On Mon, Jan 13, 2025 at 07:10:54PM +0200, Dmitry V. Levin wrote:
> > Bring syscall_set_return_value() in sync with syscall_get_error(),
> > and let upcoming ptrace/set_syscall_info selftest pass on powerpc.
> > 
> > This reverts commit 1b1a3702a65c ("powerpc: Don't negate error in
> > syscall_set_return_value()").
> > 
> > Signed-off-by: Dmitry V. Levin 
> > ---
> >  arch/powerpc/include/asm/syscall.h | 6 +-
> >  1 file changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/syscall.h 
> > b/arch/powerpc/include/asm/syscall.h
> > index 3dd36c5e334a..422d7735ace6 100644
> > --- a/arch/powerpc/include/asm/syscall.h
> > +++ b/arch/powerpc/include/asm/syscall.h
> > @@ -82,7 +82,11 @@ static inline void syscall_set_return_value(struct 
> > task_struct *task,
> >  */
> > if (error) {
> > regs->ccr |= 0x1000L;
> > -   regs->gpr[3] = error;
> > +   /*
> > +* In case of an error regs->gpr[3] contains
> > +* a positive ERRORCODE.
> > +*/
> > +   regs->gpr[3] = -error;
> 
> After this change the syscall_get_error() will return positive value if
> the system call failed. Since syscall_get_error() still believes
> regs->gpr[3] is still positive in case !trap_is_scv().
> 
> Or am I missing something?

syscall_get_error() does the following in case of !trap_is_scv():

/*
 * If the system call failed,
 * regs->gpr[3] contains a positive ERRORCODE.
 */
return (regs->ccr & 0x1000UL) ? -regs->gpr[3] : 0;

That is, in !trap_is_scv() case it assumes that regs->gpr[3] is positive
and is going to return a negative value (-ERRORCODE).

> It looks like the selftest you mentioned in the commit message doesn't
> check the !trap_is_scv() branch.

The selftest is architecture-agnostic, it just executes syscalls and
checks whether the data returned by PTRACE_GET_SYSCALL_INFO meets
expectations.  Do you mean that syscall() is not good enough for syscall
invocation from coverage perspective on powerpc?

See also commit d72500f99284 ("powerpc/64s/syscall: Fix ptrace syscall
info with scv syscalls").


-- 
ldv

Re: [PATCH v6 10/26] mm/mm_init: Move p2pdma page refcount initialisation to p2pdma

2025-01-14 Thread David Hildenbrand


On 10.01.25 07:00, Alistair Popple wrote:

Currently ZONE_DEVICE page reference counts are initialised by core
memory management code in __init_zone_device_page() as part of the
memremap() call which driver modules make to obtain ZONE_DEVICE
pages. This initialises page refcounts to 1 before returning them to
the driver.

This was presumably done because it drivers had a reference of sorts
on the page. It also ensured the page could always be mapped with
vm_insert_page() for example and would never get freed (ie. have a
zero refcount), freeing drivers of manipulating page reference counts.

However it complicates figuring out whether or not a page is free from
the mm perspective because it is no longer possible to just look at
the refcount. Instead the page type must be known and if GUP is used a
secondary pgmap reference is also sometimes needed.

To simplify this it is desirable to remove the page reference count
for the driver, so core mm can just use the refcount without having to
account for page type or do other types of tracking. This is possible
because drivers can always assume the page is valid as core kernel
will never offline or remove the struct page.

This means it is now up to drivers to initialise the page refcount as
required. P2PDMA uses vm_insert_page() to map the page, and that
requires a non-zero reference count when initialising the page so set
that when the page is first mapped.

Signed-off-by: Alistair Popple 
Reviewed-by: Dan Williams 



LGTM

Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb

Re: [PATCH v2 1/7] powerpc: properly negate error in syscall_set_return_value()

2025-01-14 Thread Alexey Gladkov

On Tue, Jan 14, 2025 at 03:48:44PM +0200, Dmitry V. Levin wrote:
> On Tue, Jan 14, 2025 at 02:00:16PM +0100, Alexey Gladkov wrote:
> > On Mon, Jan 13, 2025 at 07:10:54PM +0200, Dmitry V. Levin wrote:
> > > Bring syscall_set_return_value() in sync with syscall_get_error(),
> > > and let upcoming ptrace/set_syscall_info selftest pass on powerpc.
> > > 
> > > This reverts commit 1b1a3702a65c ("powerpc: Don't negate error in
> > > syscall_set_return_value()").
> > > 
> > > Signed-off-by: Dmitry V. Levin 
> > > ---
> > >  arch/powerpc/include/asm/syscall.h | 6 +-
> > >  1 file changed, 5 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/powerpc/include/asm/syscall.h 
> > > b/arch/powerpc/include/asm/syscall.h
> > > index 3dd36c5e334a..422d7735ace6 100644
> > > --- a/arch/powerpc/include/asm/syscall.h
> > > +++ b/arch/powerpc/include/asm/syscall.h
> > > @@ -82,7 +82,11 @@ static inline void syscall_set_return_value(struct 
> > > task_struct *task,
> > >*/
> > >   if (error) {
> > >   regs->ccr |= 0x1000L;
> > > - regs->gpr[3] = error;
> > > + /*
> > > +  * In case of an error regs->gpr[3] contains
> > > +  * a positive ERRORCODE.
> > > +  */
> > > + regs->gpr[3] = -error;
> > 
> > After this change the syscall_get_error() will return positive value if
> > the system call failed. Since syscall_get_error() still believes
> > regs->gpr[3] is still positive in case !trap_is_scv().
> > 
> > Or am I missing something?
> 
> syscall_get_error() does the following in case of !trap_is_scv():
> 
> /*
>  * If the system call failed,
>  * regs->gpr[3] contains a positive ERRORCODE.
>  */
> return (regs->ccr & 0x1000UL) ? -regs->gpr[3] : 0;
> 
> That is, in !trap_is_scv() case it assumes that regs->gpr[3] is positive
> and is going to return a negative value (-ERRORCODE).

Yeah. Now I see it.

if (trap_is_scv(regs)) {
regs->result = -EINTR;
regs->gpr[3] = -EINTR;
} else {
regs->result = -EINTR;
regs->gpr[3] = EINTR;
regs->ccr |= 0x1000;
}

Two different APIs imply gpr[3] with a different sign.

You can add:

Reviewed-by: Alexey Gladkov 

> > It looks like the selftest you mentioned in the commit message doesn't
> > check the !trap_is_scv() branch.
> 
> The selftest is architecture-agnostic, it just executes syscalls and
> checks whether the data returned by PTRACE_GET_SYSCALL_INFO meets
> expectations.  Do you mean that syscall() is not good enough for syscall
> invocation from coverage perspective on powerpc?
> 
> See also commit d72500f99284 ("powerpc/64s/syscall: Fix ptrace syscall
> info with scv syscalls").
> 
> 
> -- 
> ldv

-- 
Rgrds, legion

Re: [PATCH v6 08/26] fs/dax: Remove PAGE_MAPPING_DAX_SHARED mapping flag

2025-01-14 Thread David Hildenbrand


On 10.01.25 07:00, Alistair Popple wrote:

PAGE_MAPPING_DAX_SHARED is the same as PAGE_MAPPING_ANON. This isn't
currently a problem because FS DAX pages are treated
specially. However a future change will make FS DAX pages more like
normal pages, so folio_test_anon() must not return true for a FS DAX
page.


Yes, very nice to see PAGE_MAPPING_DAX_SHARED go!

--
Cheers,

David / dhildenb

Re: [PATCH v6 11/26] mm: Allow compound zone device pages

2025-01-14 Thread David Hildenbrand


On 10.01.25 07:00, Alistair Popple wrote:

Zone device pages are used to represent various type of device memory
managed by device drivers. Currently compound zone device pages are
not supported. This is because MEMORY_DEVICE_FS_DAX pages are the only
user of higher order zone device pages and have their own page
reference counting.

A future change will unify FS DAX reference counting with normal page
reference counting rules and remove the special FS DAX reference
counting. Supporting that requires compound zone device pages.

Supporting compound zone device pages requires compound_head() to
distinguish between head and tail pages whilst still preserving the
special struct page fields that are specific to zone device pages.

A tail page is distinguished by having bit zero being set in
page->compound_head, with the remaining bits pointing to the head
page. For zone device pages page->compound_head is shared with
page->pgmap.

The page->pgmap field is common to all pages within a memory section.
Therefore pgmap is the same for both head and tail pages and can be
moved into the folio and we can use the standard scheme to find
compound_head from a tail page.


The more relevant thing is that the pgmap field must be common to all 
pages in a folio, even if a folio exceeds memory sections (e.g., 128 MiB 
on x86_64 where we have 1 GiB folios).


> > Signed-off-by: Alistair Popple 

Reviewed-by: Jason Gunthorpe 
Reviewed-by: Dan Williams 

---

Changes for v4:
  - Fix build breakages reported by kernel test robot

Changes since v2:

  - Indentation fix
  - Rename page_dev_pagemap() to page_pgmap()
  - Rename folio _unused field to _unused_pgmap_compound_head
  - s/WARN_ON/VM_WARN_ON_ONCE_PAGE/

Changes since v1:

  - Move pgmap to the folio as suggested by Matthew Wilcox
---


[...]


  static inline bool folio_is_device_coherent(const struct folio *folio)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 29919fa..61899ec 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -205,8 +205,8 @@ struct migrate_vma {
unsigned long   end;
  
  	/*

-* Set to the owner value also stored in page->pgmap->owner for
-* migrating out of device private memory. The flags also need to
+* Set to the owner value also stored in page_pgmap(page)->owner
+* for migrating out of device private memory. The flags also need to
 * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE.
 * The caller should always set this field when using mmu notifier
 * callbacks to avoid device MMU invalidations for device private
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index df8f515..54b59b8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -129,8 +129,11 @@ struct page {
unsigned long compound_head;/* Bit zero is set */
};
struct {/* ZONE_DEVICE pages */
-   /** @pgmap: Points to the hosting device page map. */
-   struct dev_pagemap *pgmap;
+   /*
+* The first word is used for compound_head or folio
+* pgmap
+*/
+   void *_unused_pgmap_compound_head;
void *zone_device_data;
/*
 * ZONE_DEVICE private pages are counted as being
@@ -299,6 +302,7 @@ typedef struct {
   * @_refcount: Do not access this member directly.  Use folio_ref_count()
   *to find how many references there are to this folio.
   * @memcg_data: Memory Control Group data.
+ * @pgmap: Metadata for ZONE_DEVICE mappings
   * @virtual: Virtual address in the kernel direct map.
   * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
   * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
@@ -337,6 +341,7 @@ struct folio {
/* private: */
};
/* public: */
+   struct dev_pagemap *pgmap;


Agreed, that should work.

Acked-by: David Hildenbrand 

--
Cheers,

David / dhildenb

[PATCH] selftests: livepatch: handle PRINTK_CALLER in check_result()

2025-01-14 Thread Madhavan Srinivasan

Some arch configs (like ppc64) enable CONFIG_PRINTK_CALLER, which
adds the caller id as part of the dmesg. Due to this, even though
the expected vs observed are same, end testcase results are failed.

 -% insmod test_modules/test_klp_livepatch.ko
 -livepatch: enabling patch 'test_klp_livepatch'
 -livepatch: 'test_klp_livepatch': initializing patching transition
 -livepatch: 'test_klp_livepatch': starting patching transition
 -livepatch: 'test_klp_livepatch': completing patching transition
 -livepatch: 'test_klp_livepatch': patching complete
 -% echo 0 > /sys/kernel/livepatch/test_klp_livepatch/enabled
 -livepatch: 'test_klp_livepatch': initializing unpatching transition
 -livepatch: 'test_klp_livepatch': starting unpatching transition
 -livepatch: 'test_klp_livepatch': completing unpatching transition
 -livepatch: 'test_klp_livepatch': unpatching complete
 -% rmmod test_klp_livepatch
 +[   T3659] % insmod test_modules/test_klp_livepatch.ko
 +[   T3682] livepatch: enabling patch 'test_klp_livepatch'
 +[   T3682] livepatch: 'test_klp_livepatch': initializing patching transition
 +[   T3682] livepatch: 'test_klp_livepatch': starting patching transition
 +[T826] livepatch: 'test_klp_livepatch': completing patching transition
 +[T826] livepatch: 'test_klp_livepatch': patching complete
 +[   T3659] % echo 0 > /sys/kernel/livepatch/test_klp_livepatch/enabled
 +[   T3659] livepatch: 'test_klp_livepatch': initializing unpatching transition
 +[   T3659] livepatch: 'test_klp_livepatch': starting unpatching transition
 +[T789] livepatch: 'test_klp_livepatch': completing unpatching transition
 +[T789] livepatch: 'test_klp_livepatch': unpatching complete
 +[   T3659] % rmmod test_klp_livepatch

  ERROR: livepatch kselftest(s) failed
 not ok 1 selftests: livepatch: test-livepatch.sh # exit=1

Currently the check_result() handles the "[time]" removal from
the dmesg. Enhance the check to handle removal of "[Tid]" also.

Signed-off-by: Madhavan Srinivasan 
---
 tools/testing/selftests/livepatch/functions.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/livepatch/functions.sh 
b/tools/testing/selftests/livepatch/functions.sh
index e5d06fb40233..a1730c1864a4 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -306,7 +306,8 @@ function check_result {
result=$(dmesg | awk -v last_dmesg="$LAST_DMESG" 'p; $0 == last_dmesg { 
p=1 }' | \
 grep -e 'livepatch:' -e 'test_klp' | \
 grep -v '\(tainting\|taints\) kernel' | \
-sed 's/^\[[ 0-9.]*\] //')
+sed 's/^\[[ 0-9.]*\] //' | \
+sed 's/^\[[ ]*T[0-9]*\] //')
 
if [[ "$expect" == "$result" ]] ; then
echo "ok"
-- 
2.47.0

Re: [kvm-unit-tests PATCH v1 2/5] configure: Display the default processor for arm and arm64

2025-01-14 Thread Alexandru Elisei

Hi Drew,

On Mon, Jan 13, 2025 at 04:11:06PM +0100, Andrew Jones wrote:
> On Fri, Jan 10, 2025 at 01:58:45PM +, Alexandru Elisei wrote:
> > The help text for the --processor option displays the architecture name as
> > the default processor type. But the default for arm is cortex-a15, and for
> > arm64 is cortex-a57. Teach configure to display the correct default
> > processor type for these two architectures.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  configure | 30 ++
> >  1 file changed, 22 insertions(+), 8 deletions(-)
> > 
> > diff --git a/configure b/configure
> > index 5b0a2d7f39c0..138840c3f76d 100755
> > --- a/configure
> > +++ b/configure
> > @@ -5,6 +5,24 @@ if [ -z "${BASH_VERSINFO[0]}" ] || [ "${BASH_VERSINFO[0]}" 
> > -lt 4 ] ; then
> >  exit 1
> >  fi
> >  
> > +function get_default_processor()
> > +{
> > +local arch="$1"
> > +
> > +case "$arch" in
> > +"arm")
> > +default_processor="cortex-a15"
> > +;;
> > +"arm64" | "aarch64")
> > +default_processor="cortex-a57"
> > +;;
> > +*)
> > +default_processor=$arch
> > +esac
> > +
> > +echo "$default_processor"
> > +}
> > +
> >  srcdir=$(cd "$(dirname "$0")"; pwd)
> >  prefix=/usr/local
> >  cc=gcc
> > @@ -33,6 +51,7 @@ page_size=
> >  earlycon=
> >  efi=
> >  efi_direct=
> > +default_processor=$(get_default_processor $arch)
> >  
> >  # Enable -Werror by default for git repositories only (i.e. developer 
> > builds)
> >  if [ -e "$srcdir"/.git ]; then
> > @@ -48,7 +67,7 @@ usage() {
> > Options include:
> > --arch=ARCHarchitecture to compile for ($arch). ARCH 
> > can be one of:
> >arm, arm64/aarch64, i386, ppc64, riscv32, 
> > riscv64, s390x, x86_64
> > -   --processor=PROCESSOR  processor to compile for ($arch)
> > +   --processor=PROCESSOR  processor to compile for ($default_processor)
> > --target=TARGETtarget platform that the tests will be 
> > running on (qemu or
> >kvmtool, default is qemu) (arm/arm64 only)
> > --cross-prefix=PREFIX  cross compiler prefix
> > @@ -283,13 +302,8 @@ else
> >  fi
> >  fi
> >  
> > -[ -z "$processor" ] && processor="$arch"
> > -
> > -if [ "$processor" = "arm64" ]; then
> > -processor="cortex-a57"
> > -elif [ "$processor" = "arm" ]; then
> > -processor="cortex-a15"
> > -fi
> > +# $arch will have changed when cross-compiling.
> > +[ -z "$processor" ] && processor=$(get_default_processor $arch)
> 
> The fact that $arch and $processor are wrong until they've had a chance to

$processor is never wrong. $processor is unset until either the user sets it
with --processor, or until this line. This patch introduces $default_processor
only for the purpose of having an accurate help text, it doesn't change when and
how $processor is assigned.

> be converted might be another reason for the $do_help idea. But it'll
> always be fragile since another change that does some sort of conversion
> could end up getting added after the '[ $do_help ] && usage' someday.

configure needs to distinguish between:

1. The user not having specified --processor when doing ./configure.
2. The user having set --processor.

If 1, then kvm-unit-tests can use the default $processor value for $arch,
which could have also been specified by the user.

If 2, then kvm-unit-tests should not touch $processor because that's what the
user wants.

Do you see something wrong with that reasoning?

Also, I don't understand why you say it's fragile, since configure doesn't
touch $processor until this point (and unless the user sets it, of course).

Thanks,
Alex

Re: [PATCH v6 16/26] huge_memory: Add vmf_insert_folio_pmd()

2025-01-14 Thread Dan Williams

David Hildenbrand wrote:
> > +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, 
> > bool write)
> > +{
> > +   struct vm_area_struct *vma = vmf->vma;
> > +   unsigned long addr = vmf->address & PMD_MASK;
> > +   struct mm_struct *mm = vma->vm_mm;
> > +   spinlock_t *ptl;
> > +   pgtable_t pgtable = NULL;
> > +
> > +   if (addr < vma->vm_start || addr >= vma->vm_end)
> > +   return VM_FAULT_SIGBUS;
> > +
> > +   if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
> > +   return VM_FAULT_SIGBUS;
> > +
> > +   if (arch_needs_pgtable_deposit()) {
> > +   pgtable = pte_alloc_one(vma->vm_mm);
> > +   if (!pgtable)
> > +   return VM_FAULT_OOM;
> > +   }
> 
> This is interesting and nasty at the same time (only to make ppc64 boo3s 
> with has tables happy). But it seems to be the right thing to do.
> 
> > +
> > +   ptl = pmd_lock(mm, vmf->pmd);
> > +   if (pmd_none(*vmf->pmd)) {
> > +   folio_get(folio);
> > +   folio_add_file_rmap_pmd(folio, &folio->page, vma);
> > +   add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
> > +   }
> > +   insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
> > +  vma->vm_page_prot, write, pgtable);
> > +   spin_unlock(ptl);
> > +   if (pgtable)
> > +   pte_free(mm, pgtable);
> 
> Ehm, are you unconditionally freeing the pgtable, even if consumed by 
> insert_pfn_pmd() ?
> 
> Note that setting pgtable to NULL in insert_pfn_pmd() when consumed will 
> not be visible here.
> 
> You'd have to pass a pointer to the ... pointer (&pgtable).
> 
> ... unless I am missing something, staring at the diff.

In fact I glazed over the fact that this has been commented on before
and assumed it was fixed:

http://lore.kernel.org/66f61ce4da80_964f229...@dwillia2-xfh.jf.intel.com.notmuch

So, yes, insert_pfn_pmd needs to take &pgtable to report back if the
allocation got consumed.

Good catch.

Re: [kvm-unit-tests PATCH v1 4/5] arm/arm64: Add support for --processor=max

2025-01-14 Thread Alexandru Elisei

Hi Drew,

On Mon, Jan 13, 2025 at 04:21:45PM +0100, Andrew Jones wrote:
> On Fri, Jan 10, 2025 at 01:58:47PM +, Alexandru Elisei wrote:
> > For arm64, newer architecture features are supported only on newer CPUs.
> > Instead of expecting the user to know which CPU model supports which
> > feature when using the TCG accelerator for qemu, let's make it easier and
> > add support for the --processor 'max' value.
> > 
> > The --processor value is passed to the compiler's -mcpu argument and to
> > qemu's -cpu argument. 'max' is a special value that only qemu understands -
> > it means that all CPU features that qemu implements are supported by the
> > guest CPU, and passing it to the compiler causes a build error. So omit the
> > -mcpu argument when $PROCESSOR=max.
> > 
> > This affects only the TCG accelerator; when using KVM or HVF,
> > kvm-unit-tests sets the cpu model to 'host'.
> > 
> > Note that using --processor=max with a 32 bit compiler will cause a build
> > error: the CPU model that the compiler defaults to when the -mcpu argument
> > is missing lacks support for some of the instructions that kvm-unit-tests
> > uses. The solution in the case is to specify a CPU model for the compiler
> > using --cflags:
> > 
> >   ./configure --arch=arm --processor=max --cflags=-mcpu=
> > 
> > This patch doesn't introduce a regression for arm when --processor=max is
> > used, it's only the error that changes: from an unknown processor type to
> > using instructions that are not available on the processor.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  arm/Makefile.common | 2 ++
> >  configure   | 5 -
> >  2 files changed, 6 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arm/Makefile.common b/arm/Makefile.common
> > index a5d97bcf477a..b757250dc9ae 100644
> > --- a/arm/Makefile.common
> > +++ b/arm/Makefile.common
> > @@ -25,7 +25,9 @@ AUXFLAGS ?= 0x0
> >  # stack.o relies on frame pointers.
> >  KEEP_FRAME_POINTER := y
> >  
> > +ifneq ($(PROCESSOR),max)
> >  CFLAGS += -mcpu=$(PROCESSOR)
> > +endif
> >  CFLAGS += -std=gnu99
> >  CFLAGS += -ffreestanding
> >  CFLAGS += -O2
> > diff --git a/configure b/configure
> > index 138840c3f76d..46964d36a7d8 100755
> > --- a/configure
> > +++ b/configure
> > @@ -67,7 +67,10 @@ usage() {
> > Options include:
> > --arch=ARCHarchitecture to compile for ($arch). ARCH 
> > can be one of:
> >arm, arm64/aarch64, i386, ppc64, riscv32, 
> > riscv64, s390x, x86_64
> > -   --processor=PROCESSOR  processor to compile for ($default_processor)
> > +   --processor=PROCESSOR  processor to compile for 
> > ($default_processor). For arm and arm64, the
> > +  value 'max' is special and it will be passed 
> > directly to
> > +  qemu, bypassing the compiler. In this case, 
> > --cflags can be
> > +  used to compile for a specific processor.
> > --target=TARGETtarget platform that the tests will be 
> > running on (qemu or
> >kvmtool, default is qemu) (arm/arm64 only)
> > --cross-prefix=PREFIX  cross compiler prefix
> > -- 
> > 2.47.1
> >
> 
> I don't think we want to overload processor this way. While mcpu and QEMU
> could both understand the same cpu names, then it was mostly fine
> (although it probably shouldn't have been overloaded before either). Now
> that we want one name for compiling and another for running, then I think
> we need another configure parameter, something like --qemu-cpu.

I agree, that's a better approach than overloading --processor. I'll try that
for the next iteration.

Thanks,
Alex

Re: [PATCH 0/2] ASoC: fsl: Support micfil on i.MX943

2025-01-14 Thread Mark Brown

On Tue, 14 Jan 2025 18:27:18 +0800, Shengjiu Wang wrote:
> On i.MX943, the FIFO data address is changed and the bit width
> of CICOSR is changed.
> Add a new compatible string and update driver for these changes.
> 
> Shengjiu Wang (2):
>   ASoC: fsl_micfil: Add i.MX943 platform support
>   ASoC: dt-bindings: fsl,micfil: Add compatible string for i.MX943
> platform
> 
> [...]

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next

Thanks!

[1/2] ASoC: fsl_micfil: Add i.MX943 platform support
  commit: eab69050450ba63a4edb17d3d1a8654d2a130786
[2/2] ASoC: dt-bindings: fsl,micfil: Add compatible string for i.MX943 platform
  commit: 3927c51e49c1a45785334dc578f0b29c685619ec

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

Re: [kvm-unit-tests PATCH v1 5/5] configure: arm64: Make 'max' the default for --processor

2025-01-14 Thread Alexandru Elisei

Hi Drew,

On Mon, Jan 13, 2025 at 04:29:21PM +0100, Andrew Jones wrote:
> On Fri, Jan 10, 2025 at 01:58:48PM +, Alexandru Elisei wrote:
> > Newer architecture features are supported by qemu TCG on newer CPUs. When
> > writing a test for such architecture features, it is necessary to pass the
> > correct -cpu argument to qemu. Make it easier on users and test authors
> > alike by making 'max' the default value for --processor. The 'max' CPU
> > model contains all the features of the cortex-a57 CPU (the old default), so
> > no regression should be possible.
> > 
> > A side effect is that, by default, the compiler will not receive a -mcpu
> > argument for compiling the code. The expectation is that this is fine,
> > since support for -mcpu=$PROCESSOR has only been added for arm64 in the
> > last commit.
> > 
> > The default for arm (cortex-a15) has been kept unchanged, because passing
> > --processor=max will cause compilation to break. If the user wants the qemu
> > CPU model to be 'max', the user will also have to supply a suitable compile
> > CPU target via --cflags=-mcpu= configure option.
> > 
> > Signed-off-by: Alexandru Elisei 
> > ---
> >  configure | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/configure b/configure
> > index 46964d36a7d8..3ab0ec208e10 100755
> > --- a/configure
> > +++ b/configure
> > @@ -14,7 +14,7 @@ function get_default_processor()
> >  default_processor="cortex-a15"
> >  ;;
> >  "arm64" | "aarch64")
> > -default_processor="cortex-a57"
> > +default_processor="max"
> >  ;;
> >  *)
> >  default_processor=$arch
> > -- 
> > 2.47.1
> >
> 
> Another reason to introduce a new parameter (qemu_cpu) is that we can also
> change arm32 to 'max', reducing divergence between arm32 and arm64.

That sounds very sensible, I'll do that for the next iteration.

Thanks,
Alex

Re: [RFC PATCH 0/2] sched/fair: introduce new scheduler group type group_parked

2025-01-14 Thread Tobias Huschle





On 10/12/2024 21:24, Shrikanth Hegde wrote:

On 12/9/24 13:35, Tobias Huschle wrote:

[...]


It was happening with 100% stress-ng case. I was wondering since i dont 
have no-hz full enabled.
I found out the reason why and one way to do is to trigger active load 
balance if there are any parked cpus
in the group. That probably needs a IS_ENABLED check not to hurt the 
regular case.


Also, I gave a try to include arch_cpu_parked in idle_cpu and friends. 
It seems to working for me.

I will attach the code below. It simplifies code quite a bit.

Also, I am thinking to rely on active balance codepath more than the 
regular pull model.
so this would be akin to asym packing codepaths. The below code does 
that too.


Feel free to take the bits as necessary if it works.



Thanks a lot for your your comments and proposals. I was working through 
them and have a v2 almost ready. I'll be offline for the next 4 weeks 
though and will post my v2 once I'm back.



[...]

Re: [kvm-unit-tests PATCH v1 1/5] configure: Document that the architecture name 'aarch64' is also supported

2025-01-14 Thread Andrew Jones

On Tue, Jan 14, 2025 at 05:03:20PM +, Alexandru Elisei wrote:
...
> diff --git a/configure b/configure
> index 86cf1da36467..1362b68dd68b 100755
> --- a/configure
> +++ b/configure
> @@ -15,8 +15,8 @@ objdump=objdump
>  readelf=readelf
>  ar=ar
>  addr2line=addr2line
> -arch=$(uname -m | sed -e 
> 's/i.86/i386/;s/arm64/aarch64/;s/arm.*/arm/;s/ppc64.*/ppc64/')
> -host=$arch
> +host=$(uname -m | sed -e 
> 's/i.86/i386/;s/arm64/aarch64/;s/arm.*/arm/;s/ppc64.*/ppc64/')
> +arch=$(echo $host | sed -e 's/aarch64/arm64/')

Sure, or avoid the second sed and just do

host=$(...)
arch=$host
[ "$arch" = "aarch64" ] && arch="arm64"

Thanks,
drew

Re: [PATCH] cpufreq: Use str_enable_disable-like helpers

2025-01-14 Thread Krzysztof Kozlowski

On 14/01/2025 11:56, Krzysztof Kozlowski wrote:
>   if (cpufreq_boost_trigger_state(enable)) {
>   pr_err("%s: Cannot %s BOOST!\n",
> -__func__, enable ? "enable" : "disable");
> +__func__, str_enable_disable(enable));
>   return -EINVAL;
>   }
>  
>   pr_debug("%s: cpufreq BOOST %s\n",
> -  __func__, enable ? "enabled" : "disabled");
> +  __func__, str_enable_disable(enable));
This needs fix - enabled, not enable. V2.

Best regards,
Krzysztof

Re: [PATCH v9 0/8] PCI: Consolidate TLP Log reading and printing

2025-01-14 Thread Bjorn Helgaas

On Tue, Jan 14, 2025 at 07:08:32PM +0200, Ilpo Järvinen wrote:
> This series has the remaining patches of the AER & DPC TLP Log handling
> consolidation and now includes a few minor improvements to the earlier
> accepted TLP Logging code.
> 
> v9:
> - Added patch to define header logging register sizes.
> 
> v8:
> - Added missing parameter to kerneldoc.
> - Dropped last patch due to conflict with the pci_printk() cleanup
>   series (will move the patch into that series).
> 
> v7:
> - Explain in commit message reasoning why eetlp_prefix_max stores Max
>   End-End TLP Prefixes value instead of limiting it by the bridge/RP
>   imposed limits
> - Take account TLP Prefix Log Present flag.
> - Align PCI_ERR_CAP_* flags in pci_regs.h
> - Add EE_PREFIX_STR define to be able to take its sizeof() for output
>   char[] sizing.
> 
> v6:
> - Preserve "AER:"/"DPC:" prefix on the printed TLP line
> - New patch to add "AER:" also  on other lines of the AER error dump
> 
> v5:
> - Fix build with AER=y and DPC=n
> - Match kerneldoc and function parameter name
> 
> v4:
> - Added patches:
>   - Remove EXPORT of pcie_read_tlp_log()
>   - Moved code to pcie/tlp.c and build only with AER enabled
>   - Match variables in prototype and function
>   - int -> unsigned int conversion
>   - eetlp_prefix_max into own patch
> - struct pcie_tlp_log param consistently called "log" within tlp.c
> - Moved function prototypes into drivers/pci/pci.h
> - Describe AER/DPC differences more clearly in one commit message
> 
> v3:
> - Small rewording in a commit message
> 
> v2:
> - Don't add EXPORT()s
> - Don't include igxbe changes
> - Don't use pr_cont() as it's incompatible with pci_err() and according
>   to Andy Shevchenko should not be used in the first place
> 
> 
> Ilpo Järvinen (8):
>   PCI: Don't expose pcie_read_tlp_log() outside of PCI subsystem
>   PCI: Move TLP Log handling to own file
>   PCI: Add defines for TLP Header/Prefix log sizes
>   PCI: Make pcie_read_tlp_log() signature same
>   PCI: Use unsigned int i in pcie_read_tlp_log()
>   PCI: Store # of supported End-End TLP Prefixes
>   PCI: Add TLP Prefix reading into pcie_read_tlp_log()
>   PCI: Create helper to print TLP Header and Prefix Log
> 
>  drivers/pci/ats.c |   2 +-
>  drivers/pci/pci.c |  28 -
>  drivers/pci/pci.h |   9 +++
>  drivers/pci/pcie/Makefile |   2 +-
>  drivers/pci/pcie/aer.c|  15 ++---
>  drivers/pci/pcie/dpc.c|  22 +++
>  drivers/pci/pcie/tlp.c| 115 ++
>  drivers/pci/probe.c   |  14 +++--
>  drivers/pci/quirks.c  |   6 +-
>  include/linux/aer.h   |  12 +++-
>  include/linux/pci.h   |   2 +-
>  include/uapi/linux/pci_regs.h |  11 ++--
>  12 files changed, 172 insertions(+), 66 deletions(-)
>  create mode 100644 drivers/pci/pcie/tlp.c

Applied to pci/err for v6.14, thanks!

Re: [PATCH v6 08/26] fs/dax: Remove PAGE_MAPPING_DAX_SHARED mapping flag

2025-01-14 Thread Alistair Popple

On Mon, Jan 13, 2025 at 04:52:34PM -0800, Dan Williams wrote:
> Alistair Popple wrote:
> > PAGE_MAPPING_DAX_SHARED is the same as PAGE_MAPPING_ANON. 
> 
> I think a bit a bit more detail is warranted, how about?
> 
> The page ->mapping pointer can have magic values like
> PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON for page owner specific
> usage. In fact, PAGE_MAPPING_DAX_SHARED and PAGE_MAPPING_ANON alias the
> same value.

Massaged it slightly but sounds good.

> > This isn't currently a problem because FS DAX pages are treated
> > specially.
> 
> s/are treated specially/are never seen by the anonymous mapping code and
> vice versa/
> 
> > However a future change will make FS DAX pages more like
> > normal pages, so folio_test_anon() must not return true for a FS DAX
> > page.
> > 
> > We could explicitly test for a FS DAX page in folio_test_anon(),
> > etc. however the PAGE_MAPPING_DAX_SHARED flag isn't actually
> > needed. Instead we can use the page->mapping field to implicitly track
> > the first mapping of a page. If page->mapping is non-NULL it implies
> > the page is associated with a single mapping at page->index. If the
> > page is associated with a second mapping clear page->mapping and set
> > page->share to 1.
> > 
> > This is possible because a shared mapping implies the file-system
> > implements dax_holder_operations which makes the ->mapping and
> > ->index, which is a union with ->share, unused.
> > 
> > The page is considered shared when page->mapping == NULL and
> > page->share > 0 or page->mapping != NULL, implying it is present in at
> > least one address space. This also makes it easier for a future change
> > to detect when a page is first mapped into an address space which
> > requires special handling.
> > 
> > Signed-off-by: Alistair Popple 
> > ---
> >  fs/dax.c   | 45 +--
> >  include/linux/page-flags.h |  6 +-
> >  2 files changed, 29 insertions(+), 22 deletions(-)
> > 
> > diff --git a/fs/dax.c b/fs/dax.c
> > index 4e49cc4..d35dbe1 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -351,38 +351,41 @@ static unsigned long dax_end_pfn(void *entry)
> > for (pfn = dax_to_pfn(entry); \
> > pfn < dax_end_pfn(entry); pfn++)
> >  
> > +/*
> > + * A DAX page is considered shared if it has no mapping set and ->share 
> > (which
> > + * shares the ->index field) is non-zero. Note this may return false even 
> > if the
> > + * page is shared between multiple files but has not yet actually been 
> > mapped
> > + * into multiple address spaces.
> > + */
> >  static inline bool dax_page_is_shared(struct page *page)
> >  {
> > -   return page->mapping == PAGE_MAPPING_DAX_SHARED;
> > +   return !page->mapping && page->share;
> >  }
> >  
> >  /*
> > - * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
> > - * refcount.
> > + * Increase the page share refcount, warning if the page is not marked as 
> > shared.
> >   */
> >  static inline void dax_page_share_get(struct page *page)
> >  {
> > -   if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
> > -   /*
> > -* Reset the index if the page was already mapped
> > -* regularly before.
> > -*/
> > -   if (page->mapping)
> > -   page->share = 1;
> > -   page->mapping = PAGE_MAPPING_DAX_SHARED;
> > -   }
> > +   WARN_ON_ONCE(!page->share);
> > +   WARN_ON_ONCE(page->mapping);
> 
> Given the only caller of this function is dax_associate_entry() it seems
> like overkill to check that a function only a few lines away manipulated
> ->mapping correctly.

Good call.

> I don't see much reason for dax_page_share_get() to exist after your
> changes.
> 
> Perhaps all that is needed is a dax_make_shared() helper that does the
> initial fiddling of '->mapping = NULL' and '->share = 1'?

Ok. I was going to make the argument that dax_make_shared() was overkill as
well, but as noted below it's a good place to put the comment describing how
this all works so have done that.

> > page->share++;
> >  }
> >  
> >  static inline unsigned long dax_page_share_put(struct page *page)
> >  {
> > +   WARN_ON_ONCE(!page->share);
> > return --page->share;
> >  }
> >  
> >  /*
> > - * When it is called in dax_insert_entry(), the shared flag will indicate 
> > that
> > - * whether this entry is shared by multiple files.  If so, set the 
> > page->mapping
> > - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
> > + * When it is called in dax_insert_entry(), the shared flag will indicate
> > + * whether this entry is shared by multiple files. If the page has not
> > + * previously been associated with any mappings the ->mapping and ->index
> > + * fields will be set. If it has already been associated with a mapping
> > + * the mapping will be cleared and the share count set. It's then up to the
> > + * file-system to track which mappings contain which pages, ie. by 
> > implementing
>

Re: [PATCH v6 13/26] mm/memory: Add vmf_insert_page_mkwrite()

2025-01-14 Thread Alistair Popple

On Tue, Jan 14, 2025 at 05:15:54PM +0100, David Hildenbrand wrote:
> On 10.01.25 07:00, Alistair Popple wrote:
> > Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This
> > creates a special devmap PTE entry for the pfn but does not take a
> > reference on the underlying struct page for the mapping. This is
> > because DAX page refcounts are treated specially, as indicated by the
> > presence of a devmap entry.
> > 
> > To allow DAX page refcounts to be managed the same as normal page
> > refcounts introduce vmf_insert_page_mkwrite(). This will take a
> > reference on the underlying page much the same as vmf_insert_page,
> > except it also permits upgrading an existing mapping to be writable if
> > requested/possible.
> > 
> > Signed-off-by: Alistair Popple 
> > 
> > ---
> > 
> > Updates from v2:
> > 
> >   - Rename function to make not DAX specific
> > 
> >   - Split the insert_page_into_pte_locked() change into a separate
> > patch.
> > 
> > Updates from v1:
> > 
> >   - Re-arrange code in insert_page_into_pte_locked() based on comments
> > from Jan Kara.
> > 
> >   - Call mkdrity/mkyoung for the mkwrite case, also suggested by Jan.
> > ---
> >   include/linux/mm.h |  2 ++
> >   mm/memory.c| 36 
> >   2 files changed, 38 insertions(+)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index e790298..f267b06 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3620,6 +3620,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct 
> > page **pages,
> > unsigned long num);
> >   int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
> > unsigned long num);
> > +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
> > +   bool write);
> >   vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> > unsigned long pfn);
> >   vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long 
> > addr,
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 8531acb..c60b819 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2624,6 +2624,42 @@ static vm_fault_t __vm_insert_mixed(struct 
> > vm_area_struct *vma,
> > return VM_FAULT_NOPAGE;
> >   }
> > +vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
> > +   bool write)
> > +{
> > +   struct vm_area_struct *vma = vmf->vma;
> > +   pgprot_t pgprot = vma->vm_page_prot;
> > +   unsigned long pfn = page_to_pfn(page);
> > +   unsigned long addr = vmf->address;
> > +   int err;
> > +
> > +   if (addr < vma->vm_start || addr >= vma->vm_end)
> > +   return VM_FAULT_SIGBUS;
> > +
> > +   track_pfn_insert(vma, &pgprot, pfn_to_pfn_t(pfn));
> 
> I think I raised this before: why is this track_pfn_insert() in here? It
> only ever does something to VM_PFNMAP mappings, and that cannot possibly be
> the case here (nothing in VM_PFNMAP is refcounted, ever)?

Yes, I also had deja vu reading this comment and a vague recollection of fixing
them too. Your comments[1] were for vmf_insert_folio_pud() though which exlains
why I neglected to do the same clean-up here even though I should have so thanks
for pointing them out.

[1] - 
https://lore.kernel.org/linux-mm/ee19854f-fa1f-4207-9176-3c7b79bcc...@redhat.com/

> 
> > +
> > +   if (!pfn_modify_allowed(pfn, pgprot))
> > +   return VM_FAULT_SIGBUS;
> 
> Why is that required? Why are we messing so much with PFNs? :)
> 
> Note that x86 does in there
> 
>   /* If it's real memory always allow */
>   if (pfn_valid(pfn))
>   return true;
> 
> See below, when would we ever have a "struct page *" but !pfn_valid() ?
> 
> 
> > +
> > +   /*
> > +* We refcount the page normally so make sure pfn_valid is true.
> > +*/
> > +   if (!pfn_valid(pfn))
> > +   return VM_FAULT_SIGBUS;
> 
> Somebody gave us a "struct page", how could the pfn ever by invalid (not
> have a struct page)?
> 
> I think all of the above regarding PFNs should be dropped -- unless I am
> missing something important.
> 
> > +
> > +   if (WARN_ON(is_zero_pfn(pfn) && write))
> > +   return VM_FAULT_SIGBUS;
> 
> is_zero_page() if you already have the "page". But note that in
> validate_page_before_insert() we do have a check that allows for conditional
> insertion of the shared zeropage.
> 
> So maybe this hunk is also not required.

Yes, also not required. I have removed the above hunks as well because we don't
need any of this pfn stuff. Again it's just a hangover from an earlier version
of the series when I was passing pfn's rather than pages here.

> > +
> > +   err = insert_page(vma, addr, page, pgprot, write);
> > +   if (err == -ENOMEM)
> > +   return VM_FAULT_OOM;
> > +   if (err < 0 && err != -EBUSY)
> > +   return VM_FAULT_SIGBUS;
> > +
> > +   return VM_FAULT_NOPAGE;
> > +}
> > +EXPORT_SYMBOL_

Re: [PATCH v6 15/26] huge_memory: Add vmf_insert_folio_pud()

2025-01-14 Thread Alistair Popple

On Tue, Jan 14, 2025 at 05:22:15PM +0100, David Hildenbrand wrote:
> On 10.01.25 07:00, Alistair Popple wrote:
> > Currently DAX folio/page reference counts are managed differently to
> > normal pages. To allow these to be managed the same as normal pages
> > introduce vmf_insert_folio_pud. This will map the entire PUD-sized folio
> > and take references as it would for a normally mapped page.
> > 
> > This is distinct from the current mechanism, vmf_insert_pfn_pud, which
> > simply inserts a special devmap PUD entry into the page table without
> > holding a reference to the page for the mapping.
> > 
> > Signed-off-by: Alistair Popple 
> 
> [...]
> 
> > +/**
> > + * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
> > + * @vmf: Structure describing the fault
> > + * @folio: folio to insert
> > + * @write: whether it's a write fault
> > + *
> > + * Return: vm_fault_t value.
> > + */
> > +vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, 
> > bool write)
> > +{
> > +   struct vm_area_struct *vma = vmf->vma;
> > +   unsigned long addr = vmf->address & PUD_MASK;
> > +   pud_t *pud = vmf->pud;
> > +   struct mm_struct *mm = vma->vm_mm;
> > +   spinlock_t *ptl;
> > +
> > +   if (addr < vma->vm_start || addr >= vma->vm_end)
> > +   return VM_FAULT_SIGBUS;
> > +
> > +   if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
> > +   return VM_FAULT_SIGBUS;
> > +
> > +   ptl = pud_lock(mm, pud);
> > +   if (pud_none(*vmf->pud)) {
> > +   folio_get(folio);
> > +   folio_add_file_rmap_pud(folio, &folio->page, vma);
> > +   add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
> > +   }
> > +   insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)), 
> > write);
> 
> This looks scary at first (inserting something when not taking a reference),
> but insert_pfn_pud() seems to handle that. A comment here would have been
> nice.

Indeed, I will add one.
 
> It's weird, though, that if there is already something else, that we only
> WARN but don't actually return an error. So ...

Note we only WARN when there is already a mapping there and we're trying to
upgrade it to writeable. This just mimics the logic which currently exists in
insert_pfn() and insert_pfn_pmd().

The comment in insert_pfn() sheds more light:

/*
 * For read faults on private mappings the PFN passed
 * in may not match the PFN we have mapped if the
 * mapped PFN is a writeable COW page.  In the mkwrite
 * case we are creating a writable PTE for a shared
 * mapping and we expect the PFNs to match. If they
 * don't match, we are likely racing with block
 * allocation and mapping invalidation so just skip the
 * update.
 */

> > +   spin_unlock(ptl);
> > +
> > +   return VM_FAULT_NOPAGE;
> 
> I assume always returning VM_FAULT_NOPAGE, even when something went wrong,
> is the right thing to do?

Yes, I think so. I guess in the WARN case we could return something like
VM_FAULT_SIGBUS to kill the application, but the existing vmf_insert_*()
functions don't currently do that so I think that would be a separate clean-up.

> Apart from that LGTM.
> 
> 
> -- 
> Cheers,
> 
> David / dhildenb
>

[PATCH v5 6/6] crash: Remove KEXEC_CORE_NOTE_NAME

2025-01-14 Thread Akihiko Odaki

KEXEC_CORE_NOTE_NAME is no longer used.

Signed-off-by: Akihiko Odaki 
Acked-by: Baoquan He 
Reviewed-by: Dave Martin 
---
 include/linux/kexec.h   | 2 --
 include/linux/vmcore_info.h | 1 -
 2 files changed, 3 deletions(-)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f0e9f8eda7a3..c840431eadda 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -68,8 +68,6 @@ extern note_buf_t __percpu *crash_notes;
 #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE
 #endif
 
-#define KEXEC_CORE_NOTE_NAME   CRASH_CORE_NOTE_NAME
-
 /*
  * This structure is used to hold the arguments that are used when loading
  * kernel binaries.
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index 1672801fd98c..37e003ae5262 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -6,7 +6,6 @@
 #include 
 #include 
 
-#define CRASH_CORE_NOTE_NAME  "CORE"
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
 #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
 #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)

-- 
2.47.1

Re: [PATCH v6 12/26] mm/memory: Enhance insert_page_into_pte_locked() to create writable mappings

2025-01-14 Thread Alistair Popple

On Mon, Jan 13, 2025 at 05:08:31PM -0800, Dan Williams wrote:
> Alistair Popple wrote:
> > In preparation for using insert_page() for DAX, enhance
> > insert_page_into_pte_locked() to handle establishing writable
> > mappings.  Recall that DAX returns VM_FAULT_NOPAGE after installing a
> > PTE which bypasses the typical set_pte_range() in finish_fault.
> > 
> > Signed-off-by: Alistair Popple 
> > Suggested-by: Dan Williams 
> > 
> > ---
> > 
> > Changes for v5:
> >  - Minor comment/formatting fixes suggested by David Hildenbrand
> > 
> > Changes since v2:
> > 
> >  - New patch split out from "mm/memory: Add dax_insert_pfn"
> > ---
> >  mm/memory.c | 37 +
> >  1 file changed, 29 insertions(+), 8 deletions(-)
> > 
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 06bb29e..8531acb 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2126,19 +2126,40 @@ static int validate_page_before_insert(struct 
> > vm_area_struct *vma,
> >  }
> >  
> >  static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t 
> > *pte,
> > -   unsigned long addr, struct page *page, pgprot_t prot)
> > +   unsigned long addr, struct page *page,
> > +   pgprot_t prot, bool mkwrite)
> >  {
> > struct folio *folio = page_folio(page);
> > +   pte_t entry = ptep_get(pte);
> > pte_t pteval;
> >  
> > -   if (!pte_none(ptep_get(pte)))
> > -   return -EBUSY;
> > +   if (!pte_none(entry)) {
> > +   if (!mkwrite)
> > +   return -EBUSY;
> > +
> > +   /* see insert_pfn(). */
> > +   if (pte_pfn(entry) != page_to_pfn(page)) {
> > +   WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
> > +   return -EFAULT;
> > +   }
> > +   entry = maybe_mkwrite(entry, vma);
> > +   entry = pte_mkyoung(entry);
> > +   if (ptep_set_access_flags(vma, addr, pte, entry, 1))
> > +   update_mmu_cache(vma, addr, pte);
> > +   return 0;
> > +   }
> 
> This hunk feels like it is begging to be unified with insert_pfn() after
> pfn_t dies. Perhaps a TODO to remember to come back and unify them, or
> you can go append that work to your pfn_t removal series?

No one has complained about removing pfn_t so I do intend to clean that series
up once this has all been merged somewhere, so I will just go append this
work there.

> Other than that you can add:
> 
> Reviewed-by: Dan Williams

Re: [PATCH v6 08/26] fs/dax: Remove PAGE_MAPPING_DAX_SHARED mapping flag

2025-01-14 Thread Dan Williams

Alistair Popple wrote:
[..]
> > How does this case happen? I don't think any page would ever enter with
> > both ->mapping and ->share set, right?
> 
> Sigh. You're right - it can't. This patch series is getting a litte bit large
> and unweildy with all the prerequisite bugfixes and cleanups. Obviously I 
> fixed
> this when developing the main fs dax count fixup but forgot to rebase the fix
> further back in the series.

I assumed as much when I got to that patch.

> Anyway I have fixed that now, thanks.

You deserve a large helping of grace for waking and then slaying this
old dragon.

[PATCH v5 1/6] elf: Define note name macros

2025-01-14 Thread Akihiko Odaki

elf.h had a comment saying:
> Notes used in ET_CORE. Architectures export some of the arch register
> sets using the corresponding note types via the PTRACE_GETREGSET and
> PTRACE_SETREGSET requests.
> The note name for these types is "LINUX", except NT_PRFPREG that is
> named "CORE".

However, NT_PRSTATUS is also named "CORE". It is also unclear what
"these types" refers to.

To fix these problems, define a name for each note type. The added
definitions are macros so the kernel and userspace can directly refer to
them to remove their duplicate definitions of note names.

Signed-off-by: Akihiko Odaki 
Acked-by: Baoquan He 
---
 include/uapi/linux/elf.h | 89 +---
 1 file changed, 84 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b44069d29cec..592507aa9b3a 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -368,101 +368,180 @@ typedef struct elf64_shdr {
 #define ELF_OSABI ELFOSABI_NONE
 #endif
 
+/* Note definitions: NN_ defines names. NT_ defines types. */
+
+#define NN_GNU_PROPERTY_TYPE_0 "GNU"
+#define NT_GNU_PROPERTY_TYPE_0 5
+
 /*
  * Notes used in ET_CORE. Architectures export some of the arch register sets
  * using the corresponding note types via the PTRACE_GETREGSET and
  * PTRACE_SETREGSET requests.
- * The note name for these types is "LINUX", except NT_PRFPREG that is named
- * "CORE".
  */
+#define NN_PRSTATUS"CORE"
 #define NT_PRSTATUS1
+#define NN_PRFPREG "CORE"
 #define NT_PRFPREG 2
+#define NN_PRPSINFO"CORE"
 #define NT_PRPSINFO3
+#define NN_TASKSTRUCT  "CORE"
 #define NT_TASKSTRUCT  4
+#define NN_AUXV"CORE"
 #define NT_AUXV6
 /*
  * Note to userspace developers: size of NT_SIGINFO note may increase
  * in the future to accomodate more fields, don't assume it is fixed!
  */
+#define NN_SIGINFO  "CORE"
 #define NT_SIGINFO  0x53494749
+#define NN_FILE "CORE"
 #define NT_FILE 0x46494c45
+#define NN_PRXFPREG "LINUX"
 #define NT_PRXFPREG 0x46e62b7f  /* copied from 
gdb5.1/include/elf/common.h */
+#define NN_PPC_VMX "LINUX"
 #define NT_PPC_VMX 0x100   /* PowerPC Altivec/VMX registers */
+#define NN_PPC_SPE "LINUX"
 #define NT_PPC_SPE 0x101   /* PowerPC SPE/EVR registers */
+#define NN_PPC_VSX "LINUX"
 #define NT_PPC_VSX 0x102   /* PowerPC VSX registers */
+#define NN_PPC_TAR "LINUX"
 #define NT_PPC_TAR 0x103   /* Target Address Register */
+#define NN_PPC_PPR "LINUX"
 #define NT_PPC_PPR 0x104   /* Program Priority Register */
+#define NN_PPC_DSCR"LINUX"
 #define NT_PPC_DSCR0x105   /* Data Stream Control Register */
+#define NN_PPC_EBB "LINUX"
 #define NT_PPC_EBB 0x106   /* Event Based Branch Registers */
+#define NN_PPC_PMU "LINUX"
 #define NT_PPC_PMU 0x107   /* Performance Monitor Registers */
+#define NN_PPC_TM_CGPR "LINUX"
 #define NT_PPC_TM_CGPR 0x108   /* TM checkpointed GPR Registers */
+#define NN_PPC_TM_CFPR "LINUX"
 #define NT_PPC_TM_CFPR 0x109   /* TM checkpointed FPR Registers */
+#define NN_PPC_TM_CVMX "LINUX"
 #define NT_PPC_TM_CVMX 0x10a   /* TM checkpointed VMX Registers */
+#define NN_PPC_TM_CVSX "LINUX"
 #define NT_PPC_TM_CVSX 0x10b   /* TM checkpointed VSX Registers */
+#define NN_PPC_TM_SPR  "LINUX"
 #define NT_PPC_TM_SPR  0x10c   /* TM Special Purpose Registers */
+#define NN_PPC_TM_CTAR "LINUX"
 #define NT_PPC_TM_CTAR 0x10d   /* TM checkpointed Target Address 
Register */
+#define NN_PPC_TM_CPPR "LINUX"
 #define NT_PPC_TM_CPPR 0x10e   /* TM checkpointed Program Priority 
Register */
+#define NN_PPC_TM_CDSCR"LINUX"
 #define NT_PPC_TM_CDSCR0x10f   /* TM checkpointed Data Stream 
Control Register */
+#define NN_PPC_PKEY"LINUX"
 #define NT_PPC_PKEY0x110   /* Memory Protection Keys registers */
+#define NN_PPC_DEXCR   "LINUX"
 #define NT_PPC_DEXCR   0x111   /* PowerPC DEXCR registers */
+#define NN_PPC_HASHKEYR"LINUX"
 #define NT_PPC_HASHKEYR0x112   /* PowerPC HASHKEYR register */
+#define NN_386_TLS "LINUX"
 #define NT_386_TLS 0x200   /* i386 TLS slots (struct user_desc) */
+#define NN_386_IOPERM  "LINUX"
 #define NT_386_IOPERM  0x201   /* x86 io permission bitmap (1=deny) */
+#define NN_X86_XSTATE  "LINUX"
 #define NT_X86_XSTATE  0x202   /* x86 extended state using xsave */
 /* Old binutils treats 0x203 as a CET state */
+#define NN_X86_SHSTK   "LINUX"
 #define NT_X86_SHSTK   0x204   /* x86 SHSTK state */
+#define NN_X86_XSAVE_LAYOUT"LINUX"
 #define NT_X86_XSAVE_LAYOUT0x205   /* XSAVE layout description */
+#define NN_S390_HIGH_GPRS  "LINUX"
 #define NT_S390_HIGH_GPRS  0x300   /* s390 upper register halves */
+#define NN_S390_TIMER  "LINUX"
 #define NT_S390_TIMER

[PATCH v5 0/6] elf: Define note name macros

2025-01-14 Thread Akihiko Odaki

elf.h had a comment saying:
> Notes used in ET_CORE. Architectures export some of the arch register
> sets using the corresponding note types via the PTRACE_GETREGSET and
> PTRACE_SETREGSET requests.
> The note name for these types is "LINUX", except NT_PRFPREG that is
> named "CORE".

However, NT_PRSTATUS is also named "CORE". It is also unclear what
"these types" refers to.

To fix these problems, define a name for each note type. The added
definitions are macros so the kernel and userspace can directly refer to
them.

For userspace program developers
---
While the main purpose of new macros is documentation, they are also
hoped to be useful for userspace programs. Please check patch
"elf: Define note name macros" and if you have a suggestion to make it
more convenient for you, please share.

I added the Binutils mailing list to the CC as it contains code to parse
dumps. I'm also planning to share this series on LLVM Discourse.

Signed-off-by: Akihiko Odaki 
---
Changes in v5:
- Noted that patch "elf: Define note name macros" allows removing
  duplicate definitions of note names.
- Reordered NT_GNU_PROPERTY_TYPE_0 to precede notes used in ET_CORE to
  show its not specific to ET_CORE.
- Link to v4: 
https://lore.kernel.org/r/20250111-elf-v4-0-b3841fa0d...@daynix.com

Changes in v4:
- s/powwerpc/powerpc/
- s/NT_INIT/nt_init/g s/NT_SIZE/nt_size/g
- Removed parentheses that have little value.
- Fixed the code alignment in get_cpu_elf_notes_size().
- Link to v3: 
https://lore.kernel.org/r/20250107-elf-v3-0-99cb505b1...@daynix.com

Changes in v3:
- Added patch "s390/crash: Use note name macros".
- Changed to interleave note name and type macros.
- Described NN_ and NT_ macros.
- Link to v2: 
https://lore.kernel.org/r/20250104-elf-v2-0-77dc2e06d...@daynix.com

Changes in v2:
- Added a macro definition for each note type instead of trying to
  describe in a comment.
- Link to v1: 
https://lore.kernel.org/r/20241225-elf-v1-1-79e940350...@daynix.com

---
Akihiko Odaki (6):
  elf: Define note name macros
  binfmt_elf: Use note name macros
  powerpc: Use note name macros
  crash: Use note name macros
  s390/crash: Use note name macros
  crash: Remove KEXEC_CORE_NOTE_NAME

 arch/powerpc/kernel/fadump.c   |  2 +-
 arch/powerpc/platforms/powernv/opal-core.c |  8 +--
 arch/s390/kernel/crash_dump.c  | 62 -
 fs/binfmt_elf.c| 21 ---
 fs/binfmt_elf_fdpic.c  |  8 +--
 fs/proc/kcore.c| 12 ++--
 include/linux/kexec.h  |  2 -
 include/linux/vmcore_info.h|  3 +-
 include/uapi/linux/elf.h   | 89 --
 kernel/crash_core.c|  2 +-
 10 files changed, 134 insertions(+), 75 deletions(-)
---
base-commit: a32e14f8aef69b42826cf0998b068a43d486a9e9
change-id: 20241210-elf-b80ea3949c39

Best regards,
-- 
Akihiko Odaki

[PATCH v5 2/6] binfmt_elf: Use note name macros

2025-01-14 Thread Akihiko Odaki

Use note name macros to match with the userspace's expectation.

Signed-off-by: Akihiko Odaki 
Acked-by: Baoquan He 
Reviewed-by: Dave Martin 
---
 fs/binfmt_elf.c   | 21 ++---
 fs/binfmt_elf_fdpic.c |  8 
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 106f0e8af177..5b4a92e5e508 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -762,8 +762,7 @@ static int parse_elf_property(const char *data, size_t 
*off, size_t datasz,
 }
 
 #define NOTE_DATA_SZ SZ_1K
-#define GNU_PROPERTY_TYPE_0_NAME "GNU"
-#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME))
+#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0))
 
 static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
struct arch_elf_state *arch)
@@ -800,7 +799,7 @@ static int parse_elf_properties(struct file *f, const 
struct elf_phdr *phdr,
if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 ||
note.nhdr.n_namesz != NOTE_NAME_SZ ||
strncmp(note.data + sizeof(note.nhdr),
-   GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr)))
+   NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr)))
return -ENOEXEC;
 
off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ,
@@ -1603,14 +1602,14 @@ static void fill_auxv_note(struct memelfnote *note, 
struct mm_struct *mm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
-   fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+   fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv);
 }
 
 static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t 
*csigdata,
const kernel_siginfo_t *siginfo)
 {
copy_siginfo_to_external(csigdata, siginfo);
-   fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+   fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata);
 }
 
 /*
@@ -1706,7 +1705,7 @@ static int fill_files_note(struct memelfnote *note, 
struct coredump_params *cprm
}
 
size = name_curpos - (char *)data;
-   fill_note(note, "CORE", NT_FILE, size, data);
+   fill_note(note, NN_FILE, NT_FILE, size, data);
return 0;
 }
 
@@ -1767,7 +1766,7 @@ static int fill_thread_core_info(struct 
elf_thread_core_info *t,
regset_get(t->task, &view->regsets[0],
   sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
 
-   fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
+   fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS,
  PRSTATUS_SIZE, &t->prstatus);
info->size += notesize(&t->notes[0]);
 
@@ -1801,7 +1800,7 @@ static int fill_thread_core_info(struct 
elf_thread_core_info *t,
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
 
-   fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
+   fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX",
  note_type, ret, data);
 
info->size += notesize(&t->notes[note_iter]);
@@ -1821,7 +1820,7 @@ static int fill_thread_core_info(struct 
elf_thread_core_info *t,
fill_prstatus(&t->prstatus.common, p, signr);
elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
 
-   fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+   fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
  &(t->prstatus));
info->size += notesize(&t->notes[0]);
 
@@ -1832,7 +1831,7 @@ static int fill_thread_core_info(struct 
elf_thread_core_info *t,
}
 
t->prstatus.pr_fpvalid = 1;
-   fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+   fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu);
info->size += notesize(&t->notes[1]);
 
return 1;
@@ -1852,7 +1851,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
if (!psinfo)
return 0;
-   fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+   fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), 
psinfo);
 
 #ifdef CORE_DUMP_USE_REGSET
view = task_user_regset_view(dump_task);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index f1a7c4875c4a..96bd9d2f23d7 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1397,7 +1397,7 @@ static struct elf_thread_status 
*elf_dump_thread_status(long signr, struct task_
regset_get(p, &view->regsets[0],
   sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
 
-   fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+   fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus),
  &t->prstatus);
t->num_notes++;
*sz += notesize(&t->notes[0]);
@@ -1415,7 +141

[PATCH v5 5/6] s390/crash: Use note name macros

2025-01-14 Thread Akihiko Odaki

Use note name macros to match with the userspace's expectation.

Signed-off-by: Akihiko Odaki 
Acked-by: Heiko Carstens 
Reviewed-by: Dave Martin 
---
 arch/s390/kernel/crash_dump.c | 62 ---
 1 file changed, 23 insertions(+), 39 deletions(-)

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index cd0c93a8fb8b..022f4f198edf 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -248,15 +248,6 @@ bool is_kdump_kernel(void)
 }
 EXPORT_SYMBOL_GPL(is_kdump_kernel);
 
-static const char *nt_name(Elf64_Word type)
-{
-   const char *name = "LINUX";
-
-   if (type == NT_PRPSINFO || type == NT_PRSTATUS || type == NT_PRFPREG)
-   name = KEXEC_CORE_NOTE_NAME;
-   return name;
-}
-
 /*
  * Initialize ELF note
  */
@@ -281,10 +272,8 @@ static void *nt_init_name(void *buf, Elf64_Word type, void 
*desc, int d_len,
return PTR_ADD(buf, len);
 }
 
-static inline void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len)
-{
-   return nt_init_name(buf, type, desc, d_len, nt_name(type));
-}
+#define nt_init(buf, type, desc) \
+   nt_init_name(buf, NT_ ## type, &(desc), sizeof(desc), NN_ ## type)
 
 /*
  * Calculate the size of ELF note
@@ -300,10 +289,7 @@ static size_t nt_size_name(int d_len, const char *name)
return size;
 }
 
-static inline size_t nt_size(Elf64_Word type, int d_len)
-{
-   return nt_size_name(d_len, nt_name(type));
-}
+#define nt_size(type, desc) nt_size_name(sizeof(desc), NN_ ## type)
 
 /*
  * Fill ELF notes for one CPU with save area registers
@@ -324,18 +310,16 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, 
struct save_area *sa)
memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc));
memcpy(&nt_fpregset.fprs, &sa->fprs, sizeof(sa->fprs));
/* Create ELF notes for the CPU */
-   ptr = nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus));
-   ptr = nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset));
-   ptr = nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer));
-   ptr = nt_init(ptr, NT_S390_TODCMP, &sa->todcmp, sizeof(sa->todcmp));
-   ptr = nt_init(ptr, NT_S390_TODPREG, &sa->todpreg, sizeof(sa->todpreg));
-   ptr = nt_init(ptr, NT_S390_CTRS, &sa->ctrs, sizeof(sa->ctrs));
-   ptr = nt_init(ptr, NT_S390_PREFIX, &sa->prefix, sizeof(sa->prefix));
+   ptr = nt_init(ptr, PRSTATUS, nt_prstatus);
+   ptr = nt_init(ptr, PRFPREG, nt_fpregset);
+   ptr = nt_init(ptr, S390_TIMER, sa->timer);
+   ptr = nt_init(ptr, S390_TODCMP, sa->todcmp);
+   ptr = nt_init(ptr, S390_TODPREG, sa->todpreg);
+   ptr = nt_init(ptr, S390_CTRS, sa->ctrs);
+   ptr = nt_init(ptr, S390_PREFIX, sa->prefix);
if (cpu_has_vx()) {
-   ptr = nt_init(ptr, NT_S390_VXRS_HIGH,
- &sa->vxrs_high, sizeof(sa->vxrs_high));
-   ptr = nt_init(ptr, NT_S390_VXRS_LOW,
- &sa->vxrs_low, sizeof(sa->vxrs_low));
+   ptr = nt_init(ptr, S390_VXRS_HIGH, sa->vxrs_high);
+   ptr = nt_init(ptr, S390_VXRS_LOW, sa->vxrs_low);
}
return ptr;
 }
@@ -348,16 +332,16 @@ static size_t get_cpu_elf_notes_size(void)
struct save_area *sa = NULL;
size_t size;
 
-   size =  nt_size(NT_PRSTATUS, sizeof(struct elf_prstatus));
-   size +=  nt_size(NT_PRFPREG, sizeof(elf_fpregset_t));
-   size +=  nt_size(NT_S390_TIMER, sizeof(sa->timer));
-   size +=  nt_size(NT_S390_TODCMP, sizeof(sa->todcmp));
-   size +=  nt_size(NT_S390_TODPREG, sizeof(sa->todpreg));
-   size +=  nt_size(NT_S390_CTRS, sizeof(sa->ctrs));
-   size +=  nt_size(NT_S390_PREFIX, sizeof(sa->prefix));
+   size =  nt_size(PRSTATUS, struct elf_prstatus);
+   size += nt_size(PRFPREG, elf_fpregset_t);
+   size += nt_size(S390_TIMER, sa->timer);
+   size += nt_size(S390_TODCMP, sa->todcmp);
+   size += nt_size(S390_TODPREG, sa->todpreg);
+   size += nt_size(S390_CTRS, sa->ctrs);
+   size += nt_size(S390_PREFIX, sa->prefix);
if (cpu_has_vx()) {
-   size += nt_size(NT_S390_VXRS_HIGH, sizeof(sa->vxrs_high));
-   size += nt_size(NT_S390_VXRS_LOW, sizeof(sa->vxrs_low));
+   size += nt_size(S390_VXRS_HIGH, sa->vxrs_high);
+   size += nt_size(S390_VXRS_LOW, sa->vxrs_low);
}
 
return size;
@@ -373,7 +357,7 @@ static void *nt_prpsinfo(void *ptr)
memset(&prpsinfo, 0, sizeof(prpsinfo));
prpsinfo.pr_sname = 'R';
strcpy(prpsinfo.pr_fname, "vmlinux");
-   return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo));
+   return nt_init(ptr, PRPSINFO, prpsinfo);
 }
 
 /*
@@ -589,7 +573,7 @@ static size_t get_elfcorehdr_size(int phdr_count)
/* PT_NOTES */
size += sizeof(Elf64_Phdr);
/* nt_prpsinfo */
-   size += nt_size(NT_PRPSINFO, sizeof(struct elf_prpsinfo));
+

[PATCH v5 4/6] crash: Use note name macros

2025-01-14 Thread Akihiko Odaki

Use note name macros to match with the userspace's expectation.

Signed-off-by: Akihiko Odaki 
Acked-by: Baoquan He 
Reviewed-by: Dave Martin 
---
 fs/proc/kcore.c | 12 ++--
 include/linux/vmcore_info.h |  2 +-
 kernel/crash_core.c |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e376f48c4b8b..e5612313b8b4 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -34,8 +34,6 @@
 #include 
 #include "internal.h"
 
-#define CORE_STR "CORE"
-
 #ifndef ELF_CORE_EFLAGS
 #define ELF_CORE_EFLAGS0
 #endif
@@ -119,7 +117,9 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, 
size_t *notes_len,
 
*phdrs_len = *nphdr * sizeof(struct elf_phdr);
*notes_len = (4 * sizeof(struct elf_note) +
- 3 * ALIGN(sizeof(CORE_STR), 4) +
+ ALIGN(sizeof(NN_PRSTATUS), 4) +
+ ALIGN(sizeof(NN_PRPSINFO), 4) +
+ ALIGN(sizeof(NN_TASKSTRUCT), 4) +
  VMCOREINFO_NOTE_NAME_BYTES +
  ALIGN(sizeof(struct elf_prstatus), 4) +
  ALIGN(sizeof(struct elf_prpsinfo), 4) +
@@ -444,11 +444,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct 
iov_iter *iter)
goto out;
}
 
-   append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+   append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, 
&prstatus,
  sizeof(prstatus));
-   append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+   append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, 
&prpsinfo,
  sizeof(prpsinfo));
-   append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+   append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, 
current,
  arch_task_struct_size);
/*
 * vmcoreinfo_size is mostly constant after init time, but it
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index e1dec1a6a749..1672801fd98c 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -8,7 +8,7 @@
 
 #define CRASH_CORE_NOTE_NAME  "CORE"
 #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
 #define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
 
 /*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 078fe5bc5a74..335b8425dd4b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
memset(&prstatus, 0, sizeof(prstatus));
prstatus.common.pr_pid = current->pid;
elf_core_copy_regs(&prstatus.pr_reg, regs);
-   buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+   buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
  &prstatus, sizeof(prstatus));
final_note(buf);
 }

-- 
2.47.1

[PATCH v5 3/6] powerpc: Use note name macros

2025-01-14 Thread Akihiko Odaki

Use note name macros to match with the userspace's expectation.

Signed-off-by: Akihiko Odaki 
Acked-by: Baoquan He 
Reviewed-by: Dave Martin 
---
 arch/powerpc/kernel/fadump.c   | 2 +-
 arch/powerpc/platforms/powernv/opal-core.c | 8 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 4b371c738213..d44349fe8e2b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -751,7 +751,7 @@ u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct 
pt_regs *regs)
 * prstatus.pr_pid = 
 */
elf_core_copy_regs(&prstatus.pr_reg, regs);
-   buf = append_elf_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+   buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
  &prstatus, sizeof(prstatus));
return buf;
 }
diff --git a/arch/powerpc/platforms/powernv/opal-core.c 
b/arch/powerpc/platforms/powernv/opal-core.c
index c9a9b759cc92..a379ff86c120 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -149,7 +149,7 @@ static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word 
*buf,
/* end of vector */
bufp[idx++] = cpu_to_be64(AT_NULL);
 
-   buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
+   buf = append_elf64_note(buf, NN_AUXV, NT_AUXV,
oc_conf->auxv_buf, AUXV_DESC_SZ);
return buf;
 }
@@ -252,7 +252,7 @@ static Elf64_Word * __init 
opalcore_append_cpu_notes(Elf64_Word *buf)
 * crashing CPU's prstatus.
 */
first_cpu_note = buf;
-   buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+   buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
 
for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
@@ -279,7 +279,7 @@ static Elf64_Word * __init 
opalcore_append_cpu_notes(Elf64_Word *buf)
fill_prstatus(&prstatus, thread_pir, ®s);
 
if (thread_pir != oc_conf->crashing_cpu) {
-   buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
+   buf = append_elf64_note(buf, NN_PRSTATUS,
NT_PRSTATUS, &prstatus,
sizeof(prstatus));
} else {
@@ -287,7 +287,7 @@ static Elf64_Word * __init 
opalcore_append_cpu_notes(Elf64_Word *buf)
 * Add crashing CPU as the first NT_PRSTATUS note for
 * GDB to process the core file appropriately.
 */
-   append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
+   append_elf64_note(first_cpu_note, NN_PRSTATUS,
  NT_PRSTATUS, &prstatus,
  sizeof(prstatus));
}

-- 
2.47.1

Re: [PATCH v13 5/5] rust: Use gendwarfksyms + extended modversions for CONFIG_MODVERSIONS

2025-01-14 Thread Sami Tolvanen

On Tue, Jan 14, 2025 at 10:22:15AM +0900, Masahiro Yamada wrote:
> On Tue, Jan 14, 2025 at 5:04 AM Sami Tolvanen  wrote:
> >
> > Hi Masahiro,
> >
> > On Fri, Jan 10, 2025 at 6:26 PM Masahiro Yamada  
> > wrote:
> > >
> > > On Sat, Jan 4, 2025 at 2:37 AM Matthew Maurer  wrote:
> > > >
> > > > From: Sami Tolvanen 
> > > >
> > > > Previously, two things stopped Rust from using MODVERSIONS:
> > > > 1. Rust symbols are occasionally too long to be represented in the
> > > >original versions table
> > > > 2. Rust types cannot be properly hashed by the existing genksyms
> > > >approach because:
> > > > * Looking up type definitions in Rust is more complex than C
> > > > * Type layout is potentially dependent on the compiler in Rust,
> > > >   not just the source type declaration.
> > > >
> > > > CONFIG_EXTENDED_MODVERSIONS addresses the first point, and
> > > > CONFIG_GENDWARFKSYMS the second. If Rust wants to use MODVERSIONS, allow
> > > > it to do so by selecting both features.
> > > >
> > > > Signed-off-by: Sami Tolvanen 
> > > > Co-developed-by: Matthew Maurer 
> > > > Signed-off-by: Matthew Maurer 
> > > > ---
> > > >  init/Kconfig  |  3 ++-
> > > >  rust/Makefile | 34 --
> > > >  2 files changed, 34 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/init/Kconfig b/init/Kconfig
> > > > index 
> > > > c1f9eb3d5f2e892e977ba1425599502dc830f552..b60acfd9431e0ac2bf401ecb6523b5104ad31150
> > > >  100644
> > > > --- a/init/Kconfig
> > > > +++ b/init/Kconfig
> > > > @@ -1959,7 +1959,8 @@ config RUST
> > > > bool "Rust support"
> > > > depends on HAVE_RUST
> > > > depends on RUST_IS_AVAILABLE
> > > > -   depends on !MODVERSIONS
> > > > +   select EXTENDED_MODVERSIONS if MODVERSIONS
> > > > +   depends on !MODVERSIONS || GENDWARFKSYMS
> > > > depends on !GCC_PLUGIN_RANDSTRUCT
> > > > depends on !RANDSTRUCT
> > > > depends on !DEBUG_INFO_BTF || PAHOLE_HAS_LANG_EXCLUDE
> > > > diff --git a/rust/Makefile b/rust/Makefile
> > > > index 
> > > > a40a3936126d603836e0ec9b42a1285916b60e45..80f970ad81f7989afe5ff2b5f633f50feb7f6006
> > > >  100644
> > > > --- a/rust/Makefile
> > > > +++ b/rust/Makefile
> > > > @@ -329,10 +329,11 @@ $(obj)/bindings/bindings_helpers_generated.rs: 
> > > > private bindgen_target_extra = ;
> > > >  $(obj)/bindings/bindings_helpers_generated.rs: 
> > > > $(src)/helpers/helpers.c FORCE
> > > > $(call if_changed_dep,bindgen)
> > > >
> > > > +rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && 
> > > > $$3!~/__cfi/ { printf $(2),$(3) }'
> > > > +
> > > >  quiet_cmd_exports = EXPORTS $@
> > > >cmd_exports = \
> > > > -   $(NM) -p --defined-only $< \
> > > > -   | awk '$$2~/(T|R|D|B)/ && $$3!~/__cfi/ {printf 
> > > > "EXPORT_SYMBOL_RUST_GPL(%s);\n",$$3}' > $@
> > > > +   $(call rust_exports,$<,"EXPORT_SYMBOL_RUST_GPL(%s);\n",$$3) > $@
> > >
> > > I noticed a nit:
> > >
> > > Both of the two callsites of rust_exports pass
> > > '$$3' to the last parameter instead of hardcoding it.
> > >
> > > Is it a flexibility for future extensions?
> > >
> > > I cannot think of any other use except for printing
> > > the third column, i.e. symbol name.
> >
> > Good catch, the last parameter isn't necessary anymore. It was used in
> > early versions of the series to also pass symbol addresses to
> > gendwarfksyms, but that's not needed since we read the symbol table
> > directly now.
> 
> If you submit a diff, I will squash it to 5/5.
> (You do not need to input commit description body)

Thanks, here's a diff that drops the last parameter.

Sami


diff --git a/rust/Makefile b/rust/Makefile
index 80f970ad81f7..ab300bfb46f6 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -329,11 +329,11 @@ $(obj)/bindings/bindings_helpers_generated.rs: private 
bindgen_target_extra = ;
 $(obj)/bindings/bindings_helpers_generated.rs: $(src)/helpers/helpers.c FORCE
$(call if_changed_dep,bindgen)
 
-rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && 
$$3!~/__cfi/ { printf $(2),$(3) }'
+rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && 
$$3!~/__cfi/ { printf $(2),$$3 }'
 
 quiet_cmd_exports = EXPORTS $@
   cmd_exports = \
-   $(call rust_exports,$<,"EXPORT_SYMBOL_RUST_GPL(%s);\n",$$3) > $@
+   $(call rust_exports,$<,"EXPORT_SYMBOL_RUST_GPL(%s);\n") > $@
 
 $(obj)/exports_core_generated.h: $(obj)/core.o FORCE
$(call if_changed,exports)
@@ -404,7 +404,7 @@ endif
 
 ifdef CONFIG_MODVERSIONS
 cmd_gendwarfksyms = $(if $(skip_gendwarfksyms),, \
-   $(call rust_exports,$@,"%s\n",$$3) | \
+   $(call rust_exports,$@,"%s\n") | \
scripts/gendwarfksyms/gendwarfksyms \
$(if $(KBUILD_GENDWARFKSYMS_STABLE), --stable) \
$(if $(KBUILD_SYMTYPES), --symtypes $(@:.o=.symtypes),) \

[PATCH v2] cpufreq: Use str_enable_disable-like helpers

2025-01-14 Thread Krzysztof Kozlowski

Replace ternary (condition ? "enable" : "disable") syntax with helpers
from string_choices.h because:
1. Simple function call with one argument is easier to read.  Ternary
   operator has three arguments and with wrapping might lead to quite
   long code.
2. Is slightly shorter thus also easier to read.
3. It brings uniformity in the text - same string.
4. Allows deduping by the linker, which results in a smaller binary
   file.

Signed-off-by: Krzysztof Kozlowski 

---

Changes in v2:
1. Fix enable->enabled
---
 drivers/cpufreq/cpufreq.c | 7 ---
 drivers/cpufreq/powernv-cpufreq.c | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 418236fef172..1076e37a18ad 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -602,12 +603,12 @@ static ssize_t store_boost(struct kobject *kobj, struct 
kobj_attribute *attr,
 
if (cpufreq_boost_trigger_state(enable)) {
pr_err("%s: Cannot %s BOOST!\n",
-  __func__, enable ? "enable" : "disable");
+  __func__, str_enable_disable(enable));
return -EINVAL;
}
 
pr_debug("%s: cpufreq BOOST %s\n",
-__func__, enable ? "enabled" : "disabled");
+__func__, str_enabled_disabled(enable));
 
return count;
 }
@@ -2812,7 +2813,7 @@ int cpufreq_boost_trigger_state(int state)
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
pr_err("%s: Cannot %s BOOST\n",
-  __func__, state ? "enable" : "disable");
+  __func__, str_enable_disable(state));
 
return ret;
 }
diff --git a/drivers/cpufreq/powernv-cpufreq.c 
b/drivers/cpufreq/powernv-cpufreq.c
index 8de759247771..ae79d909943b 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -281,7 +282,7 @@ static int init_powernv_pstates(void)
pr_info("cpufreq pstate min 0x%x nominal 0x%x max 0x%x\n", pstate_min,
pstate_nominal, pstate_max);
pr_info("Workload Optimized Frequency is %s in the platform\n",
-   (powernv_pstate_info.wof_enabled) ? "enabled" : "disabled");
+   str_enabled_disabled(powernv_pstate_info.wof_enabled));
 
pstate_ids = of_get_property(power_mgt, "ibm,pstate-ids", &len_ids);
if (!pstate_ids) {
-- 
2.43.0

RE: [PATCH 2/2] PCI: dwc: layerscape: Use syscon_regmap_lookup_by_phandle_args

2025-01-14 Thread Roy Zang

> From: Krzysztof Kozlowski 
 
> Use syscon_regmap_lookup_by_phandle_args() which is a wrapper over
> syscon_regmap_lookup_by_phandle() combined with getting the syscon
> argument.  Except simpler code this annotates within one line that given
> phandle has arguments, so grepping for code would be easier.
> 
> Signed-off-by: Krzysztof Kozlowski 
> ---
>  drivers/pci/controller/dwc/pci-layerscape.c | 10 --
>  1 file changed, 4 insertions(+), 6 deletions(-)


Acked-by: Roy Zang 

Roy


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH 06/14] cpumask: re-introduce cpumask_next{,_and}_wrap()

2025-01-14 Thread Yury Norov

On Tue, Jan 07, 2025 at 02:28:31PM +0100, Alexander Gordeev wrote:
> On Sat, Dec 28, 2024 at 10:49:38AM -0800, Yury Norov wrote:
> 
> Hi Yury,
> 
> > cpumask_next_wrap_old() has two additional parameters, comparing to it's
> > analogue in linux/find.h find_next_bit_wrap(). The reason for that is
> > historical.
> > 
> > Before 4fe49b3b97c262 ("lib/bitmap: introduce for_each_set_bit_wrap()
> > macro"), cpumask_next_wrap() was used to implement for_each_cpu_wrap()
> > iterator. Now that the iterator is an alias to generic
> > for_each_set_bit_wrap(), the additional parameters aren't used and may
> > confuse readers.
> > 
> > All existing users call cpumask_next_wrap() in a way that makes it
> > possible to turn it to straight and simple alias to find_next_bit_wrap().
> > 
> > In a couple places kernel users opencode missing cpumask_next_and_wrap().
> > Add it as well.
> > 
> > Signed-off-by: Yury Norov 
> > ---
> >  include/linux/cpumask.h | 37 +
> >  1 file changed, 37 insertions(+)
> > 
> > diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
> > index b267a4f6a917..18c9908d50c4 100644
> > --- a/include/linux/cpumask.h
> > +++ b/include/linux/cpumask.h
> > @@ -284,6 +284,43 @@ unsigned int cpumask_next_and(int n, const struct 
> > cpumask *src1p,
> > small_cpumask_bits, n + 1);
> >  }
> >  
> > +/**
> > + * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting 
> > from
> > + *@n and wrapping around, if needed
> > + * @n: the cpu prior to the place to search (i.e. return will be > @n)
> > + * @src1p: the first cpumask pointer
> > + * @src2p: the second cpumask pointer
> > + *
> > + * Return: >= nr_cpu_ids if no further cpus set in both.
> > + */
> > +static __always_inline
> > +unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p,
> > + const struct cpumask *src2p)
> > +{
> > +   /* -1 is a legal arg here. */
> > +   if (n != -1)
> > +   cpumask_check(n);
> > +   return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p),
> > +   small_cpumask_bits, n + 1);
> > +}
> > +
> > +/*
> > + * cpumask_next_wrap - get the next cpu in *src, starting from
> > + *@n and wrapping around, if needed
> 
> Does it mean the search wraps a cpumask and starts from the beginning
> if the bit is not found and returns >= nr_cpu_ids if @n crosses itself?
> 
> > + * @n: the cpu prior to the place to search
> > + * @src: cpumask pointer
> > + *
> > + * Return: >= nr_cpu_ids if no further cpus set in both.
> 
> It looks like Return is a cpumask_next_and_wrap() comment leftover.
> 
> > + */
> > +static __always_inline
> > +unsigned int cpumask_next_wrap(int n, const struct cpumask *src)
> > +{
> > +   /* -1 is a legal arg here. */
> > +   if (n != -1)
> > +   cpumask_check(n);
> > +   return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1);
> > +}
> > +
> >  /**
> >   * for_each_cpu - iterate over every cpu in a mask
> >   * @cpu: the (optionally unsigned) integer iterator
> 
> Thanks!

Thanks, I'll update the comments.

Re: [PATCH 06/14] cpumask: re-introduce cpumask_next{,_and}_wrap()

2025-01-14 Thread Yury Norov

On Fri, Jan 03, 2025 at 11:44:32AM -0600, Bjorn Helgaas wrote:
> On Sat, Dec 28, 2024 at 10:49:38AM -0800, Yury Norov wrote:
> > cpumask_next_wrap_old() has two additional parameters, comparing to it's
> > analogue in linux/find.h find_next_bit_wrap(). The reason for that is
> > historical.
> 
> s/it's/its/
> 
> Personally I think cscope/tags/git grep make "find_next_bit_wrap()"
> enough even without mentioning "linux/find.h".
> 
> > + * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting 
> > from
> > + *@n and wrapping around, if needed
> > + * @n: the cpu prior to the place to search (i.e. return will be > @n)
> 
> Is the return really > @n if it wraps?

No, this is a copy-paste error. Will fix in v2.

Thanks,
Yury

Re: [PATCH v6 16/26] huge_memory: Add vmf_insert_folio_pmd()

2025-01-14 Thread Alistair Popple

On Tue, Jan 14, 2025 at 09:22:00AM -0800, Dan Williams wrote:
> David Hildenbrand wrote:
> > > +vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio 
> > > *folio, bool write)
> > > +{
> > > + struct vm_area_struct *vma = vmf->vma;
> > > + unsigned long addr = vmf->address & PMD_MASK;
> > > + struct mm_struct *mm = vma->vm_mm;
> > > + spinlock_t *ptl;
> > > + pgtable_t pgtable = NULL;
> > > +
> > > + if (addr < vma->vm_start || addr >= vma->vm_end)
> > > + return VM_FAULT_SIGBUS;
> > > +
> > > + if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
> > > + return VM_FAULT_SIGBUS;
> > > +
> > > + if (arch_needs_pgtable_deposit()) {
> > > + pgtable = pte_alloc_one(vma->vm_mm);
> > > + if (!pgtable)
> > > + return VM_FAULT_OOM;
> > > + }
> > 
> > This is interesting and nasty at the same time (only to make ppc64 boo3s 
> > with has tables happy). But it seems to be the right thing to do.
> > 
> > > +
> > > + ptl = pmd_lock(mm, vmf->pmd);
> > > + if (pmd_none(*vmf->pmd)) {
> > > + folio_get(folio);
> > > + folio_add_file_rmap_pmd(folio, &folio->page, vma);
> > > + add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
> > > + }
> > > + insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
> > > +vma->vm_page_prot, write, pgtable);
> > > + spin_unlock(ptl);
> > > + if (pgtable)
> > > + pte_free(mm, pgtable);
> > 
> > Ehm, are you unconditionally freeing the pgtable, even if consumed by 
> > insert_pfn_pmd() ?
> > 
> > Note that setting pgtable to NULL in insert_pfn_pmd() when consumed will 
> > not be visible here.
> > 
> > You'd have to pass a pointer to the ... pointer (&pgtable).
> > 
> > ... unless I am missing something, staring at the diff.
> 
> In fact I glazed over the fact that this has been commented on before
> and assumed it was fixed:
> 
> http://lore.kernel.org/66f61ce4da80_964f229...@dwillia2-xfh.jf.intel.com.notmuch
> 
> So, yes, insert_pfn_pmd needs to take &pgtable to report back if the
> allocation got consumed.
> 
> Good catch.

Yes, thanks Dave and Dan and apologies for missing that originally. Looking
at the thread I suspect I went down the rabbit hole of trying to implement
vmf_insert_folio() and when that wasn't possible forgot to come back and fix
this up. I have added a return code to insert_pfn_pmd() to indicate whether
or not the pgtable was consumed. I have also added a comment in the commit log
explaining why a vmf_insert_folio() isn't useful.

 - Alistair

Re: [PATCH v2] cpufreq: Use str_enable_disable-like helpers

2025-01-14 Thread Viresh Kumar

On 14-01-25, 20:06, Krzysztof Kozlowski wrote:
> Replace ternary (condition ? "enable" : "disable") syntax with helpers
> from string_choices.h because:
> 1. Simple function call with one argument is easier to read.  Ternary
>operator has three arguments and with wrapping might lead to quite
>long code.
> 2. Is slightly shorter thus also easier to read.
> 3. It brings uniformity in the text - same string.
> 4. Allows deduping by the linker, which results in a smaller binary
>file.
> 
> Signed-off-by: Krzysztof Kozlowski 

Acked-by: Viresh Kumar 

-- 
viresh

Re: [PATCH v6 23/26] mm: Remove pXX_devmap callers

2025-01-14 Thread Alistair Popple

On Tue, Jan 14, 2025 at 10:50:49AM -0800, Dan Williams wrote:
> Alistair Popple wrote:
> > The devmap PTE special bit was used to detect mappings of FS DAX
> > pages. This tracking was required to ensure the generic mm did not
> > manipulate the page reference counts as FS DAX implemented it's own
> > reference counting scheme.
> > 
> > Now that FS DAX pages have their references counted the same way as
> > normal pages this tracking is no longer needed and can be
> > removed.
> > 
> > Almost all existing uses of pmd_devmap() are paired with a check of
> > pmd_trans_huge(). As pmd_trans_huge() now returns true for FS DAX pages
> > dropping the check in these cases doesn't change anything.
> > 
> > However care needs to be taken because pmd_trans_huge() also checks that
> > a page is not an FS DAX page. This is dealt with either by checking
> > !vma_is_dax() or relying on the fact that the page pointer was obtained
> > from a page list. This is possible because zone device pages cannot
> > appear in any page list due to sharing page->lru with page->pgmap.
> 
> While the patch looks straightforward I think part of taking "care" in
> this case is to split it such that any of those careful conversions have
> their own bisect point in the history.
> 
> Perhaps this can move to follow-on series to not blow up the patch count
> of the base series? ...but first want to get your reaction to splitting
> for bisect purposes.

TBH I don't feel too strongly about it - I suppose it would make it easier to
bisect to the specific case we weren't careful enough about. However I think if
a bug is bisected to this particular patch it would be relatively easy based on
the context of the bug to narrow it down to a particular file or two.

I do however feel strongly about whether or not that should be done in a
follow-on series :-)

Rebasing such a large series has already become painful and error prone enough
so if we want to split this change up it will definitely need to be a separate
series done once the rest of this has been merged. So I could be pursaded to
roll this and the pfn_t removal (as that depends on devmap going away) together.

Let me know what you think.

Re: [PATCH v6 23/26] mm: Remove pXX_devmap callers

2025-01-14 Thread Dan Williams

Alistair Popple wrote:
> The devmap PTE special bit was used to detect mappings of FS DAX
> pages. This tracking was required to ensure the generic mm did not
> manipulate the page reference counts as FS DAX implemented it's own
> reference counting scheme.
> 
> Now that FS DAX pages have their references counted the same way as
> normal pages this tracking is no longer needed and can be
> removed.
> 
> Almost all existing uses of pmd_devmap() are paired with a check of
> pmd_trans_huge(). As pmd_trans_huge() now returns true for FS DAX pages
> dropping the check in these cases doesn't change anything.
> 
> However care needs to be taken because pmd_trans_huge() also checks that
> a page is not an FS DAX page. This is dealt with either by checking
> !vma_is_dax() or relying on the fact that the page pointer was obtained
> from a page list. This is possible because zone device pages cannot
> appear in any page list due to sharing page->lru with page->pgmap.

While the patch looks straightforward I think part of taking "care" in
this case is to split it such that any of those careful conversions have
their own bisect point in the history.

Perhaps this can move to follow-on series to not blow up the patch count
of the base series? ...but first want to get your reaction to splitting
for bisect purposes.

Re: [kvm-unit-tests PATCH v1 2/5] configure: Display the default processor for arm and arm64

2025-01-14 Thread Andrew Jones

On Tue, Jan 14, 2025 at 05:17:28PM +, Alexandru Elisei wrote:
...
> > > +# $arch will have changed when cross-compiling.
> > > +[ -z "$processor" ] && processor=$(get_default_processor $arch)
> > 
> > The fact that $arch and $processor are wrong until they've had a chance to
> 
> $processor is never wrong. $processor is unset until either the user sets it
> with --processor, or until this line. This patch introduces $default_processor
> only for the purpose of having an accurate help text, it doesn't change when 
> and
> how $processor is assigned.

I should have said "The fact that $arch and $default_processor are wrong..."

> 
> > be converted might be another reason for the $do_help idea. But it'll
> > always be fragile since another change that does some sort of conversion
> > could end up getting added after the '[ $do_help ] && usage' someday.
> 
> configure needs to distinguish between:
> 
> 1. The user not having specified --processor when doing ./configure.
> 2. The user having set --processor.
> 
> If 1, then kvm-unit-tests can use the default $processor value for $arch,
> which could have also been specified by the user.
> 
> If 2, then kvm-unit-tests should not touch $processor because that's what the
> user wants.
> 
> Do you see something wrong with that reasoning?

If we output $default_processor in usage() before it's had a chance to be
set correctly based on a given cross arch, then it won't display the
correct name.

> 
> Also, I don't understand why you say it's fragile, since configure doesn't

I wrote "it'll always be fragile" where 'it' refers to the most recent
object of my paragraph ("the $do_help idea"). But, TBH, I'm not sure
how important it is to get the help text accurate, so we can just not
care if we call usage() with the wrong strings sometimes.

Thanks,
drew

> touch $processor until this point (and unless the user sets it, of course).
> 
> Thanks,
> Alex
> 
> -- 
> kvm-riscv mailing list
> kvm-ri...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kvm-riscv

Re: [PATCH v9 3/8] PCI: Add defines for TLP Header/Prefix log sizes

2025-01-14 Thread Jonathan Cameron

On Tue, 14 Jan 2025 19:08:35 +0200
Ilpo Järvinen  wrote:

> Add defines for AER and DPC capabilities TLP Header Logging register
> sizes (PCIe r6.2, sec 7.8.4 / 7.9.14) and replace literals with them.
> 
> Suggested-by: Yazen Ghannam 
> Signed-off-by: Ilpo Järvinen 

Hi Ilpo,

Where it is simply the number of entries I think the changes make sense.

Where it is about whether the log reaches a defined register I'd be more
tempted to use the register addresses to figure it out + length for the last
one.  That avoids a rather magic +- 1 in various places.

Jonathan


> ---
>  drivers/pci/pcie/dpc.c | 10 ++
>  drivers/pci/pcie/tlp.c |  2 +-
>  drivers/pci/quirks.c   |  6 --
>  include/linux/aer.h|  9 -
>  4 files changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index 2b6ef7efa3c1..0674d8c89bfa 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -215,18 +215,18 @@ static void dpc_process_rp_pio_error(struct pci_dev 
> *pdev)
>   first_error == i ? " (First)" : "");
>   }
>  
> - if (pdev->dpc_rp_log_size < 4)
> + if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG)

This I'm fine with.

>   goto clear_status;
>   pcie_read_tlp_log(pdev, cap + PCI_EXP_DPC_RP_PIO_HEADER_LOG, &tlp_log);
>   pci_err(pdev, "TLP Header: %#010x %#010x %#010x %#010x\n",
>   tlp_log.dw[0], tlp_log.dw[1], tlp_log.dw[2], tlp_log.dw[3]);
>  
> - if (pdev->dpc_rp_log_size < 5)
> + if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG + 1)

As below. Can we build this from difference in register instead so something 
like

if (pdev->dpc_rpc_log_size <=
(PCIE_EXP_DPC_RP_PIO_IMPSPEC_LOG - PCIE_EXP_DPC_RP_PIO_HEADER_LOG) 
/ 4)

>   goto clear_status;
>   pci_read_config_dword(pdev, cap + PCI_EXP_DPC_RP_PIO_IMPSPEC_LOG, &log);
>   pci_err(pdev, "RP PIO ImpSpec Log %#010x\n", log);
>  
> - for (i = 0; i < pdev->dpc_rp_log_size - 5; i++) {
> + for (i = 0; i < pdev->dpc_rp_log_size - PCIE_STD_NUM_TLP_HEADERLOG - 1; 
> i++) {

This is a bit ugly.  Can we instead build it based on difference in registers 
and
avoid the mysterious +-1?
(PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG - PCI_EXP_DPC_RPIO_HEADER_LOG) / 4 ?

>   pci_read_config_dword(pdev,
>   cap + PCI_EXP_DPC_RP_PIO_TLPPREFIX_LOG + i * 4, 
> &prefix);
>   pci_err(pdev, "TLP Prefix Header: dw%d, %#010x\n", i, prefix);
> @@ -404,7 +404,9 @@ void pci_dpc_init(struct pci_dev *pdev)
>   if (!pdev->dpc_rp_log_size) {
>   pdev->dpc_rp_log_size =
>   FIELD_GET(PCI_EXP_DPC_RP_PIO_LOG_SIZE, cap);
> - if (pdev->dpc_rp_log_size < 4 || pdev->dpc_rp_log_size > 9) {
> + if (pdev->dpc_rp_log_size < PCIE_STD_NUM_TLP_HEADERLOG ||
> + pdev->dpc_rp_log_size > PCIE_STD_NUM_TLP_HEADERLOG + 1 +
> + PCIE_STD_MAX_TLP_PREFIXLOG) {
Could also construct this one from register offset and the max size of the log.

>   pci_err(pdev, "RP PIO log size %u is invalid\n",
>   pdev->dpc_rp_log_size);
>   pdev->dpc_rp_log_size = 0;
> diff --git a/drivers/pci/pcie/tlp.c b/drivers/pci/pcie/tlp.c
> index 3f053cc62290..4cc76bd1867a 100644
> --- a/drivers/pci/pcie/tlp.c
> +++ b/drivers/pci/pcie/tlp.c
> @@ -28,7 +28,7 @@ int pcie_read_tlp_log(struct pci_dev *dev, int where,
>  
>   memset(tlp_log, 0, sizeof(*tlp_log));
>  
> - for (i = 0; i < 4; i++) {
> + for (i = 0; i < PCIE_STD_NUM_TLP_HEADERLOG; i++) {
>   ret = pci_read_config_dword(dev, where + i * 4,
>   &tlp_log->dw[i]);
>   if (ret)
> diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> index 76f4df75b08a..84487615e1d1 100644
> --- a/drivers/pci/quirks.c
> +++ b/drivers/pci/quirks.c
> @@ -12,6 +12,7 @@
>   * file, where their drivers can use them.
>   */
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -6233,8 +6234,9 @@ static void dpc_log_size(struct pci_dev *dev)
>   return;
>  
>   if (FIELD_GET(PCI_EXP_DPC_RP_PIO_LOG_SIZE, val) == 0) {
> - pci_info(dev, "Overriding RP PIO Log Size to 4\n");
> - dev->dpc_rp_log_size = 4;
> + pci_info(dev, "Overriding RP PIO Log Size to %d\n",
> +  PCIE_STD_NUM_TLP_HEADERLOG);
> + dev->dpc_rp_log_size = PCIE_STD_NUM_TLP_HEADERLOG;
>   }
>  }
>  DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x461f, dpc_log_size);
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index 190a0a2061cd..4ef6515c3205 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -16,10 +16,17 @@
>  #define AER_CORRECTABLE  2
>  #define DPC_FATAL3
>  
> +/*
> + * AER and DPC c

Re: [PATCH] fs: introduce getfsxattrat and setfsxattrat syscalls

2025-01-14 Thread Darrick J. Wong

On Thu, Jan 09, 2025 at 06:45:40PM +0100, Andrey Albershteyn wrote:
> From: Andrey Albershteyn 
> 
> Introduce getfsxattrat and setfsxattrat syscalls to manipulate inode
> extended attributes/flags. The syscalls take parent directory FD and
> path to the child together with struct fsxattr.
> 
> This is an alternative to FS_IOC_FSSETXATTR ioctl with a difference
> that file don't need to be open. By having this we can manipulated
> inode extended attributes not only on normal files but also on
> special ones. This is not possible with FS_IOC_FSSETXATTR ioctl as
> opening special files returns VFS special inode instead of
> underlying filesystem one.
> 
> This patch adds two new syscalls which allows userspace to set
> extended inode attributes on special files by using parent directory
> to open FS inode.
> 
> Also, as vfs_fileattr_set() is now will be called on special files
> too, let's forbid any other attributes except projid and nextents
> (symlink can have an extent).
> 
> CC: linux-...@vger.kernel.org
> Signed-off-by: Andrey Albershteyn 
> ---
> 
> Notes:
> Previous discussion:
> 
> https://lore.kernel.org/linux-xfs/20240520164624.665269-2-aalbe...@redhat.com/
> 
> XFS has project quotas which could be attached to a directory. All
> new inodes in these directories inherit project ID set on parent
> directory.
> 
> The project is created from userspace by opening and calling
> FS_IOC_FSSETXATTR on each inode. This is not possible for special
> files such as FIFO, SOCK, BLK etc. Therefore, some inodes are left
> with empty project ID. Those inodes then are not shown in the quota
> accounting but still exist in the directory. Moreover, in the case
> when special files are created in the directory with already
> existing project quota, these inode inherit extended attributes.
> This than leaves them with these attributes without the possibility
> to clear them out. This, in turn, prevents userspace from
> re-creating quota project on these existing files.
> 
>  arch/alpha/kernel/syscalls/syscall.tbl  |   2 +
>  arch/m68k/kernel/syscalls/syscall.tbl   |   2 +
>  arch/microblaze/kernel/syscalls/syscall.tbl |   2 +
>  arch/parisc/kernel/syscalls/syscall.tbl |   2 +
>  arch/powerpc/kernel/syscalls/syscall.tbl|   2 +
>  arch/s390/kernel/syscalls/syscall.tbl   |   2 +
>  arch/sh/kernel/syscalls/syscall.tbl |   2 +
>  arch/sparc/kernel/syscalls/syscall.tbl  |   2 +
>  arch/x86/entry/syscalls/syscall_32.tbl  |   2 +
>  arch/x86/entry/syscalls/syscall_64.tbl  |   2 +
>  arch/xtensa/kernel/syscalls/syscall.tbl |   2 +
>  fs/inode.c  | 105 
>  fs/ioctl.c  |  17 +++-
>  include/linux/fileattr.h|   1 +
>  include/linux/syscalls.h|   4 +
>  include/uapi/asm-generic/unistd.h   |   8 +-
>  16 files changed, 154 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl 
> b/arch/alpha/kernel/syscalls/syscall.tbl
> index c59d53d6d3f3..4b9e687494c1 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -506,3 +506,5 @@
>  574  common  getxattrat  sys_getxattrat
>  575  common  listxattrat sys_listxattrat
>  576  common  removexattrat   sys_removexattrat
> +577  common  getfsxattratsys_getfsxattrat
> +578  common  setfsxattratsys_setfsxattrat
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl 
> b/arch/m68k/kernel/syscalls/syscall.tbl
> index f5ed71f1910d..159476387f39 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -466,3 +466,5 @@
>  464  common  getxattrat  sys_getxattrat
>  465  common  listxattrat sys_listxattrat
>  466  common  removexattrat   sys_removexattrat
> +467  common  getfsxattratsys_getfsxattrat
> +468  common  setfsxattratsys_setfsxattrat
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl 
> b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 680f568b77f2..a6d59ee740b5 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -472,3 +472,5 @@
>  464  common  getxattrat  sys_getxattrat
>  465  common  listxattrat sys_listxattrat
>  466  common  removexattrat   sys_removexattrat
> +467  common  getfsxattratsys_getfsxattrat
> +468  common  setfsxattratsys_setfsxattrat
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl 
> b/arch/parisc/kernel/syscalls/syscall.tbl
> index d9fc94c86965..b3578fac43d6 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/sysca

Re: [PATCH v6 24/26] mm: Remove devmap related functions and page table bits

2025-01-14 Thread Dan Williams

Alistair Popple wrote:
> Now that DAX and all other reference counts to ZONE_DEVICE pages are
> managed normally there is no need for the special devmap PTE/PMD/PUD
> page table bits. So drop all references to these, freeing up a
> software defined page table bit on architectures supporting it.
> 
> Signed-off-by: Alistair Popple 
> Acked-by: Will Deacon  # arm64

Hooray! Looks good to me modulo breaking up the previous patch.

Reviewed-by: Dan Williams

Re: [PATCH v6 25/26] Revert "riscv: mm: Add support for ZONE_DEVICE"

2025-01-14 Thread Dan Williams

Alistair Popple wrote:
> DEVMAP PTEs are no longer required to support ZONE_DEVICE so remove
> them.
> 
> Signed-off-by: Alistair Popple 
> Suggested-by: Chunyan Zhang 
> Reviewed-by: Björn Töpel 

This and the next are candidates to squash into the previous remove
patch, right? ...and I am not sure a standalone "Revert" commit is
appropriate when other archs get an omnibus "Remove" cleanup with a
longer explanation in the changelog.

Patch looks good though and you can preserve my Reviewed-by on the
squash.

71 matches

Mail list logo