from:"Shivaprasad G Bhat"

Extends the existing watchpoint facility from TCG DAWR0 emulation to DAWR1 on
Power10 for powernv in the first patch, and for pseries in the second patch
with both TCG and KVM.

---
Changelog:
v7: 
https://lore.kernel.org/qemu-devel/170063834599.621665.9541440879278084501.st...@ltcd48-lp2.aus.stglab.ibm.com/
v7->v8:
  - Fixed the missed out ppc_store_dawr* calls.
  - Removed the macros and split the patch into 2 one just enabling the
facility for powernv and the next one doing the same for pseries guest.
  - The macro removal barely increased the number of lines by 12 as against
the previous version.

v6: 
https://lore.kernel.org/qemu-devel/168871963321.58984.15628382614621248470.stgit@ltcd89-lp2/
v6->v7:
  - Sorry about the delay in sending out this version, I have dropped the
Reviewed-bys as suggested and converted the patch to RFC back again.
  - Added the TCG support. Basically, converted the existing DAWR0 support
routines into macros for reuse by the DAWR1. Let me know if the macro
conversions should be moved to a separate independent patch.
  - As the dawr1 works on TCG, the checks in cap_dawr1_apply() report a warning
now only for P9 or P9 compat modes for both KVM and TCG use cases.
  - 'make test' passes for caps checks. Also, as suggested by Greg Kurz, the
'make test' after making the DAWR1 default 'on' and updating defaut cpu
to Power10, shows no failures.

v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/
v5->v6:
  - The other patches in the original series already merged.
  - Rebased to the top of the tree. So, the gen_spr_book3s_310_dbg() is renamed
to register_book3s_310_dbg_sprs() and moved to cpu_init.c accordingly.
  - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com
v3->v4:
  - Make error message more proper.

v3: https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com
v3->v4:
  - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
(PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
ppc_check_compati(3_00) will also be true. ppc_check_compat(3_10)
can be added while introducing pa_features_310 in future.
  - Use error_append_hint() for hints. Also add ERRP_GUARD().
  - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com
v2->v3:
  - Don't introduce pa_features_310[], instead, reuse pa_features_300[]
for 3.1 guests, as there is no difference between initial values of
them atm.
  - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() instead of
init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() from
gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls it.

v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com
v1->v2:
  - Introduce machine capability cap-dawr1 to enable/disable
the feature. By default, 2nd DAWR is OFF for guests even
when host kvm supports it. User has to manually enable it
with -machine cap-dawr1=on if he wishes to use it.
  - Split the header file changes into separate patch. (Sync
headers from v5.12-rc3)

Shivaprasad G Bhat (2):
  ppc: Enable 2nd DAWR support on Power10 PowerNV machine
  ppc: spapr: Enable 2nd DAWR on Power10 pSeries machine


 hw/ppc/spapr.c   |  7 -
 hw/ppc/spapr_caps.c  | 36 
 hw/ppc/spapr_hcall.c | 25 ++--
 include/hw/ppc/spapr.h   |  6 +++-
 target/ppc/cpu.c | 45 -
 target/ppc/cpu.h |  8 --
 target/ppc/cpu_init.c| 15 ++
 target/ppc/excp_helper.c | 61 ++--
 target/ppc/helper.h  |  2 ++
 target/ppc/kvm.c | 12 
 target/ppc/kvm_ppc.h | 12 
 target/ppc/machine.c |  3 +-
 target/ppc/misc_helper.c | 10 +++
 target/ppc/spr_common.h  |  2 ++
 target/ppc/translate.c   | 12 
 15 files changed, 202 insertions(+), 54 deletions(-)

--
Signature

[PATCH v8 2/2] ppc: spapr: Enable 2nd DAWR on Power10 pSeries machine

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability.

Signed-off-by: Ravi Bangoria 
Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c |7 ++-
 hw/ppc/spapr_caps.c|   36 
 hw/ppc/spapr_hcall.c   |   25 -
 include/hw/ppc/spapr.h |6 +-
 target/ppc/kvm.c   |   12 
 target/ppc/kvm_ppc.h   |   12 
 6 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e8dabc8614..91a97d72e7 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -262,7 +262,7 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 /* 54: DecFP, 56: DecI, 58: SHA */
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
-/* 60: NM atomic, 62: RNG */
+/* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 };
 uint8_t *pa_features = NULL;
@@ -303,6 +303,9 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
  * in pa-features. So hide it from them. */
 pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 }
+if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
+pa_features[66] |= 0x80;
+}
 
 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 }
@@ -2138,6 +2141,7 @@ static const VMStateDescription vmstate_spapr = {
 &vmstate_spapr_cap_fwnmi,
 &vmstate_spapr_fwnmi,
 &vmstate_spapr_cap_rpt_invalidate,
+&vmstate_spapr_cap_dawr1,
 NULL
 }
 };
@@ -4717,6 +4721,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
+smc->default_caps.caps[SPAPR_CAP_DAWR1] = SPAPR_CAP_OFF;
 
 /*
  * This cap specifies whether the AIL 3 mode for
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index e889244e52..677f17cea6 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -655,6 +655,32 @@ static void cap_ail_mode_3_apply(SpaprMachineState *spapr,
 }
 }
 
+static void cap_dawr1_apply(SpaprMachineState *spapr, uint8_t val,
+   Error **errp)
+{
+ERRP_GUARD();
+
+if (!val) {
+return; /* Disable by default */
+}
+
+if (!ppc_type_check_compat(MACHINE(spapr)->cpu_type,
+   CPU_POWERPC_LOGICAL_3_10, 0,
+   spapr->max_compat_pvr)) {
+warn_report("DAWR1 supported only on POWER10 and later CPUs");
+}
+
+if (kvm_enabled()) {
+if (!kvmppc_has_cap_dawr1()) {
+error_setg(errp, "DAWR1 not supported by KVM.");
+error_append_hint(errp, "Try appending -machine cap-dawr1=off");
+} else if (kvmppc_set_cap_dawr1(val) < 0) {
+error_setg(errp, "Error enabling cap-dawr1 with KVM.");
+error_append_hint(errp, "Try appending -machine cap-dawr1=off");
+}
+}
+}
+
 SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
 [SPAPR_CAP_HTM] = {
 .name = "htm",
@@ -781,6 +807,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
 .type = "bool",
 .apply = cap_ail_mode_3_apply,
 },
+[SPAPR_CAP_DAWR1] = {
+.name = "dawr1",
+.description = "Allow 2nd Data Address Watchpoint Register (DAWR1)",
+.index = SPAPR_CAP_DAWR1,
+.get = spapr_cap_get_bool,
+.set = spapr_cap_set_bool,
+.type = "bool",
+.apply = cap_dawr1_apply,
+},
 };
 
 static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr,
@@ -923,6 +958,7 @@ SPAPR_CAP_MIG_STATE(large_decr, 
SPAPR_CAP_LARGE_DECREMENTER);
 SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST);
 SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI);
 SPAPR_CAP_MIG_STATE(rpt_invalidate, SPAPR_CAP_RPT_INVALIDATE);
+SPAPR_CAP_MIG_STATE(dawr1, SPAPR_CAP_DAWR1);
 
 void spapr_caps_init(SpaprMachineState *spapr)
 {
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index fcefd1d1c7..34c1c77c95 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -814,11 +814,12 @@ static target_ulong 
h_set_mode_resource_set_ciabr(PowerPCCPU *cpu,
 return H_SUCCESS;
 }
 
-static target_ulong h_set_mode_resource_set_dawr0(PowerPCC

[PATCH v8 1/2] ppc: Enable 2nd DAWR support on Power10 PowerNV machine

Extend the existing watchpoint facility from TCG DAWR0 emulation
to DAWR1 on POWER10.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/cpu.c |   45 --
 target/ppc/cpu.h |8 +-
 target/ppc/cpu_init.c|   15 +++
 target/ppc/excp_helper.c |   61 ++
 target/ppc/helper.h  |2 ++
 target/ppc/machine.c |3 ++
 target/ppc/misc_helper.c |   10 
 target/ppc/spr_common.h  |2 ++
 target/ppc/translate.c   |   12 +
 9 files changed, 115 insertions(+), 43 deletions(-)

diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
index e3ad8e0c27..d5ac9bb888 100644
--- a/target/ppc/cpu.c
+++ b/target/ppc/cpu.c
@@ -130,11 +130,13 @@ void ppc_store_ciabr(CPUPPCState *env, target_ulong val)
 ppc_update_ciabr(env);
 }
 
-void ppc_update_daw0(CPUPPCState *env)
+void ppc_update_daw(CPUPPCState *env, int rid)
 {
 CPUState *cs = env_cpu(env);
-target_ulong deaw = env->spr[SPR_DAWR0] & PPC_BITMASK(0, 60);
-uint32_t dawrx = env->spr[SPR_DAWRX0];
+int spr_dawr = !rid ? SPR_DAWR0 : SPR_DAWR1;
+int spr_dawrx = !rid ? SPR_DAWRX0 : SPR_DAWRX1;
+target_ulong deaw = env->spr[spr_dawr] & PPC_BITMASK(0, 60);
+uint32_t dawrx = env->spr[spr_dawrx];
 int mrd = extract32(dawrx, PPC_BIT_NR(48), 54 - 48);
 bool dw = extract32(dawrx, PPC_BIT_NR(57), 1);
 bool dr = extract32(dawrx, PPC_BIT_NR(58), 1);
@@ -144,9 +146,9 @@ void ppc_update_daw0(CPUPPCState *env)
 vaddr len;
 int flags;
 
-if (env->dawr0_watchpoint) {
-cpu_watchpoint_remove_by_ref(cs, env->dawr0_watchpoint);
-env->dawr0_watchpoint = NULL;
+if (env->dawr_watchpoint[rid]) {
+cpu_watchpoint_remove_by_ref(cs, env->dawr_watchpoint[rid]);
+env->dawr_watchpoint[rid] = NULL;
 }
 
 if (!dr && !dw) {
@@ -166,28 +168,45 @@ void ppc_update_daw0(CPUPPCState *env)
 flags |= BP_MEM_WRITE;
 }
 
-cpu_watchpoint_insert(cs, deaw, len, flags, &env->dawr0_watchpoint);
+cpu_watchpoint_insert(cs, deaw, len, flags, &env->dawr_watchpoint[rid]);
 }
 
 void ppc_store_dawr0(CPUPPCState *env, target_ulong val)
 {
 env->spr[SPR_DAWR0] = val;
-ppc_update_daw0(env);
+ppc_update_daw(env, 0);
 }
 
-void ppc_store_dawrx0(CPUPPCState *env, uint32_t val)
+static void ppc_store_dawrx(CPUPPCState *env, uint32_t val, int rid)
 {
 int hrammc = extract32(val, PPC_BIT_NR(56), 1);
 
 if (hrammc) {
 /* This might be done with a second watchpoint at the xor of DEAW[0] */
-qemu_log_mask(LOG_UNIMP, "%s: DAWRX0[HRAMMC] is unimplemented\n",
-  __func__);
+qemu_log_mask(LOG_UNIMP, "%s: DAWRX%d[HRAMMC] is unimplemented\n",
+  __func__, rid);
 }
 
-env->spr[SPR_DAWRX0] = val;
-ppc_update_daw0(env);
+env->spr[!rid ? SPR_DAWRX0 : SPR_DAWRX1] = val;
+ppc_update_daw(env, rid);
+}
+
+void ppc_store_dawrx0(CPUPPCState *env, uint32_t val)
+{
+ppc_store_dawrx(env, val, 0);
+}
+
+void ppc_store_dawr1(CPUPPCState *env, target_ulong val)
+{
+env->spr[SPR_DAWR1] = val;
+ppc_update_daw(env, 1);
+}
+
+void ppc_store_dawrx1(CPUPPCState *env, uint32_t val)
+{
+ppc_store_dawrx(env, val, 1);
 }
+
 #endif
 #endif
 
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index f8101ffa29..18dcc438ea 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1236,7 +1236,7 @@ struct CPUArchState {
 #if defined(TARGET_PPC64)
 ppc_slb_t slb[MAX_SLB_ENTRIES]; /* PowerPC 64 SLB area */
 struct CPUBreakpoint *ciabr_breakpoint;
-struct CPUWatchpoint *dawr0_watchpoint;
+struct CPUWatchpoint *dawr_watchpoint[2];
 #endif
 target_ulong sr[32];   /* segment registers */
 uint32_t nb_BATs;  /* number of BATs */
@@ -1549,9 +1549,11 @@ void ppc_store_sdr1(CPUPPCState *env, target_ulong 
value);
 void ppc_store_lpcr(PowerPCCPU *cpu, target_ulong val);
 void ppc_update_ciabr(CPUPPCState *env);
 void ppc_store_ciabr(CPUPPCState *env, target_ulong value);
-void ppc_update_daw0(CPUPPCState *env);
+void ppc_update_daw(CPUPPCState *env, int rid);
 void ppc_store_dawr0(CPUPPCState *env, target_ulong value);
 void ppc_store_dawrx0(CPUPPCState *env, uint32_t value);
+void ppc_store_dawr1(CPUPPCState *env, target_ulong value);
+void ppc_store_dawrx1(CPUPPCState *env, uint32_t value);
 #endif /* !defined(CONFIG_USER_ONLY) */
 void ppc_store_msr(CPUPPCState *env, target_ulong value);
 
@@ -1737,9 +1739,11 @@ void ppc_compat_add_property(Object *obj, const char 
*name,
 #define SPR_PSPB  (0x09F)
 #define SPR_DPDES (0x0B0)
 #define SPR_DAWR0 (0x0B4)
+#define SPR_DAWR1 (0x0B5)
 #define SPR_RPR   (0x0BA)
 #define SPR_CIABR (0x0BB)
 #define SPR_DAWRX0(0x0BC)
+#define SPR_DAWRX1(0x0BD)
 #define SPR_HFSCR

Re: [RFC PATCH v7] ppc: Enable 2nd DAWR support on p10


Thanks for the review Nick!

On 1/23/24 17:36, Nicholas Piggin wrote:

On Wed Nov 22, 2023 at 5:32 PM AEST, Shivaprasad G Bhat wrote:

Extend the existing watchpoint facility from TCG DAWR0 emulation
to DAWR1 on POWER10.

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability.



I don't really like the macros. I have nightmares from Linux going
overboard with defining functions using spaghetti of generator macros.

Could you just make most functions accept either SPR number or number
(0, 1), or simply use if/else, to select between them?

Splitting the change in 2 would be good, first add regs + TCG, then the
spapr bits.

Sure.

[snip]


diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index a05bdf78c9..022b984e00 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -204,16 +204,24 @@ void helper_store_ciabr(CPUPPCState *env, target_ulong 
value)
  ppc_store_ciabr(env, value);
  }

-void helper_store_dawr0(CPUPPCState *env, target_ulong value)
-{
-ppc_store_dawr0(env, value);
+#define HELPER_STORE_DAWR(id) \
+void helper_store_dawr##id(CPUPPCState *env, target_ulong value)  \
+{ \
+env->spr[SPR_DAWR##id] = value;   \
  }

-void helper_store_dawrx0(CPUPPCState *env, target_ulong value)
-{
-ppc_store_dawrx0(env, value);
+#define HELPER_STORE_DAWRX(id)\
+void helper_store_dawrx##id(CPUPPCState *env, target_ulong value) \
+{ \
+env->spr[SPR_DAWRX##id] = value;  \
  }

Did we lose the calls to ppc_store_dawr*? That will
break direct register access (i.e., powernv) if so.


Yes. My test cases were more focussed on caps-dawr1 with pSeries

usecases, and missed this. I have taken care in the next version.


+HELPER_STORE_DAWR(0)
+HELPER_STORE_DAWRX(0)
+
+HELPER_STORE_DAWR(1)
+HELPER_STORE_DAWRX(1)

I would say open-code all these too instead of generating. If we
ever grew to >= 4 of them maybe, but as is this saves 2 lines,
and makes 'helper_store_dawrx0' more difficult to grep for.


I open coded all of the functions with barely 12 lines more adding up

without macros.


The next version posted at

https://lore.kernel.org/qemu-devel/170679876639.188422.11634974895844092362.st...@ltc-boston1.aus.stglabs.ibm.com/T/#t


Thanks,

Shivaprasad

[PATCH] vfio: container: Fix missing allocation of VFIOSpaprContainer

2024-05-09 Thread Shivaprasad G Bhat

The commit 6ad359ec29 "(vfio/spapr: Move prereg_listener into
spapr container)" began to use the newly introduced VFIOSpaprContainer
structure.

After several refactors, today the container_of(container,
VFIOSpaprContainer, ABC) is used when VFIOSpaprContainer is actually
not allocated. On PPC64 systems, this dereference is leading to corruption
showing up as glibc malloc assertion during guest start when using vfio.

Patch adds the missing allocation while also making the structure movement
to vfio common header file.

Fixes: 6ad359ec29 "(vfio/spapr: Move prereg_listener into spapr container)"
Signed-off-by: Shivaprasad G Bhat 
---
 hw/vfio/container.c   |6 --
 hw/vfio/spapr.c   |6 --
 include/hw/vfio/vfio-common.h |6 ++
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..ecaf5786d9 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -539,6 +539,7 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 {
 VFIOContainer *container;
 VFIOContainerBase *bcontainer;
+VFIOSpaprContainer *scontainer;
 int ret, fd;
 VFIOAddressSpace *space;

@@ -611,7 +612,8 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 goto close_fd_exit;
 }

-container = g_malloc0(sizeof(*container));
+scontainer = g_malloc0(sizeof(*scontainer));
+container = &scontainer->container;
 container->fd = fd;
 bcontainer = &container->bcontainer;

@@ -675,7 +677,7 @@ unregister_container_exit:
 vfio_cpr_unregister_container(bcontainer);

 free_container_exit:
-g_free(container);
+g_free(scontainer);

 close_fd_exit:
 close(fd);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 0d949bb728..78d218b7e7 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -24,12 +24,6 @@
 #include "qapi/error.h"
 #include "trace.h"

-typedef struct VFIOSpaprContainer {
-VFIOContainer container;
-MemoryListener prereg_listener;
-QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
-} VFIOSpaprContainer;
-
 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
 {
 if (memory_region_is_iommu(section->mr)) {
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..010fa68ac6 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -82,6 +82,12 @@ typedef struct VFIOContainer {
 QLIST_HEAD(, VFIOGroup) group_list;
 } VFIOContainer;

+typedef struct VFIOSpaprContainer {
+VFIOContainer container;
+MemoryListener prereg_listener;
+QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+} VFIOSpaprContainer;
+
 typedef struct VFIOHostDMAWindow {
 hwaddr min_iova;
 hwaddr max_iova;

[Qemu-devel] [RFC v2 PATCH 0/3] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The hypervisor is expected to prepare the
FDT for the NVDIMM device and send guest a hotplug interrupt with new type
RTAS_LOG_V6_HP_TYPE_PMEM currently handled by the upstream kernel. In response
to that interrupt, the guest requests the hypervisor to bind each of the SCM
blocks of the NVDIMM device using hcalls. There can be SCM block unbind
requests in case of driver errors or unplug(not supported now) use cases. The
NVDIMM label read/writes are done through hcalls.

There are also new futuristic hcalls added(currently unused in the kernel), for
querying the informations such as binding, logical addresses of the SCM blocks.
The current patchset leaves them unimplemented.

Since each virtual NVDIMM device is divided into multiple SCM blocks, the bind,
unbind, and queries using hcalls on those blocks can come independently. This
doesnt fit well into the qemu device semantics, where the map/unmap are done at
the (whole)device/object level granularity. The patchset uses the existing
NVDIMM class structures for the implementation. The bind/unbind is left to
happen at the object_add/del phase itself instead of at hcalls on-demand.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device at the
region level granularity. Without interleaving, each virtual NVDIMM device is
presented as separate region. There is no way to configure the virtual NVDIMM
interleaving for the guests today. So, there is no way a partial bind/unbind
request can come for the vNVDIMM in a hcall for a subset of SCM blocks of a
virtual NVDIMM. Hence it is safe to do bind/unbind everything during the
object_add/del.

The free device-memory region which is used for memory hotplug are done using
multiple LMBs of size(256MiB) and are expected to be aligned to 256 MiB. As the
SCM blocks are mapped to the same region, the SCM blocks also need to be
aligned to this size for the subsequent memory hotplug to work. The minimum SCM
block size is set to this size for that reason and can be made user configurable
in future if required.

The first patch moves around the existing static function to common area
for using it in the subsequent patches. Second patch adds the FDT entries and
basic device support, the third patch adds the hcalls implementation.

The patches are also available at https://github.com/ShivaprasadGBhat/qemu.git -
pseries-nvdimm branch and can be used with the upstream kernel. ndctl can be
used for configuring the nvdimms inside the guest.

This is how it can be used ..
Add nvdimm=on to the qemu machine argument,
Ex : -machine pseries,nvdimm=on
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1

Shivaprasad G Bhat (3):
mem: make nvdimm_device_list global
spapr: Add NVDIMM device support
spapr: Add Hcalls to support PAPR NVDIMM device

--
Signature

[Qemu-devel] [RFC v2 PATCH 1/3] mem: make nvdimm_device_list global

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common area.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-By: Igor Mammedov 
---
This looks to break the mips*-softmmu build.
The mips depend on CONFIG_NVDIMM_ACPI, adding CONFIG_NVDIMM looks wrong.
Is there some CONFIG tweak I need to do here? OR

Should I move these functions to utilities like I have
done here 
-(https://github.com/ShivaprasadGBhat/qemu/commit/1b8eaea132a8b19c90b4fcc4d93da356029f4667)?
---
 hw/acpi/nvdimm.c|   27 ---
 hw/mem/nvdimm.c |   27 +++
 include/hw/mem/nvdimm.h |2 ++
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..94baba1b8f 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -33,33 +33,6 @@
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
 
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
-return list;
-}
-
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
  (b) & 0xff, ((b) >> 8) & 0xff, (c) & 0xff, ((c) >> 8) & 0xff,  \
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index bf2adf5e16..f221ec7a9a 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -29,6 +29,33 @@
 #include "hw/mem/nvdimm.h"
 #include "hw/mem/memory-device.h"
 
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
+return list;
+}
+
 static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name,
   void *opaque, Error **errp)
 {
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 523a9b3d4a..bad4fc04b5 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -150,4 +150,6 @@ void nvdimm_build_acpi(GArray *table_offsets, GArray 
*table_data,
uint32_t ram_slots);
 void nvdimm_plug(NVDIMMState *state);
 void nvdimm_acpi_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev);
+GSList *nvdimm_get_device_list(void);
+
 #endif

[Qemu-devel] [RFC v2 PATCH 2/3] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power (May have
to re-look at this later).  Create the required DT entries for the
device (some entries have dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

This is how it can be used ..
Add nvdimm=on to the qemu machine argument.
Ex : -machine pseries,nvdimm=on
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/mem/nvdimm.c   |   43 
 hw/ppc/spapr.c|  202 +++--
 hw/ppc/spapr_drc.c|   18 +++
 hw/ppc/spapr_events.c |4 +
 include/hw/mem/nvdimm.h   |6 +
 include/hw/ppc/spapr.h|   12 ++
 include/hw/ppc/spapr_drc.h|9 ++
 9 files changed, 286 insertions(+), 11 deletions(-)

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index f221ec7a9a..deaeb5 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -93,11 +93,54 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(&nvdimm->uuid);
+
+visit_type_str(v, name, &value, errp);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, &value, &local_err);
+if (local_err) {
+goto out;
+}
+
+if (strcmp(value, "") == 0) {
+error_setg(&local_err, "Property '%s.%s' %s is required"
+   " at least 0x%lx", object_get_typename(obj),
+   name, value, MIN_NAMESPACE_LABEL_SIZE);
+goto out;
+}
+
+if (qemu_uuid_parse(value, &nvdimm->uuid) != 0) {
+error_setg(errp, "Invalid UUID");
+return;
+}
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 2ef3ce4362..b6951577e7 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -74,6 +74,7 @@
 #include "qemu/cutils.h"
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
+#include "hw/mem/nvdimm.h"
 
 #include 
 
@@ -699,6 +700,7 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 uint8_t *int_buf, *cur_index;
 int ret;
 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
+uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 uint64_t addr, cur_addr, size;
 uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
 uint64_t mem_end = machine->device_memory->base +
@@ -735,12 +737,20 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 nr_entries++;
 }
 
-/* Entry for DIMM */

[Qemu-devel] [RFC v2 PATCH 3/3] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to bind
each of the SCM blocks of the NVDIMM device using hcalls. There can be
SCM block unbind requests in case of driver errors or unplug(not supported now)
use cases. The NVDIMM label read/writes are done through hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks, the bind,
unbind, and queries using hcalls on those blocks can come independently. This
doesn't fit well into the qemu device semantics, where the map/unmap are done
at the (whole)device/object level granularity. The patch doesnt actually
bind/unbind on hcalls but let it happen at the object_add/del phase itself
instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device at the
region level granularity. Without interleaving, each virtual NVDIMM device is
presented as separate region. There is no way to configure the virtual NVDIMM
interleaving for the guests today. So, there is no way a partial bind/unbind
request can come for the vNVDIMM in a hcall for a subset of SCM blocks of a
virtual NVDIMM. Hence it is safe to do bind/unbind everything during the
object_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_hcall.c   |  202 
 include/hw/ppc/spapr.h |7 +-
 2 files changed, 208 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 6c16d2b120..b6e7d04dcf 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -3,11 +3,13 @@
 #include "sysemu/hw_accel.h"
 #include "sysemu/sysemu.h"
 #include "qemu/log.h"
+#include "qemu/range.h"
 #include "qemu/error-report.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "helper_regs.h"
 #include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_cpu_core.h"
 #include "mmu-hash64.h"
 #include "cpu-models.h"
@@ -16,6 +18,7 @@
 #include "hw/ppc/spapr_ovec.h"
 #include "mmu-book3s-v3.h"
 #include "hw/mem/memory-device.h"
+#include "hw/mem/nvdimm.h"
 
 static bool has_spr(PowerPCCPU *cpu, int spr)
 {
@@ -1795,6 +1798,199 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t numBytesToRead = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm = NULL;
+NVDIMMClass *ddc = NULL;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToRead != 1 && numBytesToRead != 2 &&
+numBytesToRead != 4 && numBytesToRead != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + numBytesToRead < offset) ||
+(nvdimm->label_size < numBytesToRead + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, &args[0], numBytesToRead, offset);
+
+return H_SUCCESS;
+}
+
+
+static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
+ SpaprMachineState *spapr,
+ target_ulong opcode,
+ target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t data = args[2];
+int8_t numBytesToWrite = args[3];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm = NULL;
+DeviceState *dev = NULL;
+NVDIMMClass *ddc = NULL;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
+numBytesToWrite != 4 && numBytesToWrite != 8) {
+return H_P4;
+}
+
+dev = drc->dev;
+nvdimm = NVDIMM(dev);
+if ((nvdimm->label_size < numBytesToWrite + offset) ||
+(offset + numBytesToWrite < offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->write_label_data(nvdimm, &data, numBytesToWrite, offset);
+
+return H_SUCCESS;
+}
+
+static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uin

[PATCH v3 0/3] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

PAPR semantics is such that each NVDIMM device is comprising of
multiple SCM(Storage Class Memory) blocks. The hypervisor is expected
to prepare the FDT for the NVDIMM device and send guest a hotplug
interrupt with new type RTAS_LOG_V6_HP_TYPE_PMEM currently handled by
the upstream kernel. In response to that interrupt, the guest requests
the hypervisor to bind each of the SCM blocks of the NVDIMM device
using hcalls. There can be SCM block unbind requests in case of driver
errors or unplug(not supported now) use cases. The NVDIMM label
read/writes are done through hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesnt fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level
granularity. The patchset uses the existing NVDIMM class structures
for the implementation. The bind/unbind is left to happen at the
device_add/del phase itself instead of at hcalls on-demand.

The guest kernel makes bind/unbind requests for the virtual NVDIMM
device at the region level granularity. Without interleaving, each
virtual NVDIMM device is presented as separate region. There is no
way to configure the virtual NVDIMM interleaving for the guests today.
So, there is no way a partial bind/unbind request can come for the
vNVDIMM in a hcall for a subset of SCM blocks of a virtual NVDIMM.
Hence it is safe to do bind/unbind everything during the object_add/del.

The free device-memory region which is used for memory hotplug are
done using multiple LMBs of size(256MiB) and are expected to be
aligned to 256 MiB. As the SCM blocks are mapped to the same region,
the SCM blocks also need to be aligned to this size for the subsequent
memory hotplug to work. The minimum SCM block size is set to this size
for that reason and can be made user configurable in future if required.

The first patch moves around the existing static function to common
area for using it in the subsequent patches. Second patch adds the
FDT entries and basic device support, the third patch adds the hcalls
implementation.

The patches are also available at
https://github.com/ShivaprasadGBhat/qemu.git - pseries-nvdimm-v3 branch
and can be used with the upstream kernel. ndctl can be used for
configuring the nvdimms inside the guest.
This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v2: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg02785.html
Changes from v2:
- Creating the drc indices for the nvdimm devices in advance as
suggested based on the number of user specified max slots property.
- Removed the hard dependency on -machine nvdimm=on, enabled by
default on the current latest pseries machine version.
- Renamed the functions to spapr_dt_X as suggested.
- Metadata is byteswapped before read/write to take care of endianness
semantics during the hcall.
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1

Shivaprasad G Bhat (3):
mem: move nvdimm_device_list to utilities
spapr: Add NVDIMM device support
spapr: Add Hcalls to support PAPR NVDIMM device

[PATCH v3 1/3] mem: move nvdimm_device_list to utilities

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/acpi/nvdimm.c|   28 +---
 include/qemu/nvdimm-utils.h |7 +++
 util/Makefile.objs  |1 +
 util/nvdimm-utils.c |   29 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 include/qemu/nvdimm-utils.h
 create mode 100644 util/nvdimm-utils.c

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..5219dd0e2e 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -32,33 +32,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
-
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
-return list;
-}
+#include "qemu/nvdimm-utils.h"
 
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
diff --git a/include/qemu/nvdimm-utils.h b/include/qemu/nvdimm-utils.h
new file mode 100644
index 00..4b8b198ba7
--- /dev/null
+++ b/include/qemu/nvdimm-utils.h
@@ -0,0 +1,7 @@
+#ifndef NVDIMM_UTILS_H
+#define NVDIMM_UTILS_H
+
+#include "qemu/osdep.h"
+
+GSList *nvdimm_get_device_list(void);
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 41bf59d127..a0f40d26e3 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 00..5cc768ca47
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
+return list;
+}

[PATCH v3 2/3] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/mem/nvdimm.c   |   40 +++
 hw/ppc/spapr.c|  218 ++---
 hw/ppc/spapr_drc.c|   18 +++
 hw/ppc/spapr_events.c |4 +
 include/hw/mem/nvdimm.h   |7 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_drc.h|9 ++
 9 files changed, 293 insertions(+), 17 deletions(-)

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 375f9a588a..e1238b5bed 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -69,11 +69,51 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(&nvdimm->uuid);
+
+visit_type_str(v, name, &value, errp);
+g_free(value);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, &value, &local_err);
+if (local_err) {
+goto out;
+}
+
+if (qemu_uuid_parse(value, &nvdimm->uuid) != 0) {
+error_setg(errp, "Property '%s.%s' has invalid value",
+   object_get_typename(obj), name);
+goto out;
+}
+g_free(value);
+
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 08a2a5a770..eb5c205078 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -80,6 +80,8 @@
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/nvdimm-utils.h"
 
 #include 
 
@@ -716,7 +718,8 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 uint8_t *int_buf, *cur_index;
 int ret;
 uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE;
-uint64_t addr, cur_addr, size;
+uint64_t addr, cur_addr, size, slot;
+uint64_t scm_block_size = SPAPR_MINIMUM_SCM_BLOCK_SIZE;
 uint32_t nr_boot_lmbs = (machine->device_memory->base / lmb_size);
 uint64_t mem_end = machine->device_memory->base +
memory_region_size(&machine->device_memory->mr);
@@ -741,6 +744,7 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 addr = di->addr;
 size = di->size;
 node = di->node;
+slot = di->slot;
 
 /* Entry f

[PATCH v3 3/3] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
bind each of the SCM blocks of the NVDIMM device using hcalls. There can
be SCM block unbind requests in case of driver errors or unplug(not
supported now) use cases. The NVDIMM label read/writes are done through
hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesn't fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level granularity.
The patch doesnt actually bind/unbind on hcalls but let it happen at the
device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM
device is presented as separate region. There is no way to configure the
virtual NVDIMM interleaving for the guests today. So, there is no way a
partial bind/unbind request can come for the vNVDIMM in a hcall for a
subset of SCM blocks of a virtual NVDIMM. Hence it is safe to do
bind/unbind everything during the device_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
---
 hw/ppc/spapr_hcall.c   |  300 
 include/hw/ppc/spapr.h |8 +
 2 files changed, 307 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 23e4bdb829..4e9ad96f7c 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -18,6 +18,10 @@
 #include "hw/ppc/spapr_ovec.h"
 #include "mmu-book3s-v3.h"
 #include "hw/mem/memory-device.h"
+#include "hw/ppc/spapr_drc.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/range.h"
+#include "qemu/nvdimm-utils.h"
 
 static bool has_spr(PowerPCCPU *cpu, int spr)
 {
@@ -1961,6 +1965,295 @@ static target_ulong h_update_dt(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t numBytesToRead = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+__be64 data_be = 0;
+uint64_t data = 0;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToRead != 1 && numBytesToRead != 2 &&
+numBytesToRead != 4 && numBytesToRead != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + numBytesToRead < offset) ||
+(nvdimm->label_size < numBytesToRead + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, &data_be, numBytesToRead, offset);
+
+switch (numBytesToRead) {
+case 1:
+data = data_be & 0xff;
+break;
+case 2:
+data = be16_to_cpu(data_be & 0x);
+break;
+case 4:
+data = be32_to_cpu(data_be & 0x);
+break;
+case 8:
+data = be64_to_cpu(data_be);
+break;
+default:
+break;
+}
+
+args[0] = data;
+
+return H_SUCCESS;
+}
+
+static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
+ SpaprMachineState *spapr,
+ target_ulong opcode,
+ target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t data = args[2];
+uint64_t numBytesToWrite = args[3];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+DeviceState *dev;
+NVDIMMClass *ddc;
+__be64 data_be = 0;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
+numBytesToWrite != 4 && numBytesToWrite != 8) {
+return H_P4;
+}
+
+dev = drc->dev;
+nvdimm = NVDIMM(dev);
+
+switch (numBytesToWrite) {
+case 1:
+if (data & 0xff00) {
+return H_P2;
+}
+data_be = data & 0xff;
+break;
+case 2:
+if (data & 0x) {
+return H_P2;
+}
+data_be = cpu_to_be16(data & 0x);
+break;
+case 4:
+if (data & 0x) {
+return H_P2;
+}
+

Re: [PATCH v3 2/3] spapr: Add NVDIMM device support

2019-12-10 Thread Shivaprasad G Bhat




On 12/06/2019 07:22 AM, David Gibson wrote:

On Wed, Nov 27, 2019 at 09:50:54AM +0530, Bharata B Rao wrote:

On Fri, Nov 22, 2019 at 10:42 AM David Gibson
 wrote:

Ok.  A number of queries about this.

1) The PAPR spec for ibm,dynamic-memory-v2 says that the first word in
each entry is the number of LMBs, but for NVDIMMs you use the
not-necessarily-equal scm_block_size instead.  Does the NVDIMM
amendment for PAPR really specify to use different block sizes for
these cases?  (In which case that's a really stupid spec decision, but
that wouldn't surprise me at this point).

SCM block sizes can be different from LMB sizes, but here we enforce
that SCM device size (excluding metadata) to multiple of LMB size so
that we don't end up memory range that is not aligned to LMB size.

Right, but it still doesn't make sense to use scm_block_size when you
create the dynamic-memory-v2 property.


Right, I should use LMB size here as I will be creating holes here to 
disallow DIMMs

to claim those LMBs marking them INVALID as Bharata Suggested before.


  As far as the thing
interpreting that goes, it *must* be LMB size, not SCM block size.  If
those are required to be the same at this point, you should use an
assert().


SCM block size should be a multiple for LMB size, need not be equal. 
I'll add an assert
for that, checking if equal. There is no benefit I see as of now having 
higher

SCM block size as the bind/unbind are already done before the bind hcall.


2) Similarly, the ibm,dynamic-memory-v2 description says that the
memory block described by the entry has a whole batch of contiguous
DRCs starting at the DRC index given and continuing for #LMBs DRCs.
For NVDIMMs it appears that you just have one DRC for the whole
NVDIMM.  Is that right?

One NVDIMM has one DRC, In our case, we need to mark the LMBs
corresponding to that address range in ibm,dynamic-memory-v2 as
reserved and invalid.

Ok, that fits very weirdly with the DRC allocation for the rest of
pluggable memory, but I suppose that's PAPR for you.

Having these in together is very inscrutable though, and relies on a
heap of non-obvious constraints about placement of DIMMs and NVDIMMs
relative to each other.  I really wonder if it would be better to have
a completely different address range for the NVDIMMs.


The backend object for both DIMM and NVDIMM are memory-backend-*
and they use the address from the same space. Separating it would mean
using/introducing different backend object. I dont think we have a 
choice here.





3) You're not setting *any* extra flags on the entry.  How is the
guest supposed to know which are NVDIMM entries and which are regular
DIMM entries?  AFAICT in this version the NVDIMM slots are
indistinguishable from the unassigned hotplug memory (which makes the
difference in LMB and DRC numbering even more troubling).

For NVDIMM case, this patch should populate the LMB set in
ibm,dynamic-memory-v2 something like below:
 elem = spapr_get_drconf_cell(size /lmb_size, addr,
  0, -1,
SPAPR_LMB_FLAGS_RESERVED | SPAPR_LMB_FLAGS_DRC_INVALID);

This will ensure that the NVDIMM range will never be considered as
valid memory range for memory hotplug.

Hrm.  Ok so we already have code that does that for any gaps between
DIMMs.  I don't think there's actually anything that that code will do
differently than the code you have for NVDIMMs, so you could just skip
over the NVDIMMs here and it should do the right thing.

The *interpretation* of those entries will become different: for space
into which a regular DIMM is later inserted, we'll assume the DRC
index given is a base and there are more DRCs following it, but for
NVDIMMs we'll assume the same DRC throughout.  This is nuts, but IIUC
that's what PAPR says and we can't do much about it.


My current patch is buggy as Bharata pointed out. The NVDIMM DRCs
are not to be populated here, but mark the LMB DRCs as RESERVED and INVALID
so that no malicious attempts to online those LMBs at those NVDIMM address
ranges are attempted.




4) AFAICT these are _present_ NVDIMMs, so why is
SPAPR_LMB_FLAGS_ASSIGNED not set for them?  (and why is the node
forced to -1, regardless of di->node).


  QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
  nr_entries++;
  cur_addr = addr + size;
@@ -1261,6 +1273,85 @@ static void spapr_dt_hypervisor(SpaprMachineState 
*spapr, void *fdt)
  }
  }

+static void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr)
+{
+MachineState *machine = MACHINE(spapr);
+int i;
+
+for (i = 0; i < machine->ram_slots; i++) {
+spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_PMEM, i);

What happens if you try to plug an NVDIMM to one of these slots, but a
regular DIMM has already taken it?

NVDIMM hotplug won't get that occupied slot.

Ok.

Re: [PATCH v3 2/3] spapr: Add NVDIMM device support

2019-12-12 Thread Shivaprasad G Bhat





On 12/11/2019 01:35 PM, Igor Mammedov wrote:

On Wed, 11 Dec 2019 09:44:11 +0530
Shivaprasad G Bhat  wrote:


On 12/06/2019 07:22 AM, David Gibson wrote:

On Wed, Nov 27, 2019 at 09:50:54AM +0530, Bharata B Rao wrote:

On Fri, Nov 22, 2019 at 10:42 AM David Gibson
 wrote:

Ok.  A number of queries about this.

1) The PAPR spec for ibm,dynamic-memory-v2 says that the first word in
each entry is the number of LMBs, but for NVDIMMs you use the
not-necessarily-equal scm_block_size instead.  Does the NVDIMM
amendment for PAPR really specify to use different block sizes for
these cases?  (In which case that's a really stupid spec decision, but
that wouldn't surprise me at this point).

SCM block sizes can be different from LMB sizes, but here we enforce
that SCM device size (excluding metadata) to multiple of LMB size so
that we don't end up memory range that is not aligned to LMB size.

Right, but it still doesn't make sense to use scm_block_size when you
create the dynamic-memory-v2 property.

Right, I should use LMB size here as I will be creating holes here to
disallow DIMMs
to claim those LMBs marking them INVALID as Bharata Suggested before.


   As far as the thing
interpreting that goes, it *must* be LMB size, not SCM block size.  If
those are required to be the same at this point, you should use an
assert().

SCM block size should be a multiple for LMB size, need not be equal.
I'll add an assert
for that, checking if equal. There is no benefit I see as of now having
higher
SCM block size as the bind/unbind are already done before the bind hcall.


2) Similarly, the ibm,dynamic-memory-v2 description says that the
memory block described by the entry has a whole batch of contiguous
DRCs starting at the DRC index given and continuing for #LMBs DRCs.
For NVDIMMs it appears that you just have one DRC for the whole
NVDIMM.  Is that right?

One NVDIMM has one DRC, In our case, we need to mark the LMBs
corresponding to that address range in ibm,dynamic-memory-v2 as
reserved and invalid.

Ok, that fits very weirdly with the DRC allocation for the rest of
pluggable memory, but I suppose that's PAPR for you.

Having these in together is very inscrutable though, and relies on a
heap of non-obvious constraints about placement of DIMMs and NVDIMMs
relative to each other.  I really wonder if it would be better to have
a completely different address range for the NVDIMMs.

The backend object for both DIMM and NVDIMM are memory-backend-*
and they use the address from the same space. Separating it would mean
using/introducing different backend object. I dont think we have a
choice here.

What address-space(s) are are talking about here exactly?
 From my point of view memory-backend-* provides RAM block at
some HVA, which shouldn't not have anything to do with how NVDIMM
partitions and maps it to GPA.


Ah, you are right! I got confused with the HVA.

Nonetheless, I don't see a need for having vNVDIMM in different
guest physical address range as the existing code has support for marking
memory ranges distinctly for DIMM/NVDIMM.

On another note, the x86 too does it the same way. There is no separate
range defined there.





3) You're not setting *any* extra flags on the entry.  How is the
guest supposed to know which are NVDIMM entries and which are regular
DIMM entries?  AFAICT in this version the NVDIMM slots are
indistinguishable from the unassigned hotplug memory (which makes the
difference in LMB and DRC numbering even more troubling).

For NVDIMM case, this patch should populate the LMB set in
ibm,dynamic-memory-v2 something like below:
  elem = spapr_get_drconf_cell(size /lmb_size, addr,
   0, -1,
SPAPR_LMB_FLAGS_RESERVED | SPAPR_LMB_FLAGS_DRC_INVALID);

This will ensure that the NVDIMM range will never be considered as
valid memory range for memory hotplug.

Hrm.  Ok so we already have code that does that for any gaps between
DIMMs.  I don't think there's actually anything that that code will do
differently than the code you have for NVDIMMs, so you could just skip
over the NVDIMMs here and it should do the right thing.

The *interpretation* of those entries will become different: for space
into which a regular DIMM is later inserted, we'll assume the DRC
index given is a base and there are more DRCs following it, but for
NVDIMMs we'll assume the same DRC throughout.  This is nuts, but IIUC
that's what PAPR says and we can't do much about it.

My current patch is buggy as Bharata pointed out. The NVDIMM DRCs
are not to be populated here, but mark the LMB DRCs as RESERVED and INVALID
so that no malicious attempts to online those LMBs at those NVDIMM address
ranges are attempted.

  

4) AFAICT these are _present_ NVDIMMs, so why is
SPAPR_LMB_FLAGS_ASSIGNED not set for them?  (and why is the node
forced to -1, regardless of di->node).
  

   QSIMPLEQ_INSERT_TAIL(&drco

Re: [PATCH v3 2/3] spapr: Add NVDIMM device support

2019-12-16 Thread Shivaprasad G Bhat


Hi David,

On 11/22/2019 10:00 AM, David Gibson wrote:

On Mon, Oct 14, 2019 at 01:37:50PM -0500, Shivaprasad G Bhat wrote:
---

index 62f1a42592..815167e42f 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -708,6 +708,17 @@ static void spapr_drc_phb_class_init(ObjectClass *k, void 
*data)
  drck->dt_populate = spapr_phb_dt_populate;
  }
  
+static void spapr_drc_pmem_class_init(ObjectClass *k, void *data)

+{
+SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_CLASS(k);
+
+drck->typeshift = SPAPR_DR_CONNECTOR_TYPE_SHIFT_PMEM;
+drck->typename = "MEM";

This is the same as the typename for LMB DRCs.  Doesn't that mean that
ibm,drc-types will end up with a duplicate in it?


Correct, this has to be "PMEM" instead of "MEM". Fixing it in next version.

Thanks,
Shivaprasad


+drck->drc_name_prefix = "PMEM ";

Re: [PATCH v3 3/3] spapr: Add Hcalls to support PAPR NVDIMM device

2019-12-16 Thread Shivaprasad G Bhat


Hi David,


On 11/22/2019 10:41 AM, David Gibson wrote:

On Mon, Oct 14, 2019 at 01:38:16PM -0500, Shivaprasad G Bhat wrote:

device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM

It's not clear to me what a "region" means in this context.


That is PMEM terminology. "region" in this context is guest physical
address range.

Fixing all the rest of the things you pointed out.

Thanks,
Shivaprasad

[PATCH v4 1/4] mem: move nvdimm_device_list to utilities

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Igor Mammedov 
---
 hw/acpi/nvdimm.c|   28 +---
 include/qemu/nvdimm-utils.h |7 +++
 util/Makefile.objs  |1 +
 util/nvdimm-utils.c |   29 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 include/qemu/nvdimm-utils.h
 create mode 100644 util/nvdimm-utils.c

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..5219dd0e2e 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -32,33 +32,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
-
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
-return list;
-}
+#include "qemu/nvdimm-utils.h"
 
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
diff --git a/include/qemu/nvdimm-utils.h b/include/qemu/nvdimm-utils.h
new file mode 100644
index 00..4b8b198ba7
--- /dev/null
+++ b/include/qemu/nvdimm-utils.h
@@ -0,0 +1,7 @@
+#ifndef NVDIMM_UTILS_H
+#define NVDIMM_UTILS_H
+
+#include "qemu/osdep.h"
+
+GSList *nvdimm_get_device_list(void);
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index df124af1c5..2a096fe190 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 00..5cc768ca47
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
+return list;
+}

[PATCH v4 3/4] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/ppc/spapr.c|  216 ++---
 hw/ppc/spapr_drc.c|   18 +++
 hw/ppc/spapr_events.c |4 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_drc.h|9 ++
 7 files changed, 245 insertions(+), 16 deletions(-)

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3ae7db1563..921d8d7c8e 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -80,6 +80,8 @@
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/nvdimm-utils.h"
 
 #include "monitor/monitor.h"
 
@@ -685,12 +687,22 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 nr_entries++;
 }
 
-/* Entry for DIMM */
-drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
-g_assert(drc);
-elem = spapr_get_drconf_cell(size / lmb_size, addr,
- spapr_drc_index(drc), node,
- SPAPR_LMB_FLAGS_ASSIGNED);
+if (info->value->type == MEMORY_DEVICE_INFO_KIND_DIMM) {
+/* Entry for DIMM */
+drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr / lmb_size);
+g_assert(drc);
+elem = spapr_get_drconf_cell(size / lmb_size, addr,
+ spapr_drc_index(drc), node,
+ SPAPR_LMB_FLAGS_ASSIGNED);
+} else if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM) {
+/*
+ * NVDIMM sits here, let the DIMM LMBs be unusable here in the
+ * whole range
+ */
+elem = spapr_get_drconf_cell(size / lmb_size, addr, 0, -1,
+ SPAPR_LMB_FLAGS_RESERVED |
+ SPAPR_LMB_FLAGS_DRC_INVALID);
+}
 QSIMPLEQ_INSERT_TAIL(&drconf_queue, elem, entry);
 nr_entries++;
 cur_addr = addr + size;
@@ -1197,6 +1209,85 @@ static void spapr_dt_hypervisor(SpaprMachineState 
*spapr, void *fdt)
 }
 }
 
+static int spapr_dt_nvdimm(void *fdt, int parent_offset,
+   NVDIMMDevice *nvdimm)
+{
+int child_offset;
+char buf[40];
+SpaprDrc *drc;
+uint32_t drc_idx;
+uint32_t node = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_NODE_PROP,
+ &error_abort);
+uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP,
+ &error_abort);
+uint32_t associativity[] = {
+cpu_to_be32(0x4), /* length */
+cpu_to_be32(0x0), cpu_to_be32(0x0),
+cpu_to_be32(0x0), cpu_to_be32(node)
+};
+uint64_t lsize = nvdimm->label_size;
+uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
+NULL);
+
+drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
+g_assert(drc

[PATCH v4 4/4] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
bind each of the SCM blocks of the NVDIMM device using hcalls. There can
be SCM block unbind requests in case of driver errors or unplug(not
supported now) use cases. The NVDIMM label read/writes are done through
hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesn't fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level granularity.
The patch doesnt actually bind/unbind on hcalls but let it happen at the
device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM
device is presented as a separate guest physical address range. So, there
is no way a partial bind/unbind request can come for the vNVDIMM in a
hcall for a subset of SCM blocks of a virtual NVDIMM. Hence it is safe to
do bind/unbind everything during the device_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/Makefile.objs   |2 
 hw/ppc/spapr_nvdimm.c  |  337 
 include/hw/ppc/spapr.h |8 +
 3 files changed, 345 insertions(+), 2 deletions(-)
 create mode 100644 hw/ppc/spapr_nvdimm.c

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index 580bb4f0dd..0366020ef9 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -5,7 +5,7 @@ obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o 
spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
 obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
-obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o
+obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o spapr_nvdimm.o
 obj-$(CONFIG_SPAPR_RNG) +=  spapr_rng.o
 # IBM PowerNV
 obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o 
pnv_occ.o pnv_bmc.o
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
new file mode 100644
index 00..4a3f796597
--- /dev/null
+++ b/hw/ppc/spapr_nvdimm.c
@@ -0,0 +1,337 @@
+/*
+ * QEMU PAPR Storage Class Memory Interfaces
+ *
+ * Copyright (c) 2019, IBM Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/ppc/spapr.h"
+#include "hw/ppc/spapr_drc.h"
+#include "hw/mem/nvdimm.h"
+#include "qemu/range.h"
+#include "qemu/nvdimm-utils.h"
+
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t numBytesToRead = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+uint64_t data = 0;
+uint8_t buf[8] = { 0 };
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (numBytesToRead != 1 && numBytesToRead != 2 &&
+numBytesToRead != 4 && numBytesToRead != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + numBytesToRead < offset) ||
+(nvdimm->label_size < numBytesToRead + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, buf, numBytesToRead, offset);
+
+switch

[PATCH v4 0/4] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

PAPR semantics is such that each NVDIMM device is comprising of
multiple SCM(Storage Class Memory) blocks. The hypervisor is expected
to prepare the FDT for the NVDIMM device and send guest a hotplug
interrupt with new type RTAS_LOG_V6_HP_TYPE_PMEM currently handled by
the upstream kernel. In response to that interrupt, the guest requests
the hypervisor to bind each of the SCM blocks of the NVDIMM device
using hcalls. There can be SCM block unbind requests in case of driver
errors or unplug(not supported now) use cases. The NVDIMM label
read/writes are done through hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesnt fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level
granularity. The patchset uses the existing NVDIMM class structures
for the implementation. The bind/unbind is left to happen at the
device_add/del phase itself instead of at hcalls on-demand.

The free device-memory region which is used for memory hotplug are
done using multiple LMBs of size(256MiB) and are expected to be
aligned to 256 MiB. As the SCM blocks are mapped to the same region,
the SCM blocks also need to be aligned to this size for the subsequent
memory hotplug to work. The minimum SCM block size is set to this size
for that reason and can be made user configurable in future if required.

The first patch moves around the existing static function to common
area for using it in the subsequent patches. Second patch adds new uuid
property to the nvdimm device. Third patch adds FDT entries and basic
device support, the fourth patch adds the hcalls implementation.

The patches are also available at
https://github.com/ShivaprasadGBhat/qemu.git - pseries-nvdimm-v4 branch
and can be used with the upstream kernel. ndctl can be used for
configuring the nvdimms inside the guest.
This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v3: https://lists.gnu.org/archive/html/qemu-devel/2019-10/msg03452.html
Changes from v3:
- Moved NVDIMM uuid property addition to new patch.
- Moved the SCM hcalls to new file
- Changed the metadata read/write hcalls to use st/ldX_be_p macros.
- Fixed all comments on v3
v2: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg02785.html
Changes from v2:
- Creating the drc indices for the nvdimm devices in advance as
suggested based on the number of user specified max slots property.
- Removed the hard dependency on -machine nvdimm=on, enabled by
default on the current latest pseries machine version.
- Renamed the functions to spapr_dt_X as suggested.
- Metadata is byteswapped before read/write to take care of endianness
semantics during the hcall.
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1

Shivaprasad G Bhat (4):
mem: move nvdimm_device_list to utilities
nvdimm: add uuid property to nvdimm
spapr: Add NVDIMM device support
spapr: Add Hcalls to support PAPR NVDIMM device

[PATCH v4 2/4] nvdimm: add uuid property to nvdimm

For ppc64, PAPR requires the nvdimm device to have UUID property
set in the device tree. Add an option to get it from the user.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/mem/nvdimm.c |   40 
 include/hw/mem/nvdimm.h |7 +++
 2 files changed, 47 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 375f9a588a..e1238b5bed 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -69,11 +69,51 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(&nvdimm->uuid);
+
+visit_type_str(v, name, &value, errp);
+g_free(value);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, &value, &local_err);
+if (local_err) {
+goto out;
+}
+
+if (qemu_uuid_parse(value, &nvdimm->uuid) != 0) {
+error_setg(errp, "Property '%s.%s' has invalid value",
+   object_get_typename(obj), name);
+goto out;
+}
+g_free(value);
+
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 523a9b3d4a..4807ca615b 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -25,6 +25,7 @@
 
 #include "hw/mem/pc-dimm.h"
 #include "hw/acpi/bios-linker-loader.h"
+#include "qemu/uuid.h"
 
 #define NVDIMM_DEBUG 0
 #define nvdimm_debug(fmt, ...)\
@@ -49,6 +50,7 @@
TYPE_NVDIMM)
 
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
+#define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
 
 struct NVDIMMDevice {
@@ -83,6 +85,11 @@ struct NVDIMMDevice {
  * the guest write persistence.
  */
 bool unarmed;
+
+/*
+ * The PPC64 - spapr requires each nvdimm device have a uuid.
+ */
+QemuUUID uuid;
 };
 typedef struct NVDIMMDevice NVDIMMDevice;

Re: [PATCH v5 3/4] spapr: Add NVDIMM device support




On 02/04/2020 09:29 AM, David Gibson wrote:

On Thu, Jan 30, 2020 at 05:48:15AM -0600, Shivaprasad G Bhat wrote:

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

+   " must be a multiple of %" PRIu64 "MB",
+   SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
+return;
+}
+
+uuidstr = object_property_get_str(OBJECT(dimm), NVDIMM_UUID_PROP, 
NULL);
+qemu_uuid_parse(uuidstr, &uuid);

Uh.. couldn't we just look at nvdimm->uuid, rather than getting the
string property and parsing it again?


Addressing all except this one as discussed. Posting the next version in 
a while.


Thanks,
Shivaprasad

[PATCH v6 1/4] mem: move nvdimm_device_list to utilities

nvdimm_device_list is required for parsing the list for devices
in subsequent patches. Move it to common utility area.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Igor Mammedov 
Reviewed-by: David Gibson 
---
 hw/acpi/nvdimm.c|   28 +---
 include/qemu/nvdimm-utils.h |7 +++
 util/Makefile.objs  |1 +
 util/nvdimm-utils.c |   29 +
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 include/qemu/nvdimm-utils.h
 create mode 100644 util/nvdimm-utils.c

diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c
index 9fdad6dc3f..5219dd0e2e 100644
--- a/hw/acpi/nvdimm.c
+++ b/hw/acpi/nvdimm.c
@@ -32,33 +32,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/nvram/fw_cfg.h"
 #include "hw/mem/nvdimm.h"
-
-static int nvdimm_device_list(Object *obj, void *opaque)
-{
-GSList **list = opaque;
-
-if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
-*list = g_slist_append(*list, DEVICE(obj));
-}
-
-object_child_foreach(obj, nvdimm_device_list, opaque);
-return 0;
-}
-
-/*
- * inquire NVDIMM devices and link them into the list which is
- * returned to the caller.
- *
- * Note: it is the caller's responsibility to free the list to avoid
- * memory leak.
- */
-static GSList *nvdimm_get_device_list(void)
-{
-GSList *list = NULL;
-
-object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
-return list;
-}
+#include "qemu/nvdimm-utils.h"
 
 #define NVDIMM_UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
diff --git a/include/qemu/nvdimm-utils.h b/include/qemu/nvdimm-utils.h
new file mode 100644
index 00..4b8b198ba7
--- /dev/null
+++ b/include/qemu/nvdimm-utils.h
@@ -0,0 +1,7 @@
+#ifndef NVDIMM_UTILS_H
+#define NVDIMM_UTILS_H
+
+#include "qemu/osdep.h"
+
+GSList *nvdimm_get_device_list(void);
+#endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 11262aafaf..6b38b67cf1 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -20,6 +20,7 @@ util-obj-y += envlist.o path.o module.o
 util-obj-y += host-utils.o
 util-obj-y += bitmap.o bitops.o hbitmap.o
 util-obj-y += fifo8.o
+util-obj-y += nvdimm-utils.o
 util-obj-y += cacheinfo.o
 util-obj-y += error.o qemu-error.o
 util-obj-y += qemu-print.o
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 00..5cc768ca47
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,29 @@
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+GSList **list = opaque;
+
+if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+*list = g_slist_append(*list, DEVICE(obj));
+}
+
+object_child_foreach(obj, nvdimm_device_list, opaque);
+return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+GSList *list = NULL;
+
+object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
+return list;
+}

[PATCH v6 0/4] ppc: spapr: virtual NVDIMM support

The patchset attempts to implement the virtual NVDIMM for pseries.

PAPR semantics is such that each NVDIMM device is comprising of
multiple SCM(Storage Class Memory) blocks. The hypervisor is expected
to prepare the FDT for the NVDIMM device and send guest a hotplug
interrupt with new type RTAS_LOG_V6_HP_TYPE_PMEM currently handled by
the upstream kernel. In response to that interrupt, the guest requests
the hypervisor to bind each of the SCM blocks of the NVDIMM device
using hcalls. There can be SCM block unbind requests in case of driver
errors or unplug(not supported now) use cases. The NVDIMM label
read/writes are done through hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesnt fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level
granularity. The patchset uses the existing NVDIMM class structures
for the implementation. The bind/unbind is left to happen at the
device_add/del phase itself instead of at hcalls on-demand.

The free device-memory region which is used for memory hotplug are
done using multiple LMBs of size(256MiB) and are expected to be
aligned to 256 MiB. As the SCM blocks are mapped to the same region,
the SCM blocks also need to be aligned to this size for the subsequent
memory hotplug to work. The minimum SCM block size is set to this size
for that reason and can be made user configurable in future if required.

The patches are also available at
https://github.com/ShivaprasadGBhat/qemu.git - pseries-nvdimm-v6 branch
and can be used with the upstream kernel. ndctl can be used for
configuring the nvdimms inside the guest.
This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

---
v5: https://lists.nongnu.org/archive/html/qemu-devel/2020-01/msg07472.html
Changes from v5:
- Moved most of the nvdimm code from spapr.c to spapr_nvdimm.c
- Addressed all style/logic comments.
v4: https://lists.gnu.org/archive/html/qemu-devel/2019-12/msg03455.html
Changes from v4:
- The nvdimm occupied GPA area is marked as available for hotplug, the
existing code takes care of if the dimm device is actually present there
or used by nvdimm.
- fixed all comments for hcall implementation code on style/logic issues.
v3: https://lists.gnu.org/archive/html/qemu-devel/2019-10/msg03452.html
Changes from v3:
- Moved NVDIMM uuid property addition to new patch.
- Moved the SCM hcalls to new file
- Changed the metadata read/write hcalls to use st/ldX_be_p macros.
- Fixed all comments on v3
v2: https://lists.gnu.org/archive/html/qemu-devel/2019-05/msg02785.html
Changes from v2:
- Creating the drc indices for the nvdimm devices in advance as
suggested based on the number of user specified max slots property.
- Removed the hard dependency on -machine nvdimm=on, enabled by
default on the current latest pseries machine version.
- Renamed the functions to spapr_dt_X as suggested.
- Metadata is byteswapped before read/write to take care of endianness
semantics during the hcall.
v1 : http://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg01545.html
Changes from v1:
- Rebased to upstream, this required required dt_populate implementation
for nvdimm hotplug support
- Added uuid option to nvdimm device
- Removed the memory region sizing down code as suggested by Igor,
now erroring out if NVDIMM size excluding the label area is not
aligned to 256MB, so patch 2 from previous series no longer needed.
- Removed un-implemented hcalls
- Changed the hcalls to different kinds of checks and return
different values.
- Addressed comments for v1
---

Shivaprasad G Bhat (4):
mem: move nvdimm_device_list to utilities
nvdimm: add uuid property to

[PATCH v6 3/4] spapr: Add NVDIMM device support

Add support for NVDIMM devices for sPAPR. Piggyback on existing nvdimm
device interface in QEMU to support virtual NVDIMM devices for Power.
Create the required DT entries for the device (some entries have
dummy values right now).

The patch creates the required DT node and sends a hotplug
interrupt to the guest. Guest is expected to undertake the normal
DR resource add path in response and start issuing PAPR SCM hcalls.

The device support is verified based on the machine version unlike x86.

This is how it can be used ..
Ex :
For coldplug, the device to be added in qemu command line as shown below
-object 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
-device 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

For hotplug, the device to be added from monitor as below
object_add 
memory-backend-file,id=memnvdimm0,prealloc=yes,mem-path=/tmp/nvdimm0,share=yes,size=1073872896
device_add 
nvdimm,label-size=128k,uuid=75a3cdd7-6a2f-4791-8d15-fe0a920e8e9e,memdev=memnvdimm0,id=nvdimm0,slot=0

Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Bharata B Rao 
   [Early implementation]
---
 default-configs/ppc64-softmmu.mak |1 
 hw/mem/Kconfig|2 
 hw/ppc/Makefile.objs  |2 
 hw/ppc/spapr.c|   69 +-
 hw/ppc/spapr_drc.c|   19 
 hw/ppc/spapr_events.c |4 +
 hw/ppc/spapr_nvdimm.c |  177 +
 include/hw/ppc/spapr_drc.h|9 ++
 include/hw/ppc/spapr_nvdimm.h |   37 
 9 files changed, 309 insertions(+), 11 deletions(-)
 create mode 100644 hw/ppc/spapr_nvdimm.c
 create mode 100644 include/hw/ppc/spapr_nvdimm.h

diff --git a/default-configs/ppc64-softmmu.mak 
b/default-configs/ppc64-softmmu.mak
index cca52665d9..ae0841fa3a 100644
--- a/default-configs/ppc64-softmmu.mak
+++ b/default-configs/ppc64-softmmu.mak
@@ -8,3 +8,4 @@ CONFIG_POWERNV=y
 
 # For pSeries
 CONFIG_PSERIES=y
+CONFIG_NVDIMM=y
diff --git a/hw/mem/Kconfig b/hw/mem/Kconfig
index 620fd4cb59..2ad052a536 100644
--- a/hw/mem/Kconfig
+++ b/hw/mem/Kconfig
@@ -8,4 +8,4 @@ config MEM_DEVICE
 config NVDIMM
 bool
 default y
-depends on PC
+depends on (PC || PSERIES)
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index a4bac57be6..c3d3cc56eb 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -7,7 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o 
spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
 obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
-obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o
+obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o spapr_nvdimm.o
 obj-$(CONFIG_SPAPR_RNG) +=  spapr_rng.o
 obj-$(call land,$(CONFIG_PSERIES),$(CONFIG_LINUX)) += spapr_pci_vfio.o 
spapr_pci_nvlink2.o
 # IBM PowerNV
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c9b2e0a5e0..d3cb8b4c7b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -80,6 +80,7 @@
 #include "hw/ppc/spapr_cpu_core.h"
 #include "hw/mem/memory-device.h"
 #include "hw/ppc/spapr_tpm_proxy.h"
+#include "hw/ppc/spapr_nvdimm.h"
 
 #include "monitor/monitor.h"
 
@@ -675,6 +676,14 @@ static int spapr_populate_drmem_v2(SpaprMachineState 
*spapr, void *fdt,
 size = di->size;
 node = di->node;
 
+/*
+ * The NVDIMM area is hotpluggable after the NVDIMM is unplugged. The
+ * area is marked hotpluggable in the next iteration for the bigger
+ * chunk including the NVDIMM occupied area.
+ */
+if (info->value->type == MEMORY_DEVICE_INFO_KIND_NVDIMM)
+continue;
+
 /* Entry for hot-pluggable area */
 if (cur_addr < addr) {
 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, cur_addr / lmb_size);
@@ -1266,6 +1275,11 @@ void *spapr_build_fdt(SpaprMachineState *spapr, bool 
reset, size_t space)
 }
 }
 
+/* NVDIMM devices */
+if (mc->nvdimm_supported) {
+spapr_dt_persistent_memory(fdt);
+}
+
 return fdt;
 }
 
@@ -2629,6 +2643,7 @@ static void spapr_machine_init(MachineState *machine)
 {
 SpaprMachineState *spapr = SPAPR_MACHINE(machine);
 SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
+MachineClass *mc = MACHINE_GET_CLASS(machine);
 const char *kernel_filename = machine->kernel_filename;
 const char *initrd_filename = machine->initrd_filename;
 PCIHostState *phb;
@@ -2861,6 +2876,10 @@ static void spapr_machine_init(MachineState *machine)
 "may run and log hardware error on the destination");
 }
 
+if (mc->nvdimm_supported) {
+spapr_create_nvdimm_dr_connectors(spapr);
+}
+
 /* Set up RTAS event infrastructure

[PATCH v6 2/4] nvdimm: add uuid property to nvdimm

For ppc64, PAPR requires the nvdimm device to have UUID property
set in the device tree. Add an option to get it from the user.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: David Gibson 
Reviewed-by: Igor Mammedov 
---
 hw/mem/nvdimm.c |   40 
 include/hw/mem/nvdimm.h |7 +++
 2 files changed, 47 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 39f1426d1f..8e426d24bb 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -69,11 +69,51 @@ out:
 error_propagate(errp, local_err);
 }
 
+static void nvdimm_get_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+char *value = NULL;
+
+value = qemu_uuid_unparse_strdup(&nvdimm->uuid);
+
+visit_type_str(v, name, &value, errp);
+g_free(value);
+}
+
+
+static void nvdimm_set_uuid(Object *obj, Visitor *v, const char *name,
+  void *opaque, Error **errp)
+{
+NVDIMMDevice *nvdimm = NVDIMM(obj);
+Error *local_err = NULL;
+char *value;
+
+visit_type_str(v, name, &value, &local_err);
+if (local_err) {
+goto out;
+}
+
+if (qemu_uuid_parse(value, &nvdimm->uuid) != 0) {
+error_setg(errp, "Property '%s.%s' has invalid value",
+   object_get_typename(obj), name);
+goto out;
+}
+g_free(value);
+
+out:
+error_propagate(errp, local_err);
+}
+
+
 static void nvdimm_init(Object *obj)
 {
 object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
 nvdimm_get_label_size, nvdimm_set_label_size, NULL,
 NULL, NULL);
+
+object_property_add(obj, NVDIMM_UUID_PROP, "QemuUUID", nvdimm_get_uuid,
+nvdimm_set_uuid, NULL, NULL, NULL);
 }
 
 static void nvdimm_finalize(Object *obj)
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 523a9b3d4a..4807ca615b 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -25,6 +25,7 @@
 
 #include "hw/mem/pc-dimm.h"
 #include "hw/acpi/bios-linker-loader.h"
+#include "qemu/uuid.h"
 
 #define NVDIMM_DEBUG 0
 #define nvdimm_debug(fmt, ...)\
@@ -49,6 +50,7 @@
TYPE_NVDIMM)
 
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
+#define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
 
 struct NVDIMMDevice {
@@ -83,6 +85,11 @@ struct NVDIMMDevice {
  * the guest write persistence.
  */
 bool unarmed;
+
+/*
+ * The PPC64 - spapr requires each nvdimm device have a uuid.
+ */
+QemuUUID uuid;
 };
 typedef struct NVDIMMDevice NVDIMMDevice;

[PATCH v6 4/4] spapr: Add Hcalls to support PAPR NVDIMM device

This patch implements few of the necessary hcalls for the nvdimm support.

PAPR semantics is such that each NVDIMM device is comprising of multiple
SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
bind each of the SCM blocks of the NVDIMM device using hcalls. There can
be SCM block unbind requests in case of driver errors or unplug(not
supported now) use cases. The NVDIMM label read/writes are done through
hcalls.

Since each virtual NVDIMM device is divided into multiple SCM blocks,
the bind, unbind, and queries using hcalls on those blocks can come
independently. This doesn't fit well into the qemu device semantics,
where the map/unmap are done at the (whole)device/object level granularity.
The patch doesnt actually bind/unbind on hcalls but let it happen at the
device_add/del phase itself instead.

The guest kernel makes bind/unbind requests for the virtual NVDIMM device
at the region level granularity. Without interleaving, each virtual NVDIMM
device is presented as a separate guest physical address range. So, there
is no way a partial bind/unbind request can come for the vNVDIMM in a
hcall for a subset of SCM blocks of a virtual NVDIMM. Hence it is safe to
do bind/unbind everything during the device_add/del.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c  |  298 
 include/hw/ppc/spapr.h |8 +
 2 files changed, 305 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index d03c8d3a5c..74eeb8bb74 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -28,6 +28,7 @@
 #include "hw/mem/nvdimm.h"
 #include "qemu/nvdimm-utils.h"
 #include "hw/ppc/fdt.h"
+#include "qemu/range.h"
 
 void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size,
 Error **errp)
@@ -175,3 +176,300 @@ void spapr_dt_persistent_memory(void *fdt)
 
 return;
 }
+
+static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong opcode,
+target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t len = args[2];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+uint64_t data = 0;
+uint8_t buf[8] = { 0 };
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (len != 1 && len != 2 &&
+len != 4 && len != 8) {
+return H_P3;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + len < offset) ||
+(nvdimm->label_size < len + offset)) {
+return H_P2;
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->read_label_data(nvdimm, buf, len, offset);
+
+switch (len) {
+case 1:
+data = ldub_p(buf);
+break;
+case 2:
+data = lduw_be_p(buf);
+break;
+case 4:
+data = ldl_be_p(buf);
+break;
+case 8:
+data = ldq_be_p(buf);
+break;
+default:
+g_assert_not_reached();
+}
+
+args[0] = data;
+
+return H_SUCCESS;
+}
+
+static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
+ SpaprMachineState *spapr,
+ target_ulong opcode,
+ target_ulong *args)
+{
+uint32_t drc_index = args[0];
+uint64_t offset = args[1];
+uint64_t data = args[2];
+uint64_t len = args[3];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+NVDIMMClass *ddc;
+uint8_t buf[8] = { 0 };
+
+if (!drc || !drc->dev ||
+spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+
+if (len != 1 && len != 2 &&
+len != 4 && len != 8) {
+return H_P4;
+}
+
+nvdimm = NVDIMM(drc->dev);
+if ((offset + len < offset) ||
+(nvdimm->label_size < len + offset)) {
+return H_P2;
+}
+
+switch (len) {
+case 1:
+if (data & 0xff00) {
+return H_P2;
+}
+stb_p(buf, data);
+break;
+case 2:
+if (data & 0x) {
+return H_P2;
+}
+stw_be_p(buf, data);
+break;
+case 4:
+if (data & 0x) {
+return H_P2;
+}
+stl_be_p(buf, data);
+break;
+case 8:
+stq_be_p(buf, data);
+break;
+default:
+g_assert_not_reached();
+}
+
+ddc = NVDIMM_GET_CLASS(nvdimm);
+ddc->write_label_data(nvdimm, buf, len, offset);
+
+return H_SUCCESS;
+}
+
+static targe

[PATCH 1/2] tcg: ppc64: Fix mask generation for vextractdm

2023-04-13 Thread Shivaprasad G Bhat

In function do_extractm() the mask is calculated as
dup_const(1 << (element_width - 1)). '1' being signed int
works fine for MO_8,16,32. For MO_64, on PPC64 host
this ends up becoming 0 on compilation. The vextractdm
uses MO_64, and it ends up having mask as 0.

Explicitly use 1ULL instead of signed int 1 like its
used everywhere else.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/translate/vmx-impl.c.inc |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 112233b541..c8712dd7d8 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2058,7 +2058,7 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb 
*a)
 static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
 {
 const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece,
-   mask = dup_const(vece, 1 << (elem_width - 1));
+   mask = dup_const(vece, 1ULL << (elem_width - 1));
 uint64_t i, j;
 TCGv_i64 lo, hi, t0, t1;

[PATCH 0/2] tcg: ppc64: Fix mask generation for vextractdm

2023-04-13 Thread Shivaprasad G Bhat

While debugging gitlab issue[1] 1536, I happen to try the
vextract[X]m instructions on the real hardware. The test
used in [1] is failing for vextractdm.

On debugging it is seen, in function do_extractm() the
mask is calculated as dup_const(1 << (element_width - 1)).
'1' being signed int works fine for MO_8,16,32. For MO_64,
on PPC64 host this ends up becoming 0 on compilation. The
vextractdm uses MO_64, and it ends up having mask as 0.

The first patch here fixes that by explicitly using
1ULL instead of signed int 1 like its used everywhere else.
Second patch introduces the test case from [1] into qemu
tcg/ppc64 along with fixes/tweaks to make it work for both
big and little-endian targets.

Let me know if both patches should be squashed into single
patch. Checkpatch flagged me to avoid use of __BYTE_ORDER__
in the test file(second patch), however I see it being
used in multiarch/sha1.c also this being arch specific
test, I think it is appropriate to use it here. Let me
know if otherwise.

References:
[1] : https://gitlab.com/qemu-project/qemu/-/issues/1536

---

Shivaprasad G Bhat (2):
  tcg: ppc64: Fix mask generation for vextractdm
  tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions


 target/ppc/translate/vmx-impl.c.inc |  2 +-
 tests/tcg/ppc64/Makefile.target |  6 +++-
 tests/tcg/ppc64/vector.c| 50 +
 3 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 tests/tcg/ppc64/vector.c

--
Signature

[PATCH 2/2] tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions

2023-04-13 Thread Shivaprasad G Bhat

Add test for vextractbm, vextractwm, vextractdm and vextractqm
instructions. Test works for both qemu-ppc64 and qemu-ppc64le.

Based on the test case written by John Platts posted at [1]

References:
[1]: https://gitlab.com/qemu-project/qemu/-/issues/1536

Signed-off-by: John Platts 
Signed-off-by: Shivaprasad G Bhat 
---
 tests/tcg/ppc64/Makefile.target |6 -
 tests/tcg/ppc64/vector.c|   50 +++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/ppc64/vector.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index f081f1c683..4fd543ce28 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -20,7 +20,7 @@ PPC64_TESTS += mtfsf
 PPC64_TESTS += mffsce
 
 ifneq ($(CROSS_CC_HAS_POWER10),)
-PPC64_TESTS += byte_reverse sha512-vector
+PPC64_TESTS += byte_reverse sha512-vector vector
 endif
 byte_reverse: CFLAGS += -mcpu=power10
 run-byte_reverse: QEMU_OPTS+=-cpu POWER10
@@ -33,6 +33,10 @@ sha512-vector: sha512.c
 run-sha512-vector: QEMU_OPTS+=-cpu POWER10
 run-plugin-sha512-vector-with-%: QEMU_OPTS+=-cpu POWER10
 
+vector: CFLAGS += -mcpu=power10
+run-vector: QEMU_OPTS += -cpu POWER10
+run-plugin-vector-with-%: QEMU_OPTS += -cpu POWER10
+
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
 
diff --git a/tests/tcg/ppc64/vector.c b/tests/tcg/ppc64/vector.c
new file mode 100644
index 00..3cb2b88c87
--- /dev/null
+++ b/tests/tcg/ppc64/vector.c
@@ -0,0 +1,50 @@
+#include 
+#include 
+
+int main(void)
+{
+unsigned int result_wi;
+vector unsigned char vbc_bi_src = { 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF,
+0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0,
+0, 0xFF, 0xFF};
+vector unsigned short vbc_hi_src = { 0x, 0, 0, 0x,
+ 0, 0, 0x, 0x};
+vector unsigned int vbc_wi_src = {0, 0, 0x, 0x};
+vector unsigned long long vbc_di_src = {0x, 0};
+vector __uint128_t vbc_qi_src;
+
+asm("vextractbm %0, %1" : "=r" (result_wi) : "v" (vbc_bi_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b11011111);
+#else
+assert(result_wi == 0b11111011);
+#endif
+
+asm("vextracthm %0, %1" : "=r" (result_wi) : "v" (vbc_hi_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b10010011);
+#else
+assert(result_wi == 0b11001001);
+#endif
+
+asm("vextractwm %0, %1" : "=r" (result_wi) : "v" (vbc_wi_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b0011);
+#else
+assert(result_wi == 0b1100);
+#endif
+
+asm("vextractdm %0, %1" : "=r" (result_wi) : "v" (vbc_di_src));
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+assert(result_wi == 0b10);
+#else
+assert(result_wi == 0b01);
+#endif
+
+vbc_qi_src[0] = 0x1;
+vbc_qi_src[0] = vbc_qi_src[0] << 127;
+asm("vextractqm %0, %1" : "=r" (result_wi) : "v" (vbc_qi_src));
+assert(result_wi == 0b1);
+
+return 0;
+}

[PATCH] softfloat: Fix the incorrect computation in float32_exp2()

2023-05-02 Thread Shivaprasad G Bhat

The float32_exp2() is computing wrong exponent of 2.
For example, with the following set of values {0.1, 2.0, 2.0, -1.0},
the expected output would be {1.071773, 4.00, 4.00, 0.50}.
Instead, the function is computing {1.119102, 3.382044, 3.382044, -0.191022}

Looking at the code, the float32_exp2() attempts to do this

  2 3 4 5   n
  xx x x x x   x
 e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
   1!2!3!4!5!  n!

But because of the 'typo'/bug it ends up doing

 xx x x x x   x
e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
  1!2!3!4!5!  n!

This is because instead of the xnp which holds the numerator,
parts_muladd is using the xp which is just 'x'. The commit '572c4d862ff2'
refactored this function, and it seems mistakenly using xp instead of xnp.

The patches fixes this possible typo.

Fixes: 572c4d862ff2 "softfloat: Convert float32_exp2 to FloatParts"
Partially-Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1623
Reported-By: Luca Barbato (https://gitlab.com/lu-zero)
Signed-off-by: Shivaprasad G Bhat 
Signed-off-by: Vaibhav Jain 
---
 fpu/softfloat.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index c7454c3eb1a..108f9cb224a 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -5135,7 +5135,7 @@ float32 float32_exp2(float32 a, float_status *status)
 float64_unpack_canonical(&rp, float64_one, status);
 for (i = 0 ; i < 15 ; i++) {
 float64_unpack_canonical(&tp, float32_exp2_coefficients[i], status);
-rp = *parts_muladd(&tp, &xp, &rp, 0, status);
+rp = *parts_muladd(&tp, &xnp, &rp, 0, status);
 xnp = *parts_mul(&xnp, &xp, status);
 }

Re: [PATCH 2/2] tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions


On 5/2/23 12:35, Cédric Le Goater wrote:

On 4/13/23 21:01, Shivaprasad G Bhat wrote:

Add test for vextractbm, vextractwm, vextractdm and vextractqm
instructions. Test works for both qemu-ppc64 and qemu-ppc64le.

Based on the test case written by John Platts posted at [1]

References:
[1]: https://gitlab.com/qemu-project/qemu/-/issues/1536


Gitlab issues should be referenced as :

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1536

However, this patch adds a test, not a fix. So it is the previous patch
which should be annotated as resolving the issue.

Also, I think the code should be using  HOST_BIG_ENDIAN instead of
__ORDER_BIG_ENDIAN__


Thanks for the comments Cédric.

Fixing these in v2.

Thanks,

Shivaprasad

Re: [PATCH] softfloat: Fix the incorrect computation in float32_exp2()


Hi Richard,


On 5/3/23 01:11, Richard Henderson wrote:

On 5/2/23 16:25, Shivaprasad G Bhat wrote:

The float32_exp2() is computing wrong exponent of 2.
For example, with the following set of values {0.1, 2.0, 2.0, -1.0},
the expected output would be {1.071773, 4.00, 4.00, 0.50}.
Instead, the function is computing {1.119102, 3.382044, 3.382044, 
-0.191022}





his is because instead of the xnp which holds the numerator,
parts_muladd is using the xp which is just 'x'. The commit 
'572c4d862ff2'
refactored this function, and it seems mistakenly using xp instead of 
xnp.


The patches fixes this possible typo.

Fixes: 572c4d862ff2 "softfloat: Convert float32_exp2 to FloatParts"
Partially-Resolves:https://gitlab.com/qemu-project/qemu/-/issues/1623
Reported-By: Luca Barbato (https://gitlab.com/lu-zero)
Signed-off-by: Shivaprasad G Bhat
Signed-off-by: Vaibhav Jain
---
  fpu/softfloat.c |    2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Whoops.  Good catch.



If you are fine with the patch, could you fix the mail id for Vaibhav 
Jain as


  while pulling ?

If you have other comments, I will fix it in the next version otherwise.


Thanks,

Shivaprasad


H


r~

[PATCH v2 1/2] tcg: ppc64: Fix mask generation for vextractdm

In function do_extractm() the mask is calculated as
dup_const(1 << (element_width - 1)). '1' being signed int
works fine for MO_8,16,32. For MO_64, on PPC64 host
this ends up becoming 0 on compilation. The vextractdm
uses MO_64, and it ends up having mask as 0.

Explicitly use 1ULL instead of signed int 1 like its
used everywhere else.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1536
Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Alex Bennée 
Reviewed-by: Lucas Mateus Castro 
Reviewed-by: Richard Henderson 
---
 target/ppc/translate/vmx-impl.c.inc |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/ppc/translate/vmx-impl.c.inc 
b/target/ppc/translate/vmx-impl.c.inc
index 112233b541..c8712dd7d8 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2058,7 +2058,7 @@ static bool trans_VEXPANDQM(DisasContext *ctx, arg_VX_tb 
*a)
 static bool do_vextractm(DisasContext *ctx, arg_VX_tb *a, unsigned vece)
 {
 const uint64_t elem_width = 8 << vece, elem_count_half = 8 >> vece,
-   mask = dup_const(vece, 1 << (elem_width - 1));
+   mask = dup_const(vece, 1ULL << (elem_width - 1));
 uint64_t i, j;
 TCGv_i64 lo, hi, t0, t1;

[PATCH v2 0/2] tcg: ppc64: Fix mask generation for vextractdm

While debugging gitlab issue[1] 1536, I happen to try the
vextract[X]m instructions on the real hardware. The test
used in [1] is failing for vextractdm.

On debugging it is seen, in function do_extractm() the
mask is calculated as dup_const(1 << (element_width - 1)).
'1' being signed int works fine for MO_8,16,32. For MO_64,
on PPC64 host this ends up becoming 0 on compilation. The
vextractdm uses MO_64, and it ends up having mask as 0.

The first patch here fixes that by explicitly using
1ULL instead of signed int 1 like its used everywhere else.
Second patch introduces the test case from [1] into qemu
tcg/ppc64 along with fixes/tweaks to make it work for both
big and little-endian targets.

References:
[1] : https://gitlab.com/qemu-project/qemu/-/issues/1536

---
Changelog:
Since v1 : https://lists.gnu.org/archive/html/qemu-devel/2023-04/msg01958.html
 - Added "Resolves: " to first patch description
 - Rebased to top of the tree. I see with d044b7c33a5, Alex has limited the
   scope of plugin tests to just the MULTIARCH_TESTS. So, removed the plugin
   tests for the test case added in the second patch.
 - Changed the test case to use the HOST_BIG_ENDIAN from compiler.h

Shivaprasad G Bhat (2):
  tcg: ppc64: Fix mask generation for vextractdm
  tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions


 target/ppc/translate/vmx-impl.c.inc |  2 +-
 tests/tcg/ppc64/Makefile.target |  6 +++-
 tests/tcg/ppc64/vector.c| 51 +
 3 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 tests/tcg/ppc64/vector.c

--
Signature

[PATCH v2 2/2] tests: tcg: ppc64: Add tests for Vector Extract Mask Instructions

Add test for vextractbm, vextractwm, vextractdm and vextractqm
instructions. Test works for both qemu-ppc64 and qemu-ppc64le.

Based on the test case written by John Platts posted at [1]

References:
[1] - https://gitlab.com/qemu-project/qemu/-/issues/1536

Signed-off-by: John Platts 
Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Lucas Mateus Castro 
---
 tests/tcg/ppc64/Makefile.target |5 +++-
 tests/tcg/ppc64/vector.c|   51 +++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/ppc64/vector.c

diff --git a/tests/tcg/ppc64/Makefile.target b/tests/tcg/ppc64/Makefile.target
index 6d47d3cae6..b084963b9a 100644
--- a/tests/tcg/ppc64/Makefile.target
+++ b/tests/tcg/ppc64/Makefile.target
@@ -20,7 +20,7 @@ PPC64_TESTS += mtfsf
 PPC64_TESTS += mffsce
 
 ifneq ($(CROSS_CC_HAS_POWER10),)
-PPC64_TESTS += byte_reverse sha512-vector
+PPC64_TESTS += byte_reverse sha512-vector vector
 endif
 byte_reverse: CFLAGS += -mcpu=power10
 run-byte_reverse: QEMU_OPTS+=-cpu POWER10
@@ -31,6 +31,9 @@ sha512-vector: sha512.c
 
 run-sha512-vector: QEMU_OPTS+=-cpu POWER10
 
+vector: CFLAGS += -mcpu=power10 -I$(SRC_PATH)/include
+run-vector: QEMU_OPTS += -cpu POWER10
+
 PPC64_TESTS += signal_save_restore_xer
 PPC64_TESTS += xxspltw
 
diff --git a/tests/tcg/ppc64/vector.c b/tests/tcg/ppc64/vector.c
new file mode 100644
index 00..cbf4ae9332
--- /dev/null
+++ b/tests/tcg/ppc64/vector.c
@@ -0,0 +1,51 @@
+#include 
+#include 
+#include "qemu/compiler.h"
+
+int main(void)
+{
+unsigned int result_wi;
+vector unsigned char vbc_bi_src = { 0xFF, 0xFF, 0, 0xFF, 0xFF, 0xFF,
+0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0,
+0, 0xFF, 0xFF};
+vector unsigned short vbc_hi_src = { 0x, 0, 0, 0x,
+ 0, 0, 0x, 0x};
+vector unsigned int vbc_wi_src = {0, 0, 0x, 0x};
+vector unsigned long long vbc_di_src = {0x, 0};
+vector __uint128_t vbc_qi_src;
+
+asm("vextractbm %0, %1" : "=r" (result_wi) : "v" (vbc_bi_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b11011111);
+#else
+assert(result_wi == 0b11111011);
+#endif
+
+asm("vextracthm %0, %1" : "=r" (result_wi) : "v" (vbc_hi_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b10010011);
+#else
+assert(result_wi == 0b11001001);
+#endif
+
+asm("vextractwm %0, %1" : "=r" (result_wi) : "v" (vbc_wi_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b0011);
+#else
+assert(result_wi == 0b1100);
+#endif
+
+asm("vextractdm %0, %1" : "=r" (result_wi) : "v" (vbc_di_src));
+#if HOST_BIG_ENDIAN
+assert(result_wi == 0b10);
+#else
+assert(result_wi == 0b01);
+#endif
+
+vbc_qi_src[0] = 0x1;
+vbc_qi_src[0] = vbc_qi_src[0] << 127;
+asm("vextractqm %0, %1" : "=r" (result_wi) : "v" (vbc_qi_src));
+assert(result_wi == 0b1);
+
+return 0;
+}

[PATCH v6 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch. The
hcall is applicable only for new SPAPR specific device class which is
also introduced in this patch.

The hcall expects the semantics such that the flush to return with
H_LONG_BUSY_ORDER_10_MSEC when the operation is expected to take longer
time along with a continue_token. The hcall to be called again by providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after collecting the return status for the guest
in subsequent hcall from the guest.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are also introduced in this patch which are to be
saved in the new SPAPR specific nvdimm device to be introduced in the
following patch.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|2 
 hw/ppc/spapr_nvdimm.c |  263 +
 include/hw/ppc/spapr.h|4 -
 include/hw/ppc/spapr_nvdimm.h |1 
 4 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd..9263985663 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1634,6 +1634,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes();
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 91de1052f2..ed6fda2c23 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,9 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
+#include "migration/vmstate.h"
+#include "qemu/pmem.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -47,6 +51,14 @@
 /* Have an explicit check for alignment */
 QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
+
+struct SPAPRNVDIMMClass {
+/* private */
+NVDIMMClass parent_class;
+};
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
@@ -375,6 +387,256 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+typedef struct SpaprNVDIMMDeviceFlushState {
+uint64_t continue_token;
+int64_t hcall_ret;
+int backend_fd;
+uint32_t drcidx;
+
+QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
+} SpaprNVDIMMDeviceFlushState;
+
+typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
+struct SpaprNVDIMMDevice {
+NVDIMMDevice parent_obj;
+
+uint64_t nvdimm_flush_token;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+};
+
+static int flush_worker_cb(void *opaque)
+{
+SpaprNVDIMMDeviceFlushState *state = opaque;
+SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+PCDIMMDevice *dimm = PC_DIMM(drc->dev);
+HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem);
+
+if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
+void *ptr = memory_region_get_ram_ptr(mr);
+size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
+   NULL);
+
+/* flush pmem backend */
+pmem_persist(ptr, size);
+} else {
+/* flush raw backing image */
+if (qemu_fdatasync(state->backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+return H_HARDWARE;
+}
+}
+
+return

Re: [PATCH REBASED v5 1/2] spapr: nvdimm: Implement H_SCM_FLUSH hcall


Hi David,

Thanks for comments. Sorry about the delay. Replies inline.

On 9/21/21 11:53, David Gibson wrote:

On Wed, Jul 07, 2021 at 09:57:21PM -0500, Shivaprasad G Bhat wrote:

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with one of H_LONG_BUSY when the operation is expected to take longer
time along with a continue_token. The hcall to be called again providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after reporting to guest in subsequent hcalls t





@@ -30,6 +31,7 @@
  #include "hw/ppc/fdt.h"
  #include "qemu/range.h"
  #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
  
  /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */

  /* SCM device is unable to persist memory contents */
@@ -375,6 +377,243 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
  return H_SUCCESS;
  }
  
+static uint64_t flush_token;


Better to put this in the machine state structure than a global.


Moved it to device state itself as suggested, the states list is per 
device now.





+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+/* flush raw backing image */
+  





+ !QLIST_EMPTY(&spapr->completed_flush_states));
+}
+
+static int spapr_nvdimm_post_load(void *opaque, int version_id)
+{
+SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+SpaprNVDIMMDeviceFlushState *state, *next;
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+SpaprDrc *drc;
+
+QLIST_FOREACH_SAFE(state, &spapr->completed_flush_states, node, next) {


I don't think you need FOREACH_SAFE here.  You're not removing entries
from the loop body.  If you're trying to protect against concurrent
removals, I don't think FOREACH_SAFE is sufficient, you'll need an
actual lock (but I think it's already protected by the BQL).


Changing here, below and also at spapr_nvdimm_get_flush_status() while 
traversing the pending list. Verified all these invocations are called 
with BQL.





+if (flush_token < state->continue_token) {
+flush_token = state->continue_token;
+}
+}
+
+QLIST_FOREACH_SAFE(state, &spapr->pending_flush_states, node, next) {


Sane comments here.


+if (flush_token < state->continue_token) {
+flush_token = state->continue_token;
+}
+
+drc = spapr_drc_by_index(state->drcidx);
+dimm = PC_DIMM(drc->dev);
+backend = MEMORY_BACKEND(dimm->hostmem);
+state->backend_fd = memory_region_get_fd(&backend->mr);
+
+thread_pool_submit_aio(pool, flush_worker_cb, state,
+   spapr_nvdimm_flush_completion_cb, state);
+}
+
+return 0;
+}
+
+const VMStateDescription vmstate_spapr_nvdimm_states = {
+.name = "spapr_nvdimm_states",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_nvdimm_states_needed,
+.post_load = spapr_nvdimm_post_load,
+.fields = (VMStateField[]) {
+VMSTATE_QLIST_V(completed_flush_states, SpaprMachineState, 1,
+vmstate_spapr_nvdimm_flush_state,
+SpaprNVDIMMDeviceFlushState, node),
+VMSTATE_QLIST_V(pending_flush_states, SpaprMachineState, 1,
+vmstate_spapr_nvdimm_flush_state,
+SpaprNVDIMMDeviceFlushState, node),
+VMSTATE_END_OF_LIST()
+},
+};
+
+/*
+ * Assign a token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(
+  SpaprMachineState *spapr)
+{
+SpaprNVDIMMDeviceFlushState *state;
+
+state = g_malloc0(sizeof(*state));
+
+flush_token++;
+/* Token zero is presumed as no job pending. Handle the overflow to zero */
+if (flush_token == 0) {
+flush_token++;


Hmm... strictly speaking, this isn't safe.  It's basically never going
to happen in practice, but in theory there's nothing preventing
continue_token 1 still being outstanding when the flush_token counter
overflows.

Come to think of it, since it's a uint64_t, I think an actual overflow
is also never going to happen in practice.  Maybe we should just
assert() on overflow, and fix it in the unlikely event that we ever
discover a case where it could happen.


Have added the assert on overflow.




+}
+state->continue_token = flush_token;
+
+QLIST_INSERT_HEAD(&spapr->pending_flush_states, state, node);
+
+return state;
+}
+
+/*
+ *


Thanks!

Re: [PATCH REBASED v5 2/2] spapr: nvdimm: Introduce spapr-nvdimm device




On 9/21/21 12:02, David Gibson wrote:

On Wed, Jul 07, 2021 at 09:57:31PM -0500, Shivaprasad G Bhat wrote:

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, the patch introduces a new papr specific
device type inheriting the nvdimm device. When the backend doesn't have
pmem="yes", the device tree property "ibm,hcall-flush-required" is set,
and the guest makes hcall H_SCM_FLUSH requesting for an explicit flush.

Signed-off-by: Shivaprasad G Bhat 



@@ -91,6 +93,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
  return false;
  }
  
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&

+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));


It's not obvious to me why the spapr nvdimm device has an additional
restriction here over the regular nvdimm device.


For memory-backend-ram the fd is set to -1. The fdatasync would fail 
later. This restriction is for preventing the hcall failure later. May 
be it is intentionally allowed with nvdimms for testing purposes. Let me 
know if you want me to allow it with a dummy success return for the hcall.





+return false;
+}
+
  return true;
  }
  
@@ -162,6 +172,21 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt,

   "operating-system")));
  _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
  
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {

+bool is_pmem = false;
+#ifdef CONFIG_LIBPMEM
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem",
+   &error_abort);


Presenting to the guest a property of the backend worries me
slightly.  How the backends are synchronized between the source and
destination is out of scope for qemu: is there any possibility that we
could migrate from a host where the backend is pmem to one where it is
not (or the reverse).

I think at the least we want a property on the spapr-nvdimm object
which will override what's presented to the guest (which, yes, might
mean lying to the guest).  I think that could be important for
testing, if nothing else.


Mix configurations can be attempted on a nested setup itself.

On a side note, the attempts to use pmem=on on non-pmem backend is being 
deprecated as that is unsafe pretension effective commit cdcf766d0b0.


I see your point, adding "pmem-override"(?, suggest me if you have 
better name) to spapr-nvdimm can be helpful. Adding it to spapr-nvdimm 
device. With pmem-override "on" device tree property is added allowing 
hcall-flush even when pmem=on for the backend. This works for migration 
compatibility in such a setup.





+#endif
+if (!is_pmem) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
  return child_offset;
  }
  
@@ -585,7 +610,16 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,

  }
  
  dimm = PC_DIMM(drc->dev);

+if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
+return H_PARAMETER;
+}


Hmm.  If you're going to make flushes specific to spapr nvdimms, you
could put the queue of pending flushes into the spapr-nvdimm object,
rather than having a global list in the machine.


Yes. I have changed the patches to move all the flush specific data 
structures into the spapr-nvdimm object.





+
  backend = MEMORY_BACKEND(dimm->hostmem);
+#ifdef CONFIG_LIBPMEM
+if (object_property_get_bool(OBJECT(backend), "pmem", &error_abort)) {
+return H_UNSUPPORTED;


Could you make this not be UNSUPPORTED, but instead fake the flush for
the pmem device?  Either as a no-op, or simulating the guest invoking
the right cpu cache flushes?  That seems like it would be more useful:
that way users who don't care too much about performance could just
always do a flush hcall and not have to have another path for the
"real" pmem case.



It would actually be wrong use for kernel to at

[PATCH v6 0/3] spapr: nvdimm: Introduce spapr-nvdimm device

or fixes.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
  - Fixed a missed-out unlock
  - using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
  nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class
  spapr: nvdimm: Implement H_SCM_FLUSH hcall
  spapr: nvdimm: Introduce spapr-nvdimm device


 hw/mem/nvdimm.c   |  16 ++
 hw/mem/pc-dimm.c  |   5 +
 hw/ppc/spapr.c|   2 +
 hw/ppc/spapr_nvdimm.c | 394 ++
 include/hw/mem/nvdimm.h   |   2 +
 include/hw/mem/pc-dimm.h  |   1 +
 include/hw/ppc/spapr.h|   4 +-
 include/hw/ppc/spapr_nvdimm.h |   1 +
 8 files changed, 424 insertions(+), 1 deletion(-)

--
Signature

[PATCH v6 1/3] nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class

A new subclass inheriting NVDIMMDevice is going to be introduced in
subsequent patches. The new subclass uses the realize and unrealize
callbacks. Add them on NVDIMMClass to appropriately call them as part
of plug-unplug.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/mem/nvdimm.c  |   16 
 hw/mem/pc-dimm.c |5 +
 include/hw/mem/nvdimm.h  |2 ++
 include/hw/mem/pc-dimm.h |1 +
 4 files changed, 24 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..59959d5563 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -181,10 +181,25 @@ static MemoryRegion 
*nvdimm_md_get_memory_region(MemoryDeviceState *md,
 static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
 {
 NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
 
 if (!nvdimm->nvdimm_mr) {
 nvdimm_prepare_memory_region(nvdimm, errp);
 }
+
+if (ndc->realize) {
+ndc->realize(nvdimm, errp);
+}
+}
+
+static void nvdimm_unrealize(PCDIMMDevice *dimm)
+{
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
+
+if (ndc->unrealize) {
+ndc->unrealize(nvdimm);
+}
 }
 
 /*
@@ -240,6 +255,7 @@ static void nvdimm_class_init(ObjectClass *oc, void *data)
 DeviceClass *dc = DEVICE_CLASS(oc);
 
 ddc->realize = nvdimm_realize;
+ddc->unrealize = nvdimm_unrealize;
 mdc->get_memory_region = nvdimm_md_get_memory_region;
 device_class_set_props(dc, nvdimm_properties);
 
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 48b913aba6..03bd0dd60e 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -216,6 +216,11 @@ static void pc_dimm_realize(DeviceState *dev, Error **errp)
 static void pc_dimm_unrealize(DeviceState *dev)
 {
 PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+
+if (ddc->unrealize) {
+ddc->unrealize(dimm);
+}
 
 host_memory_backend_set_mapped(dimm->hostmem, false);
 }
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index bcf62f825c..cf8f59be44 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -103,6 +103,8 @@ struct NVDIMMClass {
 /* write @size bytes from @buf to NVDIMM label data at @offset. */
 void (*write_label_data)(NVDIMMDevice *nvdimm, const void *buf,
  uint64_t size, uint64_t offset);
+void (*realize)(NVDIMMDevice *nvdimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *nvdimm);
 };
 
 #define NVDIMM_DSM_MEM_FILE "etc/acpi/nvdimm-mem"
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 1473e6db62..322bebe555 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -63,6 +63,7 @@ struct PCDIMMDeviceClass {
 
 /* public */
 void (*realize)(PCDIMMDevice *dimm, Error **errp);
+void (*unrealize)(PCDIMMDevice *dimm);
 };
 
 void pc_dimm_pre_plug(PCDIMMDevice *dimm, MachineState *machine,

[PATCH v6 3/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, a new PAPR specific device type inheriting
the nvdimm device is implemented. When the backend doesn't have pmem=on
the device tree property "ibm,hcall-flush-required" is set, and the guest
makes hcall H_SCM_FLUSH requesting for an explicit flush. The new device
has boolean property pmem-override which when "on" advertises the device
tree property even when pmem=on for the backend. The flush function
invokes the fdatasync or pmem_persist() based on the type of backend.

The vmstate structures are made part of the spapr-nvdimm device object.
The patch attempts to keep the migration compatibility between source and
destination while rejecting the incompatibles ones with failures.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |  131 +
 1 file changed, 131 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index ed6fda2c23..8aa6214d6b 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -34,6 +34,7 @@
 #include "block/thread-pool.h"
 #include "migration/vmstate.h"
 #include "qemu/pmem.h"
+#include "hw/qdev-properties.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -57,6 +58,10 @@ OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, 
SPAPR_NVDIMM)
 struct SPAPRNVDIMMClass {
 /* private */
 NVDIMMClass parent_class;
+
+/* public */
+void (*realize)(NVDIMMDevice *dimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *dimm, Error **errp);
 };
 
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
@@ -64,6 +69,8 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 {
 const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
 const MachineState *ms = MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 g_autofree char *uuidstr = NULL;
 QemuUUID uuid;
 int ret;
@@ -101,6 +108,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 return false;
 }
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
+return false;
+}
+
 return true;
 }
 
@@ -172,6 +187,20 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+bool is_pmem = false, pmem_override = false;
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL);
+pmem_override = object_property_get_bool(OBJECT(nvdimm),
+ "pmem-override", NULL);
+if (!is_pmem || pmem_override) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
 return child_offset;
 }
 
@@ -398,11 +427,21 @@ typedef struct SpaprNVDIMMDeviceFlushState {
 
 typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
 struct SpaprNVDIMMDevice {
+/* private */
 NVDIMMDevice parent_obj;
 
+bool hcall_flush_required;
 uint64_t nvdimm_flush_token;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+
+/* public */
+
+/*
+ * The 'on' value for this property forced the qemu to enable the hcall
+ * flush for the nvdimm device even if the backend is a pmem
+ */
+bool pmem_override;
 };
 
 static int flush_worker_cb(void *opaque)
@@ -449,6 +488,23 @@ static int spapr_nvdimm_flush_post_load(void *opaque, int 
version_id)
 SpaprNVDIMMDeviceFlushState *state;
 HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(s_nvdimm)->hostme

[PATCH v7 0/3] spapr: nvdimm: Introduce spapr-nvdimm device

ling code to spapr_nvdimm.c along
with some simplifications
  - Added vmstate to preserve the hcall status during save-restore
along with pre_save handler code to complete all ongoning flushes.
  - Added hw_compat magic for sync-dax 'on' on previous machines.
  - Miscellanious minor fixes.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
  - Fixed a missed-out unlock
  - using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
  nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class
  spapr: nvdimm: Implement H_SCM_FLUSH hcall
  spapr: nvdimm: Introduce spapr-nvdimm device


 hw/mem/nvdimm.c   |  16 ++
 hw/mem/pc-dimm.c  |   5 +
 hw/ppc/spapr.c|   2 +
 hw/ppc/spapr_nvdimm.c | 394 ++
 include/hw/mem/nvdimm.h   |   2 +
 include/hw/mem/pc-dimm.h  |   1 +
 include/hw/ppc/spapr.h|   4 +-
 include/hw/ppc/spapr_nvdimm.h |   1 +
 8 files changed, 424 insertions(+), 1 deletion(-)

--
Signature

[PATCH v7 1/3] nvdimm: Add realize, unrealize callbacks to NVDIMMDevice class

A new subclass inheriting NVDIMMDevice is going to be introduced in
subsequent patches. The new subclass uses the realize and unrealize
callbacks. Add them on NVDIMMClass to appropriately call them as part
of plug-unplug.

Signed-off-by: Shivaprasad G Bhat 
Acked-by: Daniel Henrique Barboza 
---
 hw/mem/nvdimm.c  |   16 
 hw/mem/pc-dimm.c |5 +
 include/hw/mem/nvdimm.h  |2 ++
 include/hw/mem/pc-dimm.h |1 +
 4 files changed, 24 insertions(+)

diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..59959d5563 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -181,10 +181,25 @@ static MemoryRegion 
*nvdimm_md_get_memory_region(MemoryDeviceState *md,
 static void nvdimm_realize(PCDIMMDevice *dimm, Error **errp)
 {
 NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
 
 if (!nvdimm->nvdimm_mr) {
 nvdimm_prepare_memory_region(nvdimm, errp);
 }
+
+if (ndc->realize) {
+ndc->realize(nvdimm, errp);
+}
+}
+
+static void nvdimm_unrealize(PCDIMMDevice *dimm)
+{
+NVDIMMDevice *nvdimm = NVDIMM(dimm);
+NVDIMMClass *ndc = NVDIMM_GET_CLASS(nvdimm);
+
+if (ndc->unrealize) {
+ndc->unrealize(nvdimm);
+}
 }
 
 /*
@@ -240,6 +255,7 @@ static void nvdimm_class_init(ObjectClass *oc, void *data)
 DeviceClass *dc = DEVICE_CLASS(oc);
 
 ddc->realize = nvdimm_realize;
+ddc->unrealize = nvdimm_unrealize;
 mdc->get_memory_region = nvdimm_md_get_memory_region;
 device_class_set_props(dc, nvdimm_properties);
 
diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c
index 48b913aba6..03bd0dd60e 100644
--- a/hw/mem/pc-dimm.c
+++ b/hw/mem/pc-dimm.c
@@ -216,6 +216,11 @@ static void pc_dimm_realize(DeviceState *dev, Error **errp)
 static void pc_dimm_unrealize(DeviceState *dev)
 {
 PCDIMMDevice *dimm = PC_DIMM(dev);
+PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
+
+if (ddc->unrealize) {
+ddc->unrealize(dimm);
+}
 
 host_memory_backend_set_mapped(dimm->hostmem, false);
 }
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index bcf62f825c..cf8f59be44 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -103,6 +103,8 @@ struct NVDIMMClass {
 /* write @size bytes from @buf to NVDIMM label data at @offset. */
 void (*write_label_data)(NVDIMMDevice *nvdimm, const void *buf,
  uint64_t size, uint64_t offset);
+void (*realize)(NVDIMMDevice *nvdimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *nvdimm);
 };
 
 #define NVDIMM_DSM_MEM_FILE "etc/acpi/nvdimm-mem"
diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h
index 1473e6db62..322bebe555 100644
--- a/include/hw/mem/pc-dimm.h
+++ b/include/hw/mem/pc-dimm.h
@@ -63,6 +63,7 @@ struct PCDIMMDeviceClass {
 
 /* public */
 void (*realize)(PCDIMMDevice *dimm, Error **errp);
+void (*unrealize)(PCDIMMDevice *dimm);
 };
 
 void pc_dimm_pre_plug(PCDIMMDevice *dimm, MachineState *machine,

[PATCH v7 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch. The
hcall is applicable only for new SPAPR specific device class which is
also introduced in this patch.

The hcall expects the semantics such that the flush to return with
H_LONG_BUSY_ORDER_10_MSEC when the operation is expected to take longer
time along with a continue_token. The hcall to be called again by providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after collecting the return status for the guest
in subsequent hcall from the guest.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are also introduced in this patch which are to be
saved in the new SPAPR specific nvdimm device to be introduced in the
following patch.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|2 
 hw/ppc/spapr_nvdimm.c |  260 +
 include/hw/ppc/spapr.h|4 -
 include/hw/ppc/spapr_nvdimm.h |1 
 4 files changed, 266 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd..9263985663 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1634,6 +1634,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes();
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 91de1052f2..ac44e00153 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,9 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
+#include "migration/vmstate.h"
+#include "qemu/pmem.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -47,6 +51,14 @@
 /* Have an explicit check for alignment */
 QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
+
+struct SPAPRNVDIMMClass {
+/* private */
+NVDIMMClass parent_class;
+};
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
@@ -375,6 +387,253 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+typedef struct SpaprNVDIMMDeviceFlushState {
+uint64_t continue_token;
+int64_t hcall_ret;
+uint32_t drcidx;
+
+QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
+} SpaprNVDIMMDeviceFlushState;
+
+typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
+struct SpaprNVDIMMDevice {
+NVDIMMDevice parent_obj;
+
+uint64_t nvdimm_flush_token;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
+QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+};
+
+static int flush_worker_cb(void *opaque)
+{
+SpaprNVDIMMDeviceFlushState *state = opaque;
+SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+PCDIMMDevice *dimm = PC_DIMM(drc->dev);
+HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem);
+int backend_fd = memory_region_get_fd(&backend->mr);
+
+if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
+void *ptr = memory_region_get_ram_ptr(mr);
+size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
+   NULL);
+
+/* flush pmem backend */
+pmem_persist(ptr, size);
+} else {
+/* flush raw backing image */
+if (qemu_fdatasync(backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+return H_HARDWARE;
+

[PATCH v7 3/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, a new PAPR specific device type inheriting
the nvdimm device is implemented. When the backend doesn't have pmem=on
the device tree property "ibm,hcall-flush-required" is set, and the guest
makes hcall H_SCM_FLUSH requesting for an explicit flush. The new device
has boolean property pmem-override which when "on" advertises the device
tree property even when pmem=on for the backend. The flush function
invokes the fdatasync or pmem_persist() based on the type of backend.

The vmstate structures are made part of the spapr-nvdimm device object.
The patch attempts to keep the migration compatibility between source and
destination while rejecting the incompatibles ones with failures.

Signed-off-by: Shivaprasad G Bhat 
Reviewed-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr_nvdimm.c |  132 +
 1 file changed, 132 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index ac44e00153..c4c97da5de 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -34,6 +34,7 @@
 #include "block/thread-pool.h"
 #include "migration/vmstate.h"
 #include "qemu/pmem.h"
+#include "hw/qdev-properties.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -57,6 +58,10 @@ OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, 
SPAPR_NVDIMM)
 struct SPAPRNVDIMMClass {
 /* private */
 NVDIMMClass parent_class;
+
+/* public */
+void (*realize)(NVDIMMDevice *dimm, Error **errp);
+void (*unrealize)(NVDIMMDevice *dimm, Error **errp);
 };
 
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
@@ -64,6 +69,8 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 {
 const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
 const MachineState *ms = MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 g_autofree char *uuidstr = NULL;
 QemuUUID uuid;
 int ret;
@@ -101,6 +108,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 return false;
 }
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
+return false;
+}
+
 return true;
 }
 
@@ -172,6 +187,20 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+bool is_pmem = false, pmem_override = false;
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL);
+pmem_override = object_property_get_bool(OBJECT(nvdimm),
+ "pmem-override", NULL);
+if (!is_pmem || pmem_override) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
 return child_offset;
 }
 
@@ -397,11 +426,21 @@ typedef struct SpaprNVDIMMDeviceFlushState {
 
 typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
 struct SpaprNVDIMMDevice {
+/* private */
 NVDIMMDevice parent_obj;
 
+bool hcall_flush_required;
 uint64_t nvdimm_flush_token;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
 QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+
+/* public */
+
+/*
+ * The 'on' value for this property forced the qemu to enable the hcall
+ * flush for the nvdimm device even if the backend is a pmem
+ */
+bool pmem_override;
 };
 
 static int flush_worker_cb(void *opaque)
@@ -448,6 +487,24 @@ static int spapr_nvdimm_flush_post_load(void *opaque, int 
version_id)
 SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque;
 SpaprNVDIM

Re: [PATCH] vfio: container: Fix missing allocation of VFIOSpaprContainer

2024-06-21 Thread Shivaprasad G Bhat




On 6/21/24 2:19 PM, Cédric Le Goater wrote:


Could you please describe the host/guest OS, hypervisor, processor
and adapter ?


Here is the environment info,


pSeries:

Host : Power10 PowerVM  Lpar

Kernel: Upstream 6.10.0-rc4 + VFIO fixes posted at 
171810893836.1721.2640631616827396553.st...@linux.ibm.com


Hypervisor : KVM on PowerVM & also tried without KVM using TCG

Guest : 6.8.5-301.fc40.ppc64le Fedora 40 distro kernel

Adapter: Non-Volatile memory controller: Samsung Electronics Co Ltd NVMe 
SSD Controller PM173X



PowerNV:

Host: Power9 Baremetal

Kernel: kernel-core-6.9.4-200 - Fedora 40 distro kernel

Hypervisor: KVM

Guest : 6.8.5-301.fc40.ppc64le - Fedora 40 distro kernel

Adapter: Non-Volatile memory controller: Samsung Electronics Co Ltd NVMe 
SSD Controller PM173X



Thanks,

Shivaprasad


Thanks,

C.

Re: [PATCH] vfio: container: Fix missing allocation of VFIOSpaprContainer

2024-06-25 Thread Shivaprasad G Bhat




On 6/21/24 8:40 PM, Cédric Le Goater wrote:

On 6/21/24 4:47 PM, Shivaprasad G Bhat wrote:


On 6/21/24 2:19 PM, Cédric Le Goater wrote:


Could you please describe the host/guest OS, hypervisor, processor
and adapter ?


Here is the environment info,


pSeries:

Host : Power10 PowerVM  Lpar

Kernel: Upstream 6.10.0-rc4 + VFIO fixes posted at 
171810893836.1721.2640631616827396553.st...@linux.ibm.com


Great. You should report there too and probably send a PR to Alex to
contribute your changes to the vfio tests.


Could you clarify which tree you are referring to ? I see his tree

https://github.com/awilliam/tests is bit old and updated recently, however

I have been using those tests for my unit testing.




Hypervisor : KVM on PowerVM & 


OK. So, this is using the newer nested v2 implementation.


Yes. However, this was working for userspace before too with limitations

like DMA windows were being borrowed, and no customization

of window size etc.



With the
legacy XICS IRQ controller or XIVE ? in-kernel device or emulated ?


Emulated XIVE.




also tried without KVM using TCG


Ah nice. Good to know that real HW passthrough works in TCG also.


Guest : 6.8.5-301.fc40.ppc64le Fedora 40 distro kernel

Adapter: Non-Volatile memory controller: Samsung Electronics Co Ltd 
NVMe SSD Controller PM173X


PowerNV:

Host: Power9 Baremetal

Kernel: kernel-core-6.9.4-200 - Fedora 40 distro kernel


Is there a requirement on the kernel version ? Would an older debian
6.1 work for instance ?


This went through cycles of breakage and fixes. It worked on 5.18(not sure

about older ones before that), and broke afterwards. Recently fixed

and working from 6.4, broken on 6.7. Fixed and working in 6.8

onwards now.





Hypervisor: KVM

Guest : 6.8.5-301.fc40.ppc64le - Fedora 40 distro kernel

Adapter: Non-Volatile memory controller: Samsung Electronics Co Ltd 
NVMe SSD Controller PM173X


Nice. XIVE I suppose. 


Yes.



What about TCG ?


Yes, TCG too works, missed to mention.


Thanks,

Shivaprasad


Thanks a lot,

C.

Re: [PATCH] vfio: container: Fix missing allocation of VFIOSpaprContainer

2024-07-01 Thread Shivaprasad G Bhat


On 6/28/24 4:07 PM, Cédric Le Goater wrote:

...


Could you clarify which tree you are referring to ? I see his tree

https://github.com/awilliam/tests is bit old and updated recently, 
however


I have been using those tests for my unit testing.


Yes, this tree.


Thanks!

...

This went through cycles of breakage and fixes. It worked on 5.18(not 
sure


about older ones before that), and broke afterwards. Recently fixed

and working from 6.4, broken on 6.7. Fixed and working in 6.8

onwards now.


Good. It should be fixed in the next debian.



Yes, TCG too works, missed to mention.


and a TCG guest under an intel host ? This used to work.



Yes. pSeries TCG guest on intel host works too.


Regards,

Shivaprasad

[PATCH 1/2] target/ppc/cpu_init: Synchronize DEXCR with KVM for migration

2024-06-03 Thread Shivaprasad G Bhat

The patch enables DEXCR migration by hooking with the
"KVM one reg" ID KVM_REG_PPC_DEXCR.

Signed-off-by: Shivaprasad G Bhat 
---
 linux-headers/asm-powerpc/kvm.h |1 +
 target/ppc/cpu_init.c   |4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
index 1691297a76..fcb947f656 100644
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@@ -645,6 +645,7 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_SIER3  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3)
 #define KVM_REG_PPC_DAWR1  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
 #define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
+#define KVM_REG_PPC_DEXCR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc6)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index c11a69fd90..b1422c2eab 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5818,9 +5818,9 @@ static void register_power10_hash_sprs(CPUPPCState *env)
 
 static void register_power10_dexcr_sprs(CPUPPCState *env)
 {
-spr_register(env, SPR_DEXCR, "DEXCR",
+spr_register_kvm(env, SPR_DEXCR, "DEXCR",
 SPR_NOACCESS, SPR_NOACCESS,
-&spr_read_generic, &spr_write_generic,
+&spr_read_generic, &spr_write_generic, KVM_REG_PPC_DEXCR,
 0);
 
 spr_register(env, SPR_UDEXCR, "UDEXCR",

[PATCH 2/2] target/ppc/cpu_init: Synchronize HASHKEYR with KVM for migration

2024-06-03 Thread Shivaprasad G Bhat

The patch enables HASHKEYR migration by hooking with the
"KVM one reg" ID KVM_REG_PPC_HASHKEYR.

Signed-off-by: Shivaprasad G Bhat 
---
 linux-headers/asm-powerpc/kvm.h |1 +
 target/ppc/cpu_init.c   |4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
index fcb947f656..23a0af739c 100644
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@@ -646,6 +646,7 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_DAWR1  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
 #define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
 #define KVM_REG_PPC_DEXCR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc6)
+#define KVM_REG_PPC_HASHKEYR   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc7)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index b1422c2eab..cee0a609eb 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5805,10 +5805,10 @@ static void register_power10_hash_sprs(CPUPPCState *env)
 ((uint64_t)g_rand_int(rand) << 32) | (uint64_t)g_rand_int(rand);
 g_rand_free(rand);
 #endif
-spr_register(env, SPR_HASHKEYR, "HASHKEYR",
+spr_register_kvm(env, SPR_HASHKEYR, "HASHKEYR",
 SPR_NOACCESS, SPR_NOACCESS,
 &spr_read_generic, &spr_write_generic,
-hashkeyr_initial_value);
+KVM_REG_PPC_HASHKEYR, hashkeyr_initial_value);
 spr_register_hv(env, SPR_HASHPKEYR, "HASHPKEYR",
 SPR_NOACCESS, SPR_NOACCESS,
 SPR_NOACCESS, SPR_NOACCESS,

[PATCH 0/2] ppc: spapr: Nested kvm guest migration fixes

2024-06-03 Thread Shivaprasad G Bhat

The series fixes the issues exposed by the kvm-unit-tests[1]
sprs-migration test.

The sprs DEXCR and HASKKEYR are not registered with one-reg IDs
without which the Qemu is not setting them to their 'previous'
value during guest migreation at destination.

The two patches in the series take care of this. Also, the PPC
kvm header changes are selectively picked for the required
definitions posted here at [2].

References:
[1]: https://github.com/kvm-unit-tests/kvm-unit-tests
[2]: 
https://lore.kernel.org/kvm/171741323521.6631.11242552089199677395.st...@linux.ibm.com

---

Shivaprasad G Bhat (2):
  target/ppc/cpu_init: Synchronize DEXCR with KVM for migration
  target/ppc/cpu_init: Synchronize HASHKEYR with KVM for migration


 linux-headers/asm-powerpc/kvm.h | 2 ++
 target/ppc/cpu_init.c   | 8 
 2 files changed, 6 insertions(+), 4 deletions(-)

--
Signature

[PATCH v2 0/4] ppc: spapr: Nested kvm guest migration fixes

The series fixes the issues exposed by the kvm-unit-tests[1]
sprs-migration test.

The sprs DEXCR, HASKKEYR and HASHPKEYR are not registered with
one-reg IDs without which the Qemu is not setting them to their
'previous' value before vcpu run or migration at destination.

The first patch updates the linux header with the IDs[2] for
current use till a complete update post kernel release. The
remaining three patches in the series take care registering
them with KVM.

References:
[1]: https://github.com/kvm-unit-tests/kvm-unit-tests
[2]: 
https://lore.kernel.org/kvm/171759276071.1480.9356137231993600304.st...@linux.ibm.com

---
Changelog:

v1: 
https://lore.kernel.org/qemu-devel/171741555734.11675.17428208097186191736.stgit@c0c876608f2d/
- Moved the linux header changes to a separate patch adding
  definitions for all the required one-reg ids together.
- Added one-reg ID for HASHPKEYR as suggested

Shivaprasad G Bhat (4):
  linux-header: PPC: KVM: Update one-reg ids for DEXCR, HASHKEYR and 
HASHPKEYR
  target/ppc/cpu_init: Synchronize DEXCR with KVM for migration
  target/ppc/cpu_init: Synchronize HASHKEYR with KVM for migration
  target/ppc/cpu_init: Synchronize HASHPKEYR with KVM for migration


 linux-headers/asm-powerpc/kvm.h |  3 +++
 target/ppc/cpu_init.c   | 12 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

--
Signature

[PATCH v2 1/4] linux-header: PPC: KVM: Update one-reg ids for DEXCR, HASHKEYR and HASHPKEYR

This is a placeholder change for these SPRs until the full linux
header update.

Signed-off-by: Shivaprasad G Bhat 
---
 linux-headers/asm-powerpc/kvm.h |3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-headers/asm-powerpc/kvm.h b/linux-headers/asm-powerpc/kvm.h
index 1691297a76..eaeda00178 100644
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@@ -645,6 +645,9 @@ struct kvm_ppc_cpu_char {
 #define KVM_REG_PPC_SIER3  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3)
 #define KVM_REG_PPC_DAWR1  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc4)
 #define KVM_REG_PPC_DAWRX1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc5)
+#define KVM_REG_PPC_DEXCR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc6)
+#define KVM_REG_PPC_HASHKEYR   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc7)
+#define KVM_REG_PPC_HASHPKEYR  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc8)
 
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs

[PATCH v2 3/4] target/ppc/cpu_init: Synchronize HASHKEYR with KVM for migration

The patch enables HASHKEYR migration by hooking with the
"KVM one reg" ID KVM_REG_PPC_HASHKEYR.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/cpu_init.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index b1422c2eab..cee0a609eb 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5805,10 +5805,10 @@ static void register_power10_hash_sprs(CPUPPCState *env)
 ((uint64_t)g_rand_int(rand) << 32) | (uint64_t)g_rand_int(rand);
 g_rand_free(rand);
 #endif
-spr_register(env, SPR_HASHKEYR, "HASHKEYR",
+spr_register_kvm(env, SPR_HASHKEYR, "HASHKEYR",
 SPR_NOACCESS, SPR_NOACCESS,
 &spr_read_generic, &spr_write_generic,
-hashkeyr_initial_value);
+KVM_REG_PPC_HASHKEYR, hashkeyr_initial_value);
 spr_register_hv(env, SPR_HASHPKEYR, "HASHPKEYR",
 SPR_NOACCESS, SPR_NOACCESS,
 SPR_NOACCESS, SPR_NOACCESS,

[PATCH v2 2/4] target/ppc/cpu_init: Synchronize DEXCR with KVM for migration

The patch enables DEXCR migration by hooking with the
"KVM one reg" ID KVM_REG_PPC_DEXCR.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/cpu_init.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index c11a69fd90..b1422c2eab 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5818,9 +5818,9 @@ static void register_power10_hash_sprs(CPUPPCState *env)
 
 static void register_power10_dexcr_sprs(CPUPPCState *env)
 {
-spr_register(env, SPR_DEXCR, "DEXCR",
+spr_register_kvm(env, SPR_DEXCR, "DEXCR",
 SPR_NOACCESS, SPR_NOACCESS,
-&spr_read_generic, &spr_write_generic,
+&spr_read_generic, &spr_write_generic, KVM_REG_PPC_DEXCR,
 0);
 
 spr_register(env, SPR_UDEXCR, "UDEXCR",

[PATCH v2 4/4] target/ppc/cpu_init: Synchronize HASHPKEYR with KVM for migration

The patch enables HASHPKEYR migration by hooking with the
"KVM one reg" ID KVM_REG_PPC_HASHPKEYR.

Signed-off-by: Shivaprasad G Bhat 
---
 target/ppc/cpu_init.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index cee0a609eb..e6ebc0cef0 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5809,11 +5809,11 @@ static void register_power10_hash_sprs(CPUPPCState *env)
 SPR_NOACCESS, SPR_NOACCESS,
 &spr_read_generic, &spr_write_generic,
 KVM_REG_PPC_HASHKEYR, hashkeyr_initial_value);
-spr_register_hv(env, SPR_HASHPKEYR, "HASHPKEYR",
+spr_register_kvm_hv(env, SPR_HASHPKEYR, "HASHPKEYR",
 SPR_NOACCESS, SPR_NOACCESS,
 SPR_NOACCESS, SPR_NOACCESS,
 &spr_read_generic, &spr_write_generic,
-hashpkeyr_initial_value);
+KVM_REG_PPC_HASHPKEYR, hashpkeyr_initial_value);
 }
 
 static void register_power10_dexcr_sprs(CPUPPCState *env)

Re: [PATCH] vfio: container: Fix missing allocation of VFIOSpaprContainer

2024-05-22 Thread Shivaprasad G Bhat


On 5/13/24 17:53, Cédric Le Goater wrote:

Hello Shivaprasad,

On 5/9/24 21:14, Shivaprasad G Bhat wrote:

The commit 6ad359ec29 "(vfio/spapr: Move prereg_listener into
spapr container)" began to use the newly introduced VFIOSpaprContainer
structure.

After several refactors, today the container_of(container,
VFIOSpaprContainer, ABC) is used when VFIOSpaprContainer is actually
not allocated. On PPC64 systems, this dereference is leading to 
corruption

showing up as glibc malloc assertion during guest start when using vfio.

Patch adds the missing allocation while also making the structure 
movement

to vfio common header file.

Fixes: 6ad359ec29 "(vfio/spapr: Move prereg_listener into spapr 
container)"

Signed-off-by: Shivaprasad G Bhat 
---
  hw/vfio/container.c   |    6 --
  hw/vfio/spapr.c   |    6 --
  include/hw/vfio/vfio-common.h |    6 ++
  3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..ecaf5786d9 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -539,6 +539,7 @@ static int vfio_connect_container(VFIOGroup 
*group, AddressSpace *as,

  {
  VFIOContainer *container;
  VFIOContainerBase *bcontainer;
+    VFIOSpaprContainer *scontainer;


We should do our best to avoid any direct use of ppc related attributes
in the common VFIO code. This comment also applies to VFIO_SPAPR_TCE*
which are still there because the clean up is not finished. So, this
proposal will have to be reworked.


Sure.

The first step is to finish the QOMification of VFIOContainer, so
that the VFIOContainer instance is created in vfio_connect_container()
with :

    container = qdev_new(iommu_type_name);


This requires the VFIOContainer to be a DeviceState object.

The existing base class TYPE_VFIO_IOMMU is an InterfaceClass.

I attempted VFIOContainer object declaration with TYPE_VFIO_IOMMU,

like

OBJECT_DECLARE_SIMPLE_TYPE(VFIOContainer, VFIO_IOMMU_LEGACY)




This means reworking this part (and vfio_set_iommu()) :

    ...
    container = g_malloc0(sizeof(*container));
    container->fd = fd;
    bcontainer = &container->bcontainer;

    if (!vfio_set_iommu(container, group->fd, space, errp)) {
    goto free_container_exit;
    }
    ...

VFIOSpaprContainer can then implement its own .init_instance() handler
to allocate/initialize attributes required by the pseries machines.



With my above changes,

I see the instance_init() is not supported for the InterfaceClass with the

checks from below

commit 422ca1432f7b44f2a9f3ad94a65d36927da021fa
Author: Marc-André Lureau 
Date:   Wed Sep 12 16:53:03 2018 +0400

    qom/object: add some interface asserts

Did you suggest me something else?


Thank you,

Shivaprasad



While doing this, please try to reduce the use of ->iommu_type which
is a design shortcut. I would like to completely remove it at some
point.

Thanks,

C.








  int ret, fd;
  VFIOAddressSpace *space;

@@ -611,7 +612,8 @@ static int vfio_connect_container(VFIOGroup 
*group, AddressSpace *as,

  goto close_fd_exit;
  }

-    container = g_malloc0(sizeof(*container));
+    scontainer = g_malloc0(sizeof(*scontainer));
+    container = &scontainer->container;
  container->fd = fd;
  bcontainer = &container->bcontainer;

@@ -675,7 +677,7 @@ unregister_container_exit:
  vfio_cpr_unregister_container(bcontainer);

  free_container_exit:
-    g_free(container);
+    g_free(scontainer);

  close_fd_exit:
  close(fd);
diff --git a/hw/vfio/spapr.c b/hw/vfio/spapr.c
index 0d949bb728..78d218b7e7 100644
--- a/hw/vfio/spapr.c
+++ b/hw/vfio/spapr.c
@@ -24,12 +24,6 @@
  #include "qapi/error.h"
  #include "trace.h"

-typedef struct VFIOSpaprContainer {
-    VFIOContainer container;
-    MemoryListener prereg_listener;
-    QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
-} VFIOSpaprContainer;
-
  static bool 
vfio_prereg_listener_skipped_section(MemoryRegionSection *section)

  {
  if (memory_region_is_iommu(section->mr)) {
diff --git a/include/hw/vfio/vfio-common.h 
b/include/hw/vfio/vfio-common.h

index b9da6c08ef..010fa68ac6 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -82,6 +82,12 @@ typedef struct VFIOContainer {
  QLIST_HEAD(, VFIOGroup) group_list;
  } VFIOContainer;

+typedef struct VFIOSpaprContainer {
+    VFIOContainer container;
+    MemoryListener prereg_listener;
+    QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
+} VFIOSpaprContainer;
+
  typedef struct VFIOHostDMAWindow {
  hwaddr min_iova;
  hwaddr max_iova;

Re: [PATCH] vfio: container: Fix missing allocation of VFIOSpaprContainer

2024-06-21 Thread Shivaprasad G Bhat


Hi Cédric,

On 6/20/24 6:37 PM, Cédric Le Goater wrote:

Shivaprasad,

On 5/9/24 9:14 PM, Shivaprasad G Bhat wrote:

The commit 6ad359ec29 "(vfio/spapr: Move prereg_listener into
spapr container)" began to use the newly introduced VFIOSpaprContainer
structure.

After several refactors, today the container_of(container,
VFIOSpaprContainer, ABC) is used when VFIOSpaprContainer is actually
not allocated. On PPC64 systems, this dereference is leading to 
corruption

showing up as glibc malloc assertion during guest start when using vfio.

Patch adds the missing allocation while also making the structure 
movement

to vfio common header file.

Fixes: 6ad359ec29 "(vfio/spapr: Move prereg_listener into spapr 
container)"

Signed-off-by: Shivaprasad G Bhat 


Could you please give vfio-9.1 a try ? Thanks,


Yes. This is working fine for ppc64.


Thank you!


Regards,

Shivaprasad



C.

https://github.com/legoater/qemu/commits/vfio-9.1

Re: [PATCH v5 3/3] ppc: Enable 2nd DAWR support on p10


Hi David, All,

I am revisiting/reviving this patch.

On 5/5/21 11:20, David Gibson wrote:

On Wed, Apr 21, 2021 at 11:50:40AM +0530, Ravi Bangoria wrote:

Hi David,

On 4/19/21 10:23 AM, David Gibson wrote:

On Mon, Apr 12, 2021 at 05:14:33PM +0530, Ravi Bangoria wrote:




Since we have released versions with POWER10 support, but no DAWR1, in
theory we need a capability so new qemu with old machine types don't
gain guest visible features that the same machine types on older qemus
had.

Except.. there's a loophole we might use to sidestep that.  The
current POWER10 CPU modelled in qemu is a DD1 - which I strongly
suspect will never appear outside of IBM.  I'm pretty sure we want to
replace that with a DD2.

While the modelled CPU is DD1, I think it's pretty reasonable to say
our POWER10 support hasn't yet stabilized, and it would therefore be
ok to simply add DAWR1 on POWER10 unconditionally, as long as we do it
before we switch over to DD2.


As POWER10 DD2 switch over has already happened, the need for

new/separate capability for dawr1 still holds. So, I am keeping it as is.


Posting the next version after rebase.


Thanks,

Shivaprasad


I'm wondering if we're actually just better off setting the pa feature
just based on the guest CPU model.  TCG will be broken if you try to
use it, but then, it already is.  AFAIK there's no inherent reason we
couldn't implement DAWR support in TCG, it's just never been worth the
trouble.

Correct. Probably there is no practical usecase for DAWR in TCG mode.

Thanks,
Ravi

[PATCH v6] ppc: Enable 2nd DAWR support on p10

From: Ravi Bangoria 

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability. Though,
watchpoint on powerpc TCG guest is not supported and thus 2nd DAWR is not
enabled for TCG mode.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Greg Kurz 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Shivaprasad G Bhat 
---
Changelog:
v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/
v5->v6:
  - The other patches in the original series already merged.
  - Rebased to the top of the tree. So, the gen_spr_book3s_310_dbg() is renamed
to register_book3s_310_dbg_sprs() and moved to cpu_init.c accordingly.
  - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com
v3->v4:
  - Make error message more proper.

v3: https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com
v3->v4:
  - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
(PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
ppc_check_compati(3_00) will also be true. ppc_check_compat(3_10)
can be added while introducing pa_features_310 in future.
  - Use error_append_hint() for hints. Also add ERRP_GUARD().
  - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com
v2->v3:
  - Don't introduce pa_features_310[], instead, reuse pa_features_300[]
for 3.1 guests, as there is no difference between initial values of
them atm.
  - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() instead of
init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() from
gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls it.

v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com
v1->v2:
  - Introduce machine capability cap-dawr1 to enable/disable
the feature. By default, 2nd DAWR is OFF for guests even
when host kvm supports it. User has to manually enable it
with -machine cap-dawr1=on if he wishes to use it.
  - Split the header file changes into separate patch. (Sync
headers from v5.12-rc3)

[1] https://git.kernel.org/torvalds/c/bd1de1a0e6eff

 hw/ppc/spapr.c |7 ++-
 hw/ppc/spapr_caps.c|   32 
 include/hw/ppc/spapr.h |6 +-
 target/ppc/cpu.h   |2 ++
 target/ppc/cpu_init.c  |   15 +++
 target/ppc/kvm.c   |   12 
 target/ppc/kvm_ppc.h   |   12 
 7 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 54dbfd7fe9..1e54e0c719 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -241,7 +241,7 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 /* 54: DecFP, 56: DecI, 58: SHA */
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
-/* 60: NM atomic, 62: RNG */
+/* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 };
 uint8_t *pa_features = NULL;
@@ -282,6 +282,9 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
  * in pa-features. So hide it from them. */
 pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 }
+if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
+pa_features[66] |= 0x80;
+}

 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 }
@@ -2084,6 +2087,7 @@ static const VMStateDescription vmstate_spapr = {
 &vmstate_spapr_cap_fwnmi,
 &vmstate_spapr_fwnmi,
 &vmstate_spapr_cap_rpt_invalidate,
+&vmstate_spapr_cap_dawr1,
 NULL
 }
 };
@@ -4683,6 +4687,7 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
+smc->default_caps.caps[SPAPR_CAP_DAWR1] = SPAPR_CAP_OFF;

 /*
  * This cap specifies whether the AIL 3 mode for
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 5a0755d34f..2f2cf4a250 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -655,6 +655,28 @@ static void cap_ail_mode_3_apply(SpaprMachineState *spapr,
 }
 }

+static void cap_dawr1_apply(SpaprMachineState *spapr, uint8_t val,
+   Error **errp)
+{
+ERRP_GUARD();
+if (!val) {
+return; /* Disable by default */
+}
+
+if (tcg_enabled()) {
+error_se

Re: [PATCH v6] ppc: Enable 2nd DAWR support on p10


On 7/7/23 17:52, Daniel Henrique Barboza wrote:



On 7/7/23 08:59, Greg Kurz wrote:

Hi Daniel and Shiva !

On Fri, 7 Jul 2023 08:09:47 -0300
Daniel Henrique Barboza  wrote:


This one was a buzzer shot.



Indeed ! :-) I would have appreciated some more time to re-assess
my R-b tag on this 2 year old bug though ;-)


My bad! I never thought it was that old. Never occured to me to check 
when

the previous version was sent.

Folks, please bear in mind that a Reviewed-by is given on the context 
when the
patch was sent. A handful of months? Keep the R-bs. 6 months, from one 
release

to the other? Things starts to get a little murky. 2 years? hahaha c'mon



Apologies, since v5 didn't need any rework I retained the Reviewed-bys.

I agree, I should have been explicit in changelog about how old it is.



At the very least you need to point out that the acks are old.




My concerns were that the DAWR1 spapr cap was still not enabled by
default but I guess it is because POWER9 is still the default cpu
type. Related, the apply function should probably spit a warning
with TCG instead of failing, like already done for some other
TCG limitations (e.g. cap_safe_bounds_check_apply()). This will
be needed for `make test` to succeed when DAWR1 is eventually
enabled by default. Not needed right now.


Thanks Greg, I will convert the errors to warnings for DAWR1 caps checks

in the next version. However, I dont see any new "make test" failures 
with the patch.


Here are the logs "make test",

With patch - 
https://gist.github.com/shivaprasadbhat/859f7f4a0c105ac1232b7ab5d8e161e8#file-gistfile1-txt


Without patch - 
https://gist.github.com/shivaprasadbhat/25e5db9254cbe3292017f16adf41ecc1#file-gistfile1-txt




My R-b still stands then ! :-)


This patch got lucky then. If you/Cedric remove your acks I would 
simply drop the
patch and re-send the PR with the greatest of ease, no remorse 
whatsoever.



Thanks,

Daniel



Cheers,

--
Greg



Queued in gitlab.com/danielhb/qemu/tree/ppc-next. Thanks,


Daniel


On 7/7/23 05:47, Shivaprasad G Bhat wrote:

From: Ravi Bangoria 

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is 
set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to 
find
whether kvm supports 2nd DAWR or not. If it's supported, allow user 
to set
the pa-feature bit in guest DT using cap-dawr1 machine capability. 
Though,
watchpoint on powerpc TCG guest is not supported and thus 2nd DAWR 
is not

enabled for TCG mode.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Greg Kurz 
Reviewed-by: Cédric Le Goater 
Signed-off-by: Shivaprasad G Bhat 
---
Changelog:
v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/

v5->v6:
    - The other patches in the original series already merged.
    - Rebased to the top of the tree. So, the 
gen_spr_book3s_310_dbg() is renamed
  to register_book3s_310_dbg_sprs() and moved to cpu_init.c 
accordingly.

    - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com

v3->v4:
    - Make error message more proper.

v3: 
https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com

v3->v4:
    - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
  (PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
  ppc_check_compati(3_00) will also be true. 
ppc_check_compat(3_10)

  can be added while introducing pa_features_310 in future.
    - Use error_append_hint() for hints. Also add ERRP_GUARD().
    - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com

v2->v3:
    - Don't introduce pa_features_310[], instead, reuse 
pa_features_300[]
  for 3.1 guests, as there is no difference between initial 
values of

  them atm.
    - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() 
instead of
  init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() 
from
  gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls 
it.


v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com

v1->v2:
    - Introduce machine capability cap-dawr1 to enable/disable
  the feature. By default, 2nd DAWR is OFF for guests even
  when host kvm supports it. User has to manually enable it
  with -machine cap-dawr1=on if he wishes to use it.
    - Split the header file changes into separate patch. (Sync
  headers from v5.12-rc3)

[1] https://git.kernel.org/torvalds/c/bd1de1a0e6eff

   hw/ppc/spapr.c |    7 ++-
   hw/ppc/spapr_caps.c    |   32 
   include/hw/ppc/spapr.h |    6 +-
   target/ppc/cpu.h   |    2 ++
   target/ppc/cpu_init.c  |   15 +++
   target/ppc/kvm.c   |   12 
   target/ppc/kvm_p

Re: [PATCH v6] ppc: Enable 2nd DAWR support on p10




On 7/7/23 19:54, Cédric Le Goater wrote:

On 7/7/23 13:59, Greg Kurz wrote:

Hi Daniel and Shiva !

On Fri, 7 Jul 2023 08:09:47 -0300
Daniel Henrique Barboza  wrote:


This one was a buzzer shot.



Indeed ! :-) I would have appreciated some more time to re-assess
my R-b tag on this 2 year old bug though ;-)


We should drop that patch IMO and ask for a resend with more tests
but that's a lot of work to build a PR :/


Hi Cedric,


I will be taking care of Greg's comment on avoiding failures in TCG mode for

cap-dawr1=on. I have already shared the "make test" results.


Do you want me to try any other tests?


Daniel, Apologies again for forcing you to rebuilding the PR.


Thanks,

Shivaprasad

[RFC PATCH v7] ppc: Enable 2nd DAWR support on p10

2023-11-21 Thread Shivaprasad G Bhat

Extend the existing watchpoint facility from TCG DAWR0 emulation
to DAWR1 on POWER10.

As per the PAPR, bit 0 of byte 64 in pa-features property
indicates availability of 2nd DAWR registers. i.e. If this bit is set, 2nd
DAWR is present, otherwise not. Use KVM_CAP_PPC_DAWR1 capability to find
whether kvm supports 2nd DAWR or not. If it's supported, allow user to set
the pa-feature bit in guest DT using cap-dawr1 machine capability.

Signed-off-by: Ravi Bangoria 
Signed-off-by: Shivaprasad G Bhat 
---
Changelog:
v6: 
https://lore.kernel.org/qemu-devel/168871963321.58984.15628382614621248470.stgit@ltcd89-lp2/
v6->v7:
  - Sorry about the delay in sending out this version, I have dropped the
Reviewed-bys as suggested and converted the patch to RFC back again.
  - Added the TCG support. Basically, converted the existing DAWR0 support
routines into macros for reuse by the DAWR1. Let me know if the macro
conversions should be moved to a separate independent patch.
  - As the dawr1 works on TCG, the checks in cap_dawr1_apply() report a warning
now only for P9 or P9 compat modes for both KVM and TCG use cases.
  - 'make test' passes for caps checks. Also, as suggested by Greg Kurz, the
'make test' after making the DAWR1 default 'on' and updating defaut cpu
to Power10, shows no failures.

v5: 
https://lore.kernel.org/all/20210412114433.129702-1-ravi.bango...@linux.ibm.com/
v5->v6:
  - The other patches in the original series already merged.
  - Rebased to the top of the tree. So, the gen_spr_book3s_310_dbg() is renamed
to register_book3s_310_dbg_sprs() and moved to cpu_init.c accordingly.
  - No functional changes.

v4: 
https://lore.kernel.org/r/20210406053833.282907-1-ravi.bango...@linux.ibm.com
v3->v4:
  - Make error message more proper.

v3: https://lore.kernel.org/r/20210330095350.36309-1-ravi.bango...@linux.ibm.com
v3->v4:
  - spapr_dt_pa_features(): POWER10 processor is compatible with 3.0
(PCR_COMPAT_3_00). No need to ppc_check_compat(3_10) for now as
ppc_check_compati(3_00) will also be true. ppc_check_compat(3_10)
can be added while introducing pa_features_310 in future.
  - Use error_append_hint() for hints. Also add ERRP_GUARD().
  - Add kvmppc_set_cap_dawr1() stub function for CONFIG_KVM=n.

v2: 
https://lore.kernel.org/r/20210329041906.213991-1-ravi.bango...@linux.ibm.com
v2->v3:
  - Don't introduce pa_features_310[], instead, reuse pa_features_300[]
for 3.1 guests, as there is no difference between initial values of
them atm.
  - Call gen_spr_book3s_310_dbg() from init_proc_POWER10() instead of
init_proc_POWER8(). Also, Don't call gen_spr_book3s_207_dbg() from
gen_spr_book3s_310_dbg() as init_proc_POWER10() already calls it.

v1: 
https://lore.kernel.org/r/20200723104220.314671-1-ravi.bango...@linux.ibm.com
v1->v2:
  - Introduce machine capability cap-dawr1 to enable/disable
the feature. By default, 2nd DAWR is OFF for guests even
when host kvm supports it. User has to manually enable it
with -machine cap-dawr1=on if he wishes to use it.
  - Split the header file changes into separate patch. (Sync
headers from v5.12-rc3)

 hw/ppc/spapr.c   |7 ++-
 hw/ppc/spapr_caps.c  |   35 ++
 hw/ppc/spapr_hcall.c |   50 
 include/hw/ppc/spapr.h   |6 ++
 target/ppc/cpu.c |  114 +-
 target/ppc/cpu.h |6 ++
 target/ppc/cpu_init.c|   15 ++
 target/ppc/excp_helper.c |   61 ++---
 target/ppc/helper.h  |2 +
 target/ppc/kvm.c |   12 +
 target/ppc/kvm_ppc.h |   12 +
 target/ppc/machine.c |1
 target/ppc/misc_helper.c |   20 ++--
 target/ppc/spr_common.h  |2 +
 target/ppc/translate.c   |   25 +++---
 15 files changed, 253 insertions(+), 115 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index df09aa9d6a..c1cb47464b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -262,7 +262,7 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 48 - 53 */
 /* 54: DecFP, 56: DecI, 58: SHA */
 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, /* 54 - 59 */
-/* 60: NM atomic, 62: RNG */
+/* 60: NM atomic, 62: RNG, 64: DAWR1 (ISA 3.1) */
 0x80, 0x00, 0x80, 0x00, 0x00, 0x00, /* 60 - 65 */
 };
 uint8_t *pa_features = NULL;
@@ -303,6 +303,9 @@ static void spapr_dt_pa_features(SpaprMachineState *spapr,
  * in pa-features. So hide it from them. */
 pa_features[40 + 2] &= ~0x80; /* Radix MMU */
 }
+if (spapr_get_cap(spapr, SPAPR_CAP_DAWR1)) {
+pa_features[66] |= 0x80;
+}

 _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size)));
 }
@@ -2138,6 +2141,7 @@ static const VMStateDescription vmstate_spapr = {
 &vmstate_spapr_cap

Re: [Qemu-devel] [PATCH] spapr: make default PHB optionnal

2017-07-12 Thread Shivaprasad G Bhat




On 07/12/2017 04:25 PM, Andrea Bolognani wrote:

[libvir-list added to the loop]

On Tue, 2017-07-04 at 10:47 +0200, Greg Kurz wrote:

On Tue, 4 Jul 2017 17:29:01 +1000 David Gibson  
wrote:

On Mon, Jul 03, 2017 at 06:48:25PM +0200, Greg Kurz wrote:
  
The sPAPR machine always create a default PHB during initialization, even

if -nodefaults was passed on the command line. This forces the user to
rely on -global if she wants to set properties of the default PHB, such
as numa_node.
  
This patch introduces a new machine create-default-phb property to control

whether the default PHB must be created or not. It defaults to on in order
to preserve old setups (which is also the motivation to not alter the
current behavior of -nodefaults).
  
If create-default-phb is set to off, the default PHB isn't created, nor

any other device usually created with it. It is mandatory to provide
a PHB on the command line to be able to use PCI devices (otherwise QEMU
won't start). For example, the following creates a PHB with the same
mappings as the default PHB and also sets the NUMA affinity:
  
  -machine type=pseries,create-default-phb=off \

  -numa node,nodeid=0 -device spapr-pci-host-bridge,index=0,numa_node=0
  
So, I agree that the distinction between default devices that are

disabled with -nodefaults and default devices that aren't is a big
mess in qemu configuration.  But on the other hand this only addresses
one tiny aspect of that, and in the meantime means we will silently
ignore some other configuration options in some conditions.
  
So, what's the immediate benefit / use case for this?


Setting numa_node for emulated devices is the benefit for now. On x86, I 
figured there is
no way to set the numa_node for the root controller and the emulated 
devices sitting there
all have numa_node set to -1. Only the devices on the pxb can have a 
sensible value specified.
Does it mean, the emulated devices/drivers don't care about the 
numa_node they are on?


Would it be fine on PPC to disallow setting the NUMA node for the 
default PHB because that is where

all the emulated devices sit ?

  
With the current code base, the only way to set properties of the default

PHB, is to pass -global spapr-pci-host-bridge.prop=value for each property.
The immediate benefit of this patch is to unify the way libvirt passes
PHB description to the command line:
  
ie, do:
  
  -machine type=pseries,create-default-phb=off \

  -device spapr-pci-host-bridge,prop1=a,prop2=b,prop3=c \
  -device spapr-pci-host-bridge,prop1=d,prop2=e,prop3=f
  
instead of:
  
  -machine type=pseries \

  -global spapr-pci-host-bridge.prop1=a \
  -global spapr-pci-host-bridge.prop2=b \
  -global spapr-pci-host-bridge.prop3=c \
  -device spapr-pci-host-bridge,prop1=d,prop2=e,prop3=f

So, I'm thinking about this mostly in terms of NUMA nodes
because that's the use case I'm aware of.

The problem with using -global is not that it requires using
a different syntax to set properties for the default PHB,
but rather that such properties are then inherited by all
other PHBs unless explicitly overridden. Not creating the
default PHB at all would solve the issue.

On the other hand, libvirt would then need to either

   1) only allow setting NUMA nodes for PHBs if QEMU supports
  the new option, leaving QEMU < 2.10 users behind; or

   2) implement handling for both the new and old behavior.

I'm not sure we could get away with 1), and going for 2)
means more work both for QEMU and libvirt developers for
very little actual gain, so I'd be inclined to scrap this
and just build the libvirt glue on top of the existing
interface.

That is, of course, unless

   1) having a random selection of PHBs not assigned to any
  NUMA node is a sensible use case. This is something
  we just can't do reliably with the current interface:
  we can decide to set the NUMA node only for say, PHBs
  1 and 3 leaving 0 and 2 alone, but once we set it for
  the default PHB we *have* to set it for all remaining
  ones as well. libvirt will by default assign emulated
  devices to the default PHB, so I would rather expect
  users to leave that one alone and set a NUMA node for
  all other PHBs; or

   2) there are other properties outside of numa_node we
  might want to deal with; or

   3) it turns out it's okay to require a recent QEMU :)

--
Andrea Bolognani / Red Hat / Virtualization

[Qemu-devel] [PATCH] linux-user: elf: mmap all the target-pages of hostpage for data segment

2018-08-27 Thread Shivaprasad G Bhat

If the hostpage size is greater than the TARGET_PAGESIZE, the
target-pages of size TARGET_PAGESIZE are marked valid only till the
length requested during the elfload. The glibc attempts to consume unused
space in the last page of data segment(__libc_memalign() in
elf/dl-minimal.c). The GLRO(dl_pagesize) is actually the host pagesize as
set in the auxillary vectors. So, there is no explicit mmap request for
the remaining target-pages on the last hostpage. The glibc assumes that
particular space as available and subsequent attempts to use
those addresses lead to crash as the target_mmap has not marked them valid
for those target-pages.

The issue is seen when trying to chroot to 16.04-x86_64 ubuntu on a PPC64
host where the fork fails to access the thread_id as it is allocated on a
page not marked valid. The recent glibc doesnt have checks for thread-id in
fork, but the issue can manifest somewhere else, none the less.

The fix here is to map all the target-pages of the hostpage during the
ELF load for data segment to allow the glibc for proper consumption.

Signed-off-by: Shivaprasad G Bhat 
---
 linux-user/elfload.c |   24 +---
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 8638612aec..1d86034c8d 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1438,9 +1438,17 @@ struct exec
 
 /* Necessary parameters */
 #define TARGET_ELF_EXEC_PAGESIZE TARGET_PAGE_SIZE
-#define TARGET_ELF_PAGESTART(_v) ((_v) & \
- ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE-1))
-#define TARGET_ELF_PAGEOFFSET(_v) ((_v) & (TARGET_ELF_EXEC_PAGESIZE-1))
+#define TARGET_ELF_PAGESTART(_v, _s) \
+((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+ (_v) & ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE - 1) : \
+ (_v) & ~(abi_ulong)(_s - 1));
+#define TARGET_ELF_PAGEOFFSET(_v, _s) \
+((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+ (_v) & (TARGET_ELF_EXEC_PAGESIZE - 1) : \
+ (_v) & (_s - 1));
+#define TARGET_ELF_PAGELENGTH(_v, _s) \
+((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+ TARGET_PAGE_ALIGN(_v) : HOST_PAGE_ALIGN(_v));
 
 #define DLINFO_ITEMS 15
 
@@ -2279,7 +2287,7 @@ static void load_elf_image(const char *image_name, int 
image_fd,
 for (i = 0; i < ehdr->e_phnum; i++) {
 struct elf_phdr *eppnt = phdr + i;
 if (eppnt->p_type == PT_LOAD) {
-abi_ulong vaddr, vaddr_po, vaddr_ps, vaddr_ef, vaddr_em;
+abi_ulong vaddr, vaddr_po, vaddr_ps, vaddr_ef, vaddr_em, vaddr_len;
 int elf_prot = 0;
 
 if (eppnt->p_flags & PF_R) elf_prot =  PROT_READ;
@@ -2287,10 +2295,12 @@ static void load_elf_image(const char *image_name, int 
image_fd,
 if (eppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
 
 vaddr = load_bias + eppnt->p_vaddr;
-vaddr_po = TARGET_ELF_PAGEOFFSET(vaddr);
-vaddr_ps = TARGET_ELF_PAGESTART(vaddr);
+vaddr_po = TARGET_ELF_PAGEOFFSET(vaddr, qemu_host_page_size);
+vaddr_ps = TARGET_ELF_PAGESTART(vaddr, qemu_host_page_size);
+vaddr_len = TARGET_ELF_PAGELENGTH(eppnt->p_filesz + vaddr_po,
+  qemu_host_page_size);
 
-error = target_mmap(vaddr_ps, eppnt->p_filesz + vaddr_po,
+error = target_mmap(vaddr_ps, vaddr_len,
 elf_prot, MAP_PRIVATE | MAP_FIXED,
 image_fd, eppnt->p_offset - vaddr_po);
 if (error == -1) {

Re: [Qemu-devel] [PATCH] linux-user: elf: mmap all the target-pages of hostpage for data segment

2018-08-28 Thread Shivaprasad G Bhat





On 08/27/2018 06:55 PM, Laurent Vivier wrote:

Le 27/08/2018 à 14:37, Shivaprasad G Bhat a écrit :

If the hostpage size is greater than the TARGET_PAGESIZE, the
target-pages of size TARGET_PAGESIZE are marked valid only till the
length requested during the elfload. The glibc attempts to consume unused
space in the last page of data segment(__libc_memalign() in
elf/dl-minimal.c). The GLRO(dl_pagesize) is actually the host pagesize as
set in the auxillary vectors. So, there is no explicit mmap request for
the remaining target-pages on the last hostpage. The glibc assumes that
particular space as available and subsequent attempts to use
those addresses lead to crash as the target_mmap has not marked them valid
for those target-pages.

The issue is seen when trying to chroot to 16.04-x86_64 ubuntu on a PPC64
host where the fork fails to access the thread_id as it is allocated on a
page not marked valid. The recent glibc doesnt have checks for thread-id in
fork, but the issue can manifest somewhere else, none the less.

The fix here is to map all the target-pages of the hostpage during the
ELF load for data segment to allow the glibc for proper consumption.

Signed-off-by: Shivaprasad G Bhat 
---
  linux-user/elfload.c |   24 +---
  1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 8638612aec..1d86034c8d 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1438,9 +1438,17 @@ struct exec
  
  /* Necessary parameters */

  #define TARGET_ELF_EXEC_PAGESIZE TARGET_PAGE_SIZE
-#define TARGET_ELF_PAGESTART(_v) ((_v) & \
- ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE-1))
-#define TARGET_ELF_PAGEOFFSET(_v) ((_v) & (TARGET_ELF_EXEC_PAGESIZE-1))
+#define TARGET_ELF_PAGESTART(_v, _s) \
+((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+ (_v) & ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE - 1) : \
+ (_v) & ~(abi_ulong)(_s - 1));
+#define TARGET_ELF_PAGEOFFSET(_v, _s) \
+((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+ (_v) & (TARGET_ELF_EXEC_PAGESIZE - 1) : \
+ (_v) & (_s - 1));
+#define TARGET_ELF_PAGELENGTH(_v, _s) \
+((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+ TARGET_PAGE_ALIGN(_v) : HOST_PAGE_ALIGN(_v));

I think it's only possible if the PT_LOAD p_align value is greater or
equal to qemu_host_page_size.

See 33143c446e ("linux-user: fix ELF load alignment error").

You could check this with qemu-s390x or qemu-arm on ppc64 host.
Ah, right! I should have added the extra conditional to check the 
p_align with qemu_host_page_mask

along with the existing ones.

Posted the v2 accordingly.

Thanks and Regards,
Shivaprasad


Thanks,
Laurent

[Qemu-devel] [PATCH v2] linux-user: elf: mmap all the target-pages of hostpage for data segment

2018-08-28 Thread Shivaprasad G Bhat

If the hostpage size is greater than the TARGET_PAGESIZE, the
target-pages of size TARGET_PAGESIZE are marked valid only till the
length requested during the elfload. The glibc attempts to consume unused
space in the last page of data segment(__libc_memalign() in
elf/dl-minimal.c). If PT_LOAD p_align is greater than or
equal to hostpage size, the GLRO(dl_pagesize) is actually the host pagesize
as set in the auxillary vectors. So, there is no explicit mmap request for
the remaining target-pages on the last hostpage. The glibc assumes that
particular space as available and subsequent attempts to use
those addresses lead to crash as the target_mmap has not marked them valid
for target-pages as valid.

The issue is seen when trying to chroot to 16.04-x86_64 ubuntu on a PPC64
host where the fork fails to access the thread_id as it is allocated on a
page not marked valid. The recent glibc doesnt have checks for thread-id in
fork, but the issue can manifest somewhere else, none the less.

The fix here is to map all the target-pages of the hostpage during the
elfload if the p_align is greater than or equal to hostpage size, for
data segment to allow the glibc for proper consumption.

Signed-off-by: Shivaprasad G Bhat 
---
v1: https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg05730.html
Changes from v1:
   - Made the conditionals consistent with the commit "33143c446e" and
 changed the commit message accordingly.

 linux-user/elfload.c |   37 +
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 8638612aec..cced43f45c 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1438,9 +1438,23 @@ struct exec
 
 /* Necessary parameters */
 #define TARGET_ELF_EXEC_PAGESIZE TARGET_PAGE_SIZE
-#define TARGET_ELF_PAGESTART(_v) ((_v) & \
- ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE-1))
-#define TARGET_ELF_PAGEOFFSET(_v) ((_v) & (TARGET_ELF_EXEC_PAGESIZE-1))
+#define TARGET_ELF_PAGESTART(_v, _a, _s, _m) \
+(((_a & ~_m) != 0) ? \
+ (_v) & ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE - 1) : \
+ ((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+  (_v) & ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE - 1) : \
+  (_v) & ~(abi_ulong)(_s - 1)));
+#define TARGET_ELF_PAGEOFFSET(_v, _a, _s, _m) \
+(((_a & ~_m) != 0) ? \
+ (_v) & (TARGET_ELF_EXEC_PAGESIZE - 1) : \
+ ((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+  (_v) & (TARGET_ELF_EXEC_PAGESIZE - 1) : \
+  (_v) & (_s - 1)));
+#define TARGET_ELF_PAGELENGTH(_v, _a, _s, _m) \
+(((_a & ~_m) != 0) ? \
+ TARGET_PAGE_ALIGN(_v) : \
+ ((TARGET_ELF_EXEC_PAGESIZE > _s) ? \
+  TARGET_PAGE_ALIGN(_v) : HOST_PAGE_ALIGN(_v)));
 
 #define DLINFO_ITEMS 15
 
@@ -2279,7 +2293,7 @@ static void load_elf_image(const char *image_name, int 
image_fd,
 for (i = 0; i < ehdr->e_phnum; i++) {
 struct elf_phdr *eppnt = phdr + i;
 if (eppnt->p_type == PT_LOAD) {
-abi_ulong vaddr, vaddr_po, vaddr_ps, vaddr_ef, vaddr_em;
+abi_ulong vaddr, vaddr_po, vaddr_ps, vaddr_ef, vaddr_em, vaddr_len;
 int elf_prot = 0;
 
 if (eppnt->p_flags & PF_R) elf_prot =  PROT_READ;
@@ -2287,10 +2301,17 @@ static void load_elf_image(const char *image_name, int 
image_fd,
 if (eppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
 
 vaddr = load_bias + eppnt->p_vaddr;
-vaddr_po = TARGET_ELF_PAGEOFFSET(vaddr);
-vaddr_ps = TARGET_ELF_PAGESTART(vaddr);
-
-error = target_mmap(vaddr_ps, eppnt->p_filesz + vaddr_po,
+vaddr_po = TARGET_ELF_PAGEOFFSET(vaddr, info->alignment,
+ qemu_host_page_size,
+ qemu_host_page_mask);
+vaddr_ps = TARGET_ELF_PAGESTART(vaddr, info->alignment,
+qemu_host_page_size,
+qemu_host_page_mask);
+vaddr_len = TARGET_ELF_PAGELENGTH(eppnt->p_filesz + vaddr_po,
+  info->alignment,
+  qemu_host_page_size,
+  qemu_host_page_mask);
+error = target_mmap(vaddr_ps, vaddr_len,
 elf_prot, MAP_PRIVATE | MAP_FIXED,
 image_fd, eppnt->p_offset - vaddr_po);
 if (error == -1) {

[Qemu-devel] [PATCH v3] linux-user: elf: mmap all the target-pages of hostpage for data segment

2018-08-29 Thread Shivaprasad G Bhat

If the hostpage size is greater than the TARGET_PAGESIZE, the
target-pages of size TARGET_PAGESIZE are marked valid only till the
length requested during the elfload. The glibc attempts to consume unused
space in the last page of data segment(__libc_memalign() in
elf/dl-minimal.c). If PT_LOAD p_align is greater than or
equal to hostpage size, the GLRO(dl_pagesize) is actually the host pagesize
as set in the auxillary vectors. So, there is no explicit mmap request for
the remaining target-pages on the last hostpage. The glibc assumes that
particular space as available and subsequent attempts to use
those addresses lead to crash as the target_mmap has not marked them valid
for those target-pages.

The issue is seen when trying to chroot to 16.04-x86_64 ubuntu on a PPC64
host where the fork fails to access the thread_id as it is allocated on a
page not marked valid. The recent glibc doesn't have checks for thread-id in
fork, but the issue can manifest somewhere else, none the less.

The fix here is to map all the target-pages of the hostpage during the
elfload if the p_align is greater than or equal to hostpage size, for
data segment to allow the glibc for proper consumption.

Signed-off-by: Shivaprasad G Bhat 
---
v2: https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg05943.html
Changes from v2:
- Simplified the macro as suggested.
- Fixed some grammatical error in commit message.
v1: https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg05730.html
Changes from v1:
   - Made the conditionals consistent with the commit "33143c446e" and
 changed the commit message accordingly.

 linux-user/elfload.c |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 8638612aec..6ead0d11c6 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1437,7 +1437,10 @@ struct exec
 #define QMAGIC 0314
 
 /* Necessary parameters */
-#define TARGET_ELF_EXEC_PAGESIZE TARGET_PAGE_SIZE
+#define TARGET_ELF_EXEC_PAGESIZE \
+(((eppnt->p_align & ~qemu_host_page_mask) != 0) ? \
+ TARGET_PAGE_SIZE : MAX(qemu_host_page_size, TARGET_PAGE_SIZE))
+#define TARGET_ELF_PAGELENGTH(_v) ROUND_UP((_v), TARGET_ELF_EXEC_PAGESIZE)
 #define TARGET_ELF_PAGESTART(_v) ((_v) & \
  ~(abi_ulong)(TARGET_ELF_EXEC_PAGESIZE-1))
 #define TARGET_ELF_PAGEOFFSET(_v) ((_v) & (TARGET_ELF_EXEC_PAGESIZE-1))
@@ -2279,7 +2282,7 @@ static void load_elf_image(const char *image_name, int 
image_fd,
 for (i = 0; i < ehdr->e_phnum; i++) {
 struct elf_phdr *eppnt = phdr + i;
 if (eppnt->p_type == PT_LOAD) {
-abi_ulong vaddr, vaddr_po, vaddr_ps, vaddr_ef, vaddr_em;
+abi_ulong vaddr, vaddr_po, vaddr_ps, vaddr_ef, vaddr_em, vaddr_len;
 int elf_prot = 0;
 
 if (eppnt->p_flags & PF_R) elf_prot =  PROT_READ;
@@ -2289,8 +2292,9 @@ static void load_elf_image(const char *image_name, int 
image_fd,
 vaddr = load_bias + eppnt->p_vaddr;
 vaddr_po = TARGET_ELF_PAGEOFFSET(vaddr);
 vaddr_ps = TARGET_ELF_PAGESTART(vaddr);
+vaddr_len = TARGET_ELF_PAGELENGTH(eppnt->p_filesz + vaddr_po);
 
-error = target_mmap(vaddr_ps, eppnt->p_filesz + vaddr_po,
+error = target_mmap(vaddr_ps, vaddr_len,
 elf_prot, MAP_PRIVATE | MAP_FIXED,
 image_fd, eppnt->p_offset - vaddr_po);
 if (error == -1) {

[Qemu-devel] [PATCH] linux-user: ppc64: use the correct values for F_*LK64s

2018-07-11 Thread Shivaprasad G Bhat

Qemu includes the glibc headers for the host defines and target headers are
part of the qemu source themselves. The glibc has the F_GETLK64, F_SETLK64
and F_SETLKW64 defined to 12, 13 and 14 for all archs(generic) in
sysdeps/unix/sysv/linux/bits/fcntl-linux.h. The linux kernel generic
definition for F_*LK is 5, 6 & 7 and F_*LK64* is 12,13, and 14 as seen in
include/uapi/asm-generic/fcntl.h. On 64bit machine, by default the kernel
assumes all F_*LK to 64bit calls and doesnt support use of F_*LK64* as
can be seen in include/linux/fcntl.h in linux source.

On x86_64 host, the values for F_*LK64* are set to 5, 6 and 7
explicitly in /usr/include/x86_64-linux-gnu/bits/fcntl.h by the glibc.
Whereas, a PPC64 host doesn't have such a definition in
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h by the glibc. So,
the sources on PPC64 host sees the default value of F_*LK64*
as 12, 13 & 14(fcntl-linux.h).

Since the 64bit kernel doesnt support 12, 13 & 14; the glibc fcntl syscall
implementation(__libc_fcntl*(), __fcntl64_nocancel) does the F_*LK64* value
convertion back to F_*LK* values on PPC64 as seen in
sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h with FCNTL_ADJUST_CMD()
macro. Whereas on x86_64 host the values for F_*LK64* are set to 5, 6 and 7
and no adjustments are needed.

Since qemu doesnt use the glibc fcntl, but makes the safe_syscall* on its
own, the PPC64 qemu is calling the syscall with 12, 13, and 14(without
adjustment) and they all fail. The fcntl calls to F_GETLK/F_SETLK|W all
fail by all pplications run on PPC64 host user emulation.

The fix here could be to see why on PPC64 the glibc is still keeping
F_*LK64* different from F_*LK and why adjusting them to 5, 6 and 7 before
the syscall for PPC only. See if we can make the
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h to have the values
5, 6 & 7 just like x86_64 and remove the adjustment code in glibc. That way,
qemu sources see the kernel supported values in glibc headers.

OR

On PPC64 host, qemu sources see both F_LK* & F_LK64* as same and set to
12, 13 and 14 because __USE_FILE_OFFSET64 is defined in qemu
sources(also refer sysdeps/unix/sysv/linux/bits/fcntl-linux.h).
Since F_*LK and F_*LK64 are same, the value adjument like done by glibc in
qemu sources is difficult. So, Overwrite the glibc defaults with the actual
supported values in Qemu. The current patch is doing this.

Signed-off-by: Shivaprasad G Bhat 
---
 linux-user/syscall.c |   14 ++
 1 file changed, 14 insertions(+)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 7b9ac3b408..1693e69ce0 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -250,6 +250,20 @@ static type name (type1 arg1,type2 arg2,type3 arg3,type4 
arg4,type5 arg5,  \
 #define TARGET_NR__llseek TARGET_NR_llseek
 #endif
 
+/* glibc headers has these defined to 12, 13 and 14 and is not supported
+ * by kernel. The glibc fcntl call actually adjusts them back to 5, 6 and 7
+ * before making the syscall(). Since we make the syscall directly,
+ * overwite/adjust to what is supported by the kernel.
+ */
+#if defined(__linux__) && defined(__powerpc64__)
+#undef F_GETLK64
+#define F_GETLK64  5   /* Get record locking info.  */
+#undef F_SETLK64
+#define F_SETLK64  6   /* Set record locking info (non-blocking).  */
+#undef F_SETLKW64
+#define F_SETLKW64 7   /* Set record locking info (blocking).  */
+#endif
+
 #ifdef __NR_gettid
 _syscall0(int, gettid)
 #else

Re: [Qemu-devel] [PATCH] linux-user: ppc64: use the correct values for F_*LK64s





On 07/12/2018 02:21 AM, Laurent Vivier wrote:

Le 11/07/2018 à 15:04, Laurent Vivier a écrit :

Le 11/07/2018 à 12:55, Shivaprasad G Bhat a écrit :

Qemu includes the glibc headers for the host defines and target headers are
part of the qemu source themselves. The glibc has the F_GETLK64, F_SETLK64
and F_SETLKW64 defined to 12, 13 and 14 for all archs(generic) in
sysdeps/unix/sysv/linux/bits/fcntl-linux.h. The linux kernel generic
definition for F_*LK is 5, 6 & 7 and F_*LK64* is 12,13, and 14 as seen in
include/uapi/asm-generic/fcntl.h. On 64bit machine, by default the kernel
assumes all F_*LK to 64bit calls and doesnt support use of F_*LK64* as
can be seen in include/linux/fcntl.h in linux source.

On x86_64 host, the values for F_*LK64* are set to 5, 6 and 7
explicitly in /usr/include/x86_64-linux-gnu/bits/fcntl.h by the glibc.
Whereas, a PPC64 host doesn't have such a definition in
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h by the glibc. So,
the sources on PPC64 host sees the default value of F_*LK64*
as 12, 13 & 14(fcntl-linux.h).

Since the 64bit kernel doesnt support 12, 13 & 14; the glibc fcntl syscall
implementation(__libc_fcntl*(), __fcntl64_nocancel) does the F_*LK64* value
convertion back to F_*LK* values on PPC64 as seen in
sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h with FCNTL_ADJUST_CMD()
macro. Whereas on x86_64 host the values for F_*LK64* are set to 5, 6 and 7
and no adjustments are needed.

Since qemu doesnt use the glibc fcntl, but makes the safe_syscall* on its
own, the PPC64 qemu is calling the syscall with 12, 13, and 14(without
adjustment) and they all fail. The fcntl calls to F_GETLK/F_SETLK|W all
fail by all pplications run on PPC64 host user emulation.

The fix here could be to see why on PPC64 the glibc is still keeping
F_*LK64* different from F_*LK and why adjusting them to 5, 6 and 7 before
the syscall for PPC only. See if we can make the
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h to have the values
5, 6 & 7 just like x86_64 and remove the adjustment code in glibc. That way,
qemu sources see the kernel supported values in glibc headers.

OR

On PPC64 host, qemu sources see both F_LK* & F_LK64* as same and set to
12, 13 and 14 because __USE_FILE_OFFSET64 is defined in qemu
sources(also refer sysdeps/unix/sysv/linux/bits/fcntl-linux.h).
Since F_*LK and F_*LK64 are same, the value adjument like done by glibc in
qemu sources is difficult. So, Overwrite the glibc defaults with the actual
supported values in Qemu. The current patch is doing this.

Signed-off-by: Shivaprasad G Bhat 
---
  linux-user/syscall.c |   14 ++
  1 file changed, 14 insertions(+)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 7b9ac3b408..1693e69ce0 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -250,6 +250,20 @@ static type name (type1 arg1,type2 arg2,type3 arg3,type4 
arg4,type5 arg5,  \
  #define TARGET_NR__llseek TARGET_NR_llseek
  #endif
  
+/* glibc headers has these defined to 12, 13 and 14 and is not supported

+ * by kernel. The glibc fcntl call actually adjusts them back to 5, 6 and 7
+ * before making the syscall(). Since we make the syscall directly,
+ * overwite/adjust to what is supported by the kernel.
+ */
+#if defined(__linux__) && defined(__powerpc64__)
+#undef F_GETLK64
+#define F_GETLK64  5   /* Get record locking info.  */
+#undef F_SETLK64
+#define F_SETLK64  6   /* Set record locking info (non-blocking).  */
+#undef F_SETLKW64
+#define F_SETLKW64 7   /* Set record locking info (blocking).  */
+#endif
+
  #ifdef __NR_gettid
  _syscall0(int, gettid)
  #else


These macros are used in target_to_host_fcntl_cmd(), and this function
is used with safe_fcntl() and fcntl().

So I think it would be cleaner to do the change after
target_to_host_fcntl_cmd() in do_fcntl() as it is done in glibc instead
of redefining system values. Something like:

--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6782,6 +6782,12 @@ static abi_long do_fcntl(int fd, int cmd,
abi_ulong arg)
  if (host_cmd == -TARGET_EINVAL)
 return host_cmd;

+#if defined(__linux__) && defined(__powerpc64__)
+if (host_cmd >= F_GETLK64 && host_cmd <= F_SETLKW64) {
+host_cmd -= F_GETLK64 - F_GETLK;

But as you said, __USE_FILE_OFFSET64 is defined in qemu, and F_GETLK is
equal to F_GETLK64, so we should use something like:

...
 host_cmd -= F_GETLK64 - 5;
...

Hi Laurent, Thanks for the comments. I agree to the comments,
sending the v2 accordingly.

Thanks,
Shivaprasad



Thanks,
Laurent

[Qemu-devel] [PATCH v2] linux-user: ppc64: use the correct values for F_*LK64s

Qemu includes the glibc headers for the host defines and target headers are
part of the qemu source themselves. The glibc has the F_GETLK64, F_SETLK64
and F_SETLKW64 defined to 12, 13 and 14 for all archs in
sysdeps/unix/sysv/linux/bits/fcntl-linux.h. The linux kernel generic
definition for F_*LK is 5, 6 & 7 and F_*LK64* is 12,13, and 14 as seen in
include/uapi/asm-generic/fcntl.h. On 64bit machine, by default the kernel
assumes all F_*LK to 64bit calls and doesnt support use of F_*LK64* as
can be seen in include/linux/fcntl.h in linux source.

On x86_64 host, the values for F_*LK64* are set to 5, 6 and 7
explicitly in /usr/include/x86_64-linux-gnu/bits/fcntl.h by the glibc.
Whereas, a PPC64 host doesn't have such a definition in
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h by the glibc. So,
the sources on PPC64 host sees the default value of F_*LK64*
as 12, 13 & 14(fcntl-linux.h).

Since the 64bit kernel doesnt support 12, 13 & 14; the glibc fcntl syscall
implementation(__libc_fcntl*(), __fcntl64_nocancel) does the F_*LK64* value
convertion back to F_*LK* values on PPC64 as seen in
sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h with FCNTL_ADJUST_CMD()
macro. Whereas on x86_64 host the values for F_*LK64* are set to 5, 6 and 7
and no adjustments are needed.

Since qemu doesnt use the glibc fcntl, but makes the safe_syscall* on its
own, the PPC64 qemu is calling the syscall with 12, 13, and 14(without
adjustment) and they all fail. The fcntl calls to F_GETLK/F_SETLK|W all
fail by all pplications run on PPC64 host user emulation.

The fix here could be to see why on PPC64 the glibc is still keeping
F_*LK64* different from F_*LK and why adjusting them to 5, 6 and 7 before
the syscall for PPC only. See if we can make the
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h to have the values
5, 6 & 7 just like x86_64 and remove the adjustment code in glibc. That
way, qemu sources see the kernel supported values in glibc headers.

OR

On PPC64 host, qemu sources see both F_*LK & F_*LK64* as same and set to
12, 13 and 14 because __USE_FILE_OFFSET64 is defined in qemu
sources(also refer sysdeps/unix/sysv/linux/bits/fcntl-linux.h).
Do the value adjustment just like it is done by glibc source by using
F_GETLK value of 5. That way, we make the syscalls with the actual
supported values in Qemu. The patch is taking this approach.

Signed-off-by: Shivaprasad G Bhat 
---
 Changes from v1:
- Changed the overwrite of F*LK64* with 5, 6 and 7 in using #define
  instead using the adjustment code similar to glibc as suggested.
- Dropped __linux__ check for the adjustment code as suggested.
- Moved the adjustment code inside target_to_host_fcntl_cmd to address
  all possible|future cases.

 linux-user/syscall.c |   73 --
 1 file changed, 52 insertions(+), 21 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 643b8833de..38c2f1e90f 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6475,63 +6475,94 @@ static int do_fork(CPUArchState *env, unsigned int 
flags, abi_ulong newsp,
 /* warning : doesn't handle linux specific flags... */
 static int target_to_host_fcntl_cmd(int cmd)
 {
+int ret = -TARGET_EINVAL;
 switch(cmd) {
case TARGET_F_DUPFD:
case TARGET_F_GETFD:
case TARGET_F_SETFD:
case TARGET_F_GETFL:
case TARGET_F_SETFL:
-return cmd;
+ret = cmd;
+break;
 case TARGET_F_GETLK:
-return F_GETLK64;
+ret = F_GETLK64;
+break;
 case TARGET_F_SETLK:
-return F_SETLK64;
+ret = F_SETLK64;
+break;
 case TARGET_F_SETLKW:
-return F_SETLKW64;
+ret = F_SETLKW64;
+break;
case TARGET_F_GETOWN:
-   return F_GETOWN;
+ret = F_GETOWN;
+break;
case TARGET_F_SETOWN:
-   return F_SETOWN;
+ret = F_SETOWN;
+break;
case TARGET_F_GETSIG:
-   return F_GETSIG;
+ret = F_GETSIG;
+break;
case TARGET_F_SETSIG:
-   return F_SETSIG;
+ret = F_SETSIG;
+break;
 #if TARGET_ABI_BITS == 32
 case TARGET_F_GETLK64:
-   return F_GETLK64;
+ret = F_GETLK64;
+break;
case TARGET_F_SETLK64:
-   return F_SETLK64;
+ret = F_SETLK64;
+break;
case TARGET_F_SETLKW64:
-   return F_SETLKW64;
+ret = F_SETLKW64;
+break;
 #endif
 case TARGET_F_SETLEASE:
-return F_SETLEASE;
+ret = F_SETLEASE;
+break;
 case TARGET_F_GETLEASE:
-return F_GETLEASE;
+ret = F_GETLEASE;
+break;
 #ifdef F_DUPFD_CLOEXEC
 case TARGET_F_DUPFD_CLOEXEC:
-return F_DUPFD_CLOEXEC;
+

Re: [Qemu-devel] [PATCH] linux-user: ppc64: use the correct values for F_*LK64s





On 07/12/2018 12:36 PM, Laurent Vivier wrote:

Le 12/07/2018 à 09:00, Shivaprasad G Bhat a écrit :


On 07/12/2018 02:21 AM, Laurent Vivier wrote:

Le 11/07/2018 à 15:04, Laurent Vivier a écrit :

Le 11/07/2018 à 12:55, Shivaprasad G Bhat a écrit :

Qemu includes the glibc headers for the host defines and target
headers are
part of the qemu source themselves. The glibc has the F_GETLK64,
F_SETLK64
and F_SETLKW64 defined to 12, 13 and 14 for all archs(generic) in
sysdeps/unix/sysv/linux/bits/fcntl-linux.h. The linux kernel generic
definition for F_*LK is 5, 6 & 7 and F_*LK64* is 12,13, and 14 as
seen in
include/uapi/asm-generic/fcntl.h. On 64bit machine, by default the
kernel
assumes all F_*LK to 64bit calls and doesnt support use of F_*LK64* as
can be seen in include/linux/fcntl.h in linux source.

On x86_64 host, the values for F_*LK64* are set to 5, 6 and 7
explicitly in /usr/include/x86_64-linux-gnu/bits/fcntl.h by the glibc.
Whereas, a PPC64 host doesn't have such a definition in
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h by the glibc. So,
the sources on PPC64 host sees the default value of F_*LK64*
as 12, 13 & 14(fcntl-linux.h).

Since the 64bit kernel doesnt support 12, 13 & 14; the glibc fcntl
syscall
implementation(__libc_fcntl*(), __fcntl64_nocancel) does the
F_*LK64* value
convertion back to F_*LK* values on PPC64 as seen in
sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h with
FCNTL_ADJUST_CMD()
macro. Whereas on x86_64 host the values for F_*LK64* are set to 5,
6 and 7
and no adjustments are needed.

Since qemu doesnt use the glibc fcntl, but makes the safe_syscall*
on its
own, the PPC64 qemu is calling the syscall with 12, 13, and 14(without
adjustment) and they all fail. The fcntl calls to F_GETLK/F_SETLK|W all
fail by all pplications run on PPC64 host user emulation.

The fix here could be to see why on PPC64 the glibc is still keeping
F_*LK64* different from F_*LK and why adjusting them to 5, 6 and 7
before
the syscall for PPC only. See if we can make the
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h to have the values
5, 6 & 7 just like x86_64 and remove the adjustment code in glibc.
That way,
qemu sources see the kernel supported values in glibc headers.

OR

On PPC64 host, qemu sources see both F_LK* & F_LK64* as same and set to
12, 13 and 14 because __USE_FILE_OFFSET64 is defined in qemu
sources(also refer sysdeps/unix/sysv/linux/bits/fcntl-linux.h).
Since F_*LK and F_*LK64 are same, the value adjument like done by
glibc in
qemu sources is difficult. So, Overwrite the glibc defaults with the
actual
supported values in Qemu. The current patch is doing this.

Signed-off-by: Shivaprasad G Bhat 
---
   linux-user/syscall.c |   14 ++
   1 file changed, 14 insertions(+)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 7b9ac3b408..1693e69ce0 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -250,6 +250,20 @@ static type name (type1 arg1,type2 arg2,type3
arg3,type4 arg4,type5 arg5,    \
   #define TARGET_NR__llseek TARGET_NR_llseek
   #endif
   +/* glibc headers has these defined to 12, 13 and 14 and is not
supported
+ * by kernel. The glibc fcntl call actually adjusts them back to 5,
6 and 7
+ * before making the syscall(). Since we make the syscall directly,
+ * overwite/adjust to what is supported by the kernel.
+ */
+#if defined(__linux__) && defined(__powerpc64__)
+#undef F_GETLK64
+#define F_GETLK64  5   /* Get record locking info.  */
+#undef F_SETLK64
+#define F_SETLK64  6   /* Set record locking info
(non-blocking).  */
+#undef F_SETLKW64
+#define F_SETLKW64 7   /* Set record locking info
(blocking).  */
+#endif
+
   #ifdef __NR_gettid
   _syscall0(int, gettid)
   #else


These macros are used in target_to_host_fcntl_cmd(), and this function
is used with safe_fcntl() and fcntl().

So I think it would be cleaner to do the change after
target_to_host_fcntl_cmd() in do_fcntl() as it is done in glibc instead
of redefining system values. Something like:

--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6782,6 +6782,12 @@ static abi_long do_fcntl(int fd, int cmd,
abi_ulong arg)
   if (host_cmd == -TARGET_EINVAL)
  return host_cmd;

+#if defined(__linux__) && defined(__powerpc64__)
+    if (host_cmd >= F_GETLK64 && host_cmd <= F_SETLKW64) {
+    host_cmd -= F_GETLK64 - F_GETLK;

But as you said, __USE_FILE_OFFSET64 is defined in qemu, and F_GETLK is
equal to F_GETLK64, so we should use something like:

...
  host_cmd -= F_GETLK64 - 5;
...

Hi Shivaprasad,


Hi Laurent, Thanks for the comments. I agree to the comments,
sending the v2 accordingly.

Thank you.

I did some tests, (qemu-hppa on ppc64 with dpkg), and we need the
conversion with TARGET_NR_fcntl64 too because it also calls safe_fcntl()
for TARGET_F_SETLK64 and TARGET_F_SETLKW64.
I moved the adjustment code inside target_to_host_fcntl_cmd() to address 
all cases fo

[Qemu-devel] [PATCH v3] linux-user: ppc64: use the correct values for F_*LK64s

Qemu includes the glibc headers for the host defines and target headers are
part of the qemu source themselves. The glibc has the F_GETLK64, F_SETLK64
and F_SETLKW64 defined to 12, 13 and 14 for all archs in
sysdeps/unix/sysv/linux/bits/fcntl-linux.h. The linux kernel generic
definition for F_*LK is 5, 6 & 7 and F_*LK64* is 12,13, and 14 as seen in
include/uapi/asm-generic/fcntl.h. On 64bit machine, by default the kernel
assumes all F_*LK to 64bit calls and doesnt support use of F_*LK64* as
can be seen in include/linux/fcntl.h in linux source.

On x86_64 host, the values for F_*LK64* are set to 5, 6 and 7
explicitly in /usr/include/x86_64-linux-gnu/bits/fcntl.h by the glibc.
Whereas, a PPC64 host doesn't have such a definition in
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h by the glibc. So,
the sources on PPC64 host sees the default value of F_*LK64*
as 12, 13 & 14(fcntl-linux.h).

Since the 64bit kernel doesnt support 12, 13 & 14; the glibc fcntl syscall
implementation(__libc_fcntl*(), __fcntl64_nocancel) does the F_*LK64* value
convertion back to F_*LK* values on PPC64 as seen in
sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h with FCNTL_ADJUST_CMD()
macro. Whereas on x86_64 host the values for F_*LK64* are set to 5, 6 and 7
and no adjustments are needed.

Since qemu doesnt use the glibc fcntl, but makes the safe_syscall* on its
own, the PPC64 qemu is calling the syscall with 12, 13, and 14(without
adjustment) and they all fail. The fcntl calls to F_GETLK/F_SETLK|W all
fail by all pplications run on PPC64 host user emulation.

The fix here could be to see why on PPC64 the glibc is still keeping
F_*LK64* different from F_*LK and why adjusting them to 5, 6 and 7 before
the syscall for PPC only. See if we can make the
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h to have the values
5, 6 & 7 just like x86_64 and remove the adjustment code in glibc. That
way, qemu sources see the kernel supported values in glibc headers.

OR

On PPC64 host, qemu sources see both F_*LK & F_*LK64* as same and set to
12, 13 and 14 because __USE_FILE_OFFSET64 is defined in qemu
sources(also refer sysdeps/unix/sysv/linux/bits/fcntl-linux.h).
Do the value adjustment just like it is done by glibc source by using
F_GETLK value of 5. That way, we make the syscalls with the actual
supported values in Qemu. The patch is taking this approach.

Signed-off-by: Shivaprasad G Bhat 
---
 v2 - https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg02920.html
 Changes from v2:
- Fixed the braces, and indentation for comments.
 v1 - https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg02567.html
 Changes from v1:
- Changed the overwrite of F*LK64* with 5, 6 and 7 in using #define
  instead using the adjustment code similar to glibc as suggested.
- Dropped __linux__ check for the adjustment code as suggested.
- Moved the adjustment code inside target_to_host_fcntl_cmd to address
  all possible|future cases.

 linux-user/syscall.c |   74 --
 1 file changed, 53 insertions(+), 21 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 643b8833de..7fb595269f 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6475,63 +6475,95 @@ static int do_fork(CPUArchState *env, unsigned int 
flags, abi_ulong newsp,
 /* warning : doesn't handle linux specific flags... */
 static int target_to_host_fcntl_cmd(int cmd)
 {
+int ret = -TARGET_EINVAL;
 switch(cmd) {
case TARGET_F_DUPFD:
case TARGET_F_GETFD:
case TARGET_F_SETFD:
case TARGET_F_GETFL:
case TARGET_F_SETFL:
-return cmd;
+ret = cmd;
+break;
 case TARGET_F_GETLK:
-return F_GETLK64;
+ret = F_GETLK64;
+break;
 case TARGET_F_SETLK:
-return F_SETLK64;
+ret = F_SETLK64;
+break;
 case TARGET_F_SETLKW:
-return F_SETLKW64;
+ret = F_SETLKW64;
+break;
case TARGET_F_GETOWN:
-   return F_GETOWN;
+ret = F_GETOWN;
+break;
case TARGET_F_SETOWN:
-   return F_SETOWN;
+ret = F_SETOWN;
+break;
case TARGET_F_GETSIG:
-   return F_GETSIG;
+ret = F_GETSIG;
+break;
case TARGET_F_SETSIG:
-   return F_SETSIG;
+ret = F_SETSIG;
+break;
 #if TARGET_ABI_BITS == 32
 case TARGET_F_GETLK64:
-   return F_GETLK64;
+ret = F_GETLK64;
+break;
case TARGET_F_SETLK64:
-   return F_SETLK64;
+ret = F_SETLK64;
+break;
case TARGET_F_SETLKW64:
-   return F_SETLKW64;
+ret = F_SETLKW64;
+break;
 #endif
 case TARGET_F_SETLEASE:
-return F_SETLEASE;
+ret = F_SETLEASE;
+break;

[Qemu-devel] [PATCH v4] linux-user: ppc64: use the correct values for F_*LK64s

2018-07-13 Thread Shivaprasad G Bhat

Qemu includes the glibc headers for the host defines and target headers are
part of the qemu source themselves. The glibc has the F_GETLK64, F_SETLK64
and F_SETLKW64 defined to 12, 13 and 14 for all archs in
sysdeps/unix/sysv/linux/bits/fcntl-linux.h. The linux kernel generic
definition for F_*LK is 5, 6 & 7 and F_*LK64* is 12,13, and 14 as seen in
include/uapi/asm-generic/fcntl.h. On 64bit machine, by default the kernel
assumes all F_*LK to 64bit calls and doesnt support use of F_*LK64* as
can be seen in include/linux/fcntl.h in linux source.

On x86_64 host, the values for F_*LK64* are set to 5, 6 and 7
explicitly in /usr/include/x86_64-linux-gnu/bits/fcntl.h by the glibc.
Whereas, a PPC64 host doesn't have such a definition in
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h by the glibc. So,
the sources on PPC64 host sees the default value of F_*LK64*
as 12, 13 & 14(fcntl-linux.h).

Since the 64bit kernel doesnt support 12, 13 & 14; the glibc fcntl syscall
implementation(__libc_fcntl*(), __fcntl64_nocancel) does the F_*LK64* value
convertion back to F_*LK* values on PPC64 as seen in
sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h with FCNTL_ADJUST_CMD()
macro. Whereas on x86_64 host the values for F_*LK64* are set to 5, 6 and 7
and no adjustments are needed.

Since qemu doesnt use the glibc fcntl, but makes the safe_syscall* on its
own, the PPC64 qemu is calling the syscall with 12, 13, and 14(without
adjustment) and they all fail. The fcntl calls to F_GETLK/F_SETLK|W all
fail by all pplications run on PPC64 host user emulation.

The fix here could be to see why on PPC64 the glibc is still keeping
F_*LK64* different from F_*LK and why adjusting them to 5, 6 and 7 before
the syscall for PPC only. See if we can make the
/usr/include/powerpc64le-linux-gnu/bits/fcntl.h to have the values
5, 6 & 7 just like x86_64 and remove the adjustment code in glibc. That
way, qemu sources see the kernel supported values in glibc headers.

OR

On PPC64 host, qemu sources see both F_*LK & F_*LK64* as same and set to
12, 13 and 14 because __USE_FILE_OFFSET64 is defined in qemu
sources(also refer sysdeps/unix/sysv/linux/bits/fcntl-linux.h).
Do the value adjustment just like it is done by glibc source by using
F_GETLK value of 5. That way, we make the syscalls with the actual
supported values in Qemu. The patch is taking this approach.

Signed-off-by: Shivaprasad G Bhat 
---
 v3 - https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg02923.html
 Changes from v3:
- Fixed the tabs for case statements
- Addressed the comments on v3 wrt to the variable initialisation
  and break from default case.
 v2 - https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg02920.html
 Changes from v2:
- Fixed the braces, and indentation for comments.
 v1 - https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg02567.html
 Changes from v1:
- Changed the overwrite of F*LK64* with 5, 6 and 7 in using #define
  instead using the adjustment code similar to glibc as suggested.
- Dropped __linux__ check for the adjustment code as suggested.
- Moved the adjustment code inside target_to_host_fcntl_cmd to address
  all possible|future cases.

 linux-user/syscall.c |  126 --
 1 file changed, 80 insertions(+), 46 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 643b8833de..b5274f657a 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -6475,63 +6475,97 @@ static int do_fork(CPUArchState *env, unsigned int 
flags, abi_ulong newsp,
 /* warning : doesn't handle linux specific flags... */
 static int target_to_host_fcntl_cmd(int cmd)
 {
+int ret;
+
 switch(cmd) {
-   case TARGET_F_DUPFD:
-   case TARGET_F_GETFD:
-   case TARGET_F_SETFD:
-   case TARGET_F_GETFL:
-   case TARGET_F_SETFL:
-return cmd;
-case TARGET_F_GETLK:
-return F_GETLK64;
-case TARGET_F_SETLK:
-return F_SETLK64;
-case TARGET_F_SETLKW:
-return F_SETLKW64;
-   case TARGET_F_GETOWN:
-   return F_GETOWN;
-   case TARGET_F_SETOWN:
-   return F_SETOWN;
-   case TARGET_F_GETSIG:
-   return F_GETSIG;
-   case TARGET_F_SETSIG:
-   return F_SETSIG;
+case TARGET_F_DUPFD:
+case TARGET_F_GETFD:
+case TARGET_F_SETFD:
+case TARGET_F_GETFL:
+case TARGET_F_SETFL:
+ret = cmd;
+break;
+case TARGET_F_GETLK:
+ret = F_GETLK64;
+break;
+case TARGET_F_SETLK:
+ret = F_SETLK64;
+break;
+case TARGET_F_SETLKW:
+ret = F_SETLKW64;
+break;
+case TARGET_F_GETOWN:
+ret = F_GETOWN;
+break;
+case TARGET_F_SETOWN:
+ret = F_SETOWN;
+break;
+case TARGET_F_GETSIG:
+ret = F_GETSIG;
+break;
+case TARGET_F_SETSIG:
+ret = F_SETSIG;
+break;
 #if TARGE

[Qemu-devel] [PATCH] linux-user: ppc64: don't use volatile register during safe_syscall

2018-07-25 Thread Shivaprasad G Bhat

r11 is a volatile register on PPC as per calling conventions.
The safe_syscall code uses it to check if the signal_pending
is set during the safe_syscall. When a syscall is interrupted
on return from signal handling, the r11 might be corrupted
before we retry the syscall leading to a crash. The registers
r0-r13 are not to be used here as they have
volatile/designated/reserved usages. Change the code to use
r14 which is non-volatile and is appropriate for local use in
safe_syscall.

Signed-off-by: Shivaprasad G Bhat 
---
Steps to reproduce:
On PPC host, issue `qemu-ppc64le /usr/bin/cc -E -`
Attempt Ctrl-C, the issue is reproduced.

Reference:
https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#REG

 linux-user/host/ppc64/safe-syscall.inc.S |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linux-user/host/ppc64/safe-syscall.inc.S 
b/linux-user/host/ppc64/safe-syscall.inc.S
index d30050a67c..b0cbbe6a69 100644
--- a/linux-user/host/ppc64/safe-syscall.inc.S
+++ b/linux-user/host/ppc64/safe-syscall.inc.S
@@ -49,7 +49,7 @@ safe_syscall_base:
 *   and returns the result in r3
 * Shuffle everything around appropriately.
 */
-   mr  11, 3   /* signal_pending */
+   mr  14, 3   /* signal_pending */
mr  0, 4/* syscall number */
mr  3, 5/* syscall arguments */
mr  4, 6
@@ -67,7 +67,7 @@ safe_syscall_base:
 */
 safe_syscall_start:
/* if signal_pending is non-zero, don't do the call */
-   lwz 12, 0(11)
+   lwz 12, 0(14)
cmpwi   0, 12, 0
bne-0f
sc

[Qemu-devel] [PATCH v2] linux-user: ppc64: don't use volatile register during safe_syscall

2018-07-26 Thread Shivaprasad G Bhat

r11 is a volatile register on PPC as per calling conventions.
The safe_syscall code uses it to check if the signal_pending
is set during the safe_syscall. When a syscall is interrupted
on return from signal handling, the r11 might be corrupted
before we retry the syscall leading to a crash. The registers
r0-r13 are not to be used here as they have
volatile/designated/reserved usages. Change the code to use
r14 which is non-volatile and is appropriate for local use in
safe_syscall.

Steps to reproduce:
On PPC host, issue `qemu-x86_64 /usr/bin/cc -E -`
Attempt Ctrl-C, the issue is reproduced.

Reference:
https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#REG

Signed-off-by: Shivaprasad G Bhat 
Tested-by: Laurent Vivier 
Reviewed-by: Laurent Vivier 
---
v1: https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg05089.html
Changes from v1:
   Fixed the commit message as suggested

 linux-user/host/ppc64/safe-syscall.inc.S |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linux-user/host/ppc64/safe-syscall.inc.S 
b/linux-user/host/ppc64/safe-syscall.inc.S
index d30050a67c..b0cbbe6a69 100644
--- a/linux-user/host/ppc64/safe-syscall.inc.S
+++ b/linux-user/host/ppc64/safe-syscall.inc.S
@@ -49,7 +49,7 @@ safe_syscall_base:
 *   and returns the result in r3
 * Shuffle everything around appropriately.
 */
-   mr  11, 3   /* signal_pending */
+   mr  14, 3   /* signal_pending */
mr  0, 4/* syscall number */
mr  3, 5/* syscall arguments */
mr  4, 6
@@ -67,7 +67,7 @@ safe_syscall_base:
 */
 safe_syscall_start:
/* if signal_pending is non-zero, don't do the call */
-   lwz 12, 0(11)
+   lwz 12, 0(14)
cmpwi   0, 12, 0
bne-0f
sc

Re: [Qemu-devel] [PATCH] linux-user: ppc64: don't use volatile register during safe_syscall

2018-07-26 Thread Shivaprasad G Bhat





On 07/26/2018 10:56 PM, Richard Henderson wrote:

On 07/25/2018 11:48 PM, Shivaprasad G Bhat wrote:

Reference:
https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#REG

This document is for _CALL_ELF < 2.  For ppc64le, the document is at

https://openpowerfoundation.org/wp-content/uploads/2016/03/ABI64BitOpenPOWERv1.1_16July2015_pub4.pdf

In both cases, it appears that we can (ab)use SP+16 to save
the value of r14 across the syscall.  This slot would normally
be used for saving our own return address (LR), but we have no
need to save that value because it *is* preserved across the syscall.

I will send updated patch as suggested.

Thanks,
Shivaprasad



r~

[Qemu-devel] [PATCH v3] linux-user: ppc64: don't use volatile register during safe_syscall

2018-07-30 Thread Shivaprasad G Bhat

r11 is a volatile register on PPC as per calling conventions.
The safe_syscall code uses it to check if the signal_pending
is set during the safe_syscall. When a syscall is interrupted
on return from signal handling, the r11 might be corrupted
before we retry the syscall leading to a crash. The registers
r0-r13 are not to be used here as they have
volatile/designated/reserved usages.

Change the code to use r14 which is non-volatile.
Use SP+16 which is a slot for LR, for save/restore of previous value
of r14. SP+16 can be used, as LR is preserved across the syscall.

Steps to reproduce:
On PPC host, issue `qemu-x86_64 /usr/bin/cc -E -`
Attempt Ctrl-C, the issue is reproduced.

Reference:
https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#REG
https://openpowerfoundation.org/wp-content/uploads/2016/03/ABI64BitOpenPOWERv1.1_16July2015_pub4.pdf

Signed-off-by: Shivaprasad G Bhat 
---
v2: https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg05102.html
Changes from v2:
   Added code to store and restore r14 register. 

v1: https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg05089.html
Changes from v1:
   Fixed the commit message as suggested

 linux-user/host/ppc64/safe-syscall.inc.S |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/linux-user/host/ppc64/safe-syscall.inc.S 
b/linux-user/host/ppc64/safe-syscall.inc.S
index d30050a67c..ca85da13bd 100644
--- a/linux-user/host/ppc64/safe-syscall.inc.S
+++ b/linux-user/host/ppc64/safe-syscall.inc.S
@@ -49,7 +49,8 @@ safe_syscall_base:
 *   and returns the result in r3
 * Shuffle everything around appropriately.
 */
-   mr  11, 3   /* signal_pending */
+   std 14, 16(1) /* Preserve r14 in SP+16 */
+   mr  14, 3   /* signal_pending */
mr  0, 4/* syscall number */
mr  3, 5/* syscall arguments */
mr  4, 6
@@ -67,11 +68,12 @@ safe_syscall_base:
 */
 safe_syscall_start:
/* if signal_pending is non-zero, don't do the call */
-   lwz 12, 0(11)
+   lwz 12, 0(14)
cmpwi   0, 12, 0
bne-0f
sc
 safe_syscall_end:
+   ld 14, 16(1) /* restore r14 to its original value */
/* code path when we did execute the syscall */
bnslr+
 
@@ -81,6 +83,7 @@ safe_syscall_end:
 
/* code path when we didn't execute the syscall */
 0: addi3, 0, -TARGET_ERESTARTSYS
+   ld 14, 16(1) /* restore r14 to its orginal value */
blr
.cfi_endproc

[Qemu-devel] [PATCH v4] linux-user: ppc64: don't use volatile register during safe_syscall

2018-07-30 Thread Shivaprasad G Bhat

r11 is a volatile register on PPC as per calling conventions.
The safe_syscall code uses it to check if the signal_pending
is set during the safe_syscall. When a syscall is interrupted
on return from signal handling, the r11 might be corrupted
before we retry the syscall leading to a crash. The registers
r0-r13 are not to be used here as they have
volatile/designated/reserved usages.

Change the code to use r14 which is non-volatile.
Use SP+16 which is a slot for LR, for save/restore of previous value
of r14. SP+16 can be used, as LR is preserved across the syscall.

Steps to reproduce:
On PPC host, issue `qemu-x86_64 /usr/bin/cc -E -`
Attempt Ctrl-C, the issue is reproduced.

Reference:
https://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html#REG
https://openpowerfoundation.org/wp-content/uploads/2016/03/ABI64BitOpenPOWERv1.1_16July2015_pub4.pdf

Signed-off-by: Shivaprasad G Bhat 
Tested-by: Richard Henderson 
Tested-by: Laurent Vivier 
Reviewed-by: Richard Henderson 
Reviewed-by: Laurent Vivier 
---
v3: https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg05559.html
Changes from v3:
Added cfi_offset directive as suggested and a minor comment/code line swap.

v2: https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg05102.html
Changes from v2:
   Added code to store and restore r14 register. 

v1: https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg05089.html
Changes from v1:
   Fixed the commit message as suggested

 linux-user/host/ppc64/safe-syscall.inc.S |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/linux-user/host/ppc64/safe-syscall.inc.S 
b/linux-user/host/ppc64/safe-syscall.inc.S
index d30050a67c..8ed73a5b86 100644
--- a/linux-user/host/ppc64/safe-syscall.inc.S
+++ b/linux-user/host/ppc64/safe-syscall.inc.S
@@ -49,7 +49,9 @@ safe_syscall_base:
 *   and returns the result in r3
 * Shuffle everything around appropriately.
 */
-   mr  11, 3   /* signal_pending */
+   std 14, 16(1) /* Preserve r14 in SP+16 */
+   .cfi_offset 14, 16
+   mr  14, 3   /* signal_pending */
mr  0, 4/* syscall number */
mr  3, 5/* syscall arguments */
mr  4, 6
@@ -67,12 +69,13 @@ safe_syscall_base:
 */
 safe_syscall_start:
/* if signal_pending is non-zero, don't do the call */
-   lwz 12, 0(11)
+   lwz 12, 0(14)
cmpwi   0, 12, 0
bne-0f
sc
 safe_syscall_end:
/* code path when we did execute the syscall */
+   ld 14, 16(1) /* restore r14 to its original value */
bnslr+
 
/* syscall failed; return negative errno */
@@ -81,6 +84,7 @@ safe_syscall_end:
 
/* code path when we didn't execute the syscall */
 0: addi3, 0, -TARGET_ERESTARTSYS
+   ld 14, 16(1) /* restore r14 to its orginal value */
blr
.cfi_endproc

Re: [RFC Qemu PATCH v2 1/2] spapr: drc: Add support for async hcalls at the drc level


Hi David,

Sorry about the delay.

On 2/8/21 11:51 AM, David Gibson wrote:

On Tue, Jan 19, 2021 at 12:40:31PM +0530, Shivaprasad G Bhat wrote:

Thanks for the comments!


On 12/28/20 2:08 PM, David Gibson wrote:


On Mon, Dec 21, 2020 at 01:08:53PM +0100, Greg Kurz wrote:

...

The overall idea looks good but I think you should consider using
a thread pool to implement it. See below.

I am not convinced, however.  Specifically, attaching this to the DRC
doesn't make sense to me.  We're adding exactly one DRC related async
hcall, and I can't really see much call for another one.  We could
have other async hcalls - indeed we already have one for HPT resizing
- but attaching this to DRCs doesn't help for those.

The semantics of the hcall made me think, if this is going to be
re-usable for future if implemented at DRC level.

It would only be re-usable for operations that are actually connected
to DRCs.  It doesn't seem to me particularly likely that we'll ever
have more asynchronous hcalls that are also associated with DRCs.

Okay


Other option
is to move the async-hcall-state/list into the NVDIMMState structure
in include/hw/mem/nvdimm.h and handle it with machine->nvdimms_state
at a global level.

I'm ok with either of two options:

A) Implement this ad-hoc for this specific case, making whatever
simplifications you can based on this specific case.


I am simplifying it to nvdimm use-case alone and limiting the scope.



B) Implement a general mechanism for async hcalls that is *not* tied
to DRCs.  Then use that for the existing H_RESIZE_HPT_PREPARE call as
well as this new one.


Hope you are okay with using the pool based approach that Greg

Honestly a thread pool seems like it might be overkill for this
application.


I think its appropriate here as that is what is being done by virtio-pmem

too for flush requests. The aio infrastructure simplifies lot of the

thread handling usage. Please suggest if you think there are better ways.


I am sending the next version addressing all the comments from you and Greg.


Thanks,

Shivaprasad

[PATCH v3 0/3] spapr: nvdimm: Enable sync-dax property for nvdimm

The nvdimm devices are expected to ensure write persistence during power
failure kind of scenarios.

The libpmem has architecture specific instructions like dcbf on power
to flush the cache data to backend nvdimm device during normal writes.

Qemu - virtual nvdimm devices are memory mapped. The dcbf in the guest
doesn't traslate to actual flush to the backend file on the host in case
of file backed v-nvdimms. This is addressed by virtio-pmem in case of x86_64
by making explicit flushes translating to fdatasync at qemu.

On PAPR, the issue is addressed by adding a new hcall to
request for an explicit flush from the guest ndctl driver when the backend
nvdimm cannot ensure write persistence with dcbf alone. So, the approach
here is to convey when the hcall flush is required in a device tree
property. The guest makes the hcall when the property is found, instead
of relying on dcbf.

The first patch adds the necessary asynchronous hcall support infrastructure
code at the DRC level. Second patch implements the hcall using the
infrastructure.

Hcall number and semantics finalized, so dropping the RFC prefix.

A new device property sync-dax is added to the nvdimm device. When the
sync-dax is off(default), device property "hcall-flush-required" is set,
and the guest makes hcall H_SCM_FLUSH requesting for an explicit flush.

By default, sync-dax is "off" on all new pseries machines and prior to
5.2 its "on",

The below demonstration shows the map_sync behavior with sync-dax on & off.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

The pmem0 is from nvdimm with With sync-dax=on, and pmem1 is from nvdimm with
syn-dax=off, mounted as
/dev/pmem0 on /mnt1 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)
/dev/pmem1 on /mnt2 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)

[root@atest-guest ~]# ./mapsync /mnt1/newfile> When sync-dax=off
[root@atest-guest ~]# ./mapsync /mnt2/newfile> when sync-dax=on
Failed to mmap with Operation not supported

The first patch does the header file cleanup necessary for the
subsequent ones. Second patch implements the hcall, adds the necessary
vmstate properties to spapr machine structure for carrying the hcall
status during save-restore. The nature of the hcall being asynchronus,
the patch uses aio utilities to offload the flush. The third patch adds
the 'sync-dax' device property and enables the device tree property
for the guest to utilise the hcall.

---
v2 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg07031.html
Changes from v2:
- Using the thread pool based approach as suggested by Greg
- Moved the async hcall handling code to spapr_nvdimm.c along
with some simplifications
- Added vmstate to preserve the hcall status during save-restore
along with pre_save handler code to complete all ongoning flushes.
- Added hw_compat magic for sync-dax 'on' on previous machines.
- Miscellanious minor fixes.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
spapr: nvdimm: Forward declare and move the definitions
spapr: nvdimm: Impletment scm flush hcall
spapr: nvdimm: Enable sync-dax device property for nvdimm

--
Signature

[PATCH v3 1/3] spapr: nvdimm: Forward declare and move the definitions

The subsequent patches add definitions which tend to
get the compilation to cyclic dependency. So, prepare
with forward declarations, move the defitions and clean up.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   12 
 include/hw/ppc/spapr_nvdimm.h |   21 ++---
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index b46c36917c..8cf3fb2ffb 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -31,6 +31,18 @@
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
 
+/*
+ * The nvdimm size should be aligned to SCM block size.
+ * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
+ * inorder to have SCM regions not to overlap with dimm memory regions.
+ * The SCM devices can have variable block sizes. For now, fixing the
+ * block size to the minimum value.
+ */
+#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
+
+/* Have an explicit check for alignment */
+QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 73be250e2a..abcacda5d7 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -11,23 +11,14 @@
 #define HW_SPAPR_NVDIMM_H
 
 #include "hw/mem/nvdimm.h"
-#include "hw/ppc/spapr.h"
 
-/*
- * The nvdimm size should be aligned to SCM block size.
- * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
- * inorder to have SCM regions not to overlap with dimm memory regions.
- * The SCM devices can have variable block sizes. For now, fixing the
- * block size to the minimum value.
- */
-#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
-
-/* Have an explicit check for alignment */
-QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+struct SpaprDrc;
+struct SpaprMachineState;
 
-int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
-   void *fdt, int *fdt_start_offset, Error **errp);
-void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt);
+int spapr_pmem_dt_populate(struct SpaprDrc *drc,
+   struct SpaprMachineState *spapr, void *fdt,
+   int *fdt_start_offset, Error **errp);
+void spapr_dt_persistent_memory(struct SpaprMachineState *spapr, void *fdt);
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp);
 void spapr_add_nvdimm(DeviceState *dev, uint64_t slot);

[PATCH v3 3/3] spapr: nvdimm: Enable sync-dax device property for nvdimm

The patch adds the 'sync-dax' property to the nvdimm device.

When the sync-dax is 'off', the device tree property
"hcall-flush-required" is added to the nvdimm node which makes the
guest to issue H_SCM_FLUSH hcalls to request for flushes explicitly.
This would be the default behaviour without sync-dax property set
for the nvdimm device.

The sync-dax="on" would mean the guest need not make flush requests
to the qemu. On previous machine versions the sync-dax is set to be
"on" by default using the hw_compat magic.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/core/machine.c   |1 +
 hw/mem/nvdimm.c |1 +
 hw/ppc/spapr_nvdimm.c   |   17 +
 include/hw/mem/nvdimm.h |   10 ++
 include/hw/ppc/spapr.h  |1 +
 5 files changed, 30 insertions(+)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 257a664ea2..f843643574 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -41,6 +41,7 @@ GlobalProperty hw_compat_5_2[] = {
 { "PIIX4_PM", "smm-compat", "on"},
 { "virtio-blk-device", "report-discard-granularity", "off" },
 { "virtio-net-pci", "vectors", "3"},
+{ "nvdimm", "sync-dax", "on" },
 };
 const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2);
 
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index 7397b67156..8f0e29b191 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -229,6 +229,7 @@ static void nvdimm_write_label_data(NVDIMMDevice *nvdimm, 
const void *buf,
 
 static Property nvdimm_properties[] = {
 DEFINE_PROP_BOOL(NVDIMM_UNARMED_PROP, NVDIMMDevice, unarmed, false),
+DEFINE_PROP_BOOL(NVDIMM_SYNC_DAX_PROP, NVDIMMDevice, sync_dax, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 883317c1ed..dd1c90251b 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -125,6 +125,9 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
 uint64_t lsize = nvdimm->label_size;
 uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
 NULL);
+bool sync_dax = object_property_get_bool(OBJECT(nvdimm),
+ NVDIMM_SYNC_DAX_PROP,
+ &error_abort);
 
 drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
 g_assert(drc);
@@ -159,6 +162,11 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (!sync_dax) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+
 return child_offset;
 }
 
@@ -567,10 +575,12 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
   target_ulong opcode, target_ulong *args)
 {
 int ret;
+bool sync_dax;
 uint32_t drc_index = args[0];
 uint64_t continue_token = args[1];
 SpaprDrc *drc = spapr_drc_by_index(drc_index);
 PCDIMMDevice *dimm;
+NVDIMMDevice *nvdimm;
 HostMemoryBackend *backend = NULL;
 SpaprNVDIMMDeviceFlushState *state;
 ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
@@ -580,6 +590,13 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_PARAMETER;
 }
 
+nvdimm = NVDIMM(drc->dev);
+sync_dax = object_property_get_bool(OBJECT(nvdimm), NVDIMM_SYNC_DAX_PROP,
+&error_abort);
+if (sync_dax) {
+return H_UNSUPPORTED;
+}
+
 if (continue_token != 0) {
 ret = spapr_nvdimm_get_flush_status(continue_token);
 if (H_IS_LONG_BUSY(ret)) {
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index bcf62f825c..f82979cf2f 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -51,6 +51,7 @@ OBJECT_DECLARE_TYPE(NVDIMMDevice, NVDIMMClass, NVDIMM)
 #define NVDIMM_LABEL_SIZE_PROP "label-size"
 #define NVDIMM_UUID_PROP   "uuid"
 #define NVDIMM_UNARMED_PROP"unarmed"
+#define NVDIMM_SYNC_DAX_PROP   "sync-dax"
 
 struct NVDIMMDevice {
 /* private */
@@ -85,6 +86,15 @@ struct NVDIMMDevice {
  */
 bool unarmed;
 
+/*
+ * On PPC64,
+ * The 'off' value results in the hcall-flush-required property set
+ * in the device tree for pseries machines. When 'off', the guest
+ * initiates explicit flush requests to the backend device ensuring
+ * write persistence.
+ */
+bool sync_dax;
+
 /*
  * The PPC64 - spapr requires each nvdimm device have a uuid.
  */
diff --git a/include

[PATCH v3 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with H_BUSY when the operation is expected to take longer time along
with a continue_token. The hcall to be called again providing the
continue_token to get the status. So, all fresh requsts are put into
a 'pending' list and flush worker is submitted to the thread pool.
The thread pool completion callbacks move the requests to 'completed'
list, which are cleaned up after reporting to guest in subsequent
hcalls to get the status.

The semantics makes it necessary to preserve the continue_tokens
and their return status even across migrations. So, the pre_save
handler for the device waits for the flush worker to complete and
collects all the hcall states from 'completed' list. The necessary
nvdimm flush specific vmstate structures are added to the spapr
machine vmstate.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|6 +
 hw/ppc/spapr_nvdimm.c |  240 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_nvdimm.h |   12 ++
 4 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index d56418ca29..fdb0c73a2c 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1607,6 +1607,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes();
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -2003,6 +2005,7 @@ static const VMStateDescription vmstate_spapr = {
 &vmstate_spapr_cap_ccf_assist,
 &vmstate_spapr_cap_fwnmi,
 &vmstate_spapr_fwnmi,
+&vmstate_spapr_nvdimm_flush_states,
 NULL
 }
 };
@@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine)
 }
 
 qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
+qemu_mutex_init(&spapr->spapr_nvdimm_flush_states_lock);
+QLIST_INIT(&spapr->pending_flush_states);
+QLIST_INIT(&spapr->completed_flush_states);
 }
 
 #define DEFAULT_KVM_TYPE "auto"
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 8cf3fb2ffb..883317c1ed 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,14 +22,17 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
 #include "hw/mem/nvdimm.h"
+#include "qemu/guest-random.h"
 #include "qemu/nvdimm-utils.h"
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
 
 /*
  * The nvdimm size should be aligned to SCM block size.
@@ -371,6 +374,242 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static const VMStateDescription vmstate_spapr_nvdimm_entry = {
+ .name = "spapr_nvdimm_states",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static bool spapr_nvdimm_states_needed(void *opaque)
+{
+ SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+ return (!QLIST_EMPTY(&spapr->pending_flush_states) ||
+ !QLIST_EMPTY(&spapr->completed_flush_states));
+}
+
+static int spapr_nvdimm_pre_save(void *opaque)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+while (!QLIST_EMPTY(&spapr->pending_flush_states)) {
+aio_poll(qemu_get_aio_context(), true);
+}
+
+return 0;
+}
+
+const VMStateDescription vmstate_spapr_nvdimm_flush_states = {
+.name = "spapr_nvdimm_hcall_states",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_nvdimm_states_needed,
+.pre_save = spapr_nvdimm_pre_save,
+.fields = (VMStateField[]) {
+VMSTATE_QLIST_V(completed_flush_states, SpaprMachineState, 1,
+vmstate_spapr_nvdimm_entry,
+SpaprNVDIMMDeviceFlushState, node),
+VMSTATE_END_OF_LIST()
+},
+};
+
+/*
+ * Acquire a unique token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(void)
+{
+Error *err = NULL;
+uint64_t token;
+SpaprMachineSta

Re: [PATCH v3 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

2021-03-26 Thread Shivaprasad G Bhat


On 3/25/21 7:21 AM, David Gibson wrote:

On Wed, Mar 24, 2021 at 09:34:06AM +0530, Aneesh Kumar K.V wrote:

On 3/24/21 8:37 AM, David Gibson wrote:

On Tue, Mar 23, 2021 at 09:47:38AM -0400, Shivaprasad G Bhat wrote:

The patch adds support for the SCM flush hcall for the nvdimm devices.

...

collects all the hcall states from 'completed' list. The necessary
nvdimm flush specific vmstate structures are added to the spapr
machine vmstate.

Signed-off-by: Shivaprasad G Bhat 

An overal question: surely the same issue must arise on x86 with
file-backed NVDIMMs.  How do they handle this case?

On x86 we have different ways nvdimm can be discovered. ACPI NFIT, e820 map
and virtio_pmem. Among these virio_pmem always operated with synchronous dax
disabled and both ACPI and e820 doesn't have the ability to differentiate
support for synchronous dax.

Ok.  And for the virtio-pmem case, how are the extra flushes actually
done on x86?



virtio-pmem device has virtqueue with virtio_pmem_flush() as the handler

which gets called for all flush requests from guest. virtio_pmem_flush() is

offloading the flush to thread pool with a worker doing fsync() and the

completion callback notifying the guest with response.



With that I would expect users to use virtio_pmem when using using file
backed NVDIMMS

So... should we prevent advertising an NVDIMM through ACPI or e820 if
it doesn't have sync-dax enabled?



Is it possible to have different defaults for sync-dax based on 
architecture ?


The behaviour on x86 is sync-dax=on for nvdimms. So, it would be correct to

have the default as "on" for x86. For pseries -  "off" for new machines.

Looking at code, I didnt find much ways to achieve this. Can you suggest

what can be done ?

Re: [PATCH v3 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

2021-03-29 Thread Shivaprasad G Bhat



On 3/24/21 8:37 AM, David Gibson wrote:

On Tue, Mar 23, 2021 at 09:47:38AM -0400, Shivaprasad G Bhat wrote:

machine vmstate.

Signed-off-by: Shivaprasad G Bhat

An overal question: surely the same issue must arise on x86 with
file-backed NVDIMMs.  How do they handle this case?


Discussed in other threads..




  };
@@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine)
  }
  
  qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);

+qemu_mutex_init(&spapr->spapr_nvdimm_flush_states_lock);

Do you actually need an extra mutex, or can you rely on the BQL?


I verified BQL is held at all places where it matters in the context of 
this patch.


Safe to get rid of this extra mutex.

...




+{
+ SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+ return (!QLIST_EMPTY(&spapr->pending_flush_states) ||
+ !QLIST_EMPTY(&spapr->completed_flush_states));
+}
+
+static int spapr_nvdimm_pre_save(void *opaque)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+
+while (!QLIST_EMPTY(&spapr->pending_flush_states)) {
+aio_poll(qemu_get_aio_context(), true);

Hmm... how long could waiting for all the pending flushes to complete
take?  This could add substanially to the guest's migration downtime,
couldn't it?



The time taken depends on the number of dirtied pages and the disk io 
write speed. The number of dirty pages on host is configureable with 
tunables vm.dirty_background_ratio (10% default on Fedora 32, Ubuntu 
20.04), vm.dirty_ratio(20%) of host memory and|or 
vm.dirty_expire_centisecs(30 seconds). So, the host itself would be 
flushing the mmaped file on its own from time to time. For guests using 
the nvdimms with filesystem, the flushes would have come frequently and 
the number of dirty pages might be less. The pmem applications can use 
the nvdimms without a filesystem. And for such guests, the chances that 
a flush request can come from pmem applications at the time of migration 
is less or is random. But, the host would have flushed the pagecache on 
its own when vm.dirty_background_ratio is crossed or 
vm.dirty_expire_centisecs expired. So, the worst case would stands at 
disk io latency for writing the dirtied pages in the last 
vm.dirty_expire_centisecs on host OR latency for writing maximum 
vm.dirty_background_ratio(10%) of host RAM. If you want me to calibrate 
any particular size, scenario and get the numbers please let me know.


...

+
+/*
+ * Acquire a unique token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(void)
+{
+Error *err = NULL;
+uint64_t token;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *tmp, *next, *state;
+
+state = g_malloc0(sizeof(*state));
+
+qemu_mutex_lock(&spapr->spapr_nvdimm_flush_states_lock);
+retry:
+if (qemu_guest_getrandom(&token, sizeof(token), &err) < 0) {

Using getrandom seems like overkill, why not just use a counter?


I didnt want a spurious guest to abuse by consuming the return value 
providing


a valid "guess-able" counter and the real driver failing subsequently. 
Also, across


guest migrations carrying the global counter to destination is another 
thing to ponder.



Let me know if you want me to reconsider using counter.

...


mm_flush_states_lock);
+
+return state;
+}
+
+/*
+ * spapr_nvdimm_finish_flushes
+ *  Waits for all pending flush requests to complete
+ *  their execution and free the states
+ */
+void spapr_nvdimm_finish_flushes(void)
+{
+SpaprNVDIMMDeviceFlushState *state, *next;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());

The caller has natural access to the machine, so pass it in rather
than using the global.


okay

...


+
+/*
+ * spapr_nvdimm_get_hcall_status
+ *  Fetches the status of the hcall worker and returns H_BUSY
+ *  if the worker is still running.
+ */
+static int spapr_nvdimm_get_flush_status(uint64_t token)
+{
+int ret = H_LONG_BUSY_ORDER_10_MSEC;
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());

The callers have natural access to spapr, so pass it in rather than
using the global.


Okay

...


+
+/*
+ * H_SCM_FLUSH
+ * Input: drc_index, continue-token
+ * Out: continue-token
+ * Return Value: H_SUCCESS, H_Parameter, H_P2, H_BUSY
+ *
+ * Given a DRC Index Flush the data to backend NVDIMM device.
+ * The hcall returns H_BUSY when the flush takes longer time and the hcall

It returns one of the H_LONG_BUSY values, not actual H_BUSY, doesn't
it?


Yes. I thought its okay to call it just H_BUSY in a generic way. Will 
fix it.




+ * needs to be issued multiple times in order to be completely serviced.
+}
+
+return ret;
+}
+
+dimm = PC_DIMM(drc->dev);
+backend = MEMORY_BACKEND(dimm->hostmem);
+
+

Re: [PATCH] ppc/spapr: Add support for implement support for H_SCM_HEALTH

2021-03-29 Thread Shivaprasad G Bhat


Hi Vaibhav,

Some comments inline..

On 3/29/21 9:52 PM, Vaibhav Jain wrote:

Add support for H_SCM_HEALTH hcall described at [1] for spapr
nvdimms. This enables guest to detect the 'unarmed' status of a
specific spapr nvdimm identified by its DRC and if its unarmed, mark
the region backed by the nvdimm as read-only.

The patch adds h_scm_health() to handle the H_SCM_HEALTH hcall which
returns two 64-bit bitmaps (health bitmap, health bitmap mask) derived
from 'struct nvdimm->unarmed' member.

Linux kernel side changes to enable handling of 'unarmed' nvdimms for
ppc64 are proposed at [2].

References:
[1] "Hypercall Op-codes (hcalls)"
 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/powerpc/papr_hcalls.rst

[2] "powerpc/papr_scm: Mark nvdimm as unarmed if needed during probe"
 
https://lore.kernel.org/linux-nvdimm/20210329113103.476760-1-vaib...@linux.ibm.com/

Signed-off-by: Vaibhav Jain 
---
  hw/ppc/spapr_nvdimm.c  | 30 ++
  include/hw/ppc/spapr.h |  4 ++--
  2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index b46c36917c..e38740036d 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -31,6 +31,13 @@
  #include "qemu/range.h"
  #include "hw/ppc/spapr_numa.h"
  
+/* DIMM health bitmap bitmap indicators */

+/* SCM device is unable to persist memory contents */
+#define PAPR_PMEM_UNARMED (1ULL << (63 - 0))
+
+/* Bits status indicators for health bitmap indicating unarmed dimm */
+#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED)
+
  bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
 uint64_t size, Error **errp)
  {
@@ -467,6 +474,28 @@ static target_ulong h_scm_unbind_all(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
  return H_SUCCESS;
  }
  
+static target_ulong h_scm_health(PowerPCCPU *cpu, SpaprMachineState *spapr,

+ target_ulong opcode, target_ulong *args)
+{
+uint32_t drc_index = args[0];
+SpaprDrc *drc = spapr_drc_by_index(drc_index);
+NVDIMMDevice *nvdimm;
+
+if (drc && spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+return H_PARAMETER;
+}
+



Please check if drc->dev is not NULL too. DRCs are created in advance

and drc->dev may not be assigned if the device is not plugged yet.



+nvdimm = NVDIMM(drc->dev);
+
+/* Check if the nvdimm is unarmed and send its status via health bitmaps */
+args[0] = nvdimm->unarmed ? PAPR_PMEM_UNARMED_MASK : 0;



Please use object_property_get_bool to fetch the unarmed value.



+
+/* health bitmap mask same as the health bitmap */
+args[1] = args[0];
+
+return H_SUCCESS;
+}
+
  static void spapr_scm_register_types(void)
  {


...


Thanks,

Shivaprasad

[PATCH] spapr: nvdimm: Fix the persistent-memory root node name in device tree

2021-05-26 Thread Shivaprasad G Bhat

The FDT code is adding the pmem root node by name "persistent-memory"
which should have been "ibm,persistent-memory".

The linux fetches the device tree nodes by type and it has been working
correctly as the type is correct. If someone searches by its intended
name it would fail, so fix that.

Reported-by: Aneesh Kumar K.V 
Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 252204e25f..d7a4a0a051 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -163,11 +163,11 @@ int spapr_pmem_dt_populate(SpaprDrc *drc, 
SpaprMachineState *spapr,
 
 void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt)
 {
-int offset = fdt_subnode_offset(fdt, 0, "persistent-memory");
+int offset = fdt_subnode_offset(fdt, 0, "ibm,persistent-memory");
 GSList *iter, *nvdimms = nvdimm_get_device_list();
 
 if (offset < 0) {
-offset = fdt_add_subnode(fdt, 0, "persistent-memory");
+offset = fdt_add_subnode(fdt, 0, "ibm,persistent-memory");
 _FDT(offset);
 _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x1)));
 _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));

[PATCH v5 3/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there is
need for explicit IO flushes on the backend to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem. So, the
approach here is to convey when the hcall flush is required in a device
tree property. The guest once it knows the device backend is not pmem,
makes the hcall whenever flush is required.

To set the device tree property, the patch introduces a new papr specific
device type inheriting the nvdimm device. When the backend doesn't have
pmem="yes", the device tree property "ibm,hcall-flush-required" is set,
and the guest makes hcall H_SCM_FLUSH requesting for an explicit flush.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   46 +
 include/hw/ppc/spapr_nvdimm.h |4 
 2 files changed, 50 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index d460a098c0..9a04df4c47 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -54,6 +54,8 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 {
 const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
 const MachineState *ms = MACHINE(hotplug_dev);
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
 g_autofree char *uuidstr = NULL;
 QemuUUID uuid;
 int ret;
@@ -91,6 +93,14 @@ bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, 
NVDIMMDevice *nvdimm,
 return false;
 }
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
+(memory_region_get_fd(mr) < 0)) {
+error_setg(errp, "spapr-nvdimm device requires the "
+   "memdev %s to be of memory-backend-file type",
+   object_get_canonical_path_component(OBJECT(dimm->hostmem)));
+return false;
+}
+
 return true;
 }
 
@@ -162,6 +172,21 @@ static int spapr_dt_nvdimm(SpaprMachineState *spapr, void 
*fdt,
  "operating-system")));
 _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
 
+if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+bool is_pmem = false;
+#ifdef CONFIG_LIBPMEM
+PCDIMMDevice *dimm = PC_DIMM(nvdimm);
+HostMemoryBackend *hostmem = dimm->hostmem;
+
+is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem",
+   &error_abort);
+#endif
+if (!is_pmem) {
+_FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
+ NULL, 0));
+}
+}
+
 return child_offset;
 }
 
@@ -585,7 +610,16 @@ static target_ulong h_scm_flush(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 }
 
 dimm = PC_DIMM(drc->dev);
+if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
+return H_PARAMETER;
+}
+
 backend = MEMORY_BACKEND(dimm->hostmem);
+#ifdef CONFIG_LIBPMEM
+if (object_property_get_bool(OBJECT(backend), "pmem", &error_abort)) {
+return H_UNSUPPORTED;
+}
+#endif
 fd = memory_region_get_fd(&backend->mr);
 
 if (fd < 0) {
@@ -766,3 +800,15 @@ static void spapr_scm_register_types(void)
 }
 
 type_init(spapr_scm_register_types)
+
+static TypeInfo spapr_nvdimm_info = {
+.name  = TYPE_SPAPR_NVDIMM,
+.parent= TYPE_NVDIMM,
+};
+
+static void spapr_nvdimm_register_types(void)
+{
+type_register_static(&spapr_nvdimm_info);
+}
+
+type_init(spapr_nvdimm_register_types)
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 24d8e37b33..fb4e56418e 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -13,6 +13,10 @@
 #include "hw/mem/nvdimm.h"
 #include "migration/vmstate.h"
 
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_SIMPLE_TYPE(SpaprNVDIMMDevice, SPAPR_NVDIMM)
+
+typedef struct SpaprNVDIMMDevice  SpaprNVDIMMDevice;
 typedef struct SpaprDrc SpaprDrc;
 typedef struct SpaprMachineState SpaprMachineState;

[PATCH v5 0/3] spapr: nvdimm: Introduce spapr-nvdimm device

If the device backend is not persistent memory for the nvdimm, there
is need for explicit IO flushes to ensure persistence.

On SPAPR, the issue is addressed by adding a new hcall to request for
an explicit flush from the guest when the backend is not pmem.
So, the approach here is to convey when the hcall flush is required
in a device tree property. The guest once it knows the device needs
explicit flushes, makes the hcall as and when required.

It was suggested to create a new device type to address the
explicit flush for such backends on PPC instead of extending the
generic nvdimm device with new property. So, the patch introduces
the spapr-nvdimm device. The new device inherits the nvdimm device
with the new bahviour such that if the backend has pmem=no, the
device tree property is set.

The below demonstration shows the map_sync behavior for non-pmem
backends.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

The pmem0 is from spapr-nvdimm with with backend pmem=yes, and pmem1 is
from spapr-nvdimm with pmem=no, mounted as
/dev/pmem0 on /mnt1 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)
/dev/pmem1 on /mnt2 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)

[root@atest-guest ~]# ./mapsync /mnt1/newfile > When pmem=yes
[root@atest-guest ~]# ./mapsync /mnt2/newfile > when pmem=no
Failed to mmap with Operation not supported

The first patch does the header file cleanup necessary for the
subsequent ones. Second patch implements the hcall, adds the necessary
vmstate properties to spapr machine structure for carrying the hcall
status during save-restore. The nature of the hcall being asynchronus,
the patch uses aio utilities to offload the flush. The third patch
introduces the spapr-nvdimm device, adds the device tree property
for the guest when spapr-nvdimm is used with pmem="no" on the backend.

The kernel changes to exploit this hcall is at
https://github.com/linuxppc/linux/commit/75b7c05ebf9026.patch

---
v4 - https://lists.gnu.org/archive/html/qemu-devel/2021-04/msg05982.html
Changes from v4:
- Introduce spapr-nvdimm device with nvdimm device as the parent.
- The new spapr-nvdimm has no new properties. As this is a new
device and there is no migration related dependencies to be
taken care of, the device behavior is made to set the device tree
property and enable hcall when the device type spapr-nvdimm is
used with pmem="no"
- Fixed commit messages
- Added checks to ensure the backend is actualy file and not memory
- Addressed things pointed out by Eric

v3 - https://lists.gnu.org/archive/html/qemu-devel/2021-03/msg07916.html
Changes from v3:
- Fixed the forward declaration coding guideline violations in 1st patch.
- Removed the code waiting for the flushes to complete during migration,
instead restart the flush worker on destination qemu in post load.
- Got rid of the randomization of the flush tokens, using simple
counter.
- Got rid of the redundant flush state lock, relying on the BQL now.
- Handling the memory-backend-ram usage
- Changed the sync-dax symantics from on/off to 'unsafe','writeback' and
'direct'.
Added prevention code using 'writeback' on arm and x86_64.
- Fixed all the miscellaneous comments.

v2 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg07031.html
Changes from v2:
- Using the thread pool based approach as suggested
- Moved the async hcall handling code to spapr_nvdimm.c along
with some simplifications
- Added vmstate to preserve the hcall status during save-restore
along with pre_save handler code to complete all ongoning flushes.
- Added hw_compat magic for sync-dax 'on' on previous machines.
- Miscellanious minor fixes.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
spapr: nvdimm: Forward declare and move the definitions
spapr: nvdimm: Implement H_SCM_FLUSH hcall
spapr: nvdimm: Introduce spapr-nvdimm device

hw/ppc/spapr.c|6 +
hw/ppc/spapr_nvdimm.c | 298 +
include/hw/ppc/spapr.h| 11 +-
include/hw/ppc/spapr_nvdimm.h | 29 ++--
4 files changed, 332 insertions(+), 12 deletions(-)

--
Signature

[PATCH v5 1/3] spapr: nvdimm: Forward declare and move the definitions

The subsequent patches add definitions which tend to get
the compilation to cyclic dependency. So, prepare with
forward declarations, move the definitions and clean up.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   12 
 include/hw/ppc/spapr_nvdimm.h |   14 ++
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 252204e25f..3f57a8b6fa 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -35,6 +35,18 @@
 /* SCM device is unable to persist memory contents */
 #define PAPR_PMEM_UNARMED PPC_BIT(0)
 
+/*
+ * The nvdimm size should be aligned to SCM block size.
+ * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
+ * in order to have SCM regions not to overlap with dimm memory regions.
+ * The SCM devices can have variable block sizes. For now, fixing the
+ * block size to the minimum value.
+ */
+#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
+
+/* Have an explicit check for alignment */
+QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 73be250e2a..764f999f54 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -11,19 +11,9 @@
 #define HW_SPAPR_NVDIMM_H
 
 #include "hw/mem/nvdimm.h"
-#include "hw/ppc/spapr.h"
 
-/*
- * The nvdimm size should be aligned to SCM block size.
- * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
- * inorder to have SCM regions not to overlap with dimm memory regions.
- * The SCM devices can have variable block sizes. For now, fixing the
- * block size to the minimum value.
- */
-#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
-
-/* Have an explicit check for alignment */
-QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+typedef struct SpaprDrc SpaprDrc;
+typedef struct SpaprMachineState SpaprMachineState;
 
 int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
void *fdt, int *fdt_start_offset, Error **errp);

[PATCH v5 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with one of H_LONG_BUSY when the operation is expected to take longer
time along with a continue_token. The hcall to be called again providing
the continue_token to get the status. So, all fresh requests are put into
a 'pending' list and flush worker is submitted to the thread pool. The
thread pool completion callbacks move the requests to 'completed' list,
which are cleaned up after reporting to guest in subsequent hcalls to
get the status.

The semantics makes it necessary to preserve the continue_tokens and
their return status across migrations. So, the completed flush states
are forwarded to the destination and the pending ones are restarted
at the destination in post_load. The necessary nvdimm flush specific
vmstate structures are added to the spapr machine vmstate.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|6 +
 hw/ppc/spapr_nvdimm.c |  240 +
 include/hw/ppc/spapr.h|   11 ++
 include/hw/ppc/spapr_nvdimm.h |   13 ++
 4 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c23bcc4490..7a29ea2b05 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1622,6 +1622,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes(spapr);
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -2018,6 +2020,7 @@ static const VMStateDescription vmstate_spapr = {
 &vmstate_spapr_cap_ccf_assist,
 &vmstate_spapr_cap_fwnmi,
 &vmstate_spapr_fwnmi,
+&vmstate_spapr_nvdimm_states,
 NULL
 }
 };
@@ -3012,6 +3015,9 @@ static void spapr_machine_init(MachineState *machine)
 }
 
 qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
+
+QLIST_INIT(&spapr->pending_flush_states);
+QLIST_INIT(&spapr->completed_flush_states);
 }
 
 #define DEFAULT_KVM_TYPE "auto"
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 3f57a8b6fa..d460a098c0 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,7 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
 
 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
 /* SCM device is unable to persist memory contents */
@@ -375,6 +377,243 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static uint64_t flush_token;
+
+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+/* flush raw backing image */
+if (qemu_fdatasync(state->backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ret = H_HARDWARE;
+}
+
+return ret;
+}
+
+static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+state->hcall_ret = hcall_ret;
+QLIST_REMOVE(state, node);
+QLIST_INSERT_HEAD(&spapr->completed_flush_states, state, node);
+}
+
+static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
+ .name = "spapr_nvdimm_flush_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static bool spapr_nvdimm_states_needed(void *opaque)
+{
+ SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+ return (!QLIST_EMPTY(&spapr->pending_flush_states) ||
+ !QLIST_EMPTY(&spapr->completed_flush_states));
+}
+
+static int spapr_nvdimm_post_load(void *opaque, int version_id)
+{
+SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+SpaprNVDIMMDeviceFlushState *state, *next;
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL

[PATCH v4 0/3] nvdimm: Enable sync-dax property for nvdimm

The nvdimm devices are expected to ensure write persistence during power
failure kind of scenarios.

The libpmem has architecture specific instructions like dcbf on POWER
to flush the cache data to backend nvdimm device during normal writes
followed by explicit flushes if the backend devices are not synchronous
DAX capable.

Qemu - virtual nvdimm devices are memory mapped. The dcbf in the guest
and the subsequent flush doesn't traslate to actual flush to the backend
file on the host in case of file backed v-nvdimms. This is addressed by
virtio-pmem in case of x86_64 by making explicit flushes translating to
fsync at qemu.

On SPAPR, the issue is addressed by adding a new hcall to
request for an explicit flush from the guest ndctl driver when the backend
nvdimm cannot ensure write persistence with dcbf alone. So, the approach
here is to convey when the hcall flush is required in a device tree
property. The guest makes the hcall when the property is found, instead
of relying on dcbf.

A new device property sync-dax is added to the nvdimm device. When the
sync-dax is 'writeback'(default for PPC), device property
"hcall-flush-required" is set, and the guest makes hcall H_SCM_FLUSH
requesting for an explicit flush.

sync-dax is "unsafe" on all other platforms(x86, ARM) and old pseries machines
prior to 5.2 on PPC. sync-dax="writeback" on ARM and x86_64 is prevented
now as the flush semantics are unimplemented.

When the backend file is actually synchronous DAX capable and no explicit
flushes are required, the sync-dax mode 'direct' is to be used.

The below demonstration shows the map_sync behavior with sync-dax writeback &
direct.
(https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c)

The pmem0 is from nvdimm with With sync-dax=direct, and pmem1 is from
nvdimm with syn-dax=writeback, mounted as
/dev/pmem0 on /mnt1 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)
/dev/pmem1 on /mnt2 type xfs
(rw,relatime,attr2,dax=always,inode64,logbufs=8,logbsize=32k,noquota)

[root@atest-guest ~]# ./mapsync /mnt1/newfile > When sync-dax=unsafe/direct
[root@atest-guest ~]# ./mapsync /mnt2/newfile > when sync-dax=writeback
Failed to mmap with Operation not supported

The kernel changes to exploit this hcall is at
https://github.com/linuxppc/linux/commit/75b7c05ebf9026.patch

---
v3 - https://lists.gnu.org/archive/html/qemu-devel/2021-03/msg07916.html
Changes from v3:
- Fixed the forward declaration coding guideline violations in 1st patch.
- Removed the code waiting for the flushes to complete during migration,
instead restart the flush worker on destination qemu in post load.
- Got rid of the randomization of the flush tokens, using simple
counter.
- Got rid of the redundant flush state lock, relying on the BQL now.
- Handling the memory-backend-ram usage
- Changed the sync-dax symantics from on/off to 'unsafe','writeback' and
'direct'.
Added prevention code using 'writeback' on arm and x86_64.
- Fixed all the miscellaneous comments.

v1 - https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg06330.html
Changes from v1
- Fixed a missed-out unlock
- using QLIST_FOREACH instead of QLIST_FOREACH_SAFE while generating token

Shivaprasad G Bhat (3):
spapr: nvdimm: Forward declare and move the definitions
spapr: nvdimm: Implement H_SCM_FLUSH hcall
nvdimm: Enable sync-dax device property for nvdimm

--
Signature

[PATCH v4 1/3] spapr: nvdimm: Forward declare and move the definitions

The subsequent patches add definitions which tend to
get the compilation to cyclic dependency. So, prepare
with forward declarations, move the defitions and clean up.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr_nvdimm.c |   12 
 include/hw/ppc/spapr_nvdimm.h |   14 ++
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index b46c36917c..8cf3fb2ffb 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -31,6 +31,18 @@
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
 
+/*
+ * The nvdimm size should be aligned to SCM block size.
+ * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
+ * inorder to have SCM regions not to overlap with dimm memory regions.
+ * The SCM devices can have variable block sizes. For now, fixing the
+ * block size to the minimum value.
+ */
+#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
+
+/* Have an explicit check for alignment */
+QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+
 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
 {
diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h
index 73be250e2a..764f999f54 100644
--- a/include/hw/ppc/spapr_nvdimm.h
+++ b/include/hw/ppc/spapr_nvdimm.h
@@ -11,19 +11,9 @@
 #define HW_SPAPR_NVDIMM_H
 
 #include "hw/mem/nvdimm.h"
-#include "hw/ppc/spapr.h"
 
-/*
- * The nvdimm size should be aligned to SCM block size.
- * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
- * inorder to have SCM regions not to overlap with dimm memory regions.
- * The SCM devices can have variable block sizes. For now, fixing the
- * block size to the minimum value.
- */
-#define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
-
-/* Have an explicit check for alignment */
-QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+typedef struct SpaprDrc SpaprDrc;
+typedef struct SpaprMachineState SpaprMachineState;
 
 int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
void *fdt, int *fdt_start_offset, Error **errp);

[PATCH v4 2/3] spapr: nvdimm: Implement H_SCM_FLUSH hcall

The patch adds support for the SCM flush hcall for the nvdimm devices.
To be available for exploitation by guest through the next patch.

The hcall expects the semantics such that the flush to return
with H_BUSY when the operation is expected to take longer time along
with a continue_token. The hcall to be called again providing the
continue_token to get the status. So, all fresh requsts are put into
a 'pending' list and flush worker is submitted to the thread pool.
The thread pool completion callbacks move the requests to 'completed'
list, which are cleaned up after reporting to guest in subsequent
hcalls to get the status.

The semantics makes it necessary to preserve the continue_tokens
and their return status across migrations. So, the completed
flush states are forwarded to the destination and the pending
ones are restarted at the destination in post_load. The necessary
nvdimm flush specific vmstate structures are added to the spapr
machine vmstate.

Signed-off-by: Shivaprasad G Bhat 
---
 hw/ppc/spapr.c|6 +
 hw/ppc/spapr_nvdimm.c |  234 +
 include/hw/ppc/spapr.h|   10 ++
 include/hw/ppc/spapr_nvdimm.h |   13 ++
 4 files changed, 262 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index e4be00b732..80957f9188 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1607,6 +1607,8 @@ static void spapr_machine_reset(MachineState *machine)
 spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
 }
 
+spapr_nvdimm_finish_flushes(spapr);
+
 /* DRC reset may cause a device to be unplugged. This will cause troubles
  * if this device is used by another device (eg, a running vhost backend
  * will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -2003,6 +2005,7 @@ static const VMStateDescription vmstate_spapr = {
 &vmstate_spapr_cap_ccf_assist,
 &vmstate_spapr_cap_fwnmi,
 &vmstate_spapr_fwnmi,
+&vmstate_spapr_nvdimm_states,
 NULL
 }
 };
@@ -2997,6 +3000,9 @@ static void spapr_machine_init(MachineState *machine)
 }
 
 qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond);
+
+QLIST_INIT(&spapr->pending_flush_states);
+QLIST_INIT(&spapr->completed_flush_states);
 }
 
 #define DEFAULT_KVM_TYPE "auto"
diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 8cf3fb2ffb..77eb7e1293 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qapi/error.h"
 #include "hw/ppc/spapr_drc.h"
 #include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,7 @@
 #include "hw/ppc/fdt.h"
 #include "qemu/range.h"
 #include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
 
 /*
  * The nvdimm size should be aligned to SCM block size.
@@ -371,6 +373,237 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return H_SUCCESS;
 }
 
+static uint64_t flush_token;
+
+static int flush_worker_cb(void *opaque)
+{
+int ret = H_SUCCESS;
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+/* flush raw backing image */
+if (qemu_fdatasync(state->backend_fd) < 0) {
+error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ret = H_HARDWARE;
+}
+
+return ret;
+}
+
+static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
+{
+SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+SpaprNVDIMMDeviceFlushState *state = opaque;
+
+state->hcall_ret = hcall_ret;
+QLIST_REMOVE(state, node);
+QLIST_INSERT_HEAD(&spapr->completed_flush_states, state, node);
+}
+
+static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
+ .name = "spapr_nvdimm_flush_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+static bool spapr_nvdimm_states_needed(void *opaque)
+{
+ SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+ return (!QLIST_EMPTY(&spapr->pending_flush_states) ||
+ !QLIST_EMPTY(&spapr->completed_flush_states));
+}
+
+static int spapr_nvdimm_post_load(void *opaque, int version_id)
+{
+SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+SpaprNVDIMMDeviceFlushState *state, *next;
+PCDIMMDevice *dimm;
+HostMemoryBackend *backend = NULL;
+ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+SpaprDrc *drc;
+
+QLIS

[PATCH v4 3/3] nvdimm: Enable sync-dax device property for nvdimm