date:20220214

Re: [PATCH v14 3/7] migration/dirtyrate: Refactor dirty page rate calculation

2022-02-14 Thread Peter Xu

Mostly good, one trivial nit below:

On Fri, Feb 11, 2022 at 12:17:37AM +0800, huang...@chinatelecom.cn wrote:
> +int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
> + int64_t init_time_ms,
> + VcpuStat *stat,
> + unsigned int flag,
> + bool one_shot)
> +{
> +DirtyPageRecord *records;
> +int64_t duration;
> +int64_t dirtyrate;
> +int i = 0;
> +unsigned int gen_id;
> +
> +retry:
> +cpu_list_lock();
> +gen_id = cpu_list_generation_id_get();
> +records = vcpu_dirty_stat_alloc(stat);
> +vcpu_dirty_stat_collect(stat, records, true);
> +cpu_list_unlock();
> +
> +duration = dirty_stat_wait(calc_time_ms, init_time_ms);

Is it a must to pass in init_time_ms rather than always sleep in
dirty_stat_wait()?  Could we simply drop it?

-- 
Peter Xu

Re: [PATCH v14 2/7] cpus: Introduce cpu_list_generation_id

2022-02-14 Thread Peter Xu

On Fri, Feb 11, 2022 at 12:17:36AM +0800, huang...@chinatelecom.cn wrote:
> From: Hyman Huang(黄勇) 
> 
> Introduce cpu_list_generation_id to track cpu list generation so
> that cpu hotplug/unplug can be detected during measurement of
> dirty page rate.

Could you add a short paragraph on showing how the gen_id should be used?

> 
> Signed-off-by: Hyman Huang(黄勇) 

Reviewed-by: Peter Xu 

Thanks,

-- 
Peter Xu

[PULL 6/6] hw/nvme: add support for zoned random write area

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

Add support for TP 4076 ("Zoned Random Write Area"), v2021.08.23
("Ratified").

This adds three new namespace parameters: "zoned.numzrwa" (number of
zrwa resources, i.e. number of zones that can have a zrwa),
"zoned.zrwas" (zrwa size in LBAs), "zoned.zrwafg" (granularity in LBAs
for flushes).

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 171 ++-
 hw/nvme/ns.c |  58 +++
 hw/nvme/nvme.h   |  10 +++
 hw/nvme/trace-events |   1 +
 include/block/nvme.h |  17 -
 5 files changed, 237 insertions(+), 20 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 7cb4974c5e83..98aac98bef5f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -299,26 +299,37 @@ static void nvme_assign_zone_state(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 }
 
-/*
- * Check if we can open a zone without exceeding open/active limits.
- * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
- */
-static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
+static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
+ uint32_t opn, uint32_t zrwa)
 {
 if (ns->params.max_active_zones != 0 &&
 ns->nr_active_zones + act > ns->params.max_active_zones) {
 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
 }
+
 if (ns->params.max_open_zones != 0 &&
 ns->nr_open_zones + opn > ns->params.max_open_zones) {
 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
 }
 
+if (zrwa > ns->zns.numzrwa) {
+return NVME_NOZRWA | NVME_DNR;
+}
+
 return NVME_SUCCESS;
 }
 
+/*
+ * Check if we can open a zone without exceeding open/active limits.
+ * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
+ */
+static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
+{
+return nvme_zns_check_resources(ns, act, opn, 0);
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
 hwaddr hi, lo;
@@ -1628,9 +1639,19 @@ static uint16_t nvme_check_zone_write(NvmeNamespace *ns, 
NvmeZone *zone,
 return status;
 }
 
-if (unlikely(slba != zone->w_ptr)) {
-trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
-return NVME_ZONE_INVALID_WRITE;
+if (zone->d.za & NVME_ZA_ZRWA_VALID) {
+uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
+
+if (slba < zone->w_ptr || slba + nlb > ezrwa) {
+trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
+return NVME_ZONE_INVALID_WRITE;
+}
+} else {
+if (unlikely(slba != zone->w_ptr)) {
+trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
+   zone->w_ptr);
+return NVME_ZONE_INVALID_WRITE;
+}
 }
 
 if (unlikely((slba + nlb) > zcap)) {
@@ -1710,6 +1731,14 @@ static uint16_t nvme_zrm_finish(NvmeNamespace *ns, 
NvmeZone *zone)
 /* fallthrough */
 case NVME_ZONE_STATE_CLOSED:
 nvme_aor_dec_active(ns);
+
+if (zone->d.za & NVME_ZA_ZRWA_VALID) {
+zone->d.za &= ~NVME_ZA_ZRWA_VALID;
+if (ns->params.numzrwa) {
+ns->zns.numzrwa++;
+}
+}
+
 /* fallthrough */
 case NVME_ZONE_STATE_EMPTY:
 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
@@ -1745,6 +1774,13 @@ static uint16_t nvme_zrm_reset(NvmeNamespace *ns, 
NvmeZone *zone)
 /* fallthrough */
 case NVME_ZONE_STATE_CLOSED:
 nvme_aor_dec_active(ns);
+
+if (zone->d.za & NVME_ZA_ZRWA_VALID) {
+if (ns->params.numzrwa) {
+ns->zns.numzrwa++;
+}
+}
+
 /* fallthrough */
 case NVME_ZONE_STATE_FULL:
 zone->w_ptr = zone->d.zslba;
@@ -1778,6 +1814,7 @@ static void nvme_zrm_auto_transition_zone(NvmeNamespace 
*ns)
 
 enum {
 NVME_ZRM_AUTO = 1 << 0,
+NVME_ZRM_ZRWA = 1 << 1,
 };
 
 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
@@ -1796,7 +1833,8 @@ static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, 
NvmeNamespace *ns,
 if (n->params.auto_transition_zones) {
 nvme_zrm_auto_transition_zone(ns);
 }
-status = nvme_aor_check(ns, act, 1);
+status = nvme_zns_check_resources(ns, act, 1,
+  (flags & NVME_ZRM_ZRWA) ? 1 : 0);
 if (status) {
 return status;
 }
@@ -1824,6 +1862,12 @@ static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, 
NvmeNamespace *ns,
 /* fallthrough */
 
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+if (flags & NVME_ZRM_ZRWA) {
+ns->zns.numzrwa--;
+
+zone->d.za |= NVME_ZA_ZRWA_VA

Re: [PATCH v14 1/7] accel/kvm/kvm-all: Refactor per-vcpu dirty ring reaping

2022-02-14 Thread Peter Xu

On Fri, Feb 11, 2022 at 12:17:35AM +0800, huang...@chinatelecom.cn wrote:
> From: Hyman Huang(黄勇) 
> 
> Add a non-required argument 'CPUState' to kvm_dirty_ring_reap so
> that it can cover single vcpu dirty-ring-reaping scenario.
> 
> Signed-off-by: Hyman Huang(黄勇) 

Reviewed-by: Peter Xu 

-- 
Peter Xu

[PULL 2/6] hw/nvme/ctrl: Have nvme_addr_write() take const buffer

2022-02-14 Thread Klaus Jensen

From: Philippe Mathieu-Daudé 

The 'buf' argument is not modified, so better pass it as const type.

Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 37681a975986..12e1fcda7c85 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -395,7 +395,7 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
*buf, int size)
 return pci_dma_read(&n->parent_obj, addr, buf, size);
 }
 
-static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
+static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
 {
 hwaddr hi = addr + size - 1;
 if (hi < addr) {
-- 
2.35.1

Re: [PATCH v14 5/7] accel/kvm/kvm-all: Introduce kvm_dirty_ring_size function

2022-02-14 Thread Peter Xu

On Fri, Feb 11, 2022 at 12:17:39AM +0800, huang...@chinatelecom.cn wrote:
> From: Hyman Huang(黄勇) 
> 
> Introduce kvm_dirty_ring_size util function to help calculate
> dirty ring ful time.
> 
> Signed-off-by: Hyman Huang(黄勇) 

Acked-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH v14 4/7] softmmu/dirtylimit: Implement vCPU dirtyrate calculation periodically

2022-02-14 Thread Peter Xu

On Fri, Feb 11, 2022 at 12:17:38AM +0800, huang...@chinatelecom.cn wrote:
> From: Hyman Huang(黄勇) 
> 
> Introduce the third method GLOBAL_DIRTY_LIMIT of dirty
> tracking for calculate dirtyrate periodly for dirty page
> rate limit.
> 
> Add dirtylimit.c to implement dirtyrate calculation periodly,
> which will be used for dirty page rate limit.
> 
> Add dirtylimit.h to export util functions for dirty page rate
> limit implementation.
> 
> Signed-off-by: Hyman Huang(黄勇) 

Reviewed-by: Peter Xu 

-- 
Peter Xu

[PULL 1/6] hw/nvme: fix CVE-2021-3929

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

This fixes CVE-2021-3929 "locally" by denying DMA to the iomem of the
device itself. This still allows DMA to MMIO regions of other devices
(e.g. doing P2P DMA to the controller memory buffer of another NVMe
device).

Fixes: CVE-2021-3929
Reported-by: Qiuhao Li 
Reviewed-by: Keith Busch 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 1f62116af985..37681a975986 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -357,6 +357,24 @@ static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr 
addr)
 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
 }
 
+static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
+{
+hwaddr hi, lo;
+
+/*
+ * The purpose of this check is to guard against invalid "local" access to
+ * the iomem (i.e. controller registers). Thus, we check against the range
+ * covered by the 'bar0' MemoryRegion since that is currently composed of
+ * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
+ * that if the device model is ever changed to allow the CMB to be located
+ * in BAR0 as well, then this must be changed.
+ */
+lo = n->bar0.addr;
+hi = lo + int128_get64(n->bar0.size);
+
+return addr >= lo && addr < hi;
+}
+
 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
 hwaddr hi = addr + size - 1;
@@ -614,6 +632,10 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, 
hwaddr addr, size_t len)
 
 trace_pci_nvme_map_addr(addr, len);
 
+if (nvme_addr_is_iomem(n, addr)) {
+return NVME_DATA_TRAS_ERROR;
+}
+
 if (nvme_addr_is_cmb(n, addr)) {
 cmb = true;
 } else if (nvme_addr_is_pmr(n, addr)) {
-- 
2.35.1

Re: [PATCH] qapi, i386/sev: Add debug-launch-digest to launch-measure response

2022-02-14 Thread Dov Murik

On 31/01/2022 16:26, Daniel P. Berrangé wrote:
[...]
> 
> IOW, I think there's only two scenarios that make sense
> 
> 1. The combined launch digest over firmware, kernel hashes
>and VMSA state.
> 
> 2. Individual hashes for each of firmware, kernel hashes table and
>VMSA state
> 

Just one more data point relevant to this discussion: in SNP the guest
asks the PSP for a signed attestation report (MSG_REPORT_REQ).  The
returned report (ATTESTATION_REPORT structure; see section 7.3 of [1])
includes a MEASUREMENT field which is the measurement calculated at
launch (it's a SHA384-based chain of hashes and not a hash of the entire
content as in SEV-ES; and GPAs are also included. Details in section
8.17).  The entire report is signed with the signature appearing in a
separate SIGNATURE field.

Mimicking that in QEMU for SEV-ES would be in my opinion closer to
option (1) above.

Again, the proposed patch here doesn't yet include the VMSAs in the
GCTX.LD and therefore is lacking.  Dave mentioned adding ioctl in KVM; I
think that Daniel once suggested adding a virtual file like
/sys/kernel/debug/kvm/617063-12/vcpu0/launch_vmsa with the 4KB VMSA content.

Note that AFAIK measured direct boot with -kernel is not yet supported
in SNP but we plan to add it (with similar hashes table) after the SNP
patches are accepted in OVMF.

[1] https://www.amd.com/system/files/TechDocs/56860.pdf

-Dov

Re: [PATCH v14 7/7] softmmu/dirtylimit: Implement dirty page rate limit

2022-02-14 Thread Peter Xu

On Fri, Feb 11, 2022 at 12:17:41AM +0800, huang...@chinatelecom.cn wrote:
> +static struct DirtyLimitInfoList *dirtylimit_query_all(void)
> +{
> +int i, index;
> +DirtyLimitInfo *info = NULL;
> +DirtyLimitInfoList *head = NULL, **tail = &head;
> +
> +dirtylimit_state_lock();
> +
> +if (!dirtylimit_in_service()) {

Need to unlock?

> +return NULL;
> +}
> +
> +for (i = 0; i < dirtylimit_state->max_cpus; i++) {
> +index = dirtylimit_state->states[i].cpu_index;
> +if (dirtylimit_vcpu_get_state(index)->enabled) {
> +info = dirtylimit_query_vcpu(index);
> +QAPI_LIST_APPEND(tail, info);
> +}
> +}
> +
> +dirtylimit_state_unlock();
> +
> +return head;
> +}

-- 
Peter Xu

[PULL 3/6] hw/nvme/ctrl: Pass buffers as 'void *' types

2022-02-14 Thread Klaus Jensen

From: Philippe Mathieu-Daudé 

These buffers can be anything, not an array of chars,
so use the 'void *' type for them.

Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 10 +-
 hw/nvme/nvme.h |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 12e1fcda7c85..4344405e5939 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1162,7 +1162,7 @@ static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg 
*sg, uint8_t *ptr,
 return NVME_SUCCESS;
 }
 
-static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
+static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
 NvmeTxDirection dir)
 {
 assert(sg->flags & NVME_SG_ALLOC);
@@ -1199,7 +1199,7 @@ static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t 
*ptr, uint32_t len,
 return NVME_SUCCESS;
 }
 
-static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
 NvmeRequest *req)
 {
 uint16_t status;
@@ -1212,7 +1212,7 @@ static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t 
*ptr, uint32_t len,
 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
 }
 
-static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
 NvmeRequest *req)
 {
 uint16_t status;
@@ -1225,7 +1225,7 @@ static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t 
*ptr, uint32_t len,
 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
 }
 
-uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
   NvmeTxDirection dir, NvmeRequest *req)
 {
 NvmeNamespace *ns = req->ns;
@@ -1241,7 +1241,7 @@ uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, 
uint32_t len,
 return nvme_tx(n, &req->sg, ptr, len, dir);
 }
 
-uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
NvmeTxDirection dir, NvmeRequest *req)
 {
 NvmeNamespace *ns = req->ns;
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 83ffabade4cf..38f3ebf7f6c0 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -495,9 +495,9 @@ static inline uint16_t nvme_cid(NvmeRequest *req)
 }
 
 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns);
-uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
   NvmeTxDirection dir, NvmeRequest *req);
-uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
NvmeTxDirection dir, NvmeRequest *req);
 void nvme_rw_complete_cb(void *opaque, int ret);
 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
-- 
2.35.1

Re: [PATCH v14 6/7] softmmu/dirtylimit: Implement virtual CPU throttle

2022-02-14 Thread Peter Xu

On Fri, Feb 11, 2022 at 12:17:40AM +0800, huang...@chinatelecom.cn wrote:
> @@ -2964,8 +2971,13 @@ int kvm_cpu_exec(CPUState *cpu)
>   */
>  trace_kvm_dirty_ring_full(cpu->cpu_index);
>  qemu_mutex_lock_iothread();
> -kvm_dirty_ring_reap(kvm_state, NULL);
> +if (dirtylimit_in_service()) {
> +kvm_dirty_ring_reap(kvm_state, cpu);
> +} else {
> +kvm_dirty_ring_reap(kvm_state, NULL);
> +}

Could you add some comment here on why the cpu pointer is conditionally passed
into the reaping routine?  Even if we know it now, it's not immediately obvious
to all the readers.

[...]

> +struct {
> +VcpuDirtyLimitState *states;
> +/* Max cpus number configured by user */
> +int max_cpus;
> +/* Number of vcpu under dirtylimit */
> +int limited_nvcpu;
> +/* Function to implement throttle set up */
> +DirtyLimitFunc setup;

"setup" normally is used only at startup of something, but not per interval.
Perhaps "process" or "adjust"?  Same across other "setup" namings across the
patch.

Again, I'd rather call the function directly..

[...]

> +static void dirtylimit_adjust_throttle(CPUState *cpu)
> +{
> +uint64_t quota = 0;
> +uint64_t current = 0;
> +int cpu_index = cpu->cpu_index;
> +
> +quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
> +current = vcpu_dirty_rate_get(cpu_index);
> +
> +if (current == 0) {
> +cpu->throttle_us_per_full = 0;
> +goto end;

Can be dropped?

> +} else if (dirtylimit_done(quota, current)) {
> +goto end;

Same here.  Dropping it wholely and:

   } else if (!dirtylimit_done(quota, current)) {
   dirtylimit_set_throttle(cpu, quota, current);
   }

Would work?

> +} else {
> +dirtylimit_set_throttle(cpu, quota, current);
> +}
> +end:

Can be dropped?

> +trace_dirtylimit_adjust_throttle(cpu_index,
> + quota, current,
> + cpu->throttle_us_per_full);
> +return;
> +}
> +
> +void dirtylimit_setup(void)
> +{
> +CPUState *cpu;
> +
> +if (!qatomic_read(&dirtylimit_quit)) {
> +dirtylimit_state_lock();
> +
> +if (!dirtylimit_in_service()) {
> +dirtylimit_state_unlock();

Need to return?

> +}
> +
> +CPU_FOREACH(cpu) {
> +if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
> +continue;
> +}
> +dirtylimit_adjust_throttle(cpu);
> +}
> +dirtylimit_state_unlock();
> +}
> +}

[...]

> +void dirtylimit_set_vcpu(int cpu_index,
> + uint64_t quota,
> + bool enable)
> +{
> +dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
> +trace_dirtylimit_set_vcpu(cpu_index, quota);
> +}

This helper is not "help"ful..  How about wrapping the trace into
dirtylimit_vcpu_set_quota, then drop it?

Thanks,

-- 
Peter Xu

[PULL 0/6] hw/nvme updates

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

Hi Peter,

The following changes since commit 48033ad678ae2def43bf0d543a2c4c3d2a93feaf:

  Merge remote-tracking branch 'remotes/vsementsov/tags/pull-nbd-2022-02-09-v2' 
into staging (2022-02-12 22:04:07 +)

are available in the Git repository at:

  git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request

for you to fetch changes up to e321b4cdc2dd0b5e806ecf759138be7f83774142:

  hw/nvme: add support for zoned random write area (2022-02-14 08:58:29 +0100)


hw/nvme updates

  - fix CVE-2021-3929
  - add zone random write area support
  - misc cleanups from Philippe



Klaus Jensen (4):
  hw/nvme: fix CVE-2021-3929
  hw/nvme: add struct for zone management send
  hw/nvme: add ozcs enum
  hw/nvme: add support for zoned random write area

Philippe Mathieu-Daudé (2):
  hw/nvme/ctrl: Have nvme_addr_write() take const buffer
  hw/nvme/ctrl: Pass buffers as 'void *' types

 hw/nvme/ctrl.c   | 215 ---
 hw/nvme/ns.c |  61 +++-
 hw/nvme/nvme.h   |  14 ++-
 hw/nvme/trace-events |   1 +
 include/block/nvme.h |  40 +++-
 5 files changed, 296 insertions(+), 35 deletions(-)

-- 
2.35.1

[PULL 4/6] hw/nvme: add struct for zone management send

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

Add struct for Zone Management Send in preparation for more zone send
flags.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 10 --
 include/block/nvme.h | 19 +++
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 4344405e5939..7cb4974c5e83 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -3616,26 +3616,24 @@ done:
 
 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
 {
-NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
+NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
 NvmeNamespace *ns = req->ns;
 NvmeZone *zone;
 NvmeZoneResetAIOCB *iocb;
 uint8_t *zd_ext;
-uint32_t dw13 = le32_to_cpu(cmd->cdw13);
 uint64_t slba = 0;
 uint32_t zone_idx = 0;
 uint16_t status;
-uint8_t action;
+uint8_t action = cmd->zsa;
 bool all;
 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
 
-action = dw13 & 0xff;
-all = !!(dw13 & 0x100);
+all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
 
 req->status = NVME_SUCCESS;
 
 if (!all) {
-status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
+status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
 if (status) {
 return status;
 }
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e3bd47bf76ab..709d491c70d8 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1433,6 +1433,21 @@ enum NvmeZoneType {
 NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
 };
 
+typedef struct QEMU_PACKED NvmeZoneSendCmd {
+uint8_t opcode;
+uint8_t flags;
+uint16_tcid;
+uint32_tnsid;
+uint32_trsvd8[4];
+NvmeCmdDptr dptr;
+uint64_tslba;
+uint32_trsvd48;
+uint8_t zsa;
+uint8_t zsflags;
+uint8_t rsvd54[2];
+uint32_trsvd56[2];
+} NvmeZoneSendCmd;
+
 enum NvmeZoneSendAction {
 NVME_ZONE_ACTION_RSD = 0x00,
 NVME_ZONE_ACTION_CLOSE   = 0x01,
@@ -1443,6 +1458,10 @@ enum NvmeZoneSendAction {
 NVME_ZONE_ACTION_SET_ZD_EXT  = 0x10,
 };
 
+enum {
+NVME_ZSFLAG_SELECT_ALL = 1 << 0,
+};
+
 typedef struct QEMU_PACKED NvmeZoneDescr {
 uint8_t zt;
 uint8_t zs;
-- 
2.35.1

Re: [PATCH v14 3/7] migration/dirtyrate: Refactor dirty page rate calculation

2022-02-14 Thread Hyman Huang





在 2022/2/14 15:57, Peter Xu 写道:

Mostly good, one trivial nit below:

On Fri, Feb 11, 2022 at 12:17:37AM +0800, huang...@chinatelecom.cn wrote:

+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ int64_t init_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot)
+{
+DirtyPageRecord *records;
+int64_t duration;
+int64_t dirtyrate;
+int i = 0;
+unsigned int gen_id;
+
+retry:
+cpu_list_lock();
+gen_id = cpu_list_generation_id_get();
+records = vcpu_dirty_stat_alloc(stat);
+vcpu_dirty_stat_collect(stat, records, true);
+cpu_list_unlock();
+
+duration = dirty_stat_wait(calc_time_ms, init_time_ms);


Is it a must to pass in init_time_ms rather than always sleep in
dirty_stat_wait()?  Could we simply drop it?

Indeed, the parameter 'init_time_ms' seems kind of weird :(, we 
introduce 'init_time_ms' just becasue the calculate_dirtyrate_dirty_ring 
will call the function, see the following block:



 static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
 {
-CPUState *cpu;
-int64_t msec = 0;
 int64_t start_time;
+int64_t duration;
 uint64_t dirtyrate = 0;
 uint64_t dirtyrate_sum = 0;
-DirtyPageRecord *dirty_pages;
-int nvcpu = 0;
 int i = 0;

-CPU_FOREACH(cpu) {
-nvcpu++;
-}
-
-dirty_pages = malloc(sizeof(*dirty_pages) * nvcpu);
-
-DirtyStat.dirty_ring.nvcpu = nvcpu;
-DirtyStat.dirty_ring.rates = malloc(sizeof(DirtyRateVcpu) * nvcpu);
-
-dirtyrate_global_dirty_log_start();
-
-CPU_FOREACH(cpu) {
-record_dirtypages(dirty_pages, cpu, true);
-}
+/* start log sync */
+global_dirty_log_change(GLOBAL_DIRTY_DIRTY_RATE, true);

 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 DirtyStat.start_time = start_time / 1000;

The reason why we introduce the 'init_time_ms' is wanting to store the
start_time info and display.

Dropping this parameter is fine from my point view if we ignore the 
duration error result from the delay between caller and callee of

'vcpu_calculate_dirtyrate’

-msec = config.sample_period_seconds * 1000;
-msec = set_sample_page_period(msec, start_time);
-DirtyStat.calc_time = msec / 1000;
+/* calculate vcpu dirtyrate */
+duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 
1000,

+start_time,
+&DirtyStat.dirty_ring,
+GLOBAL_DIRTY_DIRTY_RATE,
+true);

-dirtyrate_global_dirty_log_stop();
-
-CPU_FOREACH(cpu) {
-record_dirtypages(dirty_pages, cpu, false);
-}
+DirtyStat.calc_time = duration / 1000;




--
Best regard

Hyman Huang(黄勇)

Re: [PATCH v4 4/4] hw/i386/sgx: Attach SGX-EPC objects to machine

2022-02-14 Thread Igor Mammedov

On Mon, 14 Feb 2022 14:58:57 +0800
Yang Zhong  wrote:

> On Mon, Feb 07, 2022 at 09:37:52AM +0100, Igor Mammedov wrote:
> > On Sat,  5 Feb 2022 13:45:26 +0100
> > Philippe Mathieu-Daudé  wrote:
> >   
> > > Previously SGX-EPC objects were exposed in the QOM tree at a path
> > > 
> > >   /machine/unattached/device[nn]
> > > 
> > > where the 'nn' varies depending on what devices were already created.
> > > 
> > > With this change the SGX-EPC objects are now at
> > > 
> > >   /machine/sgx-epc[nn]
> > > 
> > > where the 'nn' of the first SGX-EPC object is always zero.  
> > 
> > yet again, why it's necessary?  
> 
> 
>   Igor, Sorry for delay feedback because of Chinese New Year holiday.
> 
>   This series patches are to fix below issues I reported before,
>   https://lists.nongnu.org/archive/html/qemu-devel/2021-11/msg05670.html
> 
>   Since the /machine/unattached/device[0] is used by vcpu and Libvirt
>   use this interface to get unavailable-features list. But in the SGX
>   VM, the device[0] will be occupied by virtual sgx epc device, Libvirt
>   can't get unavailable-features from this device[0].
> 
>   Although patch 2 in this series already fixed "unavailable-features" issue,

I've seen patches on libvirt fixing "unavailable-features" in another way
without dependence on  /machine/unattached/device[0].
see:
 https://www.mail-archive.com/libvir-list@redhat.com/msg226244.html

>   this patch can move sgx virtual device from /machine/unattached/device[nn]
>   to /machine/sgx-epc[nn], which seems more clear. Thanks!

with those patches device[0] becomes non issue, and this patch also becomes
unnecessary.
I don't mind putting sgx-epc under machine, but that shall be justified
somehow. A drawback I noticed in this case is an extra manual
plumbing/wiring without apparent need for it.

PS:
general note on submitting patches.
Commit message shall
 1 describe problem (+error message/way to reproduce the issue)
 2 what patch does
 3 and why patch fixes the issue in a certain way

commit message in this patch only does #2,  without any clue
to what the problem was nor why it tries to fix it this way.

> 
>   Yang
>   
> 
> >   
> > > 
> > > Reported-by: Yang Zhong 
> > > Suggested-by: Paolo Bonzini 
> > > Reviewed-by: Daniel P. Berrangé 
> > > Signed-off-by: Philippe Mathieu-Daudé 
> > > ---
> > >  hw/i386/sgx.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > > 
> > > diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c
> > > index a2b318dd938..3ab2217ca43 100644
> > > --- a/hw/i386/sgx.c
> > > +++ b/hw/i386/sgx.c
> > > @@ -304,6 +304,8 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms)
> > >  for (list = x86ms->sgx_epc_list; list; list = list->next) {
> > >  obj = object_new("sgx-epc");
> > >  
> > > +object_property_add_child(OBJECT(pcms), "sgx-epc[*]", 
> > > OBJECT(obj));
> > > +
> > >  /* set the memdev link with memory backend */
> > >  object_property_parse(obj, SGX_EPC_MEMDEV_PROP, 
> > > list->value->memdev,
> > >&error_fatal);  
>

[PULL 5/6] hw/nvme: add ozcs enum

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

Add enumeration for OZCS values.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ns.c | 3 ++-
 include/block/nvme.h | 4 
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 8b5f98c76180..356b6c1c2f14 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -266,7 +266,8 @@ static void nvme_ns_init_zoned(NvmeNamespace *ns)
 id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
 id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
-id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
+id_ns_z->ozcs = ns->params.cross_zone_read ?
+NVME_ID_NS_ZONED_OZCS_RAZB : 0x00;
 
 for (i = 0; i <= ns->id_ns.nlbaf; i++) {
 id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 709d491c70d8..e10ea6f0eb88 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1351,6 +1351,10 @@ typedef struct QEMU_PACKED NvmeIdNsZoned {
 uint8_t vs[256];
 } NvmeIdNsZoned;
 
+enum NvmeIdNsZonedOzcs {
+NVME_ID_NS_ZONED_OZCS_RAZB= 1 << 0,
+};
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
-- 
2.35.1

Re: [PATCH v14 6/7] softmmu/dirtylimit: Implement virtual CPU throttle

2022-02-14 Thread Hyman Huang





在 2022/2/14 16:20, Peter Xu 写道:

On Fri, Feb 11, 2022 at 12:17:40AM +0800, huang...@chinatelecom.cn wrote:

@@ -2964,8 +2971,13 @@ int kvm_cpu_exec(CPUState *cpu)
   */
  trace_kvm_dirty_ring_full(cpu->cpu_index);
  qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(kvm_state, NULL);
+if (dirtylimit_in_service()) {
+kvm_dirty_ring_reap(kvm_state, cpu);
+} else {
+kvm_dirty_ring_reap(kvm_state, NULL);
+}


Could you add some comment here on why the cpu pointer is conditionally passed
into the reaping routine?  Even if we know it now, it's not immediately obvious
to all the readers.

Sure.


[...]


+struct {
+VcpuDirtyLimitState *states;
+/* Max cpus number configured by user */
+int max_cpus;
+/* Number of vcpu under dirtylimit */
+int limited_nvcpu;
+/* Function to implement throttle set up */
+DirtyLimitFunc setup;


"setup" normally is used only at startup of something, but not per interval.
Perhaps "process" or "adjust"?  Same across other "setup" namings across the
patch.

Ok, 'adjust' is fine.


Again, I'd rather call the function directly..

Um, maybe using the function pointer is more extensible.

[...]
static void *vcpu_dirty_rate_stat_thread(void *opaque)
{
rcu_register_thread();

/* start log sync */
global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);

while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
vcpu_dirty_rate_stat_collect();
if (dirtylimit_in_service() &&
dirtylimit_state->setup) {
dirtylimit_state->setup();
}
}

/* stop log sync */
global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);

rcu_unregister_thread();
return NULL;
}
[...]

Function pointer makes the 'dirtyrate-stat' logic and 'dirtylimit' logic 
kind of decoupled.


But i'm ok if you insist because it's just about how to call the 
'dirtylimit' and doesn't affect the whole logic.




[...]


+static void dirtylimit_adjust_throttle(CPUState *cpu)
+{
+uint64_t quota = 0;
+uint64_t current = 0;
+int cpu_index = cpu->cpu_index;
+
+quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
+current = vcpu_dirty_rate_get(cpu_index);
+
+if (current == 0) {
+cpu->throttle_us_per_full = 0;
+goto end;


Can be dropped?


+} else if (dirtylimit_done(quota, current)) {
+goto end;


Same here.  Dropping it wholely and:

} else if (!dirtylimit_done(quota, current)) {
dirtylimit_set_throttle(cpu, quota, current);
}

Would work?


+} else {
+dirtylimit_set_throttle(cpu, quota, current);
+}
+end:


Can be dropped?


+trace_dirtylimit_adjust_throttle(cpu_index,
+ quota, current,
+ cpu->throttle_us_per_full);
+return;
+}
+
+void dirtylimit_setup(void)
+{
+CPUState *cpu;
+
+if (!qatomic_read(&dirtylimit_quit)) {
+dirtylimit_state_lock();
+
+if (!dirtylimit_in_service()) {
+dirtylimit_state_unlock();


Need to return?


+}
+
+CPU_FOREACH(cpu) {
+if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
+continue;
+}
+dirtylimit_adjust_throttle(cpu);
+}
+dirtylimit_state_unlock();
+}
+}


[...]


+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable)
+{
+dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
+trace_dirtylimit_set_vcpu(cpu_index, quota);
+}


This helper is not "help"ful..  How about wrapping the trace into
dirtylimit_vcpu_set_quota, then drop it?

Thanks,



--
Best regard

Hyman Huang(黄勇)

Re: qemu iotest 161 and make check

2022-02-14 Thread Christian Borntraeger





Am 10.02.22 um 15:47 schrieb Vladimir Sementsov-Ogievskiy:

10.02.2022 10:57, Christian Borntraeger wrote:

Hello,

I do see spurious failures of 161 in our CI, but only when I use
make check with parallelism (-j).
I have not yet figured out which other testcase could interfere

@@ -34,6 +34,8 @@
  *** Commit and then change an option on the backing file

  Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1048576
+qemu-img: TEST_DIR/t.IMGFMT.base: Failed to get "write" lock
+Is another process using the image [TEST_DIR/t.IMGFMT.base]?
  Formatting 'TEST_DIR/t.IMGFMT.int', fmt=IMGFMT size=1048576 
backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT
  Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 
backing_file=TEST_DIR/t.IMGFMT.int backing_fmt=IMGFMT
  { 'execute': 'qmp_capabilities' }


any ideas?



Hmm, interesting.. Is it always 161 and always exactly this diff?


Seems to be always this diff.

[PATCH] ui/cocoa: Add Services menu

2022-02-14 Thread Akihiko Odaki

Services menu functionality of Cocoa is described at:
https://developer.apple.com/design/human-interface-guidelines/macos/extensions/services/

Signed-off-by: Akihiko Odaki 
---
 ui/cocoa.m | 4 
 1 file changed, 4 insertions(+)

diff --git a/ui/cocoa.m b/ui/cocoa.m
index ac18e14ce01..e0882fd48e2 100644
--- a/ui/cocoa.m
+++ b/ui/cocoa.m
@@ -1597,11 +1597,15 @@ static void create_initial_menus(void)
 NSMenuItem  *menuItem;
 
 [NSApp setMainMenu:[[NSMenu alloc] init]];
+[NSApp setServicesMenu:[[NSMenu alloc] initWithTitle:@"Services"]];
 
 // Application menu
 menu = [[NSMenu alloc] initWithTitle:@""];
 [menu addItemWithTitle:@"About QEMU" action:@selector(do_about_menu_item:) 
keyEquivalent:@""]; // About QEMU
 [menu addItem:[NSMenuItem separatorItem]]; //Separator
+menuItem = [menu addItemWithTitle:@"Services" action:nil 
keyEquivalent:@""];
+[menuItem setSubmenu:[NSApp servicesMenu]];
+[menu addItem:[NSMenuItem separatorItem]];
 [menu addItemWithTitle:@"Hide QEMU" action:@selector(hide:) 
keyEquivalent:@"h"]; //Hide QEMU
 menuItem = (NSMenuItem *)[menu addItemWithTitle:@"Hide Others" 
action:@selector(hideOtherApplications:) keyEquivalent:@"h"]; // Hide Others
 [menuItem 
setKeyEquivalentModifierMask:(NSEventModifierFlagOption|NSEventModifierFlagCommand)];
-- 
2.32.0 (Apple Git-132)

Re: [PATCH v14 6/7] softmmu/dirtylimit: Implement virtual CPU throttle

2022-02-14 Thread Hyman Huang





在 2022/2/14 16:20, Peter Xu 写道:

On Fri, Feb 11, 2022 at 12:17:40AM +0800, huang...@chinatelecom.cn wrote:

@@ -2964,8 +2971,13 @@ int kvm_cpu_exec(CPUState *cpu)
   */
  trace_kvm_dirty_ring_full(cpu->cpu_index);
  qemu_mutex_lock_iothread();
-kvm_dirty_ring_reap(kvm_state, NULL);
+if (dirtylimit_in_service()) {
+kvm_dirty_ring_reap(kvm_state, cpu);
+} else {
+kvm_dirty_ring_reap(kvm_state, NULL);
+}


Could you add some comment here on why the cpu pointer is conditionally passed
into the reaping routine?  Even if we know it now, it's not immediately obvious
to all the readers.

[...]


+struct {
+VcpuDirtyLimitState *states;
+/* Max cpus number configured by user */
+int max_cpus;
+/* Number of vcpu under dirtylimit */
+int limited_nvcpu;
+/* Function to implement throttle set up */
+DirtyLimitFunc setup;


"setup" normally is used only at startup of something, but not per interval.
Perhaps "process" or "adjust"?  Same across other "setup" namings across the
patch.

Again, I'd rather call the function directly..

[...]


+static void dirtylimit_adjust_throttle(CPUState *cpu)
+{
+uint64_t quota = 0;
+uint64_t current = 0;
+int cpu_index = cpu->cpu_index;
+
+quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
+current = vcpu_dirty_rate_get(cpu_index);
+
+if (current == 0) {
+cpu->throttle_us_per_full = 0;
+goto end;


Can be dropped?

Ok, i'll move this block into dirtylimit_set_throttle.



+} else if (dirtylimit_done(quota, current)) {
+goto end;


Same here.  Dropping it wholely and:

} else if (!dirtylimit_done(quota, current)) {
dirtylimit_set_throttle(cpu, quota, current);
}

Would work?

Yes.



+} else {
+dirtylimit_set_throttle(cpu, quota, current);
+}
+end:


Can be dropped?Ok


+trace_dirtylimit_adjust_throttle(cpu_index,
+ quota, current,
+ cpu->throttle_us_per_full);
+return;
+}
+
+void dirtylimit_setup(void)
+{
+CPUState *cpu;
+
+if (!qatomic_read(&dirtylimit_quit)) {
+dirtylimit_state_lock();
+
+if (!dirtylimit_in_service()) {
+dirtylimit_state_unlock();


Need to return?

My fault. :(



+}
+
+CPU_FOREACH(cpu) {
+if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
+continue;
+}
+dirtylimit_adjust_throttle(cpu);
+}
+dirtylimit_state_unlock();
+}
+}


[...]


+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable)
+{
+dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
+trace_dirtylimit_set_vcpu(cpu_index, quota);
+}


This helper is not "help"ful..  How about wrapping the trace into
dirtylimit_vcpu_set_quota, then drop it?


Ok.

Thanks,



--
Best regard

Hyman Huang(黄勇)

Re: [PATCH v14 7/7] softmmu/dirtylimit: Implement dirty page rate limit

2022-02-14 Thread Hyman Huang





在 2022/2/14 16:25, Peter Xu 写道:

On Fri, Feb 11, 2022 at 12:17:41AM +0800, huang...@chinatelecom.cn wrote:

+static struct DirtyLimitInfoList *dirtylimit_query_all(void)
+{
+int i, index;
+DirtyLimitInfo *info = NULL;
+DirtyLimitInfoList *head = NULL, **tail = &head;
+
+dirtylimit_state_lock();
+
+if (!dirtylimit_in_service()) {


Need to unlock?

Yes, i'll fix it next version.



+return NULL;
+}
+
+for (i = 0; i < dirtylimit_state->max_cpus; i++) {
+index = dirtylimit_state->states[i].cpu_index;
+if (dirtylimit_vcpu_get_state(index)->enabled) {
+info = dirtylimit_query_vcpu(index);
+QAPI_LIST_APPEND(tail, info);
+}
+}
+
+dirtylimit_state_unlock();
+
+return head;
+}




--
Best regard

Hyman Huang(黄勇)

Re: [PATCH v14 3/7] migration/dirtyrate: Refactor dirty page rate calculation

2022-02-14 Thread Peter Xu

On Mon, Feb 14, 2022 at 04:40:31PM +0800, Hyman Huang wrote:
> Dropping this parameter is fine from my point view if we ignore the duration
> error result from the delay between caller and callee of
> 'vcpu_calculate_dirtyrate’

Agreed, then let's drop it.

-- 
Peter Xu

Re: [PULL 0/5] 9p queue 2022-02-10

2022-02-14 Thread Christian Schoenebeck

On Sonntag, 13. Februar 2022 21:33:10 CET Peter Maydell wrote:
> On Thu, 10 Feb 2022 at 11:33, Christian Schoenebeck
> 
>  wrote:
> > The following changes since commit 0a301624c2f4ced3331ffd5bce85b4274fe132af:
> >   Merge remote-tracking branch
> >   'remotes/pmaydell/tags/pull-target-arm-20220208' into staging
> >   (2022-02-08 11:40:08 +)> 
> > are available in the Git repository at:
> >   https://github.com/cschoenebeck/qemu.git tags/pull-9p-20220210
> > 
> > for you to fetch changes up to de19c79dad6a2cad54ae04ce754d47c07bf9bc93:
> >   9pfs: Fix segfault in do_readdir_many caused by struct dirent overread
> >   (2022-02-10 11:56:01 +0100)> 
> > 
> > 9pfs: fixes and cleanup
> > 
> > * Fifth patch fixes a 9pfs server crash that happened on some systems due
> > 
> >   to incorrect (system dependant) handling of struct dirent size.
> > 
> > * Tests: Second patch fixes a test error that happened on some systems due
> > 
> >   mkdir() being called twice for creating the test directory for the 9p
> >   'local' tests.
> > 
> > * Tests: Third patch fixes a memory leak.
> > 
> > * Tests: The remaining two patches are code cleanup.
> > 
> > 
> 
> Hi; this fails CI for the build-oss-fuzz job, which finds
> a heap-buffer-overflow:
> https://gitlab.com/qemu-project/qemu/-/jobs/2087610013

So this is about the 'dirent' patch:
https://github.com/cschoenebeck/qemu/commit/de19c79dad6a2cad54ae04ce754d47c07bf9bc93

In conjunction with the 9p fuzzing tests:
https://wiki.qemu.org/Documentation/9p#Fuzzing

I first thought it might be a false positive due to the unorthodox handling of
dirent duplication by that patch, but from the ASan output below I am not
really sure about that.

Is there a way to get the content of local variables?

Would it be possible that the following issue (g_memdup vs. g_memdup2) might
apply here?
https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538

Best regards,
Christian Schoenebeck

> 
> 8/152 qemu:qtest+qtest-i386 / qtest-i386/qos-test ERROR 66.74s killed
> by signal 6 SIGABRT
> 
> >>> QTEST_QEMU_BINARY=./qemu-system-i386
> >>> QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon
> >>> MALLOC_PERTURB_=120
> >>> G_TEST_DBUS_DAEMON=/builds/qemu-project/qemu/tests/dbus-vmstate-daemon.
> >>> sh QTEST_QEMU_IMG=./qemu-img
> >>> /builds/qemu-project/qemu/build-oss-fuzz/tests/qtest/qos-test --tap -k
> ― ✀
> ― Listing only the last 100 lines from
> a long log.
> For details see https://github.com/google/sanitizers/issues/189
> ==7270==WARNING: ASan doesn't fully support makecontext/swapcontext
> functions and may produce false positives in some cases!
> ==7270==WARNING: ASan is ignoring requested __asan_handle_no_return:
> stack type: default top: 0x7ffc79fb; bottom 0x7ff908ffd000; size:
> 0x000370fb3000 (14780411904)
> False positive error reports may follow
> For details see https://github.com/google/sanitizers/issues/189
> ==7276==WARNING: ASan doesn't fully support makecontext/swapcontext
> functions and may produce false positives in some cases!
> ==7276==WARNING: ASan is ignoring requested __asan_handle_no_return:
> stack type: default top: 0x7fff7e4a8000; bottom 0x7fd6363fd000; size:
> 0x0029480ab000 (177302319104)
> False positive error reports may follow
> For details see https://github.com/google/sanitizers/issues/189
> ==7282==WARNING: ASan doesn't fully support makecontext/swapcontext
> functions and may produce false positives in some cases!
> ==7282==WARNING: ASan is ignoring requested __asan_handle_no_return:
> stack type: default top: 0x7ffee6e7f000; bottom 0x7f32fb5fd000; size:
> 0x00cbeb882000 (875829927936)
> False positive error reports may follow
> For details see https://github.com/google/sanitizers/issues/189
> ==7288==WARNING: ASan doesn't fully support makecontext/swapcontext
> functions and may produce false positives in some cases!
> ==7288==WARNING: ASan is ignoring requested __asan_handle_no_return:
> stack type: default top: 0x7ffc6118e000; bottom 0x7f6391cfd000; size:
> 0x0098cf491000 (656312700928)
> False positive error reports may follow
> For details see https://github.com/google/sanitizers/issues/189
> ==7294==WARNING: ASan doesn't fully support makecontext/swapcontext
> functions and may produce false positives in some cases!
> ==7294==WARNING: ASan is ignoring requested __asan_handle_no_return:
> stack type: default top: 0x7ffef665d000; bottom 0x7f69dc8fd000; size:
> 0x009519d6 (640383582208)
> False positive error reports may follow
> For details see https://github.com/google/sanitizers/issues/189
> ==7300==WARNING: ASan doesn't fully support makecontext/swapcontext
> functions and may produce false positives in some cases!
> ==7300==WARNING: ASan is ignoring requested __asan_handle_no_return:
> stack type: def

Re: [PATCH] MAINTAINERS: Add Akihiko Odaki to macOS-relateds

2022-02-14 Thread Christian Schoenebeck

On Sonntag, 13. Februar 2022 03:12:15 CET Akihiko Odaki wrote:
> Signed-off-by: Akihiko Odaki 
> ---
>  MAINTAINERS | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 2fd74c46426..5aefb5b431a 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2333,6 +2333,7 @@ F: audio/alsaaudio.c
>  Core Audio framework backend
>  M: Gerd Hoffmann 
>  R: Christian Schoenebeck 
> +R: Akihiko Odaki 
>  S: Odd Fixes
>  F: audio/coreaudio.c
> 
> @@ -2585,6 +2586,7 @@ F: util/drm.c
> 
>  Cocoa graphics
>  M: Peter Maydell 
> +R: Akihiko Odaki 
>  S: Odd Fixes
>  F: ui/cocoa.m

Reviewed-by: Christian Schoenebeck

Re: [PATCH v14 6/7] softmmu/dirtylimit: Implement virtual CPU throttle

2022-02-14 Thread Peter Xu

On Mon, Feb 14, 2022 at 05:05:38PM +0800, Hyman Huang wrote:
> But i'm ok if you insist because it's just about how to call the
> 'dirtylimit' and doesn't affect the whole logic.

If you don't have plan to add e.g. another adjust() method then it's pointless.
The hook can be easily added on top when necessary.

But I don't insist - your call. :)  Not really a big deal.

-- 
Peter Xu

Re: [PULL 0/5] 9p queue 2022-02-10

2022-02-14 Thread Peter Maydell

On Mon, 14 Feb 2022 at 09:47, Christian Schoenebeck
 wrote:
> So this is about the 'dirent' patch:
> https://github.com/cschoenebeck/qemu/commit/de19c79dad6a2cad54ae04ce754d47c07bf9bc93
>
> In conjunction with the 9p fuzzing tests:
> https://wiki.qemu.org/Documentation/9p#Fuzzing
>
> I first thought it might be a false positive due to the unorthodox handling of
> dirent duplication by that patch, but from the ASan output below I am not
> really sure about that.
>
> Is there a way to get the content of local variables?

Yes. You can build locally with the clang sanitizers enabled and then
run under gdb and with the appropriate environment variables to tell the
sanitizer to abort() on failures.

> Would it be possible that the following issue (g_memdup vs. g_memdup2) might
> apply here?
> https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538

It seems unlikely that the problem is that you're allocating more than
4 gigabytes and thus hitting a 64-to-32 truncation.

thanks
-- PMM

Re: [RFC] thread-pool: Add option to fix the pool size

2022-02-14 Thread Stefan Hajnoczi

On Fri, Feb 11, 2022 at 12:32:57PM +0100, Kevin Wolf wrote:
> Am 03.02.2022 um 15:19 hat Stefan Hajnoczi geschrieben:
> > On Thu, Feb 03, 2022 at 10:56:49AM +, Daniel P. Berrangé wrote:
> > > On Thu, Feb 03, 2022 at 10:53:07AM +, Stefan Hajnoczi wrote:
> > > > On Wed, Feb 02, 2022 at 06:52:34PM +0100, Nicolas Saenz Julienne wrote:
> > 1. A global default value that all new AioContext take. The QEMU main
> >loop's qemu_aio_context will use this and all IOThread AioContext
> >will use it (unless they have been overridden).
> > 
> >I would define it on --machine because that's the "global" object for
> >a guest, but that's not very satisfying.
> 
> Semantically, -machine is about the virtual hardware where as iothreads
> are about the backend, so I agree it's not a good fit.
> 
> For the main thread, you may want to configure all the same options that
> you can configure for an iothread. So to me that sounds like we would
> want to allow using an iothread object for the main thread, too.
> 
> That would still require us to tell QEMU which iothread object should be
> used for the main thread, though.

Making the main loop thread an IOThread is an interesting direction but
not an easy change to make.

The main loop thread has a custom event loop that is not interchangeable
with the IOThread event loop:
- The main loop has a poll notifier interface for libslirp fd monitoring
  integration.
- The main loop is a GLib event loop but manually polls to get
  nanosecond resolution timers.
- The main loop has icount integration.
- The main loop has the special iohandler AioContext

The IOThread event loop runs an optimized AioContext event loop instead.
It falls back to regular g_main_loop_run() if there is a GSource user.

It would definitely be nice to unify the main loop with IOThread and
then use --object iothread,... to configure main loop parameters.

I'm not sure if requiring that of Nicolas is fair though. The event
loops in QEMU are complex and changes are likely to introduce subtle
bugs or performance regressions.

Stefan


signature.asc
Description: PGP signature

Re: [PATCH 1/6] block/io.c: fix bdrv_child_cb_drained_begin invocations from a coroutine

2022-02-14 Thread Emanuele Giuseppe Esposito

On 11/02/2022 12:54, Kevin Wolf wrote:
> Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
>> Using bdrv_do_drained_begin_quiesce() in bdrv_child_cb_drained_begin()
>> is not a good idea: the callback might be called when running
>> a drain in a coroutine, and bdrv_drained_begin_poll() does not
>> handle that case, resulting in assertion failure.
> 
> I remembered that we talked about this only recently on IRC, but it
> didn't make any sense to me again when I read this commit message. So I
> think we need --verbose.
> 
> The .drained_begin callback was always meant to run outside of coroutine
> context, so the unexpected part isn't that it calls a function that
> can't run in coroutine context, but that it is already called itself in
> coroutine context.
> 
> The problematic path is bdrv_replace_child_noperm() which then calls
> bdrv_parent_drained_begin_single(poll=true). Polling in coroutine
> context is dangerous, it can cause deadlocks because the caller of the
> coroutine can't make progress. So I believe this call is already wrong
> in coroutine context.

Ok, you added this assertion in dcf94a23, but at that time there was no
bdrv_parent_drained_begin_single, and the polling was only done in
bdrv_do_drained_begin. So I think that to keep the same logic, the
assertion should be moved in bdrv_parent_drained_begin_single()? And
even more specifically, only if the poll flag is true.

I triggered this by adding additional drains in the callers of
bdrv_replace_child_noperm(), and I think some test (probably unit test)
was failing because of either the drained_begin callback itself called
by the drain, or as you suggested the callbacks called by
bdrv_parent_drained_begin_single from bdrv_replace_child_noperm.

Anyways, I think that in addition to the fix in this patch, we should
also fix bdrv_parent_drained_begin_single(poll=true) in
bdrv_replace_child_noperm, with something similar to what is done in
bdrv_co_yield_to_drain? ie if we are in coroutine, schedule a BH that
runs the same logic but in the main loop, but then somehow wait that it
finishes before continuing?
Even though at that point we would have a coroutine waiting for the main
loop, which I don't think it's something we want.

Alternatively, we would forbid polling in coroutines at all. And the
only place I can see that is using the drain in coroutine is mirror (see
below).

Additional question: I also noticed that there is a bdrv_drained_begin()
call in mirror.c in the JobDriver run() callback. How can this even
work? If a parent uses bdrv_child_cb_drained_begin (which should not be
so rare) it will crash because of the assertion.

Further additional question: actually I don't understand also the
polling logic of mirror (mirror_drained_poll), as if we are draining in
the coroutine with in_drain = true I think we can have a deadlock if
in_flight>0?

Emanuele

> 
> Now I don't know the call path up to bdrv_replace_child_noperm(), but as
> far as I remember, that was another function that was originally never
> run in coroutine context. Maybe we have good reason to change this, I
> can't point to anything that would be inherently wrong with it, but I
> would still be curious in which context it does run in a coroutine now.
> 
> Anyway, whatever the specific place is, I believe we must drop out of
> coroutine context _before_ calling bdrv_parent_drained_begin_single(),
> not only in callbacks called by it.
> 
>> Instead, bdrv_do_drained_begin with no recursion and poll
>> will accomplish the same thing (invoking bdrv_do_drained_begin_quiesce)
>> but will firstly check if we are already in a coroutine, and exit
>> from that via bdrv_co_yield_to_drain().
>>
>> Signed-off-by: Emanuele Giuseppe Esposito 
> 
> Kevin
>

Re: [PATCH] ui/cocoa: Set UI information

2022-02-14 Thread Gerd Hoffmann

> (2) A question for Gerd:
> Is it safe to call dpy_set_ui_info() from a non-QEMU-main-thread?

No.

take care,
  Gerd

Re: [PATCH v4 4/4] hw/i386/sgx: Attach SGX-EPC objects to machine

2022-02-14 Thread Daniel P . Berrangé

On Mon, Feb 14, 2022 at 09:21:07AM +0100, Igor Mammedov wrote:
> On Mon, 14 Feb 2022 14:58:57 +0800
> Yang Zhong  wrote:
> 
> > On Mon, Feb 07, 2022 at 09:37:52AM +0100, Igor Mammedov wrote:
> > > On Sat,  5 Feb 2022 13:45:26 +0100
> > > Philippe Mathieu-Daudé  wrote:
> > >   
> > > > Previously SGX-EPC objects were exposed in the QOM tree at a path
> > > > 
> > > >   /machine/unattached/device[nn]
> > > > 
> > > > where the 'nn' varies depending on what devices were already created.
> > > > 
> > > > With this change the SGX-EPC objects are now at
> > > > 
> > > >   /machine/sgx-epc[nn]
> > > > 
> > > > where the 'nn' of the first SGX-EPC object is always zero.  
> > > 
> > > yet again, why it's necessary?  
> > 
> > 
> >   Igor, Sorry for delay feedback because of Chinese New Year holiday.
> > 
> >   This series patches are to fix below issues I reported before,
> >   https://lists.nongnu.org/archive/html/qemu-devel/2021-11/msg05670.html
> > 
> >   Since the /machine/unattached/device[0] is used by vcpu and Libvirt
> >   use this interface to get unavailable-features list. But in the SGX
> >   VM, the device[0] will be occupied by virtual sgx epc device, Libvirt
> >   can't get unavailable-features from this device[0].
> > 
> >   Although patch 2 in this series already fixed "unavailable-features" 
> > issue,
> 
> I've seen patches on libvirt fixing "unavailable-features" in another way
> without dependence on  /machine/unattached/device[0].
> see:
>  https://www.mail-archive.com/libvir-list@redhat.com/msg226244.html
> 
> >   this patch can move sgx virtual device from /machine/unattached/device[nn]
> >   to /machine/sgx-epc[nn], which seems more clear. Thanks!
> 
> with those patches device[0] becomes non issue, and this patch also becomes
> unnecessary.
> I don't mind putting sgx-epc under machine, but that shall be justified
> somehow. A drawback I noticed in this case is an extra manual
> plumbing/wiring without apparent need for it.

This is effectively questioning why we have a QOM hierarchy with
named devices at all. IMHO we don't need to justify giving explicitly
named nodes under QOM beyond  "this is normal QOM modelling", and
anything under '/unattached' is subject to being fixed in this way.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH] qapi, i386/sev: Add debug-launch-digest to launch-measure response

2022-02-14 Thread Dr. David Alan Gilbert

* Daniel P. Berrangé (berra...@redhat.com) wrote:
> On Thu, Feb 10, 2022 at 07:39:01PM +, Dr. David Alan Gilbert wrote:
> > * Daniel P. Berrangé (berra...@redhat.com) wrote:
> > > I wonder if we're thinking of this at the wrong level though. Does
> > > it actually need to be QEMU providing this info to the guest owner ?
> > > 
> > > Guest owners aren't going to be interacting with QEMU / QMP directly,
> > > nor are they likely to be interacting with libvirt directly. Their
> > > way into the public cloud will be via some high level API. eg the
> > > OpenStack Nova REST API, or the IBM Cloud API (whatever that may
> > > be). This high level mgmt infra is likely what is deciding which
> > > of the 'N' possible OVMF builds to pick for a given VM launch. It
> > > could easily just expose the full OVMF data to the user via its
> > > own API regardless of what query-sev does.
> > > 
> > > Similarly if the cloud is choosing which kernel, out of N possible
> > > kernels to boot with, they could expose the raw kernel data somewhere
> > > in their API - we don't neccessarily need to expose that from QEMU.
> > 
> > It gets more interesting where it's the guest which picks the
> > kernel/initrd; imagine the setup where the cloud reads the kernel/initrd
> > from the guest disk and passes that to qemu; one of the update ideas
> > would be just to let the guest update from a repo at it's own pace;
> > so the attestor doesn't know whether to expect a new or old kernel
> > from the guest; but it does know it should be one of the approved
> > set of kernels.
> 
> So that scenario would effectively be the old Xen style pygrub where
> you have some script on the host to pull the kernel/initrd out of
> the guest /boot.

Right.

> On the plus side that would enable you to use a "normal" guest disk
> image with unencrypted /boot, instead of encrypting everything.

Yes; the other plus is that you don't need the guest to send update
information back to the attestor to tell it when it's done an update;
that's potentially a big simplification on an untrusted interface.

> The risk though is that you need a strong guarantee that the *only* data
> from /boot that is used is the kernel+initrd+cmdline that get included
> in the measurement. If the guest boot process reads anything else from
> /boot then your confidentiality is potentially doomed. This feels like
> quite a risky setup, as I don't know how you'd achieve the high level of
> confidence that stuff in /boot isn't going to cause danger to the guest
> during boot, or after boot.

Which I think pygrub type things found out the hard way; but as long as
you copy from /boot, attest on the data you use and then boot using the
data you attested you should be OK.

Dave

> Regards,
> Daniel
> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH 3/6] block.c: bdrv_replace_child_noperm: first call ->attach(), and then add child

2022-02-14 Thread Emanuele Giuseppe Esposito




On 11/02/2022 13:34, Kevin Wolf wrote:
> Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
>> Doing the opposite can make adding the child node to a non-drained node,
>> as apply_subtree_drain is only done in ->attach() and thus make
>> assert_bdrv_graph_writable fail.
>>
>> This can happen for example during a transaction rollback (test 245,
>> test_io_with_graph_changes):
>> 1. a node is removed from the graph, thus it is undrained
>> 2. then something happens, and we need to roll back the transactions
>>through tran_abort()
>> 3. at this point, the current code would first attach the undrained node
>>to the graph via QLIST_INSERT_HEAD, and then call ->attach() that
>>will take care of restoring the drain with apply_subtree_drain(),
>>leaving the node undrained between the two operations.
>>
>> Signed-off-by: Emanuele Giuseppe Esposito 
>> ---
>>  block.c | 6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index ec346a7e2e..08a6e3a4ef 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -2872,8 +2872,6 @@ static void bdrv_replace_child_noperm(BdrvChild 
>> **childp,
>>  }
>>  
>>  if (new_bs) {
>> -assert_bdrv_graph_writable(new_bs);
>> -QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
>>  
>>  /*
>>   * Detaching the old node may have led to the new node's
>> @@ -2890,6 +2888,10 @@ static void bdrv_replace_child_noperm(BdrvChild 
>> **childp,
>>  if (child->klass->attach) {
>>  child->klass->attach(child);
>>  }
>> +
>> +assert_bdrv_graph_writable(new_bs);
>> +QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
>> +
>>  }
> 
> Extra empty line. Looks good otherwise.
> 
> Does this also mean that the order in bdrv_child_cb_attach/detach() is
> wrong? Or maybe adding a new node to bs->children is okay even when the
> child node isn't drained.

No I don't think it's wrong. In fact, if we are just replacing a node
(so old_bs and new_bs are both != NULL), the child will be just removed
and then re-added to the same children's list of the same parent
(child->opaque).

Whether adding a new node to bs->children requires a drain or not is
still under debate in the other serie with Vladimir. We'll see about
that, but in the meanwhile this is just a safe fix that makes sure that
*if* drains are added, everything will always stay under proper drain.

Emanuele

> 
> Kevin
>

Re: [PULL 0/5] 9p queue 2022-02-10

2022-02-14 Thread Greg Kurz

On Mon, 14 Feb 2022 10:47:43 +0100
Christian Schoenebeck  wrote:

> On Sonntag, 13. Februar 2022 21:33:10 CET Peter Maydell wrote:
> > On Thu, 10 Feb 2022 at 11:33, Christian Schoenebeck
> > 
> >  wrote:
> > > The following changes since commit 
> > > 0a301624c2f4ced3331ffd5bce85b4274fe132af:
> > >   Merge remote-tracking branch
> > >   'remotes/pmaydell/tags/pull-target-arm-20220208' into staging
> > >   (2022-02-08 11:40:08 +)> 
> > > are available in the Git repository at:
> > >   https://github.com/cschoenebeck/qemu.git tags/pull-9p-20220210
> > > 
> > > for you to fetch changes up to de19c79dad6a2cad54ae04ce754d47c07bf9bc93:
> > >   9pfs: Fix segfault in do_readdir_many caused by struct dirent overread
> > >   (2022-02-10 11:56:01 +0100)> 
> > > 9d82f6a3e68c2
> > > 9pfs: fixes and cleanup
> > > 
> > > * Fifth patch fixes a 9pfs server crash that happened on some systems due
> > > 
> > >   to incorrect (system dependant) handling of struct dirent size.
> > > 
> > > * Tests: Second patch fixes a test error that happened on some systems due
> > > 
> > >   mkdir() being called twice for creating the test directory for the 9p
> > >   'local' tests.
> > > 
> > > * Tests: Third patch fixes a memory leak.
> > > 
> > > * Tests: The remaining two patches are code cleanup.
> > > 
> > > 
> > 
> > Hi; this fails CI for the build-oss-fuzz job, which finds
> > a heap-buffer-overflow:
> > https://gitlab.com/qemu-project/qemu/-/jobs/2087610013
> 
> So this is about the 'dirent' patch:
> https://github.com/cschoenebeck/qemu/commit/de19c79dad6a2cad54ae04ce754d47c07bf9bc93
> 
> In conjunction with the 9p fuzzing tests:
> https://wiki.qemu.org/Documentation/9p#Fuzzing
> 
> I first thought it might be a false positive due to the unorthodox handling of
> dirent duplication by that patch, but from the ASan output below I am not
> really sure about that.
> 

No, this is an actual bug. See below.

> Is there a way to get the content of local variables?
> 
> Would it be possible that the following issue (g_memdup vs. g_memdup2) might
> apply here?
> https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-now/5538
> 
> Best regards,
> Christian Schoenebeck
> 
> > 
> > 8/152 qemu:qtest+qtest-i386 / qtest-i386/qos-test ERROR 66.74s killed
> > by signal 6 SIGABRT
> > 
> > >>> QTEST_QEMU_BINARY=./qemu-system-i386
> > >>> QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon
> > >>> MALLOC_PERTURB_=120
> > >>> G_TEST_DBUS_DAEMON=/builds/qemu-project/qemu/tests/dbus-vmstate-daemon.
> > >>> sh QTEST_QEMU_IMG=./qemu-img
> > >>> /builds/qemu-project/qemu/build-oss-fuzz/tests/qtest/qos-test --tap -k
> > ― ✀
> > ― Listing only the last 100 lines from
> > a long log.
> > For details see https://github.com/google/sanitizers/issues/189
> > ==7270==WARNING: ASan doesn't fully support makecontext/swapcontext
> > functions and may produce false positives in some cases!
> > ==7270==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > stack type: default top: 0x7ffc79fb; bottom 0x7ff908ffd000; size:
> > 0x000370fb3000 (14780411904)
> > False positive error reports may follow
> > For details see https://github.com/google/sanitizers/issues/189
> > ==7276==WARNING: ASan doesn't fully support makecontext/swapcontext
> > functions and may produce false positives in some cases!
> > ==7276==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > stack type: default top: 0x7fff7e4a8000; bottom 0x7fd6363fd000; size:
> > 0x0029480ab000 (177302319104)
> > False positive error reports may follow
> > For details see https://github.com/google/sanitizers/issues/189
> > ==7282==WARNING: ASan doesn't fully support makecontext/swapcontext
> > functions and may produce false positives in some cases!
> > ==7282==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > stack type: default top: 0x7ffee6e7f000; bottom 0x7f32fb5fd000; size:
> > 0x00cbeb882000 (875829927936)
> > False positive error reports may follow
> > For details see https://github.com/google/sanitizers/issues/189
> > ==7288==WARNING: ASan doesn't fully support makecontext/swapcontext
> > functions and may produce false positives in some cases!
> > ==7288==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > stack type: default top: 0x7ffc6118e000; bottom 0x7f6391cfd000; size:
> > 0x0098cf491000 (656312700928)
> > False positive error reports may follow
> > For details see https://github.com/google/sanitizers/issues/189
> > ==7294==WARNING: ASan doesn't fully support makecontext/swapcontext
> > functions and may produce false positives in some cases!
> > ==7294==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > stack type: default top: 0x7ffef665d000; bottom 0x7f69dc8fd000; size:
> > 0x009519d6 (640383582

Re: [RFC PATCH 01/25] virtio-snd: Add virtio sound header file

2022-02-14 Thread Gerd Hoffmann

On Sat, Feb 12, 2022 at 03:42:55AM +0530, Shreyansh Chouhan wrote:
> Added device configuration and common definitions to the header
> file.
> 
> Signed-off-by: Shreyansh Chouhan 
> ---
>  include/hw/virtio/virtio-snd.h | 97 ++
>  1 file changed, 97 insertions(+)
>  create mode 100644 include/hw/virtio/virtio-snd.h
> 
> diff --git a/include/hw/virtio/virtio-snd.h b/include/hw/virtio/virtio-snd.h
> new file mode 100644
> index 00..bbbf174c51
> --- /dev/null
> +++ b/include/hw/virtio/virtio-snd.h

We already have include/standard-headers/linux/virtio_snd.h (synced from
linux kernel), you can (and should) just use that.

take care,
  Gerd

Re: [PATCH] ui/cocoa: Set UI information

2022-02-14 Thread Peter Maydell

On Mon, 14 Feb 2022 at 10:27, Gerd Hoffmann  wrote:
>
> > (2) A question for Gerd:
> > Is it safe to call dpy_set_ui_info() from a non-QEMU-main-thread?
>
> No.

Is it OK to do so if the other thread is holding the iothread lock?
(This is how we do a lot of the other "need to call a QEMU function"
work from the cocoa UI thread.)

thanks
-- PMM

Re: [RFC PATCH v2 00/25] Virtio Sound card Implementation

2022-02-14 Thread Gerd Hoffmann

  Hi,

> IMHO, all your patches can be merged in only one.

For the most part yes.  I'd keep the pci wrapper (aka -device
virtio-snd-pci) separate though.  Possibly also patches adding
significant functionality in the future (i.e. one patch with all
basics and playback support, one patch adding recording
functionality, ...).

> Morever it would help for
> review as some patches remove code done in previous patches.

Yes, squashing the incremental fixes at the end of the series makes
sense.

take care,
  Gerd

Re: [RFC PATCH v2 00/25] Virtio Sound card Implementation

2022-02-14 Thread Laurent Vivier


Le 14/02/2022 à 11:44, Gerd Hoffmann a écrit :

   Hi,


IMHO, all your patches can be merged in only one.


For the most part yes.  I'd keep the pci wrapper (aka -device
virtio-snd-pci) separate though.  Possibly also patches adding
significant functionality in the future (i.e. one patch with all
basics and playback support, one patch adding recording
functionality, ...).



I agree.

Laurent

Re: [PATCH 5/6] test-bdrv-drain.c: remove test_detach_by_parent_cb()

2022-02-14 Thread Emanuele Giuseppe Esposito




On 11/02/2022 16:38, Kevin Wolf wrote:
> Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
>> This test uses a callback of an I/O function (blk_aio_preadv)
>> to modify the graph, using bdrv_attach_child.
>> This is simply not allowed anymore. I/O cannot change the graph.
> 
> The callback of an I/O function isn't I/O, though. It is code _after_
> the I/O has completed. If this doesn't work any more, it feels like this
> is a bug.
> 
> I think it becomes a bit more obvious when you translate the AIO into
> the equivalent coroutine function:
> 
> void coroutine_fn foo(...)
> {
> GLOBAL_STATE_CODE();
> 
> blk_co_preadv(...);
> detach_by_parent_aio_cb(...);
> }
> 
> This is obviously correct code that must work. Calling an I/O function
> from a GS function is allowed, and of course the GS function may
> continue to do GS stuff after the I/O function returned.
> 
> (Actually, I'm not sure if it really works when blk is not in the main
> AioContext, but your API split patches claim that it does, so let's
> assume for the moment that this is already true :-))

Uhmm why does my split claims that it is? all blk_aio_* are IO_CODE.
Also, as you said, if blk is not in the main loop, the callback is not
GS either.

> 
>> Before "block/io.c: make bdrv_do_drained_begin_quiesce static
>> and introduce bdrv_drained_begin_no_poll", the test would simply
>> be at risk of failure, because if bdrv_replace_child_noperm()
>> (called to modify the graph) would call a drain,
>> then one callback of .drained_begin() is bdrv_do_drained_begin_quiesce,
>> that specifically asserts that we are not in a coroutine.
>>
>> Now that we fixed the behavior, the drain will invoke a bh in the
>> main loop, so we don't have such problem. However, this test is still
>> illegal and fails because we forbid graph changes from I/O paths.
>>
>> Once we add the required subtree_drains to protect
>> bdrv_replace_child_noperm(), the key problem in this test is in:
> 
> Probably a question for a different patch, but why is a subtree drain
> required instead of just a normal node drain? It feels like a bigger
> hammer than what is needed for replacing a single child.
> 
>> acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
>> /* Drain and check the expected result */
>> bdrv_subtree_drained_begin(parent_b);
>>
>> because the detach_by_parent_aio_cb calls detach_indirect_bh(), that
>> modifies the graph and is invoked during bdrv_subtree_drained_begin().
>> The call stack is the following:
>> 1. blk_aio_preadv() creates a coroutine, increments in_flight counter
>> and enters the coroutine running blk_aio_read_entry()
>> 2. blk_aio_read_entry() performs the read and then schedules a bh to
>>complete (blk_aio_complete)
>> 3. at this point, subtree_drained_begin() kicks in and waits for all
>>in_flight requests, polling
>> 4. polling allows the bh to be scheduled, so blk_aio_complete runs
>> 5. blk_aio_complete *first* invokes the callback
>>(detach_by_parent_aio_cb) and then decrements the in_flight counter
>> 6. Here we have the problem: detach_by_parent_aio_cb modifies the graph,
>>so both bdrv_unref_child() and bdrv_attach_child() will have
>>subtree_drains inside. And this causes a deadlock, because the
>>nested drain will wait for in_flight counter to go to zero, which
>>is only happening once the drain itself finishes.
> 
> So the problem has nothing to do with detach_by_parent_aio_cb() being
> an I/O function in the sense of locking rules (which it isn't), but with
> calling a drain operation while having the in_flight counter increased.
> 
> In other words, an AIO callback like detach_by_parent_aio_cb() must
> never call drain - and it doesn't before your changes to
> bdrv_replace_child_noperm() break it. How confident are we that this
> only breaks tests and not real code?
I am not sure. From a quick look, the AIO callback is really used pretty
much everywhere. Maybe we should really find a way to asseert
GLOBAL_STATE_CODE and friends?

> 
> Part of the problem is probably that drain is serving two slightly
> different purposes inside the block layer (just make sure that nothing
> touches the node any more) and callers outside of the block layer (make
> sure that everything has completed and no more callbacks will be
> called). This bug sits exactly in the difference between those two
> concepts - we're waiting for more than we would have to wait for, and it
> causes a deadlock in this combination.
> 
> I guess it could be fixed if BlockBackend accounted for requests that
> are already completed, but their callback hasn't yet. blk_drain() would
> then have to wait for those requests, but blk_root_drained_poll()
> wouldn't because these requests don't affect the root node any more.
> 
>> Different story is test_detach_by_driver_cb(): in this case,
>> detach_by_parent_aio_cb() does not call detach_indirect_bh(),
>> but it is instead called as a bh r

Re: [PATCH v2] Deprecate C virtiofsd

2022-02-14 Thread Stefan Hajnoczi

On Thu, Feb 10, 2022 at 05:47:14PM +, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" 
> 
> There's a nice new Rust implementation out there; recommend people
> do new work on that.
> 
> Signed-off-by: Dr. David Alan Gilbert 
> ---
>  docs/about/deprecated.rst | 17 +
>  1 file changed, 17 insertions(+)

Thanks, applied to my block tree:
https://gitlab.com/stefanha/qemu/commits/block

Stefan


signature.asc
Description: PGP signature

[PATCH 1/2] spapr: Add SPAPR_CAP_AIL_MODE_3 for AIL mode 3 support for H_SET_MODE hcall

2022-02-14 Thread Nicholas Piggin

The behaviour of the Address Translation Mode on Interrupt resource is
not consistently supported by all CPU versions or all KVM versions. In
particular KVM HV only supports mode 0 on POWER7 processors and some
early POWER9 processors, and does not support mode 2 anywhere. KVM PR
only supports mode 0. TCG supports all modes (0, 2, 3).

This leads to inconsistencies in guest behaviour and could cause
problems migrating guests. This was not noticable for Linux guests for a
long time because the kernel only uses modes 0 and 3, and it used to
consider AIL-3 to be advisory in that it would always keep the AIL-0
vectors around. KVM for a long time would not always honor the AIL mode,
contrary to architecture.

Recent Linux guests depend on the AIL mode working as specified in order
to support the SCV facility interrupt. If AIL-3 can not be provided,
then Linux must be given an error so it can disable the SCV facility,
rather than silently failing.

Add the ail-mode-3 capability to specify that AIL-3 is supported. AIL-0
is implied as the baseline. AIL-2 is no longer supported by spapr. It
is not known to be used by any software, but support in TCG could be
restored with an ail-mode-2 capability quite easily if a regression is
reported.

Modify the H_SET_MODE Address Translation Mode on Interrupt resource
handler to check capabilities and correctly return error if not
supported.

A heuristic is added for KVM to determine AIL-3 support before the
introduction of a new KVM CAP.

Signed-off-by: Nicholas Piggin 
---
Since the RFC, I made this a single mode cap for ail-3, suggested
by David. Also split out the KVM CAP into patch 2 because that is
not ready for merge yet (this patch can go in ahead of it).

Thanks,
Nick

 hw/ppc/spapr.c |  6 ++
 hw/ppc/spapr_caps.c| 23 +++
 hw/ppc/spapr_hcall.c   | 23 +--
 include/hw/ppc/spapr.h |  4 +++-
 target/ppc/kvm.c   | 23 +++
 target/ppc/kvm_ppc.h   |  6 ++
 6 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 3d6ec309dd..15a02d3e78 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -4606,6 +4606,12 @@ static void spapr_machine_class_init(ObjectClass *oc, 
void *data)
 smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON;
 smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF;
+
+/* This cap specifies whether the AIL 3 mode for H_SET_RESOURCE is
+ * supported. Default to true, (PR KVM, POWER7 and earlier, and KVM on
+ * some early POWER9s only support 0).
+ */
+smc->default_caps.caps[SPAPR_CAP_AIL_MODE_3] = SPAPR_CAP_ON;
 spapr_caps_add_properties(smc);
 smc->irq = &spapr_irq_dual;
 smc->dr_phb_enabled = true;
diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index ed7c077a0d..5fd4a53c33 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -613,6 +613,20 @@ static void cap_rpt_invalidate_apply(SpaprMachineState 
*spapr,
 }
 }
 
+static void cap_ail_mode_3_apply(SpaprMachineState *spapr,
+ uint8_t val, Error **errp)
+{
+ERRP_GUARD();
+
+if (kvm_enabled()) {
+if (!kvmppc_supports_ail_3()) {
+error_setg(errp, "KVM implementation does not support 
cap-ail-mode-3");
+error_append_hint(errp, "Try appending -machine 
cap-ail-mode-3=off\n");
+return;
+}
+}
+}
+
 SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
 [SPAPR_CAP_HTM] = {
 .name = "htm",
@@ -730,6 +744,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
 .type = "bool",
 .apply = cap_rpt_invalidate_apply,
 },
+[SPAPR_CAP_AIL_MODE_3] = {
+.name = "ail-mode-3",
+.description = "Alternate Interrupt Location (AIL) mode 3 support",
+.index = SPAPR_CAP_AIL_MODE_3,
+.get = spapr_cap_get_bool,
+.set = spapr_cap_set_bool,
+.type = "bool",
+.apply = cap_ail_mode_3_apply,
+},
 };
 
 static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr,
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 222c1b6bbd..5dec056796 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -811,32 +811,35 @@ static target_ulong h_set_mode_resource_le(PowerPCCPU 
*cpu,
 }
 
 static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu,
+SpaprMachineState 
*spapr,
 target_ulong mflags,
 target_ulong value1,
 target_ulong value2)
 {
-PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
-
-if (!(pcc->insns_flags2 & PPC2_ISA207S)) {
-return H_P2;
-}
 if (value1) {
 return H_P3;
 }
+
 if (

[PATCH 2/2] target/ppc/kvm: Use KVM_CAP_PPC_AIL_MODE_3 to determine cap-ail-mode-3 support

2022-02-14 Thread Nicholas Piggin

Use KVM_CAP_PPC_AIL_MODE_3 to determine cap-ail-mode-3 support for KVM
guests. Keep the fallback heuristic for KVM hosts that pre-date this
CAP.

This is only proposed the KVM CAP has not yet been allocated. I will
ask to merge the new KVM cap when there are no objections on the QEMU
side.

not-yet-Signed-off-by: Nicholas Piggin 
---
 hw/ppc/spapr_caps.c   |  2 +-
 linux-headers/linux/kvm.h |  1 +
 target/ppc/kvm.c  | 18 +-
 target/ppc/kvm_ppc.h  |  4 ++--
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
index 5fd4a53c33..5cc80776d0 100644
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@@ -619,7 +619,7 @@ static void cap_ail_mode_3_apply(SpaprMachineState *spapr,
 ERRP_GUARD();
 
 if (kvm_enabled()) {
-if (!kvmppc_supports_ail_3()) {
+if (!kvmppc_has_cap_ail_3()) {
 error_setg(errp, "KVM implementation does not support 
cap-ail-mode-3");
 error_append_hint(errp, "Try appending -machine 
cap-ail-mode-3=off\n");
 return;
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 02c5e7b7bb..d91f578200 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -1130,6 +1130,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_BINARY_STATS_FD 203
 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
 #define KVM_CAP_ARM_MTE 205
+#define KVM_CAP_PPC_AIL_MODE_3 210
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 128bc530d4..d0d0bdaac4 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
 static int cap_large_decr;
 static int cap_fwnmi;
 static int cap_rpt_invalidate;
+static int cap_ail_mode_3;
 
 static uint32_t debug_inst_opcode;
 
@@ -154,6 +155,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
 }
 
 cap_rpt_invalidate = kvm_vm_check_extension(s, KVM_CAP_PPC_RPT_INVALIDATE);
+cap_ail_mode_3 = kvm_vm_check_extension(s, KVM_CAP_PPC_AIL_MODE_3);
 kvm_ppc_register_host_cpu_type();
 
 return 0;
@@ -2563,10 +2565,24 @@ int kvmppc_has_cap_rpt_invalidate(void)
 return cap_rpt_invalidate;
 }
 
-int kvmppc_supports_ail_3(void)
+int kvmppc_has_cap_ail_3(void)
 {
 PowerPCCPUClass *pcc = kvm_ppc_get_host_cpu_class();
 
+if (cap_ail_mode_3) {
+return 1;
+}
+
+if (kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_PPC_AIL_MODE_3) == 
0) {
+return 0;
+}
+
+/*
+ * For KVM hosts that pre-date this cap, special-case the test because
+ * the performance cost for disabling the feature unconditionally is
+ * prohibitive.
+ */
+
 /*
  * KVM PR only supports AIL-0
  */
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index d9d1c54955..efafa67b83 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -73,7 +73,7 @@ int kvmppc_set_cap_nested_kvm_hv(int enable);
 int kvmppc_get_cap_large_decr(void);
 int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable);
 int kvmppc_has_cap_rpt_invalidate(void);
-int kvmppc_supports_ail_3(void);
+int kvmppc_has_cap_ail_3(void);
 int kvmppc_enable_hwrng(void);
 int kvmppc_put_books_sregs(PowerPCCPU *cpu);
 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
@@ -394,7 +394,7 @@ static inline int kvmppc_has_cap_rpt_invalidate(void)
 return false;
 }
 
-static inline int kvmppc_supports_ail_3(void)
+static inline int kvmppc_has_cap_ail_3(void)
 {
 return false;
 }
-- 
2.23.0

Re: [PATCH v2] Deprecate C virtiofsd

2022-02-14 Thread Dr. David Alan Gilbert

* Richard W.M. Jones (rjo...@redhat.com) wrote:
> On Thu, Feb 10, 2022 at 05:47:14PM +, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" 
> > 
> > There's a nice new Rust implementation out there; recommend people
> > do new work on that.
> > 
> > Signed-off-by: Dr. David Alan Gilbert 
> > ---
> >  docs/about/deprecated.rst | 17 +
> >  1 file changed, 17 insertions(+)
> > 
> > diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
> > index 47a594a3b6..3c73d22729 100644
> > --- a/docs/about/deprecated.rst
> > +++ b/docs/about/deprecated.rst
> > @@ -454,3 +454,20 @@ nanoMIPS ISA
> >  
> >  The ``nanoMIPS`` ISA has never been upstreamed to any compiler toolchain.
> >  As it is hard to generate binaries for it, declare it deprecated.
> > +
> > +Tools
> > +-
> > +
> > +virtiofsd
> > +'
> > +
> > +There is a new Rust implementation of ``virtiofsd`` at
> > +``https://gitlab.com/virtio-fs/virtiofsd``;
> > +since this is now marked stable, new development should be done on that
> > +rather than the existing C version in the QEMU tree.
> > +The C version will still accept fixes and patches that
> > +are already in development for the moment, but will eventually
> > +be deleted from this tree.
> > +New deployments should use the Rust version, and existing systems
> > +should consider moving to it.  The command line and feature set
> > +is very close and moving should be simple.
> 
> I'm not qualified to say if the Rust impl is complete enough
> to replace the C version, so I won't add a reviewed tag.

We believe it is a complete replacement at this point, with compatible
command line.

Dave

> However I want to say that from the point of view of downstream
> packagers of qemu -- especially Fedora -- it would be helpful if we
> could direct both upstream development effort and downstream packaging
> into just the one virtiofsd.  So I agree in principle with this.
> 
> Rich.
> 
> -- 
> Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
> Read my programming and virtualization blog: http://rwmj.wordpress.com
> virt-p2v converts physical machines to virtual machines.  Boot with a
> live CD or over the network (PXE) and turn machines into KVM guests.
> http://libguestfs.org/virt-v2v
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [RFC] thread-pool: Add option to fix the pool size

2022-02-14 Thread Kevin Wolf

Am 14.02.2022 um 11:09 hat Stefan Hajnoczi geschrieben:
> On Fri, Feb 11, 2022 at 12:32:57PM +0100, Kevin Wolf wrote:
> > Am 03.02.2022 um 15:19 hat Stefan Hajnoczi geschrieben:
> > > On Thu, Feb 03, 2022 at 10:56:49AM +, Daniel P. Berrangé wrote:
> > > > On Thu, Feb 03, 2022 at 10:53:07AM +, Stefan Hajnoczi wrote:
> > > > > On Wed, Feb 02, 2022 at 06:52:34PM +0100, Nicolas Saenz Julienne 
> > > > > wrote:
> > > 1. A global default value that all new AioContext take. The QEMU main
> > >loop's qemu_aio_context will use this and all IOThread AioContext
> > >will use it (unless they have been overridden).
> > > 
> > >I would define it on --machine because that's the "global" object for
> > >a guest, but that's not very satisfying.
> > 
> > Semantically, -machine is about the virtual hardware where as iothreads
> > are about the backend, so I agree it's not a good fit.
> > 
> > For the main thread, you may want to configure all the same options that
> > you can configure for an iothread. So to me that sounds like we would
> > want to allow using an iothread object for the main thread, too.
> > 
> > That would still require us to tell QEMU which iothread object should be
> > used for the main thread, though.
> 
> Making the main loop thread an IOThread is an interesting direction but
> not an easy change to make.
> 
> The main loop thread has a custom event loop that is not interchangeable
> with the IOThread event loop:
> - The main loop has a poll notifier interface for libslirp fd monitoring
>   integration.
> - The main loop is a GLib event loop but manually polls to get
>   nanosecond resolution timers.
> - The main loop has icount integration.
> - The main loop has the special iohandler AioContext
> 
> The IOThread event loop runs an optimized AioContext event loop instead.
> It falls back to regular g_main_loop_run() if there is a GSource user.
> 
> It would definitely be nice to unify the main loop with IOThread and
> then use --object iothread,... to configure main loop parameters.
> 
> I'm not sure if requiring that of Nicolas is fair though. The event
> loops in QEMU are complex and changes are likely to introduce subtle
> bugs or performance regressions.

I'm not suggesting actually running the iothread event loop instead,
merely using the properties of an object to configure the main thread as
the external user interface.
Whether this uses the same main loop code as today or is moved to the
regular iothread event loop is an implementation detail that can be
changed later.

Or we could maybe use a different object type like 'mainthread' and
share the properties using QOM inheritance.

Kevin


signature.asc
Description: PGP signature

Re: [PATCH 5/6] test-bdrv-drain.c: remove test_detach_by_parent_cb()

2022-02-14 Thread Paolo Bonzini


On 2/11/22 16:38, Kevin Wolf wrote:

The callback of an I/O function isn't I/O, though. It is code_after_
the I/O has completed. If this doesn't work any more, it feels like this
is a bug.


The issue is that the I/O has *not* completed yet.  blk_aio_preadv(..., 
cb, opaque) is not equivalent to


ret = blk_co_preadv(...);
cb(ret, opaque);

but rather to

blk_inc_in_flight(blk);
ret = blk_co_preadv(...);
cb(ret, opaque);
blk_dec_in_flight(blk);

Your own commit message (yeah I know we've all been through that :)) 
explains why, and notes that it is now invalid to drain in a callback:


commit 46aaf2a566e364a62315219255099cbf1c9b990d
Author: Kevin Wolf 
Date:   Thu Sep 6 17:47:22 2018 +0200

block-backend: Decrease in_flight only after callback

Request callbacks can do pretty much anything, including operations
that will yield from the coroutine (such as draining the backend).
In that case, a decreased in_flight would be visible to other code
and could lead to a drain completing while the callback hasn't
actually completed yet.

Note that reordering these operations forbids calling drain directly
inside an AIO callback.

So the questions are:

1) is the above commit wrong though well-intentioned?

2) is it unexpected that bdrv_replace_child_noperm() drains (thus 
becoming invalid from the callback, albeit valid through a bottom half)?



My answer is respectively 1) it's correct, many coroutines do 
inc_in_flight before creation and dec_in_flight at the end, we're just 
saying that it's _always_ the case for callback-based operations; 2) no, 
it's not unexpected and therefore the test is the incorrect one.


Paolo

Re: [PULL 0/5] 9p queue 2022-02-10

2022-02-14 Thread Christian Schoenebeck

On Montag, 14. Februar 2022 11:36:53 CET Greg Kurz wrote:
> On Mon, 14 Feb 2022 10:47:43 +0100
> 
> Christian Schoenebeck  wrote:
> > On Sonntag, 13. Februar 2022 21:33:10 CET Peter Maydell wrote:
> > > On Thu, 10 Feb 2022 at 11:33, Christian Schoenebeck
> > > 
> > >  wrote:
> > > > The following changes since commit 
0a301624c2f4ced3331ffd5bce85b4274fe132af:
> > > >   Merge remote-tracking branch
> > > >   'remotes/pmaydell/tags/pull-target-arm-20220208' into staging
> > > >   (2022-02-08 11:40:08 +)>
> > > > 
> > > > are available in the Git repository at:
> > > >   https://github.com/cschoenebeck/qemu.git tags/pull-9p-20220210
> > > > 
> > > > for you to fetch changes up to 
de19c79dad6a2cad54ae04ce754d47c07bf9bc93:
> > > >   9pfs: Fix segfault in do_readdir_many caused by struct dirent
> > > >   overread
> > > >   (2022-02-10 11:56:01 +0100)>
> > > > 
> > > > 9d82f6
> > > > a3e68c2 9pfs: fixes and cleanup
> > > > 
> > > > * Fifth patch fixes a 9pfs server crash that happened on some systems
> > > > due
> > > > 
> > > >   to incorrect (system dependant) handling of struct dirent size.
> > > > 
> > > > * Tests: Second patch fixes a test error that happened on some systems
> > > > due
> > > > 
> > > >   mkdir() being called twice for creating the test directory for the
> > > >   9p
> > > >   'local' tests.
> > > > 
> > > > * Tests: Third patch fixes a memory leak.
> > > > 
> > > > * Tests: The remaining two patches are code cleanup.
> > > > 
> > > > 
> > > 
> > > Hi; this fails CI for the build-oss-fuzz job, which finds
> > > a heap-buffer-overflow:
> > > https://gitlab.com/qemu-project/qemu/-/jobs/2087610013
> > 
> > So this is about the 'dirent' patch:
> > https://github.com/cschoenebeck/qemu/commit/de19c79dad6a2cad54ae04ce754d47
> > c07bf9bc93
> > 
> > In conjunction with the 9p fuzzing tests:
> > https://wiki.qemu.org/Documentation/9p#Fuzzing
> > 
> > I first thought it might be a false positive due to the unorthodox
> > handling of dirent duplication by that patch, but from the ASan output
> > below I am not really sure about that.
> 
> No, this is an actual bug. See below.

Yep, the patch would turn the 9p tests' synth driver buggy. :/ Vitaly, I fear 
the patch needs a v5.

> > Is there a way to get the content of local variables?
> > 
> > Would it be possible that the following issue (g_memdup vs. g_memdup2)
> > might apply here?
> > https://discourse.gnome.org/t/port-your-module-from-g-memdup-to-g-memdup2-> 
> > > now/5538
> > 
> > Best regards,
> > Christian Schoenebeck
> > 
> > > 8/152 qemu:qtest+qtest-i386 / qtest-i386/qos-test ERROR 66.74s killed
> > > by signal 6 SIGABRT
> > > 
> > > >>> QTEST_QEMU_BINARY=./qemu-system-i386
> > > >>> QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemo
> > > >>> n
> > > >>> MALLOC_PERTURB_=120
> > > >>> G_TEST_DBUS_DAEMON=/builds/qemu-project/qemu/tests/dbus-vmstate-daem
> > > >>> on.
> > > >>> sh QTEST_QEMU_IMG=./qemu-img
> > > >>> /builds/qemu-project/qemu/build-oss-fuzz/tests/qtest/qos-test --tap
> > > >>> -k
> > > 
> > > ― ✀
> > > ― Listing only the last 100 lines
> > > from
> > > a long log.
> > > For details see https://github.com/google/sanitizers/issues/189
> > > ==7270==WARNING: ASan doesn't fully support makecontext/swapcontext
> > > functions and may produce false positives in some cases!
> > > ==7270==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > > stack type: default top: 0x7ffc79fb; bottom 0x7ff908ffd000; size:
> > > 0x000370fb3000 (14780411904)
> > > False positive error reports may follow
> > > For details see https://github.com/google/sanitizers/issues/189
> > > ==7276==WARNING: ASan doesn't fully support makecontext/swapcontext
> > > functions and may produce false positives in some cases!
> > > ==7276==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > > stack type: default top: 0x7fff7e4a8000; bottom 0x7fd6363fd000; size:
> > > 0x0029480ab000 (177302319104)
> > > False positive error reports may follow
> > > For details see https://github.com/google/sanitizers/issues/189
> > > ==7282==WARNING: ASan doesn't fully support makecontext/swapcontext
> > > functions and may produce false positives in some cases!
> > > ==7282==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > > stack type: default top: 0x7ffee6e7f000; bottom 0x7f32fb5fd000; size:
> > > 0x00cbeb882000 (875829927936)
> > > False positive error reports may follow
> > > For details see https://github.com/google/sanitizers/issues/189
> > > ==7288==WARNING: ASan doesn't fully support makecontext/swapcontext
> > > functions and may produce false positives in some cases!
> > > ==7288==WARNING: ASan is ignoring requested __asan_handle_no_return:
> > > stack type: default top: 0x7ffc6118e000; bottom 0x7f6391cfd000; size:
>

Re: Call for GSoC and Outreachy project ideas for summer 2022

2022-02-14 Thread Stefano Garzarella


On Mon, Feb 14, 2022 at 03:11:20PM +0800, Jason Wang wrote:

On Fri, Jan 28, 2022 at 11:47 PM Stefan Hajnoczi  wrote:


Dear QEMU, KVM, and rust-vmm communities,
QEMU will apply for Google Summer of Code 2022
(https://summerofcode.withgoogle.com/) and has been accepted into
Outreachy May-August 2022 (https://www.outreachy.org/). You can now
submit internship project ideas for QEMU, KVM, and rust-vmm!

If you have experience contributing to QEMU, KVM, or rust-vmm you can
be a mentor. It's a great way to give back and you get to work with
people who are just starting out in open source.

Please reply to this email by February 21st with your project ideas.

Good project ideas are suitable for remote work by a competent
programmer who is not yet familiar with the codebase. In
addition, they are:
- Well-defined - the scope is clear
- Self-contained - there are few dependencies
- Uncontroversial - they are acceptable to the community
- Incremental - they produce deliverables along the way

Feel free to post ideas even if you are unable to mentor the project.
It doesn't hurt to share the idea!


Implementing the VIRTIO_F_IN_ORDER feature for both Qemu and kernel
(vhost/virtio drivers) would be an interesting idea.

It satisfies all the points above since it's supported by virtio spec.


Yep, I agree!



(Unfortunately, I won't have time in the mentoring)


I can offer my time to mentor this idea.

Thanks,
Stefano

Re: [PATCH] ui/cocoa: Set UI information

2022-02-14 Thread Gerd Hoffmann

On Mon, Feb 14, 2022 at 10:43:32AM +, Peter Maydell wrote:
> On Mon, 14 Feb 2022 at 10:27, Gerd Hoffmann  wrote:
> >
> > > (2) A question for Gerd:
> > > Is it safe to call dpy_set_ui_info() from a non-QEMU-main-thread?
> >
> > No.
> 
> Is it OK to do so if the other thread is holding the iothread lock?
> (This is how we do a lot of the other "need to call a QEMU function"
> work from the cocoa UI thread.)

That should work.

take care,
  Gerd

[PATCH v4 1/1] util: adjust coroutine pool size to virtio block queue

2022-02-14 Thread Hiroki Narukawa

Coroutine pool size was 64 from long ago, and the basis was organized in the 
commit message in 4d68e86b.

At that time, virtio-blk queue-size and num-queue were not configuable, and 
equivalent values were 128 and 1.

Coroutine pool size 64 was fine then.

Later queue-size and num-queue got configuable, and default values were 
increased.

Coroutine pool with size 64 exhausts frequently with random disk IO in new 
size, and slows down.

This commit adjusts coroutine pool size adaptively with new values.

This commit adds 64 by default, but now coroutine is not only for block devices,

and is not too much burdon comparing with new default.

pool size of 128 * vCPUs.

Signed-off-by: Hiroki Narukawa 
---
 hw/block/virtio-blk.c|  5 +
 include/qemu/coroutine.h | 10 ++
 util/qemu-coroutine.c| 20 
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 82676cdd01..540c38f829 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -32,6 +32,7 @@
 #include "hw/virtio/virtio-bus.h"
 #include "migration/qemu-file-types.h"
 #include "hw/virtio/virtio-access.h"
+#include "qemu/coroutine.h"
 
 /* Config size before the discard support (hide associated config fields) */
 #define VIRTIO_BLK_CFG_SIZE offsetof(struct virtio_blk_config, \
@@ -1214,6 +1215,8 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 for (i = 0; i < conf->num_queues; i++) {
 virtio_add_queue(vdev, conf->queue_size, virtio_blk_handle_output);
 }
+qemu_coroutine_increase_pool_batch_size(conf->num_queues * conf->queue_size
+/ 2);
 virtio_blk_data_plane_create(vdev, conf, &s->dataplane, &err);
 if (err != NULL) {
 error_propagate(errp, err);
@@ -1250,6 +1253,8 @@ static void virtio_blk_device_unrealize(DeviceState *dev)
 for (i = 0; i < conf->num_queues; i++) {
 virtio_del_queue(vdev, i);
 }
+qemu_coroutine_decrease_pool_batch_size(conf->num_queues * conf->queue_size
+/ 2);
 qemu_del_vm_change_state_handler(s->change);
 blockdev_mark_auto_del(s->blk);
 virtio_cleanup(vdev);
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index 4829ff373d..c828a95ee0 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -331,6 +331,16 @@ void qemu_co_sleep_wake(QemuCoSleep *w);
  */
 void coroutine_fn yield_until_fd_readable(int fd);
 
+/**
+ * Increase coroutine pool size
+ */
+void qemu_coroutine_increase_pool_batch_size(unsigned int 
additional_pool_size);
+
+/**
+ * Devcrease coroutine pool size
+ */
+void qemu_coroutine_decrease_pool_batch_size(unsigned int 
additional_pool_size);
+
 #include "qemu/lockable.h"
 
 #endif /* QEMU_COROUTINE_H */
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index 38fb6d3084..c03b2422ff 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -20,12 +20,14 @@
 #include "qemu/coroutine_int.h"
 #include "block/aio.h"
 
+/** Initial batch size is 64, and is increased on demand */
 enum {
-POOL_BATCH_SIZE = 64,
+POOL_INITIAL_BATCH_SIZE = 64,
 };
 
 /** Free list to speed up creation */
 static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
+static unsigned int pool_batch_size = POOL_INITIAL_BATCH_SIZE;
 static unsigned int release_pool_size;
 static __thread QSLIST_HEAD(, Coroutine) alloc_pool = 
QSLIST_HEAD_INITIALIZER(pool);
 static __thread unsigned int alloc_pool_size;
@@ -49,7 +51,7 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void 
*opaque)
 if (CONFIG_COROUTINE_POOL) {
 co = QSLIST_FIRST(&alloc_pool);
 if (!co) {
-if (release_pool_size > POOL_BATCH_SIZE) {
+if (release_pool_size > qatomic_read(&pool_batch_size)) {
 /* Slow path; a good place to register the destructor, too.  */
 if (!coroutine_pool_cleanup_notifier.notify) {
 coroutine_pool_cleanup_notifier.notify = 
coroutine_pool_cleanup;
@@ -86,12 +88,12 @@ static void coroutine_delete(Coroutine *co)
 co->caller = NULL;
 
 if (CONFIG_COROUTINE_POOL) {
-if (release_pool_size < POOL_BATCH_SIZE * 2) {
+if (release_pool_size < qatomic_read(&pool_batch_size) * 2) {
 QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
 qatomic_inc(&release_pool_size);
 return;
 }
-if (alloc_pool_size < POOL_BATCH_SIZE) {
+if (alloc_pool_size < qatomic_read(&pool_batch_size)) {
 QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
 alloc_pool_size++;
 return;
@@ -202,3 +204,13 @@ AioContext *coroutine_fn 
qemu_coroutine_get_aio_context(Coroutine *co)
 {
 return co->ctx;
 }
+
+void qemu_coroutine_increase_pool_batch_size(unsigned int additional_pool_size)
+{
+qatomic_add(&pool_batch_size, additional_

[PATCH v4 0/1] Patch to adjust coroutine pool size adaptively

2022-02-14 Thread Hiroki Narukawa

Resending with correct commit message

Resending patch with decreasing coroutine pool size on device remove

We encountered random disk IO performance drop since qemu-5.0.0, and this patch 
fixes it.

Commit message in 4d68e86b implied to adjust coroutine pool size adaptively, so 
I tried to implement this.

Changes from v3:
No code changed. Changed commit message so that first line indicates to
correct commit ID.

Changes from v2:
Decrease coroutine pool size on device remove

Changes from v1:
Use qatomic_read properly

Hiroki Narukawa (1):
  util: adjust coroutine pool size to virtio block queue

 hw/block/virtio-blk.c|  5 +
 include/qemu/coroutine.h | 10 ++
 util/qemu-coroutine.c| 20 
 3 files changed, 31 insertions(+), 4 deletions(-)

-- 
2.17.1

RE: [PATCH v3 1/1] util: adjust coroutine pool size to virtio block queue

2022-02-14 Thread Hiroki Narukawa

> Coroutine pool size was 64 from long ago, and the basis was organized in the
> commit message in c740ad92.

Sorry, I noticed that commit ID mentioning here was incorrect.
The correct one is 4d68e86b.

https://gitlab.com/qemu-project/qemu/-/commit/4d68e86bb10159099da0798f74e7512955f15eec

I have resent this patch as v4 with exactly the same code as v3, just changing 
this commit message.

> 
> At that time, virtio-blk queue-size and num-queue were not configuable, and
> equivalent values were 128 and 1.
> 
> Coroutine pool size 64 was fine then.
> 
> Later queue-size and num-queue got configuable, and default values were
> increased.
> 
> Coroutine pool with size 64 exhausts frequently with random disk IO in new 
> size,
> and slows down.
> 
> This commit adjusts coroutine pool size adaptively with new values.
> 
> This commit adds 64 by default, but now coroutine is not only for block 
> devices,
> 
> and is not too much burdon comparing with new default.
> 
> pool size of 128 * vCPUs.
> 
> Signed-off-by: Hiroki Narukawa 
> ---
>  hw/block/virtio-blk.c|  5 +
>  include/qemu/coroutine.h | 10 ++
>  util/qemu-coroutine.c| 20 
>  3 files changed, 31 insertions(+), 4 deletions(-)
>

Re: [PATCH 1/6] block/io.c: fix bdrv_child_cb_drained_begin invocations from a coroutine

2022-02-14 Thread Paolo Bonzini


On 2/14/22 11:27, Emanuele Giuseppe Esposito wrote:

Anyways, I think that in addition to the fix in this patch, we should
also fix bdrv_parent_drained_begin_single(poll=true) in
bdrv_replace_child_noperm, with something similar to what is done in
bdrv_co_yield_to_drain? ie if we are in coroutine, schedule a BH that
runs the same logic but in the main loop, but then somehow wait that it
finishes before continuing?

Alternatively, we would forbid polling in coroutines at all. And the
only place I can see that is using the drain in coroutine is mirror (see
below).


I think you should first of all see what breaks if you forbid 
bdrv_replace_child_noperm() from coroutine context.


Drain in coroutines does not poll, it gets out of the coroutine through 
a bottom half before polling.  So if bdrv_replace_child_noperm() doesn't 
require it, polling in coroutines can still be forbidden.


This patch is correct nevertheless.

Paolo

Re: [PATCH] Add --with-branding-prefix and QEMU_BRANDING_PREFIX

2022-02-14 Thread Liviu Ionescu

> On 8 Feb 2022, at 21:58, Peter Maydell  wrote:
> 
> I've cc'd some people who might have an opinion on whether this
> something we want to add upstream. ...

Well, given the comments received, it is obvious that we have different use 
cases, and, in my opinion, a one-size-fits-all approach cannot be expected to 
satisfy all of them. :-(

Since QEMU is now a hard dependency in all my projects (for running unit 
tests), I had to commit myself on continuing to maintain the xPack QEMU binary 
distribution used by these tests.

Thus my desire to minimise maintenance during updates, for example by keeping 
as little things as possible in a local fork.

So, if the Linux maintainers do not find a compelling reason for adding 
`--with-branding-prefix` and are concerned that this might break their 
distributions (which it will not, since they will not use this option), what 
would be an acceptable solution to allow more flexibility in the main QEMU 
greeting message? Personally I use only qemu-system-arm and qemu-system-riscv, 
so we are talking only about a change in `vl.c`; I do not use the other tools, 
so not having them updated is not a concern.

Would you agree on a multi step approach, first a minimal patch that would 
solve my use case, then, if there will be others needing it, a more elaborate 
solution?

For example I don't mind having to pass a preprocessor definition to my build; 
so how about something like:

```c
static void version(void)
{
+#if defined(QEMU_BRANDING_PREFIX)
+printf("%s ", QEMU_BRANDING_PREFIX);
+#endif
printf("QEMU emulator version " QEMU_FULL_VERSION "\n"
   QEMU_COPYRIGHT "\n");
}
```

This would harm no existing distributions, and would add no maintenance efforts 
for none of you, but would save me some recurrent maintenance efforts with each 
release.

Would this be fine with you?

Regards,

Liviu

[PATCH] ui/clipboard: fix use-after-free regression

2022-02-14 Thread marcandre . lureau

From: Marc-André Lureau 

The same info may be used to update the clipboard, and may be freed
before being ref'ed again.

Fixes: 70a54b01693ed ("ui: avoid compiler warnings from unused clipboard info 
variable")

Signed-off-by: Marc-André Lureau 
---
 ui/clipboard.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ui/clipboard.c b/ui/clipboard.c
index 5f15cf853d07..9079ef829b51 100644
--- a/ui/clipboard.c
+++ b/ui/clipboard.c
@@ -66,8 +66,10 @@ void qemu_clipboard_update(QemuClipboardInfo *info)
 
 notifier_list_notify(&clipboard_notifiers, ¬ify);
 
-qemu_clipboard_info_unref(cbinfo[info->selection]);
-cbinfo[info->selection] = qemu_clipboard_info_ref(info);
+if (cbinfo[info->selection] != info) {
+qemu_clipboard_info_unref(cbinfo[info->selection]);
+cbinfo[info->selection] = qemu_clipboard_info_ref(info);
+}
 }
 
 QemuClipboardInfo *qemu_clipboard_info(QemuClipboardSelection selection)
-- 
2.34.1.428.gdcc0cd074f0c

[PATCH] MAINTAINERS: no need to add my name explicitly as a reviewer for VIOT tables

2022-02-14 Thread Ani Sinha

I am already listed as a reviewer for ACPI/SMBIOS subsystem. There is no need to
again add me as a reviewer for ACPI/VIOT.

Signed-off-by: Ani Sinha 
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b0b845f445..d92a262947 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1812,7 +1812,6 @@ F: docs/specs/acpi_hw_reduced_hotplug.rst
 
 ACPI/VIOT
 M: Jean-Philippe Brucker 
-R: Ani Sinha 
 S: Supported
 F: hw/acpi/viot.c
 F: hw/acpi/viot.h
-- 
2.25.1

Re: [PATCH 0/6] ui/dbus: Share one listener for a console

2022-02-14 Thread Marc-André Lureau

Hi Akihiko

On Sun, Feb 13, 2022 at 6:44 AM Akihiko Odaki 
wrote:

> ui/dbus required to have multiple DisplayChangeListeners (possibly with
> OpenGL)
> for a console but that caused several problems:
> - It broke egl-headless, an unusual display which implements OpenGL
> rendering
>   for non-OpenGL displays. The code to support multiple
> DisplayChangeListeners
>   does not consider the case where non-OpenGL displays listens OpenGL
> consoles.
>

Can you provide instructions on what broke? Even better write a test,
please.

"make check-avocado AVOCADO_TESTS=tests/avocado/virtio-gpu.py", which
covers egl-headless usage, works.


> - Multiple OpenGL DisplayChangeListeners of dbus shares one DisplaySurface
> and
>   modifies its texture field, causing OpenGL texture leak and
> use-after-free.
>

Again, please provide instructions. I have regularly run -display dbus with
multiple clients and qemu compiled with sanitizers. I don't see any leak or
use after free.


> - Introduced extra code to check the compatibility of OpenGL context
> providers
>   and OpenGL texture renderers where those are often inherently tightly
> coupled
>   since they must share the same hardware.
>

So code checks are meant to prevent misusage. They might be too limited or
broken in some ways, but reverting is likely going to introduce other
regressions I was trying to fix.

- Needed extra work to broadcast the same change to multiple dbus listeners.
>
>
Compared to what?


> This series solve them by implementing the change broadcast in ui/dbus,
> which
> knows exactly what is needed. Changes for the common code to support
> multiple
> OpenGL DisplayChangeListeners were reverted to fix egl-headless and reduce
> the code size.
>

Thanks a lot for your work, I am going to take a look at your approach. But
please help us understand better what the problem actually is, by giving
examples & tests to avoid future regressions and document the expected
behaviour.


> Akihiko Odaki (6):
>   ui/dbus: Share one listener for a console
>   Revert "console: save current scanout details"
>   Revert "ui: split the GL context in a different object"
>   Revert "ui: dispatch GL events to all listeners"
>   Revert "ui: associate GL context outside of display listener
> registration"
>   Revert "ui: factor out qemu_console_set_display_gl_ctx()"
>
>  include/ui/console.h   |  60 +-
>  include/ui/egl-context.h   |   6 +-
>  include/ui/gtk.h   |  11 +-
>  include/ui/sdl2.h  |   7 +-
>  include/ui/spice-display.h |   1 -
>  ui/console.c   | 258 +++
>  ui/dbus-console.c  | 109 ++
>  ui/dbus-listener.c | 417 +++--
>  ui/dbus.c  |  22 --
>  ui/dbus.h  |  36 +++-
>  ui/egl-context.c   |   6 +-
>  ui/egl-headless.c  |  20 +-
>  ui/gtk-egl.c   |  10 +-
>  ui/gtk-gl-area.c   |   8 +-
>  ui/gtk.c   |  25 +--
>  ui/sdl2-gl.c   |  10 +-
>  ui/sdl2.c  |  14 +-
>  ui/spice-display.c |  19 +-
>  18 files changed, 498 insertions(+), 541 deletions(-)
>
> --
> 2.32.0 (Apple Git-132)
>
>
>

-- 
Marc-André Lureau

Re: [PATCH] ui/clipboard: fix use-after-free regression

2022-02-14 Thread Daniel P . Berrangé

On Mon, Feb 14, 2022 at 03:59:17PM +0400, marcandre.lur...@redhat.com wrote:
> From: Marc-André Lureau 
> 
> The same info may be used to update the clipboard, and may be freed
> before being ref'ed again.

Ewww, subtle.

> Fixes: 70a54b01693ed ("ui: avoid compiler warnings from unused clipboard info 
> variable")
> 
> Signed-off-by: Marc-André Lureau 
> ---
>  ui/clipboard.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/ui/clipboard.c b/ui/clipboard.c
> index 5f15cf853d07..9079ef829b51 100644
> --- a/ui/clipboard.c
> +++ b/ui/clipboard.c
> @@ -66,8 +66,10 @@ void qemu_clipboard_update(QemuClipboardInfo *info)
>  
>  notifier_list_notify(&clipboard_notifiers, ¬ify);
>  
> -qemu_clipboard_info_unref(cbinfo[info->selection]);
> -cbinfo[info->selection] = qemu_clipboard_info_ref(info);
> +if (cbinfo[info->selection] != info) {
> +qemu_clipboard_info_unref(cbinfo[info->selection]);
> +cbinfo[info->selection] = qemu_clipboard_info_ref(info);
> +}
>  }

Reviewed-by: Daniel P. Berrangé 


Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 1/6] block/io.c: fix bdrv_child_cb_drained_begin invocations from a coroutine

2022-02-14 Thread Kevin Wolf

Am 14.02.2022 um 11:27 hat Emanuele Giuseppe Esposito geschrieben:
> 
> 
> On 11/02/2022 12:54, Kevin Wolf wrote:
> > Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
> >> Using bdrv_do_drained_begin_quiesce() in bdrv_child_cb_drained_begin()
> >> is not a good idea: the callback might be called when running
> >> a drain in a coroutine, and bdrv_drained_begin_poll() does not
> >> handle that case, resulting in assertion failure.
> > 
> > I remembered that we talked about this only recently on IRC, but it
> > didn't make any sense to me again when I read this commit message. So I
> > think we need --verbose.
> > 
> > The .drained_begin callback was always meant to run outside of coroutine
> > context, so the unexpected part isn't that it calls a function that
> > can't run in coroutine context, but that it is already called itself in
> > coroutine context.
> > 
> > The problematic path is bdrv_replace_child_noperm() which then calls
> > bdrv_parent_drained_begin_single(poll=true). Polling in coroutine
> > context is dangerous, it can cause deadlocks because the caller of the
> > coroutine can't make progress. So I believe this call is already wrong
> > in coroutine context.
> 
> Ok, you added this assertion in dcf94a23, but at that time there was no
> bdrv_parent_drained_begin_single, and the polling was only done in
> bdrv_do_drained_begin. So I think that to keep the same logic, the
> assertion should be moved in bdrv_parent_drained_begin_single()? And
> even more specifically, only if the poll flag is true.

I wouldn't necessarily say move, but copying it there makes sense to me.
In order to keep the interface constraints simple, I would assert it
independent of the poll parameter.

> I triggered this by adding additional drains in the callers of
> bdrv_replace_child_noperm(), and I think some test (probably unit test)
> was failing because of either the drained_begin callback itself called
> by the drain, or as you suggested the callbacks called by
> bdrv_parent_drained_begin_single from bdrv_replace_child_noperm.
> 
> Anyways, I think that in addition to the fix in this patch, we should
> also fix bdrv_parent_drained_begin_single(poll=true) in
> bdrv_replace_child_noperm, with something similar to what is done in
> bdrv_co_yield_to_drain? ie if we are in coroutine, schedule a BH that
> runs the same logic but in the main loop, but then somehow wait that it
> finishes before continuing?
> Even though at that point we would have a coroutine waiting for the main
> loop, which I don't think it's something we want.

Coroutines are waiting for the main loop all the time, why would this be
a problem?

Yes, I think a mechanism similar to bdrv_co_yield_to_drain() is needed
if we want to allow callers to be in coroutine context.

And once we have this mechanism, it's actually not in addition to this
patch, but instead of it, because this patch isn't needed any more when
we know that we can't be in coroutine context.

> Alternatively, we would forbid polling in coroutines at all. And the
> only place I can see that is using the drain in coroutine is mirror
> (see below).

Well, my point is that it is already forbidden because it can deadlock.
Code that polls in coroutine context anyway is probably buggy, unless it
can guarantee very specific circumstances that make a deadlock
impossible.

Maybe we can actually assert this in AIO_WAIT_WHILE().

> Additional question: I also noticed that there is a bdrv_drained_begin()
> call in mirror.c in the JobDriver run() callback. How can this even
> work? If a parent uses bdrv_child_cb_drained_begin (which should not be
> so rare) it will crash because of the assertion.

bdrv_co_yield_to_drain() lets this code run in the main loop.

> Further additional question: actually I don't understand also the
> polling logic of mirror (mirror_drained_poll), as if we are draining in
> the coroutine with in_drain = true I think we can have a deadlock if
> in_flight>0?

You mean for a drain issued by the mirror job itself? The in-flight
requests are still processed by the polling loop, so eventually
in_flight should become 0.

Kevin

Re: [PATCH] Add --with-branding-prefix and QEMU_BRANDING_PREFIX

2022-02-14 Thread Peter Maydell

On Mon, 14 Feb 2022 at 11:54, Liviu Ionescu  wrote:
> Would you agree on a multi step approach, first a minimal patch that would 
> solve my use case, then, if there will be others needing it, a more elaborate 
> solution?
>
> For example I don't mind having to pass a preprocessor definition to my 
> build; so how about something like:
>
>
> ```c
> static void version(void)
> {
> +#if defined(QEMU_BRANDING_PREFIX)
> +printf("%s ", QEMU_BRANDING_PREFIX);
> +#endif
> printf("QEMU emulator version " QEMU_FULL_VERSION "\n"
>QEMU_COPYRIGHT "\n");
> }
> ```

Either we want the feature upstream, or we don't. If we do, then
we want the whole thing including the configure option. If we don't,
then we don't want any of it, and that includes not having this kind
of ifdef.

So far you haven't really described a use case that --with-pkgversion
wouldn't satisfy. You say:

> Seeing the branded greeting in a console log is a visual confirmation
> that the test script used the desired version, and not another version
> picked up by mistake when using an incorrect PATH.

but the "v6.2.0-1-xpack-arm" pkgversion suffix works for this
purpose too.

thanks
-- PMM

Re: [PATCH] hw/i386: Improve bounds checking in OVMF table parsing

2022-02-14 Thread Philippe Mathieu-Daudé via


On 14/2/22 13:08, Dov Murik wrote:

When pc_system_parse_ovmf_flash() parses the optional GUIDed table in
the end of the OVMF flash memory area, the table length field is checked
for sizes that are too small, but doesn't error on sizes that are too
big (bigger than the flash content itself).

Add a check for maximal size of the OVMF table, and add an error report
in case the size is invalid.

Signed-off-by: Dov Murik 
---
  hw/i386/pc_sysfw_ovmf.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c
index f4dd92c588..0663f3f54a 100644
--- a/hw/i386/pc_sysfw_ovmf.c
+++ b/hw/i386/pc_sysfw_ovmf.c
@@ -24,6 +24,7 @@
   */
  
  #include "qemu/osdep.h"

+#include "qemu/error-report.h"
  #include "hw/i386/pc.h"
  #include "cpu.h"
  
@@ -66,7 +67,13 @@ void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size)

  ptr -= sizeof(uint16_t);
  tot_len = le16_to_cpu(*(uint16_t *)ptr) - sizeof(guid) - sizeof(uint16_t);
  
-if (tot_len <= 0) {

+if (tot_len < 0 || tot_len > flash_size - 50) {


Please use a definition instead of this magic '50' number.


+error_report("OVMF table has invalid size %d", tot_len);
+return;
+}
+
+if (tot_len == 0) {
+/* no entries in the OVMF table */
  return;
  }
  


base-commit: 48033ad678ae2def43bf0d543a2c4c3d2a93feaf

Re: [PATCH] MAINTAINERS: no need to add my name explicitly as a reviewer for VIOT tables

2022-02-14 Thread Philippe Mathieu-Daudé via


On 14/2/22 13:01, Ani Sinha wrote:

I am already listed as a reviewer for ACPI/SMBIOS subsystem. There is no need to
again add me as a reviewer for ACPI/VIOT.

Signed-off-by: Ani Sinha 
---
  MAINTAINERS | 1 -
  1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index b0b845f445..d92a262947 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1812,7 +1812,6 @@ F: docs/specs/acpi_hw_reduced_hotplug.rst


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH] MAINTAINERS: Add Akihiko Odaki to macOS-relateds

2022-02-14 Thread Philippe Mathieu-Daudé via


On 13/2/22 03:12, Akihiko Odaki wrote:

Signed-off-by: Akihiko Odaki 
---
  MAINTAINERS | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 2fd74c46426..5aefb5b431a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2333,6 +2333,7 @@ F: audio/alsaaudio.c
  Core Audio framework backend
  M: Gerd Hoffmann 
  R: Christian Schoenebeck 
+R: Akihiko Odaki 
  S: Odd Fixes
  F: audio/coreaudio.c
  
@@ -2585,6 +2586,7 @@ F: util/drm.c
  
  Cocoa graphics

  M: Peter Maydell 
+R: Akihiko Odaki 
  S: Odd Fixes
  F: ui/cocoa.m
  


Thanks!

Reviewed-by: Philippe Mathieu-Daudé

[PATCH] hw/i386: Improve bounds checking in OVMF table parsing

2022-02-14 Thread Dov Murik

When pc_system_parse_ovmf_flash() parses the optional GUIDed table in
the end of the OVMF flash memory area, the table length field is checked
for sizes that are too small, but doesn't error on sizes that are too
big (bigger than the flash content itself).

Add a check for maximal size of the OVMF table, and add an error report
in case the size is invalid.

Signed-off-by: Dov Murik 
---
 hw/i386/pc_sysfw_ovmf.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c
index f4dd92c588..0663f3f54a 100644
--- a/hw/i386/pc_sysfw_ovmf.c
+++ b/hw/i386/pc_sysfw_ovmf.c
@@ -24,6 +24,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/error-report.h"
 #include "hw/i386/pc.h"
 #include "cpu.h"
 
@@ -66,7 +67,13 @@ void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t 
flash_size)
 ptr -= sizeof(uint16_t);
 tot_len = le16_to_cpu(*(uint16_t *)ptr) - sizeof(guid) - sizeof(uint16_t);
 
-if (tot_len <= 0) {
+if (tot_len < 0 || tot_len > flash_size - 50) {
+error_report("OVMF table has invalid size %d", tot_len);
+return;
+}
+
+if (tot_len == 0) {
+/* no entries in the OVMF table */
 return;
 }
 

base-commit: 48033ad678ae2def43bf0d543a2c4c3d2a93feaf
-- 
2.25.1

Re: [PATCH] Add --with-branding-prefix and QEMU_BRANDING_PREFIX

2022-02-14 Thread Liviu Ionescu

> On 14 Feb 2022, at 14:06, Peter Maydell  wrote:
> 
> ... but the "v6.2.0-1-xpack-arm" pkgversion suffix works for this
> purpose too.

Currently I do not use any --pkgversion in my builds, so I guess that this is 
automatically generated by the meson script, and reflects the name of a custom 
branch in my fork; if I switch to using the upstream repo, this will probably 
be replace by some tag or commit id, which identifies the exact version, isn't 
it?

Liviu

Re: [PATCH] ui/cocoa: Add Services menu

2022-02-14 Thread Philippe Mathieu-Daudé via


On 14/2/22 10:13, Akihiko Odaki wrote:

Services menu functionality of Cocoa is described at:
https://developer.apple.com/design/human-interface-guidelines/macos/extensions/services/

Signed-off-by: Akihiko Odaki 
---
  ui/cocoa.m | 4 
  1 file changed, 4 insertions(+)


Reviewed-by: Philippe Mathieu-Daudé 
Tested-by: Philippe Mathieu-Daudé

Re: [PATCH v6 03/10] virtiofsd: Parse extended "struct fuse_init_in"

2022-02-14 Thread Dr. David Alan Gilbert

* Vivek Goyal (vgo...@redhat.com) wrote:
> Add some code to parse extended "struct fuse_init_in". And use a local
> variable "flag" to represent 64 bit flags. This will make it easier
> to add more features without having to worry about two 32bit flags (->flags
> and ->flags2) in "fuse_struct_in".
> 
> Signed-off-by: Vivek Goyal 

Reviewed-by: Dr. David Alan Gilbert 

> ---
>  tools/virtiofsd/fuse_lowlevel.c | 61 +
>  1 file changed, 39 insertions(+), 22 deletions(-)
> 
> diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
> index ce29a70253..b6712b763a 100644
> --- a/tools/virtiofsd/fuse_lowlevel.c
> +++ b/tools/virtiofsd/fuse_lowlevel.c
> @@ -1881,11 +1881,14 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
>  {
>  size_t compat_size = offsetof(struct fuse_init_in, max_readahead);
>  size_t compat2_size = offsetof(struct fuse_init_in, flags) + 
> sizeof(uint32_t);
> +/* Fuse structure extended with minor version 36 */
> +size_t compat3_size = endof(struct fuse_init_in, unused);
>  struct fuse_init_in *arg;
>  struct fuse_init_out outarg;
>  struct fuse_session *se = req->se;
>  size_t bufsize = se->bufsize;
>  size_t outargsize = sizeof(outarg);
> +uint64_t flags = 0;
>  
>  (void)nodeid;
>  
> @@ -1902,11 +1905,25 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
>  fuse_reply_err(req, EINVAL);
>  return;
>  }
> +flags |= arg->flags;
> +}
> +
> +/*
> + * fuse_init_in was extended again with minor version 36. Just read
> + * current known size of fuse_init so that future extension and
> + * header rebase does not cause breakage.
> + */
> +if (sizeof(*arg) > compat2_size && (arg->flags & FUSE_INIT_EXT)) {
> +if (!fuse_mbuf_iter_advance(iter, compat3_size - compat2_size)) {
> +fuse_reply_err(req, EINVAL);
> +return;
> +}
> +flags |= (uint64_t) arg->flags2 << 32;
>  }
>  
>  fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor);
>  if (arg->major == 7 && arg->minor >= 6) {
> -fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags);
> +fuse_log(FUSE_LOG_DEBUG, "flags=0x%016llx\n", flags);
>  fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", 
> arg->max_readahead);
>  }
>  se->conn.proto_major = arg->major;
> @@ -1934,68 +1951,68 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid,
>  if (arg->max_readahead < se->conn.max_readahead) {
>  se->conn.max_readahead = arg->max_readahead;
>  }
> -if (arg->flags & FUSE_ASYNC_READ) {
> +if (flags & FUSE_ASYNC_READ) {
>  se->conn.capable |= FUSE_CAP_ASYNC_READ;
>  }
> -if (arg->flags & FUSE_POSIX_LOCKS) {
> +if (flags & FUSE_POSIX_LOCKS) {
>  se->conn.capable |= FUSE_CAP_POSIX_LOCKS;
>  }
> -if (arg->flags & FUSE_ATOMIC_O_TRUNC) {
> +if (flags & FUSE_ATOMIC_O_TRUNC) {
>  se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC;
>  }
> -if (arg->flags & FUSE_EXPORT_SUPPORT) {
> +if (flags & FUSE_EXPORT_SUPPORT) {
>  se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT;
>  }
> -if (arg->flags & FUSE_DONT_MASK) {
> +if (flags & FUSE_DONT_MASK) {
>  se->conn.capable |= FUSE_CAP_DONT_MASK;
>  }
> -if (arg->flags & FUSE_FLOCK_LOCKS) {
> +if (flags & FUSE_FLOCK_LOCKS) {
>  se->conn.capable |= FUSE_CAP_FLOCK_LOCKS;
>  }
> -if (arg->flags & FUSE_AUTO_INVAL_DATA) {
> +if (flags & FUSE_AUTO_INVAL_DATA) {
>  se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA;
>  }
> -if (arg->flags & FUSE_DO_READDIRPLUS) {
> +if (flags & FUSE_DO_READDIRPLUS) {
>  se->conn.capable |= FUSE_CAP_READDIRPLUS;
>  }
> -if (arg->flags & FUSE_READDIRPLUS_AUTO) {
> +if (flags & FUSE_READDIRPLUS_AUTO) {
>  se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO;
>  }
> -if (arg->flags & FUSE_ASYNC_DIO) {
> +if (flags & FUSE_ASYNC_DIO) {
>  se->conn.capable |= FUSE_CAP_ASYNC_DIO;
>  }
> -if (arg->flags & FUSE_WRITEBACK_CACHE) {
> +if (flags & FUSE_WRITEBACK_CACHE) {
>  se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE;
>  }
> -if (arg->flags & FUSE_NO_OPEN_SUPPORT) {
> +if (flags & FUSE_NO_OPEN_SUPPORT) {
>  se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT;
>  }
> -if (arg->flags & FUSE_PARALLEL_DIROPS) {
> +if (flags & FUSE_PARALLEL_DIROPS) {
>  se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS;
>  }
> -if (arg->flags & FUSE_POSIX_ACL) {
> +if (flags & FUSE_POSIX_ACL) {
>  se->conn.capable |= FUSE_CAP_POSIX_ACL;
>  }
> -if (arg->flags & FUSE_HANDLE_KILLPRIV) {
> +if (flags & FUSE_HANDLE_KILLPRIV) {
>  se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV;
>  }
> -if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) {
> +if (flags & FUSE_NO_OPENDIR_SUPPORT) {
>

Re: [PATCH] hw/ide: implement ich6 ide controller support

2022-02-14 Thread BALATON Zoltan


On Sat, 5 Feb 2022, BALATON Zoltan wrote:

Hello,


Ping? John, do you agree with my comments? Should Liav proceed to send a 
v2?


Thanks,
BALATON Zoltan


On Sat, 5 Feb 2022, Liav Albani wrote:

On 2/5/22 17:48, BALATON Zoltan wrote:

On Sat, 5 Feb 2022, Liav Albani wrote:

This type of IDE controller has support for relocating the IO ports and
doesn't use IRQ 14 and 15 but one allocated PCI IRQ for the controller.


I haven't looked at in detail so only a few comments I've got while 
reading it. What machine needs this? In QEMU I think we only have piix and 
ich9 emulated for pc and q35 machines but maybe ich6 is also used by some 
machine I don't know about. Otherwise it looks odd to have ide part of 
ich6 but not the other parts of this chip.



Hi BALATON,

This is my first patch to QEMU and the first time I send patches over the 
mail. I sent my github tree to John Snow (the maintainer of the IDE code in 
QEMU) for advice if I should send them here and I was encouraged to do 
that.


Welcome and thanks a lot for taking time to contribute and share your 
results. In case you're not yet aware, these docs should explain how patches 
are handled on the list:


https://www.qemu.org/docs/master/devel/submitting-a-patch.html

For the next time patch I'll put a note on writing a descriptive cover 
letter as it could have put more valuable details on why I sent this patch.


There's no such machine type emulating the ICH6 chipset in QEMU. However, I 
wrote this emulation component as a test for the SerenityOS kernel because 
I have a machine from 2009 which has
an ICH7 southbridge, so, I wanted to emulate such device with QEMU to ease 
development on it.


I found out that Linux with libata was using the controller without any 
noticeable problems, but the SerenityOS kernel struggled to use this 
device, so I decided that
I should send this patch to get it merged and then I can use it locally and 
maybe other people will benefit from it.


In regard to other components of the ICH6 chipset - I don't think it's 
worth anybody's time to actually implement them as the ICH9 chipset is 
quite close to what the ICH6 chipset offers as far as I can tell.
The idea of implementing ich6-ide controller was to enable the option of 
people like me and other OS developers to ensure their kernels operate 
correctly on such type of device,
which is legacy-free device in the aspect of PCI bus resource management 
but still is a legacy device which belongs to chipsets of late 2000s.


That's OK, maybe a short mention (just one sentence) in the commit message 
explaining this would help to understand why this device model was added.



Signed-off-by: Liav Albani 
---
hw/i386/Kconfig  |   2 +
hw/ide/Kconfig   |   5 +
hw/ide/bmdma.c   |  83 +++
hw/ide/ich6.c    | 211 +++
hw/ide/meson.build   |   3 +-
hw/ide/piix.c    |  50 +-
include/hw/ide/pci.h |   5 +
include/hw/pci/pci_ids.h |   1 +
8 files changed, 311 insertions(+), 49 deletions(-)
create mode 100644 hw/ide/bmdma.c
create mode 100644 hw/ide/ich6.c

diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index d22ac4a4b9..a18de2d962 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -75,6 +75,7 @@ config I440FX
    select PCI_I440FX
    select PIIX3
    select IDE_PIIX
+    select IDE_ICH6
    select DIMM
    select SMBIOS
    select FW_CFG_DMA
@@ -101,6 +102,7 @@ config Q35
    select PCI_EXPRESS_Q35
    select LPC_ICH9
    select AHCI_ICH9
+    select IDE_ICH6
    select DIMM
    select SMBIOS
    select FW_CFG_DMA
diff --git a/hw/ide/Kconfig b/hw/ide/Kconfig
index dd85fa3619..63304325a5 100644
--- a/hw/ide/Kconfig
+++ b/hw/ide/Kconfig
@@ -38,6 +38,11 @@ config IDE_VIA
    select IDE_PCI
    select IDE_QDEV

+config IDE_ICH6
+    bool
+    select IDE_PCI
+    select IDE_QDEV
+
config MICRODRIVE
    bool
    select IDE_QDEV
diff --git a/hw/ide/bmdma.c b/hw/ide/bmdma.c
new file mode 100644
index 00..979f5974fd
--- /dev/null
+++ b/hw/ide/bmdma.c
@@ -0,0 +1,83 @@
+/*
+ * QEMU IDE Emulation: PCI PIIX3/4 support.
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2006 Openedhand Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
a copy
+ * of this software and associated documentation files (the "Software"), 
to deal
+ * in the Software without restriction, including without limitation the 
rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell

+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
included in

+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
MERCHANTABILITY,
+ * FITNESS FOR A PARTICULA

Re: [PULL 0/5] 9p queue 2022-02-10

2022-02-14 Thread Christian Schoenebeck

On Montag, 14. Februar 2022 10:55:17 CET Peter Maydell wrote:
> On Mon, 14 Feb 2022 at 09:47, Christian Schoenebeck
> 
>  wrote:
> > So this is about the 'dirent' patch:
> > https://github.com/cschoenebeck/qemu/commit/de19c79dad6a2cad54ae04ce754d47
> > c07bf9bc93
> > 
> > In conjunction with the 9p fuzzing tests:
> > https://wiki.qemu.org/Documentation/9p#Fuzzing
> > 
> > I first thought it might be a false positive due to the unorthodox
> > handling of dirent duplication by that patch, but from the ASan output
> > below I am not really sure about that.
> > 
> > Is there a way to get the content of local variables?
> 
> Yes. You can build locally with the clang sanitizers enabled and then
> run under gdb and with the appropriate environment variables to tell the
> sanitizer to abort() on failures.

Well, it does no longer matter for this particular issue here, but it would be 
useful if the CI scripts would already dump the local variables to the logs. 

E.g. because the patch in question was about system dependant variations.

Another reason is the random data nature of fuzzing tests. Even though the 
latter could probably be reproduced with an appropriate seed.

Best regards,
Christian Schoenebeck

[PATCH 3/6] hw/nvme: move format parameter parsing

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

There is no need to extract the format command parameters for each
namespace. Move it to the entry point.

Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 71c60482c75f..d8701ebf2fa8 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5452,6 +5452,11 @@ typedef struct NvmeFormatAIOCB {
 uint32_t nsid;
 bool broadcast;
 int64_t offset;
+
+uint8_t lbaf;
+uint8_t mset;
+uint8_t pi;
+uint8_t pil;
 } NvmeFormatAIOCB;
 
 static void nvme_format_bh(void *opaque);
@@ -5471,14 +5476,9 @@ static const AIOCBInfo nvme_format_aiocb_info = {
 .get_aio_context = nvme_get_aio_context,
 };
 
-static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
+static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
+uint8_t pi, uint8_t pil)
 {
-uint32_t dw10 = le32_to_cpu(cmd->cdw10);
-uint8_t lbaf = dw10 & 0xf;
-uint8_t pi = (dw10 >> 5) & 0x7;
-uint8_t mset = (dw10 >> 4) & 0x1;
-uint8_t pil = (dw10 >> 8) & 0x1;
-
 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
 
 ns->id_ns.dps = (pil << 3) | pi;
@@ -5490,7 +5490,6 @@ static void nvme_format_set(NvmeNamespace *ns, NvmeCmd 
*cmd)
 static void nvme_format_ns_cb(void *opaque, int ret)
 {
 NvmeFormatAIOCB *iocb = opaque;
-NvmeRequest *req = iocb->req;
 NvmeNamespace *ns = iocb->ns;
 int bytes;
 
@@ -5512,7 +5511,7 @@ static void nvme_format_ns_cb(void *opaque, int ret)
 return;
 }
 
-nvme_format_set(ns, &req->cmd);
+nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
 ns->status = 0x0;
 iocb->ns = NULL;
 iocb->offset = 0;
@@ -5548,9 +5547,6 @@ static void nvme_format_bh(void *opaque)
 NvmeFormatAIOCB *iocb = opaque;
 NvmeRequest *req = iocb->req;
 NvmeCtrl *n = nvme_ctrl(req);
-uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
-uint8_t lbaf = dw10 & 0xf;
-uint8_t pi = (dw10 >> 5) & 0x7;
 uint16_t status;
 int i;
 
@@ -5572,7 +5568,7 @@ static void nvme_format_bh(void *opaque)
 goto done;
 }
 
-status = nvme_format_check(iocb->ns, lbaf, pi);
+status = nvme_format_check(iocb->ns, iocb->lbaf, iocb->pi);
 if (status) {
 req->status = status;
 goto done;
@@ -5595,6 +5591,11 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 {
 NvmeFormatAIOCB *iocb;
 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+uint8_t lbaf = dw10 & 0xf;
+uint8_t mset = (dw10 >> 4) & 0x1;
+uint8_t pi = (dw10 >> 5) & 0x7;
+uint8_t pil = (dw10 >> 8) & 0x1;
 uint16_t status;
 
 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
@@ -5604,6 +5605,10 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 iocb->ret = 0;
 iocb->ns = NULL;
 iocb->nsid = 0;
+iocb->lbaf = lbaf;
+iocb->mset = mset;
+iocb->pi = pi;
+iocb->pil = pil;
 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
 iocb->offset = 0;
 
-- 
2.35.1

Re: [PATCH] Add --with-branding-prefix and QEMU_BRANDING_PREFIX

2022-02-14 Thread Daniel P . Berrangé

On Mon, Feb 14, 2022 at 02:18:13PM +0200, Liviu Ionescu wrote:
> 
> 
> > On 14 Feb 2022, at 14:06, Peter Maydell  wrote:
> > 
> > ... but the "v6.2.0-1-xpack-arm" pkgversion suffix works for this
> > purpose too.
> 
> Currently I do not use any --pkgversion in my builds, so I guess that
> this is automatically generated by the meson script, and reflects the
> name of a custom branch in my fork; if I switch to using the upstream
> repo, this will probably be replace by some tag or commit id, which
> identifies the exact version, isn't it?

In a .git checkout, pkgversion defaults to "git describe --match 'v*' --dirty"
See scripts/qemu-version.sh

With Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH 5/6] hw/nvme: add pi tuple size helper

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

A subsequent patch will introduce a new tuple size; so add a helper and
use that instead of sizeof() and magic numbers.

Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 14 --
 hw/nvme/dif.c  | 16 
 hw/nvme/dif.h  |  5 +
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 52ab3450b975..f1683960b87e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1068,7 +1068,8 @@ static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, 
NvmeRequest *req)
 size_t len = nvme_l2b(ns, nlb);
 uint16_t status;
 
-if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
+if (nvme_ns_ext(ns) &&
+!(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
 NvmeSg sg;
 
 len += nvme_m2b(ns, nlb);
@@ -1247,7 +1248,8 @@ uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, 
uint32_t len,
 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
 
-if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
+if (nvme_ns_ext(ns) &&
+!(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
ns->lbaf.ms, 0, dir);
 }
@@ -2184,7 +2186,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
  * tuple.
  */
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += 
ns->lbaf.ms) {
@@ -3167,7 +3169,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
 bool pract = prinfo & NVME_PRINFO_PRACT;
 
-if (pract && ns->lbaf.ms == 8) {
+if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 mapped_size = data_size;
 }
 }
@@ -3244,7 +3246,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
 bool pract = prinfo & NVME_PRINFO_PRACT;
 
-if (pract && ns->lbaf.ms == 8) {
+if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 mapped_size -= nvme_m2b(ns, nlb);
 }
 }
@@ -5553,7 +,7 @@ static uint16_t nvme_format_check(NvmeNamespace *ns, 
uint8_t lbaf, uint8_t pi)
 return NVME_INVALID_FORMAT | NVME_DNR;
 }
 
-if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
+if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
 return NVME_INVALID_FORMAT | NVME_DNR;
 }
 
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index cd0cea2b5ebd..891385f33f20 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -48,7 +48,7 @@ void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t 
*buf, size_t len,
 int16_t pil = 0;
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 trace_pci_nvme_dif_pract_generate_dif(len, ns->lbasz, ns->lbasz + pil,
@@ -145,7 +145,7 @@ uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, 
size_t len,
 }
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 trace_pci_nvme_dif_check(prinfo, ns->lbasz + pil);
@@ -184,7 +184,7 @@ uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t 
*mbuf, size_t mlen,
 
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 do {
@@ -210,7 +210,7 @@ uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t 
*mbuf, size_t mlen,
 end = mbufp + mlen;
 
 for (; mbufp < end; mbufp += ns->lbaf.ms) {
-memset(mbufp + pil, 0xff, sizeof(NvmeDifTuple));
+memset(mbufp + pil, 0xff, nvme_pi_tuple_size(ns));
 }
 }
 
@@ -284,7 +284,7 @@ static void nvme_dif_rw_check_cb(void *opaque, int ret)
 goto out;
 }
 
-if (prinfo & NVME_PRINFO_PRACT && ns->lbaf.ms == 8) {
+if (prinfo & NVME_PRINFO_PRACT && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 goto out;
 }
 
@@ -388,7 +388,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
 
 if (pract) {
 uint8_t *mbuf, *end;
-int16_t pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+int16_t pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 
 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
 if (status) {
@@ -428,7 +428,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)

[PATCH 0/6] hw/nvme: enhanced protection information (64-bit guard)

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

This adds support for one possible new protection information format
introduced in TP4068 (and integrated in NVMe 2.0): the 64-bit CRC guard
and 48-bit reference tag. This version does not support storage tags.

Like the CRC16 support already present, this uses a software
implementation of CRC64 (so it is naturally pretty slow). But its good
enough for verification purposes.

This goes hand-in-hand with the support that Keith submitted for the
Linux kernel[1].

[1]: 
https://lore.kernel.org/linux-nvme/20220201190128.3075065-1-kbu...@kernel.org/

Klaus Jensen (3):
  hw/nvme: move dif/pi prototypes into dif.h
  hw/nvme: move format parameter parsing
  hw/nvme: add pi tuple size helper

Naveen Nagar (3):
  hw/nvme: add host behavior support feature
  hw/nvme: add support for the lbafee hbs feature
  hw/nvme: 64-bit pi support

 hw/nvme/ctrl.c   | 235 +--
 hw/nvme/dif.c| 378 +--
 hw/nvme/dif.h| 191 ++
 hw/nvme/ns.c |  27 +++-
 hw/nvme/nvme.h   |  58 +--
 hw/nvme/trace-events |  12 +-
 include/block/nvme.h |  81 --
 7 files changed, 774 insertions(+), 208 deletions(-)
 create mode 100644 hw/nvme/dif.h

-- 
2.35.1

[PATCH 2/6] hw/nvme: add host behavior support feature

2022-02-14 Thread Klaus Jensen

From: Naveen Nagar 

Add support for getting and setting the Host Behavior Support feature.

Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 8 
 hw/nvme/nvme.h   | 4 +++-
 include/block/nvme.h | 9 +
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d08af3bdc1a2..71c60482c75f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -196,6 +196,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 [NVME_WRITE_ATOMICITY]  = true,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
 [NVME_TIMESTAMP]= true,
+[NVME_HOST_BEHAVIOR_SUPPORT]= true,
 [NVME_COMMAND_SET_PROFILE]  = true,
 };
 
@@ -206,6 +207,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
+[NVME_HOST_BEHAVIOR_SUPPORT]= NVME_FEAT_CAP_CHANGE,
 [NVME_COMMAND_SET_PROFILE]  = NVME_FEAT_CAP_CHANGE,
 };
 
@@ -5091,6 +5093,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 goto out;
 case NVME_TIMESTAMP:
 return nvme_get_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_c2h(n, (uint8_t *)&n->features.hbs,
+sizeof(n->features.hbs), req);
 default:
 break;
 }
@@ -5281,6 +5286,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 break;
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_h2c(n, (uint8_t *)&n->features.hbs,
+sizeof(n->features.hbs), req);
 case NVME_COMMAND_SET_PROFILE:
 if (dw11 & 0x1ff) {
 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 801176a2bd5e..103407038e74 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -468,7 +468,9 @@ typedef struct NvmeCtrl {
 uint16_t temp_thresh_hi;
 uint16_t temp_thresh_low;
 };
-uint32_tasync_config;
+
+uint32_tasync_config;
+NvmeHostBehaviorSupport hbs;
 } features;
 } NvmeCtrl;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index cd068ac89142..e527c728f975 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1216,6 +1216,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_HOST_BEHAVIOR_SUPPORT  = 0x16,
 NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
@@ -1257,6 +1258,13 @@ typedef struct QEMU_PACKED NvmeRangeType {
 uint8_t rsvd48[16];
 } NvmeRangeType;
 
+typedef struct NvmeHostBehaviorSupport {
+uint8_t acre;
+uint8_t etdas;
+uint8_t lbafee;
+uint8_t rsvd3[509];
+} NvmeHostBehaviorSupport;
+
 typedef struct QEMU_PACKED NvmeLBAF {
 uint16_tms;
 uint8_t ds;
@@ -1520,6 +1528,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
+QEMU_BUILD_BUG_ON(sizeof(NvmeHostBehaviorSupport) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
-- 
2.35.1

[PATCH 4/6] hw/nvme: add support for the lbafee hbs feature

2022-02-14 Thread Klaus Jensen

From: Naveen Nagar 

Add support for up to 64 LBA formats through the LBAFEE field of the
Host Behavior Support feature.

Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 34 +++---
 hw/nvme/ns.c | 15 +--
 hw/nvme/nvme.h   |  1 +
 include/block/nvme.h |  7 +--
 4 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d8701ebf2fa8..52ab3450b975 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5165,6 +5165,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 uint32_t nsid = le32_to_cpu(cmd->nsid);
 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
 uint8_t save = NVME_SETFEAT_SAVE(dw10);
+uint16_t status;
 int i;
 
 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
@@ -5287,8 +5288,26 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
NvmeRequest *req)
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, req);
 case NVME_HOST_BEHAVIOR_SUPPORT:
-return nvme_h2c(n, (uint8_t *)&n->features.hbs,
-sizeof(n->features.hbs), req);
+status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
+  sizeof(n->features.hbs), req);
+if (status) {
+return status;
+}
+
+for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+ns = nvme_ns(n, i);
+
+if (!ns) {
+continue;
+}
+
+ns->id_ns.nlbaf = ns->nlbaf - 1;
+if (!n->features.hbs.lbafee) {
+ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
+}
+}
+
+return status;
 case NVME_COMMAND_SET_PROFILE:
 if (dw11 & 0x1ff) {
 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
@@ -5479,10 +5498,13 @@ static const AIOCBInfo nvme_format_aiocb_info = {
 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
 uint8_t pi, uint8_t pil)
 {
+uint8_t lbafl = lbaf & 0xf;
+uint8_t lbafu = lbaf >> 4;
+
 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
 
 ns->id_ns.dps = (pil << 3) | pi;
-ns->id_ns.flbas = lbaf | (mset << 4);
+ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
 
 nvme_ns_init_format(ns);
 }
@@ -5596,6 +5618,7 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
 uint8_t mset = (dw10 >> 4) & 0x1;
 uint8_t pi = (dw10 >> 5) & 0x7;
 uint8_t pil = (dw10 >> 8) & 0x1;
+uint8_t lbafu = (dw10 >> 12) & 0x3;
 uint16_t status;
 
 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
@@ -5612,6 +5635,10 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
 iocb->offset = 0;
 
+if (n->features.hbs.lbafee) {
+iocb->lbaf |= lbafu << 4;
+}
+
 if (!iocb->broadcast) {
 if (!nvme_nsid_valid(n, nsid)) {
 status = NVME_INVALID_NSID | NVME_DNR;
@@ -6587,6 +6614,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->cntlid = cpu_to_le16(n->cntlid);
 
 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
+id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
 
 id->rab = 6;
 
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index ee673f1a5bef..8dfb55130beb 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -112,10 +112,11 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 [7] = { .ds = 12, .ms = 64 },
 };
 
+ns->nlbaf = 8;
+
 memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf));
-id_ns->nlbaf = 7;
 
-for (i = 0; i <= id_ns->nlbaf; i++) {
+for (i = 0; i < ns->nlbaf; i++) {
 NvmeLBAF *lbaf = &id_ns->lbaf[i];
 if (lbaf->ds == ds) {
 if (lbaf->ms == ms) {
@@ -126,12 +127,14 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 }
 
 /* add non-standard lba format */
-id_ns->nlbaf++;
-id_ns->lbaf[id_ns->nlbaf].ds = ds;
-id_ns->lbaf[id_ns->nlbaf].ms = ms;
-id_ns->flbas |= id_ns->nlbaf;
+id_ns->lbaf[ns->nlbaf].ds = ds;
+id_ns->lbaf[ns->nlbaf].ms = ms;
+ns->nlbaf++;
+
+id_ns->flbas |= i;
 
 lbaf_found:
+id_ns->nlbaf = ns->nlbaf - 1;
 nvme_ns_init_format(ns);
 
 return 0;
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 103407038e74..e715c3255a29 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -128,6 +128,7 @@ typedef struct NvmeNamespace {
 int64_t  moff;
 NvmeIdNs id_ns;
 NvmeLBAF lbaf;
+unsigned int nlbaf;
 size_t   lbasz;
 const uint32_t *iocs;
 uint8_t  csi;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e527c728f975..37afc9be9b18 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -,6 +,10 @@ enum NvmeIdCtrlOaes {
 NVME_OAES_NS_ATTR   = 1 << 8,
 };
 
+enum NvmeIdCtrlCtratt {
+NVME_CTRATT_ELBAS   = 1 << 15,
+};
+
 enum NvmeIdCtrlOacs

[PATCH 6/6] hw/nvme: 64-bit pi support

2022-02-14 Thread Klaus Jensen

From: Naveen Nagar 

This adds support for one possible new protection information format
introduced in TP4068 (and integrated in NVMe 2.0): the 64-bit CRC guard
and 48-bit reference tag. This version does not support storage tags.

Like the CRC16 support already present, this uses a software
implementation of CRC64 (so it is naturally pretty slow). But its good
enough for verification purposes.

This may go nicely hand-in-hand with the support that Keith submitted
for the Linux kernel[1].

  [1]: 
https://lore.kernel.org/linux-nvme/20220126165214.ga1782...@dhcp-10-100-145-180.wdc.com/T/

Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 163 +++
 hw/nvme/dif.c| 363 +--
 hw/nvme/dif.h| 143 -
 hw/nvme/ns.c |  12 ++
 hw/nvme/nvme.h   |   3 +
 hw/nvme/trace-events |  12 +-
 include/block/nvme.h |  67 ++--
 7 files changed, 629 insertions(+), 134 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f1683960b87e..03760ddeae8c 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2050,9 +2050,12 @@ static void nvme_verify_cb(void *opaque, int ret)
 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
 uint16_t apptag = le16_to_cpu(rw->apptag);
 uint16_t appmask = le16_to_cpu(rw->appmask);
-uint32_t reftag = le32_to_cpu(rw->reftag);
+uint64_t reftag = le32_to_cpu(rw->reftag);
+uint64_t cdw3 = le32_to_cpu(rw->cdw3);
 uint16_t status;
 
+reftag |= cdw3 << 32;
+
 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
 
 if (ret) {
@@ -2141,7 +2144,8 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
 uint16_t apptag = le16_to_cpu(rw->apptag);
 uint16_t appmask = le16_to_cpu(rw->appmask);
-uint32_t reftag = le32_to_cpu(rw->reftag);
+uint64_t reftag = le32_to_cpu(rw->reftag);
+uint64_t cdw3 = le32_to_cpu(rw->cdw3);
 struct nvme_compare_ctx *ctx = req->opaque;
 g_autofree uint8_t *buf = NULL;
 BlockBackend *blk = ns->blkconf.blk;
@@ -2149,6 +2153,8 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
 BlockAcctStats *stats = blk_get_stats(blk);
 uint16_t status = NVME_SUCCESS;
 
+reftag |= cdw3 << 32;
+
 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
 
 if (ret) {
@@ -2527,7 +2533,8 @@ typedef struct NvmeCopyAIOCB {
 QEMUBH *bh;
 int ret;
 
-NvmeCopySourceRange *ranges;
+void *ranges;
+unsigned int format;
 int nr;
 int idx;
 
@@ -2538,7 +2545,7 @@ typedef struct NvmeCopyAIOCB {
 BlockAcctCookie write;
 } acct;
 
-uint32_t reftag;
+uint64_t reftag;
 uint64_t slba;
 
 NvmeZone *zone;
@@ -2592,13 +2599,101 @@ static void nvme_copy_bh(void *opaque)
 
 static void nvme_copy_cb(void *opaque, int ret);
 
+static void nvme_copy_source_range_parse_format0(void *ranges, int idx,
+ uint64_t *slba, uint32_t *nlb,
+ uint16_t *apptag,
+ uint16_t *appmask,
+ uint64_t *reftag)
+{
+NvmeCopySourceRangeFormat0 *_ranges = ranges;
+
+if (slba) {
+*slba = le64_to_cpu(_ranges[idx].slba);
+}
+
+if (nlb) {
+*nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
+}
+
+if (apptag) {
+*apptag = le16_to_cpu(_ranges[idx].apptag);
+}
+
+if (appmask) {
+*appmask = le16_to_cpu(_ranges[idx].appmask);
+}
+
+if (reftag) {
+*reftag = le32_to_cpu(_ranges[idx].reftag);
+}
+}
+
+static void nvme_copy_source_range_parse_format1(void *ranges, int idx,
+ uint64_t *slba, uint32_t *nlb,
+ uint16_t *apptag,
+ uint16_t *appmask,
+ uint64_t *reftag)
+{
+NvmeCopySourceRangeFormat1 *_ranges = ranges;
+
+if (slba) {
+*slba = le64_to_cpu(_ranges[idx].slba);
+}
+
+if (nlb) {
+*nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
+}
+
+if (apptag) {
+*apptag = le16_to_cpu(_ranges[idx].apptag);
+}
+
+if (appmask) {
+*appmask = le16_to_cpu(_ranges[idx].appmask);
+}
+
+if (reftag) {
+*reftag = 0;
+
+*reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
+*reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
+*reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
+*reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
+*reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
+*reftag |= (uint64_t)_ranges[idx].sr[9];
+}
+}
+
+static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
+ uint64_t *s

Re: [PATCH] ui/cocoa: Do not alert even without block devices

2022-02-14 Thread Philippe Mathieu-Daudé via


On 13/2/22 03:14, Akihiko Odaki wrote:

Signed-off-by: Akihiko Odaki 
---
  ui/cocoa.m | 5 -
  1 file changed, 5 deletions(-)

diff --git a/ui/cocoa.m b/ui/cocoa.m
index ac18e14ce01..271a2676026 100644
--- a/ui/cocoa.m
+++ b/ui/cocoa.m
@@ -1715,11 +1715,6 @@ static void addRemovableDevicesMenuItems(void)
  
  currentDevice = qmp_query_block(NULL);

  pointerToFree = currentDevice;
-if(currentDevice == NULL) {
-NSBeep();
-QEMU_Alert(@"Failed to query for block devices!");
-return;
-}
  
  menu = [[[NSApp mainMenu] itemWithTitle:@"Machine"] submenu];
  


Cc'ing qemu-block@ and Markus (QMP).

I always wondered the point of this annoying warning but never
found out.

Is this menu updated when removable blkdev are hot-plugged from
the monitor or QMP?

Tested-by: Philippe Mathieu-Daudé

[PATCH 1/6] hw/nvme: move dif/pi prototypes into dif.h

2022-02-14 Thread Klaus Jensen

From: Klaus Jensen 

Move dif/pi data structures and inlines to dif.h.

Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c |  1 +
 hw/nvme/dif.c  |  1 +
 hw/nvme/dif.h  | 53 ++
 hw/nvme/nvme.h | 50 ---
 4 files changed, 55 insertions(+), 50 deletions(-)
 create mode 100644 hw/nvme/dif.h

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 98aac98bef5f..d08af3bdc1a2 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -163,6 +163,7 @@
 #include "migration/vmstate.h"
 
 #include "nvme.h"
+#include "dif.h"
 #include "trace.h"
 
 #define NVME_MAX_IOQPAIRS 0x
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index 5dbd18b2a4a5..cd0cea2b5ebd 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -13,6 +13,7 @@
 #include "sysemu/block-backend.h"
 
 #include "nvme.h"
+#include "dif.h"
 #include "trace.h"
 
 uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
diff --git a/hw/nvme/dif.h b/hw/nvme/dif.h
new file mode 100644
index ..e36fea30e71e
--- /dev/null
+++ b/hw/nvme/dif.h
@@ -0,0 +1,53 @@
+#ifndef HW_NVME_DIF_H
+#define HW_NVME_DIF_H
+
+/* from Linux kernel (crypto/crct10dif_common.c) */
+static const uint16_t t10_dif_crc_table[256] = {
+0x, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
+   uint32_t reftag);
+uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t *mbuf, size_t mlen,
+   uint64_t slba);
+void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
+ uint8_t *mbuf, size_t mlen, uint16_t apptag,
+ uint32_t *reftag);
+uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
+uint8_t *mbuf, size_t mlen, uint8_t prinfo,
+uint64_t slba, uint16_t apptag,
+uint16_t appmask, uint32_t *reftag);
+uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req);
+
+#endif /* HW_NVME_DIF_H */
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 90c0bb7ce236..801176a2bd5e 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -513,54 +513,4 @@ void nvme_rw_complete_cb(void *opaque, int ret);
 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
NvmeCmd *cmd);
 
-/* from Linux kernel (crypto/crct10dif_common.c) */
-static const uint16_t t10_dif_crc_table[256] = {
-0x, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
-0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
-0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
-0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
-0xA99A, 0x222D, 0x3543, 0xBEF4,

[PATCH v3 0/4] virtio-iommu: Support VIRTIO_IOMMU_F_BYPASS_CONFIG

2022-02-14 Thread Jean-Philippe Brucker

Replace the VIRTIO_IOMMU_F_BYPASS feature with
VIRTIO_IOMMU_F_BYPASS_CONFIG, which enables a config space bit to switch
global bypass on and off.

Add a boot-bypass option, which defaults to 'on' to be in line with
other vIOMMUs and to allow running firmware/bootloader that are unaware
of the IOMMU. x86 doesn't need a workaround to boot with virtio-iommu
anymore.

Since v2 [1]:
* Added the new bypass bits to the migration stream.
  As discussed on the v2 thread, we assume that cross-version
  compatibility is not required for live migration at the moment, so we
  only increase the version number. Patch 2 says: "We add the bypass
  field to the migration stream without introducing subsections, based
  on the assumption that this virtio-iommu device isn't being used in
  production enough to require cross-version migration at the moment
  (all previous version required workarounds since they didn't support
  ACPI and boot-bypass)."

[1] 
https://lore.kernel.org/qemu-devel/20220127142940.671333-1-jean-phili...@linaro.org/

Jean-Philippe Brucker (4):
  linux-headers: update to v5.17-rc1
  virtio-iommu: Default to bypass during boot
  virtio-iommu: Support bypass domain
  tests/qtest/virtio-iommu-test: Check bypass config

 include/hw/virtio/virtio-iommu.h  |   1 +
 include/standard-headers/asm-x86/kvm_para.h   |   1 +
 include/standard-headers/drm/drm_fourcc.h |  11 ++
 include/standard-headers/linux/ethtool.h  |   1 +
 include/standard-headers/linux/fuse.h |  60 +++-
 include/standard-headers/linux/pci_regs.h | 142 +-
 include/standard-headers/linux/virtio_gpio.h  |  72 +
 include/standard-headers/linux/virtio_i2c.h   |  47 ++
 include/standard-headers/linux/virtio_iommu.h |   8 +-
 .../standard-headers/linux/virtio_pcidev.h|  65 
 include/standard-headers/linux/virtio_scmi.h  |  24 +++
 linux-headers/asm-generic/unistd.h|   5 +-
 linux-headers/asm-mips/unistd_n32.h   |   2 +
 linux-headers/asm-mips/unistd_n64.h   |   2 +
 linux-headers/asm-mips/unistd_o32.h   |   2 +
 linux-headers/asm-powerpc/unistd_32.h |   2 +
 linux-headers/asm-powerpc/unistd_64.h |   2 +
 linux-headers/asm-riscv/bitsperlong.h |  14 ++
 linux-headers/asm-riscv/mman.h|   1 +
 linux-headers/asm-riscv/unistd.h  |  44 ++
 linux-headers/asm-s390/unistd_32.h|   2 +
 linux-headers/asm-s390/unistd_64.h|   2 +
 linux-headers/asm-x86/kvm.h   |  16 +-
 linux-headers/asm-x86/unistd_32.h |   1 +
 linux-headers/asm-x86/unistd_64.h |   1 +
 linux-headers/asm-x86/unistd_x32.h|   1 +
 linux-headers/linux/kvm.h |  17 +++
 hw/virtio/virtio-iommu.c  |  99 ++--
 tests/qtest/virtio-iommu-test.c   |   2 +
 hw/virtio/trace-events|   4 +-
 30 files changed, 561 insertions(+), 90 deletions(-)
 create mode 100644 include/standard-headers/linux/virtio_gpio.h
 create mode 100644 include/standard-headers/linux/virtio_i2c.h
 create mode 100644 include/standard-headers/linux/virtio_pcidev.h
 create mode 100644 include/standard-headers/linux/virtio_scmi.h
 create mode 100644 linux-headers/asm-riscv/bitsperlong.h
 create mode 100644 linux-headers/asm-riscv/mman.h
 create mode 100644 linux-headers/asm-riscv/unistd.h

-- 
2.35.1

Re: [PATCH v2 2/3] hw/smbios: fix table memory corruption with large memory vms

2022-02-14 Thread Igor Mammedov

On Mon,  7 Feb 2022 17:01:28 +0530
Ani Sinha  wrote:

> With the current smbios table assignment code, we can have only 512 DIMM slots
it's a bit confusing, since it's not DIMM slots in QEMU sense (we do not expose
DIMM devices via SMBIOS/E820). So maybe clarify here that initial RAM is split
into 16GB (with 'DIMM' type ) chunks/entries when it's described in SMBIOS 
table 17.

> (each DIMM of 16 GiB in size) before tables 17 and 19 conflict with their
> addresses.

Are you sure it's addresses that are wrong? 

> A guest with more than 8 TiB of memory will hit this limitation and
> would fail with the following assertion in isa-debugcon:
> 
> ASSERT_EFI_ERROR (Status = Already started)
> ASSERT 
> /builddir/build/BUILD/edk2-ca407c7246bf/OvmfPkg/SmbiosPlatformDxe/SmbiosPlatformDxe.c(125):
>  !EFI_ERROR (Status)
> 
> This change adds an additional offset between tables 17 and 19 when 
> configuring
> VMs larger than 8 TiB of memory. The value of the offset is calculated
> to be equal to the additional space required to be reserved between the tables
> in order to accomodate more DIMM devices without the table memories colliding.

s/DIMM devices/DIMM entries/

not sure what 'table memories colliding' is, maybe rephrase and
be more accurate about what happens here.

> In normal cases where the VM memory is smaller or equal to 8 TiB, this offset
> value is 0. Hence in this case, no additional memory space is reserved and
> table addresses remain as before.
> 
> Since table addresses are altered for large memory VMs, this change can break
> migration in those cases. However, in those situations, qemu crashes anyway
> without this fix and hence we do not preserve the old bug by introducing
> compat knobs/machine types.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2023977
> 
> Signed-off-by: Ani Sinha 
> ---
>  hw/smbios/smbios.c | 19 +++
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
> index 56b412ce35..d7de740363 100644
> --- a/hw/smbios/smbios.c
> +++ b/hw/smbios/smbios.c
> @@ -799,12 +799,13 @@ static void smbios_build_type_17_table(unsigned 
> instance, uint64_t size)
>  SMBIOS_BUILD_TABLE_POST;
>  }
>  
> -static void smbios_build_type_19_table(unsigned instance,
> +static void smbios_build_type_19_table(unsigned instance, unsigned offset,
> uint64_t start, uint64_t size)
>  {
>  uint64_t end, start_kb, end_kb;
>  
> -SMBIOS_BUILD_TABLE_PRE(19, T19_BASE + instance, true); /* required */
> +SMBIOS_BUILD_TABLE_PRE(19, T19_BASE + offset + instance,
> +   true); /* required */
>  
>  end = start + size - 1;
>  assert(end > start);
> @@ -996,7 +997,7 @@ void smbios_get_tables(MachineState *ms,
> uint8_t **anchor, size_t *anchor_len,
> Error **errp)
>  {
> -unsigned i, dimm_cnt;
> +unsigned i, dimm_cnt, offset;
>  
>  if (smbios_legacy) {
>  *tables = *anchor = NULL;
> @@ -1026,6 +1027,16 @@ void smbios_get_tables(MachineState *ms,
>  
>  dimm_cnt = QEMU_ALIGN_UP(current_machine->ram_size, MAX_DIMM_SZ) / 
> MAX_DIMM_SZ;
>  
> +/*
> + * The offset determines if we need to keep additional space betweeen
> + * table 17 and table 19 so that they do not overlap. For example,

it's not tables that overlap, but something else

> + * for a VM with larger than 8 TB guest memory and DIMM size of 16 
> GiB,
> + * the default space between the two tables (T19_BASE - T17_BASE = 
> 512)
> + * is not enough.
> + */
> +offset = (dimm_cnt > (T19_BASE - T17_BASE)) ? \
> + dimm_cnt - (T19_BASE - T17_BASE) : 0;
> +
>  smbios_build_type_16_table(dimm_cnt);
>  
>  for (i = 0; i < dimm_cnt; i++) {
> @@ -1033,7 +1044,7 @@ void smbios_get_tables(MachineState *ms,
>  }
>  
>  for (i = 0; i < mem_array_size; i++) {
> -smbios_build_type_19_table(i, mem_array[i].address,
> +smbios_build_type_19_table(i, offset, mem_array[i].address,
> mem_array[i].length);
>  }
>  

other than used language/terminology, the rest looks fine tom

[PATCH v3 1/4] linux-headers: update to v5.17-rc1

2022-02-14 Thread Jean-Philippe Brucker

Update Linux headers to v5.17-rc1

Reviewed-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
 include/standard-headers/asm-x86/kvm_para.h   |   1 +
 include/standard-headers/drm/drm_fourcc.h |  11 ++
 include/standard-headers/linux/ethtool.h  |   1 +
 include/standard-headers/linux/fuse.h |  60 +++-
 include/standard-headers/linux/pci_regs.h | 142 +-
 include/standard-headers/linux/virtio_gpio.h  |  72 +
 include/standard-headers/linux/virtio_i2c.h   |  47 ++
 include/standard-headers/linux/virtio_iommu.h |   8 +-
 .../standard-headers/linux/virtio_pcidev.h|  65 
 include/standard-headers/linux/virtio_scmi.h  |  24 +++
 linux-headers/asm-generic/unistd.h|   5 +-
 linux-headers/asm-mips/unistd_n32.h   |   2 +
 linux-headers/asm-mips/unistd_n64.h   |   2 +
 linux-headers/asm-mips/unistd_o32.h   |   2 +
 linux-headers/asm-powerpc/unistd_32.h |   2 +
 linux-headers/asm-powerpc/unistd_64.h |   2 +
 linux-headers/asm-riscv/bitsperlong.h |  14 ++
 linux-headers/asm-riscv/mman.h|   1 +
 linux-headers/asm-riscv/unistd.h  |  44 ++
 linux-headers/asm-s390/unistd_32.h|   2 +
 linux-headers/asm-s390/unistd_64.h|   2 +
 linux-headers/asm-x86/kvm.h   |  16 +-
 linux-headers/asm-x86/unistd_32.h |   1 +
 linux-headers/asm-x86/unistd_64.h |   1 +
 linux-headers/asm-x86/unistd_x32.h|   1 +
 linux-headers/linux/kvm.h |  17 +++
 26 files changed, 469 insertions(+), 76 deletions(-)
 create mode 100644 include/standard-headers/linux/virtio_gpio.h
 create mode 100644 include/standard-headers/linux/virtio_i2c.h
 create mode 100644 include/standard-headers/linux/virtio_pcidev.h
 create mode 100644 include/standard-headers/linux/virtio_scmi.h
 create mode 100644 linux-headers/asm-riscv/bitsperlong.h
 create mode 100644 linux-headers/asm-riscv/mman.h
 create mode 100644 linux-headers/asm-riscv/unistd.h

diff --git a/include/standard-headers/asm-x86/kvm_para.h 
b/include/standard-headers/asm-x86/kvm_para.h
index 204cfb8640..f0235e58a1 100644
--- a/include/standard-headers/asm-x86/kvm_para.h
+++ b/include/standard-headers/asm-x86/kvm_para.h
@@ -8,6 +8,7 @@
  * should be used to determine that a VM is running under KVM.
  */
 #define KVM_CPUID_SIGNATURE0x4000
+#define KVM_SIGNATURE "KVMKVMKVM\0\0\0"
 
 /* This CPUID returns two feature bitmaps in eax, edx. Before enabling
  * a particular paravirtualization, the appropriate feature bit should
diff --git a/include/standard-headers/drm/drm_fourcc.h 
b/include/standard-headers/drm/drm_fourcc.h
index 2c025cb4fe..4888f85f69 100644
--- a/include/standard-headers/drm/drm_fourcc.h
+++ b/include/standard-headers/drm/drm_fourcc.h
@@ -313,6 +313,13 @@ extern "C" {
  */
 #define DRM_FORMAT_P016fourcc_code('P', '0', '1', '6') /* 2x2 
subsampled Cr:Cb plane 16 bits per channel */
 
+/* 2 plane YCbCr420.
+ * 3 10 bit components and 2 padding bits packed into 4 bytes.
+ * index 0 = Y plane, [31:0] x:Y2:Y1:Y0 2:10:10:10 little endian
+ * index 1 = Cr:Cb plane, [63:0] x:Cr2:Cb2:Cr1:x:Cb1:Cr0:Cb0 
[2:10:10:10:2:10:10:10] little endian
+ */
+#define DRM_FORMAT_P030fourcc_code('P', '0', '3', '0') /* 2x2 
subsampled Cr:Cb plane 10 bits per channel packed */
+
 /* 3 plane non-subsampled (444) YCbCr
  * 16 bits per component, but only 10 bits are used and 6 bits are padded
  * index 0: Y plane, [15:0] Y:x [10:6] little endian
@@ -853,6 +860,10 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t 
modifier)
  * and UV.  Some SAND-using hardware stores UV in a separate tiled
  * image from Y to reduce the column height, which is not supported
  * with these modifiers.
+ *
+ * The DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT modifier is also
+ * supported for DRM_FORMAT_P030 where the columns remain as 128 bytes
+ * wide, but as this is a 10 bpp format that translates to 96 pixels.
  */
 
 #define DRM_FORMAT_MOD_BROADCOM_SAND32_COL_HEIGHT(v) \
diff --git a/include/standard-headers/linux/ethtool.h 
b/include/standard-headers/linux/ethtool.h
index 688eb8dc39..38d5a4cd6e 100644
--- a/include/standard-headers/linux/ethtool.h
+++ b/include/standard-headers/linux/ethtool.h
@@ -231,6 +231,7 @@ enum tunable_id {
ETHTOOL_RX_COPYBREAK,
ETHTOOL_TX_COPYBREAK,
ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */
+   ETHTOOL_TX_COPYBREAK_BUF_SIZE,
/*
 * Add your fresh new tunable attribute above and remember to update
 * tunable_strings[] in net/ethtool/common.c
diff --git a/include/standard-headers/linux/fuse.h 
b/include/standard-headers/linux/fuse.h
index 23ea31708b..bda06258be 100644
--- a/include/standard-headers/linux/fuse.h
+++ b/include/standard-headers/linux/fuse.h
@@ -184,6 +184,16 @@
  *
  *  7.34
  *  - add FUSE_SYNCFS
+ *
+ *  7.35
+ *  - add FOPEN_NOFLUSH
+ *
+ *  7.36

Re: 9 TiB vm memory creation

2022-02-14 Thread Igor Mammedov

On Mon, 14 Feb 2022 10:54:22 +0530 (IST)
Ani Sinha  wrote:

> Hi Igor:
> 
> I failed to spawn a 9 Tib VM. The max I could do was a 2 TiB vm on my
> system with the following commandline before either the system
> destabilized or the OOM killed killed qemu
> 
> -m 2T,maxmem=9T,slots=1 \
> -object 
> memory-backend-file,id=mem0,size=2T,mem-path=/data/temp/memfile,prealloc=off \
> -machine memory-backend=mem0 \
> -chardev file,path=/tmp/debugcon2.txt,id=debugcon \
> -device isa-debugcon,iobase=0x402,chardev=debugcon \
> 
> I have attached the debugcon output from 2 TiB vm.
> Is there any other commandline parameters or options I should try?
> 
> thanks
> ani

$ truncate -s 9T 9tb_sparse_disk.img
$ qemu-system-x86_64 -m 9T \
  -object 
memory-backend-file,id=mem0,size=9T,mem-path=9tb_sparse_disk.img,prealloc=off,share=on
 \
  -machine memory-backend=mem0

works for me till GRUB menu, with sufficient guest kernel
persuasion (i.e. CLI limit ram size to something reasonable) you can boot linux
guest on it and inspect SMBIOS tables comfortably.


With KVM enabled it bails out with:
   qemu-system-x86_64: kvm_set_user_memory_region: KVM_SET_USER_MEMORY_REGION 
failed, slot=1, start=0x1, size=0x8ff4000: Invalid argument

all of that on a host with 32G of RAM/no swap.

[PATCH] target/ppc: raise HV interrupts for partition table entry problems

2022-02-14 Thread Nicholas Piggin

Invalid or missing partition table entry exceptions should cause HV
interrupts. HDSISR is set to bad MMU config, which is consistent with
the ISA and experimentally matches what POWER9 generates.

Signed-off-by: Nicholas Piggin 
---
 target/ppc/mmu-radix64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 040c055bff..54fb3ce98d 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -560,13 +560,13 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr 
eaddr,
 } else {
 if (!ppc64_v3_get_pate(cpu, lpid, &pate)) {
 if (guest_visible) {
-ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_NOPTE);
+ppc_radix64_raise_hsi(cpu, access_type, eaddr, eaddr, 
DSISR_R_BADCONFIG);
 }
 return false;
 }
 if (!validate_pate(cpu, lpid, &pate)) {
 if (guest_visible) {
-ppc_radix64_raise_si(cpu, access_type, eaddr, 
DSISR_R_BADCONFIG);
+ppc_radix64_raise_hsi(cpu, access_type, eaddr, eaddr, 
DSISR_R_BADCONFIG);
 }
 return false;
 }
-- 
2.23.0

[PATCH] spapr: prevent hdec timer being set up under virtual hypervisor

2022-02-14 Thread Nicholas Piggin

The spapr virtual hypervisor does not require the hdecr timer.
Remove it.

Signed-off-by: Nicholas Piggin 
---
 hw/ppc/ppc.c| 2 +-
 hw/ppc/spapr_cpu_core.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index 462c87dba8..a7c262db93 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -1072,7 +1072,7 @@ clk_setup_cb cpu_ppc_tb_init (CPUPPCState *env, uint32_t 
freq)
 }
 /* Create new timer */
 tb_env->decr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, &cpu_ppc_decr_cb, 
cpu);
-if (env->has_hv_mode) {
+if (env->has_hv_mode && !cpu->vhyp) {
 tb_env->hdecr_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 
&cpu_ppc_hdecr_cb,
 cpu);
 } else {
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index a781e97f8d..ed84713960 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -261,12 +261,12 @@ static bool spapr_realize_vcpu(PowerPCCPU *cpu, 
SpaprMachineState *spapr,
 return false;
 }
 
-/* Set time-base frequency to 512 MHz */
-cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ);
-
 cpu_ppc_set_vhyp(cpu, PPC_VIRTUAL_HYPERVISOR(spapr));
 kvmppc_set_papr(cpu);
 
+/* Set time-base frequency to 512 MHz. vhyp must be set first. */
+cpu_ppc_tb_init(env, SPAPR_TIMEBASE_FREQ);
+
 if (spapr_irq_cpu_intc_create(spapr, cpu, errp) < 0) {
 qdev_unrealize(DEVICE(cpu));
 return false;
-- 
2.23.0

Re: [PATCH v4 01/13] lcitool: refresh

2022-02-14 Thread Philippe Mathieu-Daudé via


On 12/2/22 16:14, Akihiko Odaki wrote:

On 2022/02/12 1:34, Philippe Mathieu-Daudé via wrote:

Signed-off-by: Philippe Mathieu-Daudé 
---
  tests/docker/dockerfiles/ubuntu1804.docker | 2 --
  tests/docker/dockerfiles/ubuntu2004.docker | 2 --
  2 files changed, 4 deletions(-)


diff --git a/tests/docker/dockerfiles/ubuntu2004.docker 
b/tests/docker/dockerfiles/ubuntu2004.docker

index 87513125b8..159e7f60c9 100644
--- a/tests/docker/dockerfiles/ubuntu2004.docker
+++ b/tests/docker/dockerfiles/ubuntu2004.docker
@@ -66,7 +66,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
  libpam0g-dev \
  libpcre2-dev \
  libpixman-1-dev \
-    libpmem-dev \
  libpng-dev \
  libpulse-dev \
  librbd-dev \
@@ -91,7 +90,6 @@ RUN export DEBIAN_FRONTEND=noninteractive && \
  libvdeplug-dev \
  libvirglrenderer-dev \
  libvte-2.91-dev \
-    libxen-dev \
  libzstd-dev \
  llvm \
  locales \


This can't be applied to master.

% git am ~/mbox.txt
Applying: lcitool: refresh
error: patch failed: tests/docker/dockerfiles/ubuntu1804.docker:89
error: tests/docker/dockerfiles/ubuntu1804.docker: patch does not apply
error: patch failed: tests/docker/dockerfiles/ubuntu2004.docker:91
error: tests/docker/dockerfiles/ubuntu2004.docker: patch does not apply
Patch failed at 0001 lcitool: refresh


This was based on the testing/next tree which is now merged,
so this should now applies directly.

Re: [PATCH 0/6] ui/dbus: Share one listener for a console

2022-02-14 Thread Akihiko Odaki

On Mon, Feb 14, 2022 at 9:07 PM Marc-André Lureau
 wrote:
>
> Hi Akihiko
>
> On Sun, Feb 13, 2022 at 6:44 AM Akihiko Odaki  wrote:
>>
>> ui/dbus required to have multiple DisplayChangeListeners (possibly with 
>> OpenGL)
>> for a console but that caused several problems:
>> - It broke egl-headless, an unusual display which implements OpenGL rendering
>>   for non-OpenGL displays. The code to support multiple 
>> DisplayChangeListeners
>>   does not consider the case where non-OpenGL displays listens OpenGL 
>> consoles.
>
>
> Can you provide instructions on what broke? Even better write a test, please.

The following command segfaults. Adding a test would be nice, but it
would need a binary which uses OpenGL.
qemu-system-x86_64 -device virtio-gpu-gl-pci -display egl-headless
-vnc :0 -m 8G -cdrom Fedora-Workstation-Live-x86_64-34-1.2.iso -accel
kvm

>
> "make check-avocado AVOCADO_TESTS=tests/avocado/virtio-gpu.py", which covers 
> egl-headless usage, works.
>
>>
>> - Multiple OpenGL DisplayChangeListeners of dbus shares one DisplaySurface 
>> and
>>   modifies its texture field, causing OpenGL texture leak and use-after-free.
>
>
> Again, please provide instructions. I have regularly run -display dbus with 
> multiple clients and qemu compiled with sanitizers. I don't see any leak or 
> use after free.

I doubt sanitizers can find this because it is an OpenGL texture. You
may add a probe around surface_gl_create_texture and
surface_gl_destroy_texture.

>
>>
>> - Introduced extra code to check the compatibility of OpenGL context 
>> providers
>>   and OpenGL texture renderers where those are often inherently tightly 
>> coupled
>>   since they must share the same hardware.
>
>
> So code checks are meant to prevent misusage. They might be too limited or 
> broken in some ways, but reverting is likely going to introduce other 
> regressions I was trying to fix.

The misuse will not occur because DisplayChangeListeners will be
merged with OpenGL context providers.

>
>> - Needed extra work to broadcast the same change to multiple dbus listeners.
>>
>
> Compared to what?

Compared to sharing one DisplayChangeListener for multiple dbus listeners.

>
>>
>> This series solve them by implementing the change broadcast in ui/dbus, which
>> knows exactly what is needed. Changes for the common code to support multiple
>> OpenGL DisplayChangeListeners were reverted to fix egl-headless and reduce
>> the code size.
>
>
> Thanks a lot for your work, I am going to take a look at your approach. But 
> please help us understand better what the problem actually is, by giving 
> examples & tests to avoid future regressions and document the expected 
> behaviour.

The thing is really complicated and I may miss details so please feel
free to ask questions or provide suggestions.

Regards,
Akihiko Odaki


>
>>
>> Akihiko Odaki (6):
>>   ui/dbus: Share one listener for a console
>>   Revert "console: save current scanout details"
>>   Revert "ui: split the GL context in a different object"
>>   Revert "ui: dispatch GL events to all listeners"
>>   Revert "ui: associate GL context outside of display listener
>> registration"
>>   Revert "ui: factor out qemu_console_set_display_gl_ctx()"
>>
>>  include/ui/console.h   |  60 +-
>>  include/ui/egl-context.h   |   6 +-
>>  include/ui/gtk.h   |  11 +-
>>  include/ui/sdl2.h  |   7 +-
>>  include/ui/spice-display.h |   1 -
>>  ui/console.c   | 258 +++
>>  ui/dbus-console.c  | 109 ++
>>  ui/dbus-listener.c | 417 +++--
>>  ui/dbus.c  |  22 --
>>  ui/dbus.h  |  36 +++-
>>  ui/egl-context.c   |   6 +-
>>  ui/egl-headless.c  |  20 +-
>>  ui/gtk-egl.c   |  10 +-
>>  ui/gtk-gl-area.c   |   8 +-
>>  ui/gtk.c   |  25 +--
>>  ui/sdl2-gl.c   |  10 +-
>>  ui/sdl2.c  |  14 +-
>>  ui/spice-display.c |  19 +-
>>  18 files changed, 498 insertions(+), 541 deletions(-)
>>
>> --
>> 2.32.0 (Apple Git-132)
>>
>>
>
>
> --
> Marc-André Lureau

Re: [PATCH v4 10/13] audio/coreaudio: Remove a deprecation warning on macOS 12

2022-02-14 Thread Philippe Mathieu-Daudé via


On 12/2/22 16:23, Akihiko Odaki wrote:

On 2022/02/12 1:34, Philippe Mathieu-Daudé via wrote:

When building on macOS 12 we get:

   audio/coreaudio.c:50:5: error: 'kAudioObjectPropertyElementMaster' 
is deprecated: first deprecated in macOS 12.0 
[-Werror,-Wdeprecated-declarations]

   kAudioObjectPropertyElementMaster
   ^
   kAudioObjectPropertyElementMain
   
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/CoreAudio.framework/Headers/AudioHardwareBase.h:208:5: 
note: 'kAudioObjectPropertyElementMaster' has been explicitly marked 
deprecated here
   kAudioObjectPropertyElementMaster 
API_DEPRECATED_WITH_REPLACEMENT("kAudioObjectPropertyElementMain", 
macos(10.0, 12.0), ios(2.0, 15.0), watchos(1.0, 8.0), tvos(9.0, 15.0)) 
= kAudioObjectPropertyElementMain

   ^

Replace by kAudioObjectPropertyElementMain, redefining it to
kAudioObjectPropertyElementMaster if not available.

Suggested-by: Akihiko Odaki 
Suggested-by: Christian Schoenebeck 
Suggested-by: Roman Bolshakov 
Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Christian Schoenebeck 
---
  audio/coreaudio.c | 17 +++--
  1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/audio/coreaudio.c b/audio/coreaudio.c
index d8a21d3e50..5b3aeaced0 100644
--- a/audio/coreaudio.c
+++ b/audio/coreaudio.c
@@ -44,10 +44,15 @@ typedef struct coreaudioVoiceOut {
  bool enabled;
  } coreaudioVoiceOut;
+#if !defined(MAC_OS_VERSION_12_0) \
+    || (MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_VERSION_12_0)
+#define kAudioObjectPropertyElementMain 
kAudioObjectPropertyElementMaster

+#endif
+


Unless I have missed something, we have found 
MAC_OS_X_VERSION_MIN_REQUIRED is better even for a constant in the 
following thread:

https://lore.kernel.org/all/524515d6-2fb5-15c1-0aaf-bcda3684c...@gmail.com/


I was reading the v3 comments and missed the v2 ones.

Re: [PATCH v4 05/13] hvf: Fix OOB write in RDTSCP instruction decode

2022-02-14 Thread Philippe Mathieu-Daudé via


Hi Cameron,

On 11/2/22 17:34, Philippe Mathieu-Daudé wrote:

From: Cameron Esfahani 

A guest could craft a specific stream of instructions that will have QEMU
write 0xF9 to inappropriate locations in memory.  Add additional asserts
to check for this.  Generate a #UD if there are more than 14 prefix bytes.

Found by Julian Stecklina 

Signed-off-by: Cameron Esfahani 
Signed-off-by: Philippe Mathieu-Daudé 
---
  target/i386/hvf/x86_decode.c | 11 +--
  target/i386/hvf/x86hvf.c |  8 
  target/i386/hvf/x86hvf.h |  1 +
  3 files changed, 18 insertions(+), 2 deletions(-)



@@ -1847,7 +1849,8 @@ void calc_modrm_operand(CPUX86State *env, struct 
x86_decode *decode,
  
  static void decode_prefix(CPUX86State *env, struct x86_decode *decode)

  {
-while (1) {
+/* At most 14 prefix bytes. */
+for (int i = 0; i < 14; i++) {


Could we have a definition instead of this magic '14' number?


  /*
   * REX prefix must come after legacy prefixes.
   * REX before legacy is ignored.
@@ -1892,6 +1895,8 @@ static void decode_prefix(CPUX86State *env, struct 
x86_decode *decode)
  return;
  }
  }
+/* Too many prefixes!  Generate #UD. */
+hvf_inject_ud(env);
  }

[PATCH v3 2/4] virtio-iommu: Default to bypass during boot

2022-02-14 Thread Jean-Philippe Brucker

Currently the virtio-iommu device must be programmed before it allows
DMA from any PCI device. This can make the VM entirely unusable when a
virtio-iommu driver isn't present, for example in a bootloader that
loads the OS from storage.

Similarly to the other vIOMMU implementations, default to DMA bypassing
the IOMMU during boot. Add a "boot-bypass" property, defaulting to true,
that lets users change this behavior.

Replace the VIRTIO_IOMMU_F_BYPASS feature, which didn't support bypass
before feature negotiation, with VIRTIO_IOMMU_F_BYPASS_CONFIG.

We add the bypass field to the migration stream without introducing
subsections, based on the assumption that this virtio-iommu device isn't
being used in production enough to require cross-version migration at
the moment (all previous version required workarounds since they didn't
support ACPI and boot-bypass).

Reviewed-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
 include/hw/virtio/virtio-iommu.h |  1 +
 hw/virtio/virtio-iommu.c | 60 +++-
 hw/virtio/trace-events   |  4 ++-
 3 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h
index e2339e5b72..84391f8448 100644
--- a/include/hw/virtio/virtio-iommu.h
+++ b/include/hw/virtio/virtio-iommu.h
@@ -58,6 +58,7 @@ struct VirtIOIOMMU {
 GTree *domains;
 QemuMutex mutex;
 GTree *endpoints;
+bool boot_bypass;
 };
 
 #endif
diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index aa9c16a17b..4ca36db4ac 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -24,6 +24,7 @@
 #include "hw/qdev-properties.h"
 #include "hw/virtio/virtio.h"
 #include "sysemu/kvm.h"
+#include "sysemu/reset.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "trace.h"
@@ -728,8 +729,7 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 .perm = IOMMU_NONE,
 };
 
-bypass_allowed = virtio_vdev_has_feature(&s->parent_obj,
- VIRTIO_IOMMU_F_BYPASS);
+bypass_allowed = s->config.bypass;
 
 sid = virtio_iommu_get_bdf(sdev);
 
@@ -831,13 +831,37 @@ static void virtio_iommu_get_config(VirtIODevice *vdev, 
uint8_t *config_data)
 out_config->domain_range.start = 
cpu_to_le32(dev_config->domain_range.start);
 out_config->domain_range.end = cpu_to_le32(dev_config->domain_range.end);
 out_config->probe_size = cpu_to_le32(dev_config->probe_size);
+out_config->bypass = dev_config->bypass;
 
 trace_virtio_iommu_get_config(dev_config->page_size_mask,
   dev_config->input_range.start,
   dev_config->input_range.end,
   dev_config->domain_range.start,
   dev_config->domain_range.end,
-  dev_config->probe_size);
+  dev_config->probe_size,
+  dev_config->bypass);
+}
+
+static void virtio_iommu_set_config(VirtIODevice *vdev,
+const uint8_t *config_data)
+{
+VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
+struct virtio_iommu_config *dev_config = &dev->config;
+const struct virtio_iommu_config *in_config = (void *)config_data;
+
+if (in_config->bypass != dev_config->bypass) {
+if (!virtio_vdev_has_feature(vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
+virtio_error(vdev, "cannot set config.bypass");
+return;
+} else if (in_config->bypass != 0 && in_config->bypass != 1) {
+virtio_error(vdev, "invalid config.bypass value '%u'",
+ in_config->bypass);
+return;
+}
+dev_config->bypass = in_config->bypass;
+}
+
+trace_virtio_iommu_set_config(in_config->bypass);
 }
 
 static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f,
@@ -963,6 +987,19 @@ static int 
virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
 return 0;
 }
 
+static void virtio_iommu_system_reset(void *opaque)
+{
+VirtIOIOMMU *s = opaque;
+
+trace_virtio_iommu_system_reset();
+
+/*
+ * config.bypass is sticky across device reset, but should be restored on
+ * system reset
+ */
+s->config.bypass = s->boot_bypass;
+}
+
 static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -988,9 +1025,9 @@ static void virtio_iommu_device_realize(DeviceState *dev, 
Error **errp)
 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE);
 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE);
 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP);
-virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS);
 virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO);
 virtio_add_feature(&s->features, VI

Re: [PATCH] Add --with-branding-prefix and QEMU_BRANDING_PREFIX

2022-02-14 Thread Daniel P . Berrangé

On Mon, Feb 14, 2022 at 03:05:58PM +0200, Liviu Ionescu wrote:
> 
> 
> > On 14 Feb 2022, at 14:28, Daniel P. Berrangé  wrote:
> > 
> > In a .git checkout, pkgversion defaults to "git describe --match 'v*' 
> > --dirty"
> > See scripts/qemu-version.sh
> 
> I see. I would say that this is a different use case.

That's only a default, you can set it to whatever value you want it to
be with the configure arg, if you don't like that info.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 3/3] hw/smbios: add an assertion to ensure tables 19 and 32 do not collide

2022-02-14 Thread Igor Mammedov

On Mon,  7 Feb 2022 17:01:29 +0530
Ani Sinha  wrote:

> Since change b3cddba9c14b034 ("hw/smbios: fix table memory corruption with 
> large memory vms")
> we reserve additional memory space between tables 17 and 19 for large VMs.
> This may cause table 19 to collide with table 32 for those VMs. This change
> adds an assertion to make sure table 19 does not extend into the memory used
> by table 32.
> 
> Signed-off-by: Ani Sinha 
> ---
>  hw/smbios/smbios.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
> index d7de740363..800a35e9a5 100644
> --- a/hw/smbios/smbios.c
> +++ b/hw/smbios/smbios.c
> @@ -1048,6 +1048,9 @@ void smbios_get_tables(MachineState *ms,
> mem_array[i].length);
>  }
>  
> +/* we need to make sure table 19 and table 32 do not overlap */
same as in 2/3 (here and commit message), tables do not overlap

> +assert((mem_array_size + offset) < (T32_BASE - T19_BASE));
> +
>  smbios_build_type_32_table();
>  smbios_build_type_38_table();
>  smbios_build_type_41_table(errp);

Re: [PATCH v6 05/10] virtiofsd, fuse_lowlevel.c: Add capability to parse security context

2022-02-14 Thread Dr. David Alan Gilbert

* Vivek Goyal (vgo...@redhat.com) wrote:
> Add capability to enable and parse security context as sent by client
> and put into fuse_req. Filesystems now can get security context from
> request and set it on files during creation.
> 
> Signed-off-by: Vivek Goyal 

I'd be tempted to move the secctx_enabled check into
parse_secctx_fill_req - but OK.

Reviewed-by: Dr. David Alan Gilbert 

> ---
>  tools/virtiofsd/fuse_common.h   |   5 ++
>  tools/virtiofsd/fuse_i.h|   7 +++
>  tools/virtiofsd/fuse_lowlevel.c | 102 +++-
>  3 files changed, 113 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h
> index 6f8a988202..bf46954dab 100644
> --- a/tools/virtiofsd/fuse_common.h
> +++ b/tools/virtiofsd/fuse_common.h
> @@ -377,6 +377,11 @@ struct fuse_file_info {
>   */
>  #define FUSE_CAP_SETXATTR_EXT (1 << 29)
>  
> +/**
> + * Indicates that file server supports creating file security context
> + */
> +#define FUSE_CAP_SECURITY_CTX (1ULL << 32)
> +
>  /**
>   * Ioctl flags
>   *
> diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h
> index 492e002181..a5572fa4ae 100644
> --- a/tools/virtiofsd/fuse_i.h
> +++ b/tools/virtiofsd/fuse_i.h
> @@ -15,6 +15,12 @@
>  struct fv_VuDev;
>  struct fv_QueueInfo;
>  
> +struct fuse_security_context {
> +const char *name;
> +uint32_t ctxlen;
> +const void *ctx;
> +};
> +
>  struct fuse_req {
>  struct fuse_session *se;
>  uint64_t unique;
> @@ -35,6 +41,7 @@ struct fuse_req {
>  } u;
>  struct fuse_req *next;
>  struct fuse_req *prev;
> +struct fuse_security_context secctx;
>  };
>  
>  struct fuse_notify_req {
> diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
> index d91cd9743a..2909122b23 100644
> --- a/tools/virtiofsd/fuse_lowlevel.c
> +++ b/tools/virtiofsd/fuse_lowlevel.c
> @@ -886,11 +886,63 @@ static void do_readlink(fuse_req_t req, fuse_ino_t 
> nodeid,
>  }
>  }
>  
> +static int parse_secctx_fill_req(fuse_req_t req, struct fuse_mbuf_iter *iter)
> +{
> +struct fuse_secctx_header *fsecctx_header;
> +struct fuse_secctx *fsecctx;
> +const void *secctx;
> +const char *name;
> +
> +fsecctx_header = fuse_mbuf_iter_advance(iter, sizeof(*fsecctx_header));
> +if (!fsecctx_header) {
> +return -EINVAL;
> +}
> +
> +/*
> + * As of now maximum of one security context is supported. It can
> + * change in future though.
> + */
> +if (fsecctx_header->nr_secctx > 1) {
> +return -EINVAL;
> +}
> +
> +/* No security context sent. Maybe no LSM supports it */
> +if (!fsecctx_header->nr_secctx) {
> +return 0;
> +}
> +
> +fsecctx = fuse_mbuf_iter_advance(iter, sizeof(*fsecctx));
> +if (!fsecctx) {
> +return -EINVAL;
> +}
> +
> +/* struct fsecctx with zero sized context is not expected */
> +if (!fsecctx->size) {
> +return -EINVAL;
> +}
> +name = fuse_mbuf_iter_advance_str(iter);
> +if (!name) {
> +return -EINVAL;
> +}
> +
> +secctx = fuse_mbuf_iter_advance(iter, fsecctx->size);
> +if (!secctx) {
> +return -EINVAL;
> +}
> +
> +req->secctx.name = name;
> +req->secctx.ctx = secctx;
> +req->secctx.ctxlen = fsecctx->size;
> +return 0;
> +}
> +
>  static void do_mknod(fuse_req_t req, fuse_ino_t nodeid,
>   struct fuse_mbuf_iter *iter)
>  {
>  struct fuse_mknod_in *arg;
>  const char *name;
> +bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX;
> +int err;
>  
>  arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
>  name = fuse_mbuf_iter_advance_str(iter);
> @@ -901,6 +953,14 @@ static void do_mknod(fuse_req_t req, fuse_ino_t nodeid,
>  
>  req->ctx.umask = arg->umask;
>  
> +if (secctx_enabled) {
> +err = parse_secctx_fill_req(req, iter);
> +if (err) {
> +fuse_reply_err(req, -err);
> +return;
> +}
> +}
> +
>  if (req->se->op.mknod) {
>  req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev);
>  } else {
> @@ -913,6 +973,8 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid,
>  {
>  struct fuse_mkdir_in *arg;
>  const char *name;
> +bool secctx_enabled = req->se->conn.want & FUSE_CAP_SECURITY_CTX;
> +int err;
>  
>  arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
>  name = fuse_mbuf_iter_advance_str(iter);
> @@ -923,6 +985,14 @@ static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid,
>  
>  req->ctx.umask = arg->umask;
>  
> +if (secctx_enabled) {
> +err = parse_secctx_fill_req(req, iter);
> +if (err) {
> +fuse_reply_err(req, err);
> +return;
> +}
> +}
> +
>  if (req->se->op.mkdir) {
>  req->se->op.mkdir(req, nodeid, name, arg->mode);
>  } else {
> @@ -969,12 +1039,22 @@ static void do_symlink(fuse_req_

[PATCH v3 4/4] tests/qtest/virtio-iommu-test: Check bypass config

2022-02-14 Thread Jean-Philippe Brucker

The bypass config field should be initialized to 1 by default.

Reviewed-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
 tests/qtest/virtio-iommu-test.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qtest/virtio-iommu-test.c b/tests/qtest/virtio-iommu-test.c
index 47e68388a0..068e7a9e6c 100644
--- a/tests/qtest/virtio-iommu-test.c
+++ b/tests/qtest/virtio-iommu-test.c
@@ -31,11 +31,13 @@ static void pci_config(void *obj, void *data, 
QGuestAllocator *t_alloc)
 uint64_t input_range_end = qvirtio_config_readq(dev, 16);
 uint32_t domain_range_start = qvirtio_config_readl(dev, 24);
 uint32_t domain_range_end = qvirtio_config_readl(dev, 28);
+uint8_t bypass = qvirtio_config_readb(dev, 36);
 
 g_assert_cmpint(input_range_start, ==, 0);
 g_assert_cmphex(input_range_end, ==, UINT64_MAX);
 g_assert_cmpint(domain_range_start, ==, 0);
 g_assert_cmpint(domain_range_end, ==, UINT32_MAX);
+g_assert_cmpint(bypass, ==, 1);
 }
 
 static int read_tail_status(struct virtio_iommu_req_tail *buffer)
-- 
2.35.1

Re: [RFC] thread-pool: Add option to fix the pool size

2022-02-14 Thread Stefan Hajnoczi

On Mon, Feb 14, 2022 at 12:38:22PM +0100, Kevin Wolf wrote:
> Am 14.02.2022 um 11:09 hat Stefan Hajnoczi geschrieben:
> > On Fri, Feb 11, 2022 at 12:32:57PM +0100, Kevin Wolf wrote:
> > > Am 03.02.2022 um 15:19 hat Stefan Hajnoczi geschrieben:
> > > > On Thu, Feb 03, 2022 at 10:56:49AM +, Daniel P. Berrangé wrote:
> > > > > On Thu, Feb 03, 2022 at 10:53:07AM +, Stefan Hajnoczi wrote:
> > > > > > On Wed, Feb 02, 2022 at 06:52:34PM +0100, Nicolas Saenz Julienne 
> > > > > > wrote:
> > > > 1. A global default value that all new AioContext take. The QEMU main
> > > >loop's qemu_aio_context will use this and all IOThread AioContext
> > > >will use it (unless they have been overridden).
> > > > 
> > > >I would define it on --machine because that's the "global" object for
> > > >a guest, but that's not very satisfying.
> > > 
> > > Semantically, -machine is about the virtual hardware where as iothreads
> > > are about the backend, so I agree it's not a good fit.
> > > 
> > > For the main thread, you may want to configure all the same options that
> > > you can configure for an iothread. So to me that sounds like we would
> > > want to allow using an iothread object for the main thread, too.
> > > 
> > > That would still require us to tell QEMU which iothread object should be
> > > used for the main thread, though.
> > 
> > Making the main loop thread an IOThread is an interesting direction but
> > not an easy change to make.
> > 
> > The main loop thread has a custom event loop that is not interchangeable
> > with the IOThread event loop:
> > - The main loop has a poll notifier interface for libslirp fd monitoring
> >   integration.
> > - The main loop is a GLib event loop but manually polls to get
> >   nanosecond resolution timers.
> > - The main loop has icount integration.
> > - The main loop has the special iohandler AioContext
> > 
> > The IOThread event loop runs an optimized AioContext event loop instead.
> > It falls back to regular g_main_loop_run() if there is a GSource user.
> > 
> > It would definitely be nice to unify the main loop with IOThread and
> > then use --object iothread,... to configure main loop parameters.
> > 
> > I'm not sure if requiring that of Nicolas is fair though. The event
> > loops in QEMU are complex and changes are likely to introduce subtle
> > bugs or performance regressions.
> 
> I'm not suggesting actually running the iothread event loop instead,
> merely using the properties of an object to configure the main thread as
> the external user interface.
> Whether this uses the same main loop code as today or is moved to the
> regular iothread event loop is an implementation detail that can be
> changed later.
> 
> Or we could maybe use a different object type like 'mainthread' and
> share the properties using QOM inheritance.

That seems cleaner than trying faking an IOThread to me since I don't
see a concrete plan to unify the two event loops.

The main loop code is in util/main-loop.c. Maybe call it --object
main-loop? Attempting to instantiate more than one main-loop object
should fail.

Stefan


signature.asc
Description: PGP signature

Re: [PATCH] Add --with-branding-prefix and QEMU_BRANDING_PREFIX

2022-02-14 Thread Liviu Ionescu




> On 14 Feb 2022, at 14:28, Daniel P. Berrangé  wrote:
> 
> In a .git checkout, pkgversion defaults to "git describe --match 'v*' --dirty"
> See scripts/qemu-version.sh

I see. I would say that this is a different use case.

Thank you,

Liviu

Re: [PATCH 2/2] target/ppc/kvm: Use KVM_CAP_PPC_AIL_MODE_3 to determine cap-ail-mode-3 support

2022-02-14 Thread Fabiano Rosas

Nicholas Piggin  writes:

> Use KVM_CAP_PPC_AIL_MODE_3 to determine cap-ail-mode-3 support for KVM
> guests. Keep the fallback heuristic for KVM hosts that pre-date this
> CAP.
>
> This is only proposed the KVM CAP has not yet been allocated. I will
> ask to merge the new KVM cap when there are no objections on the QEMU
> side.
>
> not-yet-Signed-off-by: Nicholas Piggin 
> ---
>  hw/ppc/spapr_caps.c   |  2 +-
>  linux-headers/linux/kvm.h |  1 +
>  target/ppc/kvm.c  | 18 +-
>  target/ppc/kvm_ppc.h  |  4 ++--
>  4 files changed, 21 insertions(+), 4 deletions(-)
>
> diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c
> index 5fd4a53c33..5cc80776d0 100644
> --- a/hw/ppc/spapr_caps.c
> +++ b/hw/ppc/spapr_caps.c
> @@ -619,7 +619,7 @@ static void cap_ail_mode_3_apply(SpaprMachineState *spapr,
>  ERRP_GUARD();
>  
>  if (kvm_enabled()) {
> -if (!kvmppc_supports_ail_3()) {
> +if (!kvmppc_has_cap_ail_3()) {
>  error_setg(errp, "KVM implementation does not support 
> cap-ail-mode-3");
>  error_append_hint(errp, "Try appending -machine 
> cap-ail-mode-3=off\n");
>  return;
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index 02c5e7b7bb..d91f578200 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> @@ -1130,6 +1130,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_BINARY_STATS_FD 203
>  #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
>  #define KVM_CAP_ARM_MTE 205
> +#define KVM_CAP_PPC_AIL_MODE_3 210
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index 128bc530d4..d0d0bdaac4 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -90,6 +90,7 @@ static int cap_ppc_nested_kvm_hv;
>  static int cap_large_decr;
>  static int cap_fwnmi;
>  static int cap_rpt_invalidate;
> +static int cap_ail_mode_3;
>  
>  static uint32_t debug_inst_opcode;
>  
> @@ -154,6 +155,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>  }
>  
>  cap_rpt_invalidate = kvm_vm_check_extension(s, 
> KVM_CAP_PPC_RPT_INVALIDATE);
> +cap_ail_mode_3 = kvm_vm_check_extension(s, KVM_CAP_PPC_AIL_MODE_3);
>  kvm_ppc_register_host_cpu_type();
>  
>  return 0;
> @@ -2563,10 +2565,24 @@ int kvmppc_has_cap_rpt_invalidate(void)
>  return cap_rpt_invalidate;
>  }
>  
> -int kvmppc_supports_ail_3(void)
> +int kvmppc_has_cap_ail_3(void)
>  {
>  PowerPCCPUClass *pcc = kvm_ppc_get_host_cpu_class();
>  
> +if (cap_ail_mode_3) {
> +return 1;
> +}
> +
> +if (kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_PPC_AIL_MODE_3) == 
> 0) {
> +return 0;
> +}

This is not needed here it seems.

> +
> +/*
> + * For KVM hosts that pre-date this cap, special-case the test because
> + * the performance cost for disabling the feature unconditionally is
> + * prohibitive.
> + */
> +
>  /*
>   * KVM PR only supports AIL-0
>   */
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index d9d1c54955..efafa67b83 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -73,7 +73,7 @@ int kvmppc_set_cap_nested_kvm_hv(int enable);
>  int kvmppc_get_cap_large_decr(void);
>  int kvmppc_enable_cap_large_decr(PowerPCCPU *cpu, int enable);
>  int kvmppc_has_cap_rpt_invalidate(void);
> -int kvmppc_supports_ail_3(void);
> +int kvmppc_has_cap_ail_3(void);
>  int kvmppc_enable_hwrng(void);
>  int kvmppc_put_books_sregs(PowerPCCPU *cpu);
>  PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void);
> @@ -394,7 +394,7 @@ static inline int kvmppc_has_cap_rpt_invalidate(void)
>  return false;
>  }
>  
> -static inline int kvmppc_supports_ail_3(void)
> +static inline int kvmppc_has_cap_ail_3(void)
>  {
>  return false;
>  }

[PATCH v3 3/4] virtio-iommu: Support bypass domain

2022-02-14 Thread Jean-Philippe Brucker

The driver can create a bypass domain by passing the
VIRTIO_IOMMU_ATTACH_F_BYPASS flag on the ATTACH request. Bypass domains
perform slightly better than domains with identity mappings since they
skip translation.

Signed-off-by: Jean-Philippe Brucker 
---
 hw/virtio/virtio-iommu.c | 39 ++-
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
index 4ca36db4ac..239fe97b12 100644
--- a/hw/virtio/virtio-iommu.c
+++ b/hw/virtio/virtio-iommu.c
@@ -43,6 +43,7 @@
 
 typedef struct VirtIOIOMMUDomain {
 uint32_t id;
+bool bypass;
 GTree *mappings;
 QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
 } VirtIOIOMMUDomain;
@@ -258,12 +259,16 @@ static void virtio_iommu_put_endpoint(gpointer data)
 }
 
 static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s,
-  uint32_t domain_id)
+  uint32_t domain_id,
+  bool bypass)
 {
 VirtIOIOMMUDomain *domain;
 
 domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
 if (domain) {
+if (domain->bypass != bypass) {
+return NULL;
+}
 return domain;
 }
 domain = g_malloc0(sizeof(*domain));
@@ -271,6 +276,7 @@ static VirtIOIOMMUDomain 
*virtio_iommu_get_domain(VirtIOIOMMU *s,
 domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
NULL, (GDestroyNotify)g_free,
(GDestroyNotify)g_free);
+domain->bypass = bypass;
 g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
 QLIST_INIT(&domain->endpoint_list);
 trace_virtio_iommu_get_domain(domain_id);
@@ -334,11 +340,16 @@ static int virtio_iommu_attach(VirtIOIOMMU *s,
 {
 uint32_t domain_id = le32_to_cpu(req->domain);
 uint32_t ep_id = le32_to_cpu(req->endpoint);
+uint32_t flags = le32_to_cpu(req->flags);
 VirtIOIOMMUDomain *domain;
 VirtIOIOMMUEndpoint *ep;
 
 trace_virtio_iommu_attach(domain_id, ep_id);
 
+if (flags & ~VIRTIO_IOMMU_ATTACH_F_BYPASS) {
+return VIRTIO_IOMMU_S_INVAL;
+}
+
 ep = virtio_iommu_get_endpoint(s, ep_id);
 if (!ep) {
 return VIRTIO_IOMMU_S_NOENT;
@@ -356,7 +367,12 @@ static int virtio_iommu_attach(VirtIOIOMMU *s,
 }
 }
 
-domain = virtio_iommu_get_domain(s, domain_id);
+domain = virtio_iommu_get_domain(s, domain_id,
+ flags & VIRTIO_IOMMU_ATTACH_F_BYPASS);
+if (!domain) {
+/* Incompatible bypass flag */
+return VIRTIO_IOMMU_S_INVAL;
+}
 QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
 
 ep->domain = domain;
@@ -419,6 +435,10 @@ static int virtio_iommu_map(VirtIOIOMMU *s,
 return VIRTIO_IOMMU_S_NOENT;
 }
 
+if (domain->bypass) {
+return VIRTIO_IOMMU_S_INVAL;
+}
+
 interval = g_malloc0(sizeof(*interval));
 
 interval->low = virt_start;
@@ -464,6 +484,11 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s,
 if (!domain) {
 return VIRTIO_IOMMU_S_NOENT;
 }
+
+if (domain->bypass) {
+return VIRTIO_IOMMU_S_INVAL;
+}
+
 interval.low = virt_start;
 interval.high = virt_end;
 
@@ -780,6 +805,9 @@ static IOMMUTLBEntry 
virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
 entry.perm = flag;
 }
 goto unlock;
+} else if (ep->domain->bypass) {
+entry.perm = flag;
+goto unlock;
 }
 
 found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
@@ -1139,8 +1167,8 @@ static const VMStateDescription vmstate_endpoint = {
 
 static const VMStateDescription vmstate_domain = {
 .name = "domain",
-.version_id = 1,
-.minimum_version_id = 1,
+.version_id = 2,
+.minimum_version_id = 2,
 .pre_load = domain_preload,
 .fields = (VMStateField[]) {
 VMSTATE_UINT32(id, VirtIOIOMMUDomain),
@@ -1149,6 +1177,7 @@ static const VMStateDescription vmstate_domain = {
 VirtIOIOMMUInterval, VirtIOIOMMUMapping),
 VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
 vmstate_endpoint, VirtIOIOMMUEndpoint, next),
+VMSTATE_BOOL_V(bypass, VirtIOIOMMUDomain, 2),
 VMSTATE_END_OF_LIST()
 }
 };
@@ -1186,7 +1215,7 @@ static const VMStateDescription 
vmstate_virtio_iommu_device = {
 .version_id = 2,
 .post_load = iommu_post_load,
 .fields = (VMStateField[]) {
-VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 1,
+VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 2,
&vmstate_domain, VirtIOIOMMUDomain),
 VMSTATE_UINT8_V(config.bypass, VirtIOIOMMU, 2),
 VMSTATE_END_OF_LIST()
-- 
2.35.1

RE: [RFC v4 01/21] vfio-user: introduce vfio-user protocol specification

2022-02-14 Thread Thanos Makatos



> -Original Message-
> From: Qemu-devel  bounces+thanos.makatos=nutanix@nongnu.org> On Behalf Of John
> Johnson
> Sent: 12 January 2022 00:44
> To: qemu-devel@nongnu.org
> Subject: [RFC v4 01/21] vfio-user: introduce vfio-user protocol specification
> 
> From: Thanos Makatos 
> 
> This patch introduces the vfio-user protocol specification (formerly
> known as VFIO-over-socket), which is designed to allow devices to be
> emulated outside QEMU, in a separate process. vfio-user reuses the
> existing VFIO defines, structs and concepts.
> 
> It has been earlier discussed as an RFC in:
> "RFC: use VFIO over a UNIX domain socket to implement device offloading"
> 
> Signed-off-by: John G Johnson 
> Signed-off-by: Thanos Makatos 
> Signed-off-by: John Levon 
> ---
>  docs/devel/index.rst |1 +
>  docs/devel/vfio-user.rst | 1810
> ++
>  MAINTAINERS  |6 +
>  3 files changed, 1817 insertions(+)
>  create mode 100644 docs/devel/vfio-user.rst
> 
> diff --git a/docs/devel/index.rst b/docs/devel/index.rst
> index afd9375..23d2c30 100644
> --- a/docs/devel/index.rst
> +++ b/docs/devel/index.rst
> @@ -48,3 +48,4 @@ modifying QEMU's source code.
> trivial-patches
> submitting-a-patch
> submitting-a-pull-request
> +   vfio-user
> diff --git a/docs/devel/vfio-user.rst b/docs/devel/vfio-user.rst
> new file mode 100644
> index 000..97a7506
> --- /dev/null
> +++ b/docs/devel/vfio-user.rst
> @@ -0,0 +1,1810 @@
> +.. include:: 
> +
> +vfio-user Protocol Specification
> +
> +
> +--
> +Version_ 0.9.1
> +--
> +
> +.. contents:: Table of Contents
> +
> +Introduction
> +
> +vfio-user is a protocol that allows a device to be emulated in a separate
> +process outside of a Virtual Machine Monitor (VMM). vfio-user devices consist
> +of a generic VFIO device type, living inside the VMM, which we call the 
> client,
> +and the core device implementation, living outside the VMM, which we call the
> +server.
> +
> +The vfio-user specification is partly based on the
> +`Linux VFIO ioctl interface 
>  3A__www.kernel.org_doc_html_latest_driver-
> 2Dapi_vfio.html&d=DwIBAg&c=s883GpUCOChKOHiocYtGcg&r=XTpYsh5Ps2zJvt
> w6ogtti46atk736SI4vgsJiUKIyDE&m=ngoOFuShfuAMJEVdV4gT7GrSMsClZglKPZXI
> UUeqln2aRgULhPnXxADWhHOl4DBy&s=9CWGM-
> s7UoGGB9GFCKuhOj_7UwoSttlAcxrpEWTjdYI&e= >`_.
> +
> +VFIO is a mature and stable API, backed by an extensively used framework. The
> +existing VFIO client implementation in QEMU (``qemu/hw/vfio/``) can be 
> largely
> +re-used, though there is nothing in this specification that requires that
> +particular implementation. None of the VFIO kernel modules are required for
> +supporting the protocol, on either the client or server side. Some source
> +definitions in VFIO are re-used for vfio-user.
> +
> +The main idea is to allow a virtual device to function in a separate process 
> in
> +the same host over a UNIX domain socket. A UNIX domain socket (``AF_UNIX``)
> is
> +chosen because file descriptors can be trivially sent over it, which in turn
> +allows:
> +
> +* Sharing of client memory for DMA with the server.
> +* Sharing of server memory with the client for fast MMIO.
> +* Efficient sharing of eventfd's for triggering interrupts.
> +
> +Other socket types could be used which allow the server to run in a separate
> +guest in the same host (``AF_VSOCK``) or remotely (``AF_INET``). 
> Theoretically
> +the underlying transport does not necessarily have to be a socket, however we
> do
> +not examine such alternatives. In this protocol version we focus on using a
> UNIX
> +domain socket and introduce basic support for the other two types of sockets
> +without considering performance implications.
> +
> +While passing of file descriptors is desirable for performance reasons, 
> support
> +is not necessary for either the client or the server in order to implement 
> the
> +protocol. There is always an in-band, message-passing fall back mechanism.
> +
> +Overview
> +
> +
> +VFIO is a framework that allows a physical device to be securely passed
> through
> +to a user space process; the device-specific kernel driver does not drive the
> +device at all.  Typically, the user space process is a VMM and the device is
> +passed through to it in order to achieve high performance. VFIO provides an
> API
> +and the required functionality in the kernel. QEMU has adopted VFIO to allow 
> a
> +guest to directly access physical devices, instead of emulating them in
> +software.
> +
> +vfio-user reuses the core VFIO concepts defined in its API, but implements 
> them
> +as messages to be sent over a socket. It does not change the kernel-based
> VFIO
> +in any way, in fact none of the VFIO kernel modules need to be loaded to use
> +vfio-user. It is also possible for the client to concurrently use

Re: [PATCH RFCv2 3/4] i386/pc: warn if phys-bits is too low

2022-02-14 Thread David Edmondson

On Monday, 2022-02-07 at 20:24:21 GMT, Joao Martins wrote:

> Default phys-bits on Qemu is TCG_PHYS_BITS (40) which is enough
> to address 1Tb (0xff  ). On AMD platforms, if a
> ram-above-4g relocation happens and the CPU wasn't configured
> with a big enough phys-bits, warn the user. There isn't a
> catastrophic failure exactly, the guest will still boot, but
> most likely won't be able to use more than ~4G of RAM.
>
> Signed-off-by: Joao Martins 
> ---
>  hw/i386/pc.c | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index b060aedd38f3..f8712eb8427e 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -842,6 +842,7 @@ static void relocate_4g(MachineState *machine, 
> PCMachineState *pcms)
>  X86MachineState *x86ms = X86_MACHINE(pcms);
>  ram_addr_t device_mem_size = 0;
>  uint32_t eax, vendor[3];
> +hwaddr maxphysaddr;
>
>  host_cpuid(0x0, 0, &eax, &vendor[0], &vendor[2], &vendor[1]);
>  if (!IS_AMD_VENDOR(vendor)) {
> @@ -858,6 +859,12 @@ static void relocate_4g(MachineState *machine, 
> PCMachineState *pcms)
>  return;
>  }
>
> +maxphysaddr = ((hwaddr)1 << X86_CPU(first_cpu)->phys_bits) - 1;
> +if (maxphysaddr < AMD_ABOVE_1TB_START)

Braces around the block are required, I believe.

> +warn_report("Relocated RAM above 4G to start at %lu "

Should use PRIu64?

> +"phys-bits too low (%u)",
> +AMD_ABOVE_1TB_START, X86_CPU(first_cpu)->phys_bits);
> +
>  x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START;

And a real nit - until above_4g_mem_start is modified, the number of
phys_bits is fine, so I would have put the warning after the assignment.

>  }

dme.
-- 
Tonight I'm gonna bury that horse in the ground.

Re: [PATCH] Add --with-branding-prefix and QEMU_BRANDING_PREFIX

2022-02-14 Thread Peter Maydell

On Mon, 14 Feb 2022 at 13:06, Liviu Ionescu  wrote:
>
>
>
> > On 14 Feb 2022, at 14:28, Daniel P. Berrangé  wrote:
> >
> > In a .git checkout, pkgversion defaults to "git describe --match 'v*' 
> > --dirty"
> > See scripts/qemu-version.sh
>
> I see. I would say that this is a different use case.

It's just a handy default for git checkouts. The primary use case
of pkgversion is exactly "this is a downstream packaging of QEMU
that wants to indicate that in the version string".

-- PMM

Re: [PATCH] target/ppc: raise HV interrupts for partition table entry problems

2022-02-14 Thread Daniel Henrique Barboza





On 2/14/22 09:31, Nicholas Piggin wrote:

Invalid or missing partition table entry exceptions should cause HV
interrupts. HDSISR is set to bad MMU config, which is consistent with
the ISA and experimentally matches what POWER9 generates.

Signed-off-by: Nicholas Piggin 
---


Reviewed-by: Daniel Henrique Barboza 


  target/ppc/mmu-radix64.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 040c055bff..54fb3ce98d 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -560,13 +560,13 @@ static bool ppc_radix64_xlate_impl(PowerPCCPU *cpu, vaddr 
eaddr,
  } else {
  if (!ppc64_v3_get_pate(cpu, lpid, &pate)) {
  if (guest_visible) {
-ppc_radix64_raise_si(cpu, access_type, eaddr, DSISR_NOPTE);
+ppc_radix64_raise_hsi(cpu, access_type, eaddr, eaddr, 
DSISR_R_BADCONFIG);
  }
  return false;
  }
  if (!validate_pate(cpu, lpid, &pate)) {
  if (guest_visible) {
-ppc_radix64_raise_si(cpu, access_type, eaddr, 
DSISR_R_BADCONFIG);
+ppc_radix64_raise_hsi(cpu, access_type, eaddr, eaddr, 
DSISR_R_BADCONFIG);
  }
  return false;
  }

1 2 3 >

1 - 100 of 264 matches

Mail list logo