date:20150325

Re: [Qemu-devel] [RFC PATCH v2 19/23] spapr: CPU hot unplug support

2015-03-25 Thread David Gibson

On Mon, Mar 23, 2015 at 07:06:00PM +0530, Bharata B Rao wrote:
> Support hot removal of CPU for sPAPR guests by sending the hot
> unplug notification to the guest via EPOW interrupt.
> 
> Signed-off-by: Bharata B Rao 
> ---
>  hw/ppc/spapr.c| 78 
> ++-
>  linux-headers/linux/kvm.h |  1 +
>  target-ppc/kvm.c  |  7 +
>  target-ppc/kvm_ppc.h  |  6 
>  4 files changed, 91 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index b48994b..7b8784d 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1468,6 +1468,12 @@ static void spapr_cpu_init(PowerPCCPU *cpu)
>  qemu_register_reset(spapr_cpu_reset, cpu);
>  }
>  
> +static void spapr_cpu_destroy(PowerPCCPU *cpu)
> +{
> +xics_cpu_destroy(spapr->icp, cpu);
> +qemu_unregister_reset(spapr_cpu_reset, cpu);
> +}
> +
>  /* pSeries LPAR / sPAPR hardware init */
>  static void ppc_spapr_init(MachineState *machine)
>  {
> @@ -1880,6 +1886,18 @@ static void spapr_cpu_hotplug_add(DeviceState *dev, 
> CPUState *cs, Error **errp)
>  }
>  }
>  
> +static void spapr_cpu_hotplug_remove(DeviceState *dev, CPUState *cs,
> + Error **errp)
> +{
> +PowerPCCPU *cpu = POWERPC_CPU(cs);
> +int id = ppc_get_vcpu_dt_id(cpu);
> +sPAPRDRConnector *drc =
> +spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
> +sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> +
> +drck->detach(drc, dev, NULL, NULL, errp);
> +}
> +
>  static void spapr_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>  Error **errp)
>  {
> @@ -1911,6 +1929,51 @@ static void spapr_cpu_plug(HotplugHandler 
> *hotplug_dev, DeviceState *dev,
>  return;
>  }
>  
> +static int spapr_cpu_unplug(Object *obj, void *opaque)
> +{
> +Error **errp = opaque;
> +DeviceState *dev = DEVICE(obj);
> +CPUState *cs = CPU(dev);
> +PowerPCCPU *cpu = POWERPC_CPU(cs);
> +int id = ppc_get_vcpu_dt_id(cpu);
> +int smt = kvmppc_smt_threads();
> +sPAPRDRConnector *drc =
> +spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
> +
> +spapr_cpu_destroy(cpu);

I may be misunderstanding something here, but don't you need to signal
the removal to the guest (and get its ack) before you "physically"
remove the cpu?

> +/*
> + * SMT threads return from here, only main thread (core) will
> + * continue and signal hot unplug event to the guest.
> + */
> +if ((id % smt) != 0) {
> +return 0;
> +}
> +g_assert(drc);
> +
> +spapr_cpu_hotplug_remove(dev, cs, errp);
> +if (*errp) {
> +return -1;
> +}
> +spapr_hotplug_req_remove_event(drc);
> +
> +return 0;
> +}
> +
> +static int spapr_cpu_core_unplug(Object *obj, void *opaque)
> +{
> +Error **errp = opaque;
> +
> +object_child_foreach(obj, spapr_cpu_unplug, errp);
> +return 0;
> +}
> +
> +static void spapr_cpu_socket_unplug(HotplugHandler *hotplug_dev,
> +DeviceState *dev, Error **errp)
> +{
> +object_child_foreach(OBJECT(dev), spapr_cpu_core_unplug, errp);
> +}
> +
>  static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
>DeviceState *dev, Error **errp)
>  {
> @@ -1926,10 +1989,21 @@ static void spapr_machine_device_plug(HotplugHandler 
> *hotplug_dev,
>  }
>  }
>  
> +static void spapr_machine_device_unplug(HotplugHandler *hotplug_dev,
> +  DeviceState *dev, Error **errp)
> +{
> +if (object_dynamic_cast(OBJECT(dev), TYPE_CPU_SOCKET)) {
> +if (dev->hotplugged && spapr->dr_cpu_enabled) {
> +spapr_cpu_socket_unplug(hotplug_dev, dev, errp);
> +}
> +}
> +}
> +
>  static HotplugHandler *spapr_get_hotpug_handler(MachineState *machine,
>   DeviceState *dev)
>  {
> -if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
> +if (object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
> +object_dynamic_cast(OBJECT(dev), TYPE_CPU_SOCKET)) {
>  return HOTPLUG_HANDLER(machine);
>  }
>  return NULL;
> @@ -1953,6 +2027,8 @@ static void spapr_machine_class_init(ObjectClass *oc, 
> void *data)
>  mc->has_dynamic_sysbus = true;
>  mc->get_hotplug_handler = spapr_get_hotpug_handler;
>  hc->plug = spapr_machine_device_plug;
> +hc->unplug = spapr_machine_device_unplug;
> +
>  smc->dr_phb_enabled = false;
>  smc->dr_cpu_enabled = false;
>  smc->dr_lmb_enabled = false;
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index 12045a1..0c1be07 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> @@ -761,6 +761,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_PPC_FIXUP_HCALL 103
>  #define KVM_CAP_PPC_ENABLE_HCALL 104
>  #define KVM_CAP_CHECK_EXTENSION_VM 105
> +#define KVM_CAP_SPAPR_REU

Re: [Qemu-devel] [RFC PATCH v2 20/23] spapr: Remove vCPU objects after CPU hot unplug

2015-03-25 Thread David Gibson

On Mon, Mar 23, 2015 at 07:06:01PM +0530, Bharata B Rao wrote:
> Release the vCPU objects after CPU hot unplug so that vCPU fd
> can be parked and reused.
> 
> Signed-off-by: Bharata B Rao 

I think this patch is simple enough it should just be folded in with
the previous one.

> ---
>  hw/ppc/spapr.c | 19 ++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 7b8784d..3e56d9e 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1886,6 +1886,23 @@ static void spapr_cpu_hotplug_add(DeviceState *dev, 
> CPUState *cs, Error **errp)
>  }
>  }
>  
> +static void spapr_cpu_release(DeviceState *dev, void *opaque)
> +{
> +CPUState *cs;
> +int i;
> +int id = ppc_get_vcpu_dt_id(POWERPC_CPU(CPU(dev)));
> +
> +for (i = id; i < id + smp_threads; i++) {
> +CPU_FOREACH(cs) {
> +PowerPCCPU *cpu = POWERPC_CPU(cs);
> +
> +if (i == ppc_get_vcpu_dt_id(cpu)) {
> +cpu_remove(cs);
> +}
> +}
> +}
> +}
> +
>  static void spapr_cpu_hotplug_remove(DeviceState *dev, CPUState *cs,
>   Error **errp)
>  {
> @@ -1895,7 +1912,7 @@ static void spapr_cpu_hotplug_remove(DeviceState *dev, 
> CPUState *cs,
>  spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
>  sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
>  
> -drck->detach(drc, dev, NULL, NULL, errp);
> +drck->detach(drc, dev, spapr_cpu_release, NULL, errp);
>  }
>  
>  static void spapr_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev,

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpUOB6KVI4vf.pgp
Description: PGP signature

Re: [Qemu-devel] [RFC PATCH v2 21/23] spapr: Initialize hotplug memory address space

2015-03-25 Thread David Gibson

On Mon, Mar 23, 2015 at 07:06:02PM +0530, Bharata B Rao wrote:
> Initialize a hotplug memory region under which all the hotplugged
> memory is accommodated. Also enable memory hotplug by setting
> CONFIG_MEM_HOTPLUG.
> 
> Modelled on i386 memory hotplug.
> 
> Signed-off-by: Bharata B Rao 
> ---
>  default-configs/ppc64-softmmu.mak |  1 +
>  hw/ppc/spapr.c| 50 
> +++
>  include/hw/ppc/spapr.h| 12 ++
>  3 files changed, 63 insertions(+)
> 
> diff --git a/default-configs/ppc64-softmmu.mak 
> b/default-configs/ppc64-softmmu.mak
> index 22ef132..16b3011 100644
> --- a/default-configs/ppc64-softmmu.mak
> +++ b/default-configs/ppc64-softmmu.mak
> @@ -51,3 +51,4 @@ CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM))
>  # For PReP
>  CONFIG_MC146818RTC=y
>  CONFIG_ISA_TESTDEV=y
> +CONFIG_MEM_HOTPLUG=y
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 3e56d9e..e43bb49 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -125,8 +125,13 @@ struct sPAPRMachineState {
>  
>  /*< public >*/
>  char *kvm_type;
> +ram_addr_t hotplug_memory_base;
> +MemoryRegion hotplug_memory;
> +bool enforce_aligned_dimm;
>  };
>  
> +#define SPAPR_MACHINE_ENFORCE_ALIGNED_DIMM "enforce-aligned-dimm"

What's the rationale for including this option?

> +
>  sPAPREnvironment *spapr;
>  
>  static XICSState *try_create_xics(const char *type, int nr_servers,
> @@ -1499,6 +1504,7 @@ static void ppc_spapr_init(MachineState *machine)
>  int smt = kvmppc_smt_threads();
>  Object *socket;
>  int sockets;
> +sPAPRMachineState *ms = SPAPR_MACHINE(machine);
>  
>  msi_supported = true;
>  
> @@ -1585,6 +1591,36 @@ static void ppc_spapr_init(MachineState *machine)
>  memory_region_add_subregion(sysmem, 0, rma_region);
>  }
>  
> +/* initialize hotplug memory address space */
> +if (machine->ram_size < machine->maxram_size) {
> +ram_addr_t hotplug_mem_size =
> +machine->maxram_size - machine->ram_size;
> +
> +if (machine->ram_slots > SPAPR_MAX_RAM_SLOTS) {
> +error_report("unsupported amount of memory slots: %"PRIu64,
> + machine->ram_slots);
> +exit(EXIT_FAILURE);
> +}
> +
> +ms->hotplug_memory_base = ROUND_UP(machine->ram_size,
> +SPAPR_HOTPLUG_MEM_ALIGN);
> +
> +if (ms->enforce_aligned_dimm) {
> +hotplug_mem_size += SPAPR_HOTPLUG_MEM_ALIGN * machine->ram_slots;
> +}
> +
> +if ((ms->hotplug_memory_base + hotplug_mem_size) < hotplug_mem_size) 
> {
> +error_report("unsupported amount of maximum memory: " 
> RAM_ADDR_FMT,
> + machine->maxram_size);
> +exit(EXIT_FAILURE);
> +}
> +
> +memory_region_init(&ms->hotplug_memory, OBJECT(ms),
> +   "hotplug-memory", hotplug_mem_size);
> +memory_region_add_subregion(sysmem, ms->hotplug_memory_base,
> +&ms->hotplug_memory);
> +}
> +
>  filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin");
>  spapr->rtas_size = get_image_size(filename);
>  spapr->rtas_blob = g_malloc(spapr->rtas_size);
> @@ -1827,13 +1863,27 @@ static void spapr_set_kvm_type(Object *obj, const 
> char *value, Error **errp)
>  sm->kvm_type = g_strdup(value);
>  }
>  
> +static bool spapr_machine_get_aligned_dimm(Object *obj, Error **errp)
> +{
> +sPAPRMachineState *ms = SPAPR_MACHINE(obj);
> +
> +return ms->enforce_aligned_dimm;
> +}
> +
>  static void spapr_machine_initfn(Object *obj)
>  {
> +sPAPRMachineState *ms = SPAPR_MACHINE(obj);
> +
>  object_property_add_str(obj, "kvm-type",
>  spapr_get_kvm_type, spapr_set_kvm_type, NULL);
>  object_property_set_description(obj, "kvm-type",
>  "Specifies the KVM virtualization mode 
> (HV, PR)",
>  NULL);
> +
> +ms->enforce_aligned_dimm = true;
> +object_property_add_bool(obj, SPAPR_MACHINE_ENFORCE_ALIGNED_DIMM,
> + spapr_machine_get_aligned_dimm,
> + NULL, NULL);
>  }
>  
>  static void ppc_cpu_do_nmi_on_cpu(void *arg)
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index ecac6e3..53560e9 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -542,6 +542,18 @@ struct sPAPREventLogEntry {
>  
>  #define SPAPR_MEMORY_BLOCK_SIZE (1 << 28) /* 256MB */
>  
> +/*
> + * This defines the maximum number of DIMM slots we can have for sPAPR
> + * guest. This is not defined by sPAPR but we are defining it to 4096 slots
> + * here. With the worst case addition of SPAPR_MEMORY_BLOCK_SIZE
> + * (256MB) memory per slot, we should be able to support 1TB of guest
> + * hotpluggable memory.
> + */
> +#define SPAPR_MAX_RAM_SLOTS (1ULL <<

Re: [Qemu-devel] [RFC PATCH v2 16/23] cpus: Reclaim vCPU objects

2015-03-25 Thread David Gibson

On Mon, Mar 23, 2015 at 07:05:57PM +0530, Bharata B Rao wrote:
> From: Gu Zheng 
> 
> In order to deal well with the kvm vcpus (which can not be removed without any
> protection), we do not close KVM vcpu fd, just record and mark it as stopped
> into a list, so that we can reuse it for the appending cpu hot-add request if
> possible. It is also the approach that kvm guys suggested:
> https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> 
> Signed-off-by: Chen Fan 
> Signed-off-by: Gu Zheng 
> Signed-off-by: Zhu Guihua 
> Signed-off-by: Bharata B Rao 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgp6pLZf9Ihxi.pgp
Description: PGP signature

Re: [Qemu-devel] [RFC PATCH v2 18/23] xics_kvm: Add cpu_destroy method to XICS

2015-03-25 Thread David Gibson

On Mon, Mar 23, 2015 at 07:05:59PM +0530, Bharata B Rao wrote:
> XICS is setup for each CPU during initialization. Provide a routine
> to undo the same when CPU is unplugged.
> 
> This allows reboot of a VM that has undergone CPU hotplug and unplug
> to work correctly.
> 
> Signed-off-by: Bharata B Rao 

Reviewed-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpYDfkTkCLVs.pgp
Description: PGP signature

Re: [Qemu-devel] [RFC PATCH v2 17/23] xics_kvm: Don't enable KVM_CAP_IRQ_XICS if already enabled

2015-03-25 Thread David Gibson

On Mon, Mar 23, 2015 at 07:05:58PM +0530, Bharata B Rao wrote:
> When supporting CPU hot removal by parking the vCPU fd and reusing
> it during hotplug again, there can be cases where we try to reenable
> KVM_CAP_IRQ_XICS CAP for the vCPU for which it was already enabled.
> Introduce a boolean member in ICPState to track this and don't
> reenable the CAP if it was already enabled earlier.
> 
> This change allows CPU hot removal to work for sPAPR.
> 
> Signed-off-by: Bharata B Rao 

Why does double enabling the capability cause problems?  I would have
expected it to be unnecessary, but harmless.

> ---
>  hw/intc/xics_kvm.c| 10 ++
>  include/hw/ppc/xics.h |  1 +
>  2 files changed, 11 insertions(+)
> 
> diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c
> index c15453f..5b27bf8 100644
> --- a/hw/intc/xics_kvm.c
> +++ b/hw/intc/xics_kvm.c
> @@ -331,6 +331,15 @@ static void xics_kvm_cpu_setup(XICSState *icp, 
> PowerPCCPU *cpu)
>  abort();
>  }
>  
> +/*
> + * If we are reusing a parked vCPU fd corresponding to the CPU
> + * which was hot-removed earlier we don't have to renable
> + * KVM_CAP_IRQ_XICS capability again.
> + */
> +if (ss->cap_irq_xics_enabled) {
> +return;
> +}
> +
>  if (icpkvm->kernel_xics_fd != -1) {
>  int ret;
>  
> @@ -343,6 +352,7 @@ static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU 
> *cpu)
>  kvm_arch_vcpu_id(cs), strerror(errno));
>  exit(1);
>  }
> +ss->cap_irq_xics_enabled = true;
>  }
>  }
>  
> diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h
> index a214dd7..355a966 100644
> --- a/include/hw/ppc/xics.h
> +++ b/include/hw/ppc/xics.h
> @@ -109,6 +109,7 @@ struct ICPState {
>  uint8_t pending_priority;
>  uint8_t mfrr;
>  qemu_irq output;
> +bool cap_irq_xics_enabled;
>  };
>  
>  #define TYPE_ICS "ics"

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


pgpTXuzJE3h6k.pgp
Description: PGP signature

Re: [Qemu-devel] [PATCH v3] block: Switch to host monotonic clock for IO throttling

2015-03-25 Thread Alberto Garcia

On Wed, Mar 25, 2015 at 10:42:32AM +0800, Fam Zheng wrote:

> Block jobs are confusingly inconsistent between with and without
> throttling: if user sets a bps limit, starts a block job, then stops
> vm, the block job will not make any progress; in contrary, if user
> unsets the bps limit, the block job will run normally.

Wait, that's not what happens:

a) If you start a job then stop the vm then the job will be
   completed immediately.
b) If you stop the vm then start the job then it won't make any
   progress.

This patch fixes b), but a) still happens.

Berto

Re: [Qemu-devel] [PATCH v3] block: Switch to host monotonic clock for IO throttling

2015-03-25 Thread Fam Zheng

On Wed, 03/25 08:16, Alberto Garcia wrote:
> On Wed, Mar 25, 2015 at 10:42:32AM +0800, Fam Zheng wrote:
> 
> > Block jobs are confusingly inconsistent between with and without
> > throttling: if user sets a bps limit, starts a block job, then stops
> > vm, the block job will not make any progress; in contrary, if user
> > unsets the bps limit, the block job will run normally.
> 
> Wait, that's not what happens:
> 
> a) If you start a job then stop the vm then the job will be
>completed immediately.
> b) If you stop the vm then start the job then it won't make any
>progress.
> 
> This patch fixes b), but a) still happens.
> 

Ah you're right, I put the order wrong. I'v just started looking at issue a).

I'll fix the text.

Fam

[Qemu-devel] [PATCH v4] block: Switch to host monotonic clock for IO throttling

2015-03-25 Thread Fam Zheng

Currently, throttle timers won't make any progress when VCPU is not
running, which would stall the request queue in utils, qtest, vm
suspending, and live migration, without special handling.

Block jobs are confusingly inconsistent between with and without
throttling: if user sets a bps limit, stops the vm, then start a block
job, the block job will not make any progress; in contrary, if user
unsets the bps limit, or if it's not set, the block job will run
normally.

After this patch, with the host clock, even if the VCPUs are stopped,
the throttle queues will be processed.

This patch also enables potential to add throttle to bdrv_drain_all.
Currently all requests are drained immediately. In other words whenever
it is called, IO throttling goes ineffective (examples: system reset,
migration and many block job operations.). This is a loophole that guest
could exploit. If we use the host clock, we can later just trust the
nested poll. This could be done on top.

Note that for qemu-iotests case 093, which uses qtest, we still keep vm
clock so the script can control the clock stepping in order to be
deterministic.

Reviewed-by: Paolo Bonzini 
Reviewed-by: Alberto Garcia 
Signed-off-by: Fam Zheng 

---
v4: Fix the description. [Alberto]
v3: More justification in commit message. [Stefan]
Add Paolo's and Alberto's rev-bys.
v2: Don't break qemu-iotests 093.
---
 block.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 0fe97de..89a1d5b 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,7 @@
 #include "qapi/qmp/qjson.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/qtest.h"
 #include "qemu/notify.h"
 #include "block/coroutine.h"
 #include "block/qapi.h"
@@ -181,10 +182,16 @@ static void bdrv_throttle_write_timer_cb(void *opaque)
 /* should be called before bdrv_set_io_limits if a limit is set */
 void bdrv_io_limits_enable(BlockDriverState *bs)
 {
+int clock_type = QEMU_CLOCK_REALTIME;
+
+if (qtest_enabled()) {
+/* For testing block IO throttling only */
+clock_type = QEMU_CLOCK_VIRTUAL;
+}
 assert(!bs->io_limits_enabled);
 throttle_init(&bs->throttle_state,
   bdrv_get_aio_context(bs),
-  QEMU_CLOCK_VIRTUAL,
+  clock_type,
   bdrv_throttle_read_timer_cb,
   bdrv_throttle_write_timer_cb,
   bs);
-- 
1.9.3

[Qemu-devel] [Bug 1385934] Re: USB with passthrougth guest cannot enumerate USB host

2015-03-25 Thread montjoie

With qemu 2.2.0 I see now the usb dongle but using it give me some problem seen 
on the host:
libusb: error [submit_control_transfer] submiturb failed error -1 errno=22
Disabling caps and seccomp change nothing.
I strace the process and I see:
26170 ioctl(20, USBDEVFS_SUBMITURB, 0x7f00c0539eb0) = -1 EINVAL (Invalid 
argument)
26170 write(2, "libusb: error [submit_control_transfer] submiturb failed error 
-1 errno=22\n", 75) = 75

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1385934

Title:
  USB with passthrougth guest cannot enumerate USB host

Status in QEMU:
  New

Bug description:
  Following the guide at 
http://www.linux-kvm.org/page/USB_Host_Device_Assigned_to_Guest
  Qemu is launched with qemu-system-x86_64 /dev/vgstripe/kvm_wifi -enable-kvm 
-m 512 -k fr -net nic -net tap,ifname=tap1,script=/bin/ifup.script -kernel 
/usr/src/linux_git/arch/x86_64/boot/bzImage -append root=/dev/sda -usb -device 
usb-host,hostbus=1,hostaddr=6
  The USB device does not show and USB stack seems not working
  On the guest:
  dmesg |grep -i usb
  [1.416966] hub 1-0:1.0: USB hub found
  [1.420431] usbcore: registered new interface driver usb-storage
  [1.445374] usbcore: registered new interface driver usbhid
  [1.446839] usbhid: USB HID core driver
  [1.863226] usb 1-1: new low-speed USB device number 2 using uhci_hcd
  [2.126173] usb 1-1: Invalid ep0 maxpacket: 64
  [2.373161] usb 1-1: new low-speed USB device number 3 using uhci_hcd
  [2.648112] usb 1-1: Invalid ep0 maxpacket: 64
  [2.892404] usb 1-1: new low-speed USB device number 4 using uhci_hcd
  [2.913001] usb 1-1: Invalid ep0 maxpacket: 64
  [3.161367] usb 1-1: new low-speed USB device number 5 using uhci_hcd
  [3.180070] usb 1-1: Invalid ep0 maxpacket: 64
  [3.181633] usb usb1-port1: unable to enumerate USB device
  lsusb
  Bus 001 Device 001: ID 1d6b:0001 Linux Foundation 1.1 root hub

  On the host:
  lsusb
  Bus 001 Device 006: ID 0457:0163 Silicon Integrated Systems Corp. SiS163U 
802.11 Wireless LAN Adapter
  qemu-system-x86_64 --version
  QEMU emulator version 2.1.2, Copyright (c) 2003-2008 Fabrice Bellard

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1385934/+subscriptions

Re: [Qemu-devel] [RFC PATCH v2 05/23] spapr: Reorganize CPU dt generation code

2015-03-25 Thread Bharata B Rao

On Wed, Mar 25, 2015 at 12:36:38PM +1100, David Gibson wrote:
> On Mon, Mar 23, 2015 at 07:05:46PM +0530, Bharata B Rao wrote:
> > Reorganize CPU device tree generation code so that it be reused from
> > hotplug path. CPU dt entries are now generated from spapr_finalize_fdt()
> > instead of spapr_create_fdt_skel().
> > 
> > Signed-off-by: Bharata B Rao 
> > ---
> >  hw/ppc/spapr.c | 288 
> > ++---
> >  1 file changed, 154 insertions(+), 134 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 36ff754..1a25cc0 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -206,24 +206,50 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int 
> > offset, PowerPCCPU *cpu,
> >  return ret;
> >  }
> >  
> > +static int spapr_fixup_cpu_numa_smt_dt(void *fdt, int offset, CPUState *cs,
> > +sPAPREnvironment *spapr)
> > +{
> > +int ret;
> > +PowerPCCPU *cpu = POWERPC_CPU(cs);
> > +int index = ppc_get_vcpu_dt_id(cpu);
> > +uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
> > +uint32_t associativity[] = {cpu_to_be32(0x5),
> > +cpu_to_be32(0x0),
> > +cpu_to_be32(0x0),
> > +cpu_to_be32(0x0),
> > +cpu_to_be32(cs->numa_node),
> > +cpu_to_be32(index)};
> > +
> > +/* Advertise NUMA via ibm,associativity */
> > +if (nb_numa_nodes > 1) {
> > +ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> > +  sizeof(associativity));
> > +if (ret < 0) {
> > +return ret;
> > +}
> > +}
> > +
> > +ret = fdt_setprop(fdt, offset, "ibm,pft-size",
> > +  pft_size_prop, sizeof(pft_size_prop));
> > +if (ret < 0) {
> > +return ret;
> > +}
> 
> The pft-size property isn't actually related to NUMA, so it doesn't
> really belong in this function.

Right, let me find an appropriate place to set this.

> 
> > +return spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
> > + ppc_get_compat_smt_threads(cpu));
> 
> Likewise calling the smt fixup function from the numa fixup function
> just seems odd to me; just be explicit and call the two sequentially.

It's just poor naming, I meant numa-and-smt. So essentially it is
fixing up NUMA and SMT related properties.

> 
> Overall this seems an odd way to split things.  Why not just make a
> spapr_fixup_one_cpu_dt() function, or similar, which should do all the
> necessary pieces.

Just one routine to setup everything related to CPU DT will not work
because some parts (NUMA and SMT bits) can be potentially fixed up
later as part of ibm,client-architecture-support call. This is the
reason why I have split up the reorg in this manner. Anyway will take
another stab at this and see if I can improve this.

> 
> > +}
> > +
> >  static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
> >  {
> >  int ret = 0, offset, cpus_offset;
> >  CPUState *cs;
> >  char cpu_model[32];
> >  int smt = kvmppc_smt_threads();
> > -uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
> >  
> >  CPU_FOREACH(cs) {
> >  PowerPCCPU *cpu = POWERPC_CPU(cs);
> >  DeviceClass *dc = DEVICE_GET_CLASS(cs);
> >  int index = ppc_get_vcpu_dt_id(cpu);
> > -uint32_t associativity[] = {cpu_to_be32(0x5),
> > -cpu_to_be32(0x0),
> > -cpu_to_be32(0x0),
> > -cpu_to_be32(0x0),
> > -cpu_to_be32(cs->numa_node),
> > -cpu_to_be32(index)};
> >  
> >  if ((index % smt) != 0) {
> >  continue;
> > @@ -247,22 +273,7 @@ static int spapr_fixup_cpu_dt(void *fdt, 
> > sPAPREnvironment *spapr)
> >  }
> >  }
> >  
> > -if (nb_numa_nodes > 1) {
> > -ret = fdt_setprop(fdt, offset, "ibm,associativity", 
> > associativity,
> > -  sizeof(associativity));
> > -if (ret < 0) {
> > -return ret;
> > -}
> > -}
> > -
> > -ret = fdt_setprop(fdt, offset, "ibm,pft-size",
> > -  pft_size_prop, sizeof(pft_size_prop));
> > -if (ret < 0) {
> > -return ret;
> > -}
> > -
> > -ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu,
> > - ppc_get_compat_smt_threads(cpu));
> > +ret = spapr_fixup_cpu_numa_smt_dt(fdt, offset, cs, spapr);
> >  if (ret < 0) {
> >  return ret;
> >  }
> > @@ -341,18 +352,13 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base,
> > uint32_t epow_irq)
> >  {

Re: [Qemu-devel] [PATCH] target-tricore: Fix check which was always false

2015-03-25 Thread Bastian Koppelmann


Hi mjt,

On 03/21/2015 02:44 PM, Stefan Weil wrote:

With a mask value of 0x0040, the result will never be 1.
This fixes a Coverity warning.

Signed-off-by: Stefan Weil 
---
  target-tricore/op_helper.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-tricore/op_helper.c b/target-tricore/op_helper.c
index 0e7b4e8..b7d18b2 100644
--- a/target-tricore/op_helper.c
+++ b/target-tricore/op_helper.c
@@ -2593,7 +2593,7 @@ void helper_rslcx(CPUTriCoreState *env)
  /* CSU trap */
  }
  /* if (PCXI.UL == 1) then trap(CTYP); */
-if ((env->PCXI & MASK_PCXI_UL) == 1) {
+if ((env->PCXI & MASK_PCXI_UL) != 0) {
  /* CTYP trap */
  }
  /* EA = {PCXI.PCXS, 6'b0, PCXI.PCXO, 6'b0}; */


are you picking this one up? It looks good to me.

Cheers,
Bastian

[Qemu-devel] AioContext of block jobs

2015-03-25 Thread Fam Zheng

I was looking at block jobs' AioContext and realized that the block job
coroutines are actually started in main loop.

I'm confused because 5a7e7a0bad17c96e03f55ed7019e2d7545e21a96 and friends in
the series [1] seem to move the coroutines to the BDS's iothreads, but it
didn't do that.

(Although after the first block_job_yield or sleep, the coroutines ARE resumed
in the right AioContext.)

Why is it safe to start the jobs from the main thread where QMP command is
handled? I see no guarantee that the jobs won't access BDS before first yield
but after releasing the AioContext.

Is this a bug?

[1]: http://lists.gnu.org/archive/html/qemu-devel/2014-10/msg02191.html

Thanks,
Fam

Re: [Qemu-devel] [RFC PATCH v2 11/23] ppc: Create sockets and cores for CPUs

2015-03-25 Thread Bharata B Rao

On Wed, Mar 25, 2015 at 01:39:02PM +1100, David Gibson wrote:
> On Mon, Mar 23, 2015 at 07:05:52PM +0530, Bharata B Rao wrote:
> > ppc machine init functions create individual CPU threads. Change this
> > for sPAPR by switching to socket creation. CPUs are created recursively
> > by socket and core instance init routines.
> > 
> > TODO: Switching to socket level CPU creation is done only for sPAPR
> > target now.
> > 
> > Signed-off-by: Bharata B Rao 
> > ---
> >  hw/ppc/cpu-core.c   | 17 +
> >  hw/ppc/cpu-socket.c | 15 +++
> >  hw/ppc/spapr.c  | 15 ---
> >  target-ppc/cpu.h|  1 +
> >  target-ppc/translate_init.c | 46 
> > +
> >  5 files changed, 87 insertions(+), 7 deletions(-)
> > 
> > diff --git a/hw/ppc/cpu-core.c b/hw/ppc/cpu-core.c
> > index ed0481f..f60646d 100644
> > --- a/hw/ppc/cpu-core.c
> > +++ b/hw/ppc/cpu-core.c
> > @@ -7,6 +7,8 @@
> >  
> >  #include "hw/qdev.h"
> >  #include "hw/ppc/cpu-core.h"
> > +#include "hw/boards.h"
> > +#include 
> >  
> >  static int ppc_cpu_core_realize_child(Object *child, void *opaque)
> >  {
> > @@ -32,10 +34,25 @@ static void ppc_cpu_core_class_init(ObjectClass *oc, 
> > void *data)
> >  dc->realize = ppc_cpu_core_realize;
> >  }
> >  
> > +static void ppc_cpu_core_instance_init(Object *obj)
> > +{
> > +int i;
> > +PowerPCCPU *cpu = NULL;
> > +MachineState *machine = MACHINE(qdev_get_machine());
> > +
> > +for (i = 0; i < smp_threads; i++) {
> > +cpu = POWERPC_CPU(cpu_ppc_create(TYPE_POWERPC_CPU, 
> > machine->cpu_model));
> > +object_property_add_child(obj, "thread[*]", OBJECT(cpu), 
> > &error_abort);
> > +object_unref(OBJECT(cpu));
> > +}
> > +}
> > +
> >  static const TypeInfo ppc_cpu_core_type_info = {
> >  .name = TYPE_POWERPC_CPU_CORE,
> >  .parent = TYPE_DEVICE,
> >  .class_init = ppc_cpu_core_class_init,
> > +.instance_init = ppc_cpu_core_instance_init,
> > +.instance_size = sizeof(PowerPCCPUCore),
> 
> The PowerPCCPUCore structure isn't defined in this patch (I assume it
> already existed), which suggests that setting the instance_size should
> have already been in an earlier patch.

PowerPCCPUCore is already defined, but I put the instance_size here
since I needed instance_init only here. I thought it is better to
have instance_init and instance_size populated together.

> 
> >  };
> >  
> >  static void ppc_cpu_core_register_types(void)
> > diff --git a/hw/ppc/cpu-socket.c b/hw/ppc/cpu-socket.c
> > index 602a060..f901336 100644
> > --- a/hw/ppc/cpu-socket.c
> > +++ b/hw/ppc/cpu-socket.c
> > @@ -8,6 +8,7 @@
> >  #include "hw/qdev.h"
> >  #include "hw/ppc/cpu-socket.h"
> >  #include "sysemu/cpus.h"
> > +#include "cpu.h"
> >  
> >  static int ppc_cpu_socket_realize_child(Object *child, void *opaque)
> >  {
> > @@ -33,10 +34,24 @@ static void ppc_cpu_socket_class_init(ObjectClass *oc, 
> > void *data)
> >  dc->realize = ppc_cpu_socket_realize;
> >  }
> >  
> > +static void ppc_cpu_socket_instance_init(Object *obj)
> > +{
> > +int i;
> > +Object *core;
> > +
> > +for (i = 0; i < smp_cores; i++) {
> > +core = object_new(TYPE_POWERPC_CPU_CORE);
> > +object_property_add_child(obj, "core[*]", core, &error_abort);
> > +object_unref(core);
> > +}
> > +}
> > +
> >  static const TypeInfo ppc_cpu_socket_type_info = {
> >  .name = TYPE_POWERPC_CPU_SOCKET,
> >  .parent = TYPE_CPU_SOCKET,
> >  .class_init = ppc_cpu_socket_class_init,
> > +.instance_init = ppc_cpu_socket_instance_init,
> > +.instance_size = sizeof(PowerPCCPUSocket),
> 
> Likewise for PowerPCCPUSocket.
> >  
> > +/*
> > + * This is essentially same as cpu_generic_init() but without a set
> > + * realize call.
> > + */
> 
> In which case it would probably make more sense to have this be a
> generic function, and implement cpu_generic_init() in terms of it.

Actually multiple people are touching that part of the code, I so I
figured it will be a bit easier for now to contain the changes within ppc.
But yes, eventually we should do what you are suggesting.

Re: [Qemu-devel] [RFC PATCH v2 12/23] spapr: CPU hotplug support

2015-03-25 Thread Bharata B Rao

On Wed, Mar 25, 2015 at 02:03:45PM +1100, David Gibson wrote:
> On Mon, Mar 23, 2015 at 07:05:53PM +0530, Bharata B Rao wrote:
> > Support CPU hotplug via device-add command. Set up device tree
> > entries for the hotplugged CPU core and use the exising EPOW event
> > infrastructure to send CPU hotplug notification to the guest.
> > 
> > Signed-off-by: Bharata B Rao 
> > ---
> >  hw/ppc/spapr.c| 75 
> > +++
> >  hw/ppc/spapr_events.c |  8 +++---
> >  hw/ppc/spapr_rtas.c   | 11 
> >  3 files changed, 91 insertions(+), 3 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index f52d38f..b48994b 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -33,6 +33,7 @@
> >  #include "sysemu/block-backend.h"
> >  #include "sysemu/cpus.h"
> >  #include "sysemu/kvm.h"
> > +#include "sysemu/device_tree.h"
> >  #include "kvm_ppc.h"
> >  #include "mmu-hash64.h"
> >  #include "qom/cpu.h"
> > @@ -660,6 +661,10 @@ static void spapr_populate_cpu_dt(CPUState *cs, void 
> > *fdt, int offset)
> >  QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL);
> >  unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0;
> >  uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1;
> > +sPAPRDRConnector *drc =
> > +spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, index);
> > +sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > +int drc_index = drck->get_index(drc);
> >  
> >  _FDT((fdt_setprop_cell(fdt, offset, "reg", index)));
> >  _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu")));
> > @@ -728,6 +733,8 @@ static void spapr_populate_cpu_dt(CPUState *cs, void 
> > *fdt, int offset)
> >  
> >  _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
> >  cs->cpu_index / cpus_per_socket)));
> > +_FDT((fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)));
> > +
> 
> What effect will this have when running with DR disabled?

I realize now that I shouldn't probably populate this at all in that case.
This routine is now common for both bootpath and hotplug path. Will take
care of this in next post.

> 
> >  _FDT(spapr_fixup_cpu_numa_smt_dt(fdt, offset, cs, spapr));
> >  }
> > @@ -1840,6 +1847,70 @@ static void spapr_nmi(NMIState *n, int cpu_index, 
> > Error **errp)
> >  }
> >  }
> >  
> > +static void spapr_cpu_hotplug_add(DeviceState *dev, CPUState *cs, Error 
> > **errp)
> > +{
> > +PowerPCCPU *cpu = POWERPC_CPU(cs);
> > +DeviceClass *dc = DEVICE_GET_CLASS(cs);
> > +int id = ppc_get_vcpu_dt_id(cpu);
> > +sPAPRDRConnector *drc =
> > +spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
> > +sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
> > +void *fdt;
> > +int offset, i, fdt_size;
> > +char *nodename;
> > +
> > +fdt = create_device_tree(&fdt_size);
> > +nodename = g_strdup_printf("%s@%x", dc->fw_name, id);
> > +offset = fdt_add_subnode(fdt, 0, nodename);
> > +
> > +/* Set NUMA node for the added CPU core */
> > +for (i = 0; i < nb_numa_nodes; i++) {
> > +if (test_bit(cs->cpu_index, numa_info[i].node_cpu)) {
> > +cs->numa_node = i;
> > +break;
> > +}
> > +}
> > +
> > +spapr_populate_cpu_dt(cs, fdt, offset);
> > +g_free(nodename);
> > +
> > +drck->attach(drc, dev, fdt, offset, !dev->hotplugged, errp);
> > +if (*errp) {
> > +g_free(fdt);
> > +}
> > +}
> > +
> > +static void spapr_cpu_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> > +Error **errp)
> > +{
> > +CPUState *cs = CPU(dev);
> > +PowerPCCPU *cpu = POWERPC_CPU(cs);
> > +int id = ppc_get_vcpu_dt_id(cpu);
> > +sPAPRDRConnector *drc =
> > +spapr_dr_connector_by_id(SPAPR_DR_CONNECTOR_TYPE_CPU, id);
> > +int smt = kvmppc_smt_threads();
> > +Error *local_err = NULL;
> > +
> > +/*
> > + * SMT threads return from here, only main thread (core) will
> > + * continue and signal hotplug event to the guest.
> > + */
> > +if ((id % smt) != 0) {
> > +return;
> > +}
> > +
> > +g_assert(drc);
> > +
> > +spapr_cpu_hotplug_add(dev, cs, &local_err);
> > +if (local_err) {
> > +error_propagate(errp, local_err);
> > +return;
> > +}
> > +spapr_hotplug_req_add_event(drc);
> > +
> > +return;
> > +}
> > +
> >  static void spapr_machine_device_plug(HotplugHandler *hotplug_dev,
> >DeviceState *dev, Error **errp)
> >  {
> > @@ -1848,6 +1919,10 @@ static void spapr_machine_device_plug(HotplugHandler 
> > *hotplug_dev,
> >  PowerPCCPU *cpu = POWERPC_CPU(cs);
> >  
> >  spapr_cpu_init(cpu);
> > +spapr_cpu_reset(cpu);
> > +if (dev->hotplugged && spapr->dr_cpu_enabled) {
> > +spapr_cpu_plug(hotplug_dev, dev

Re: [Qemu-devel] [RFC PATCH v2 14/23] cpus: Convert cpu_index into a bitmap

2015-03-25 Thread Bharata B Rao

On Wed, Mar 25, 2015 at 02:23:29PM +1100, David Gibson wrote:
> On Mon, Mar 23, 2015 at 07:05:55PM +0530, Bharata B Rao wrote:
> > Currently CPUState.cpu_index is monotonically increasing and a newly
> > created CPU always gets the next higher index. The next available
> > index is calculated by counting the existing number of CPUs. This is
> > fine as long as we only add CPUs, but there are architectures which
> > are starting to support CPU removal too. For an architecture like PowerPC
> > which derives its CPU identifier (device tree ID) from cpu_index, the
> > existing logic of generating cpu_index values causes problems.
> > 
> > With the currently proposed method of handling vCPU removal by parking
> > the vCPU fd in QEMU
> > (Ref: http://lists.gnu.org/archive/html/qemu-devel/2015-02/msg02604.html),
> > generating cpu_index this way will not work for PowerPC.
> > 
> > This patch changes the way cpu_index is handed out by maintaining
> > a bit map of the CPUs that tracks both addition and removal of CPUs.
> > 
> > Signed-off-by: Bharata B Rao 
> > ---
> >  exec.c| 37 ++---
> >  include/qom/cpu.h |  8 
> >  2 files changed, 42 insertions(+), 3 deletions(-)
> > 
> > diff --git a/exec.c b/exec.c
> > index e1ff6b0..9bbab02 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -527,21 +527,52 @@ void tcg_cpu_address_space_init(CPUState *cpu, 
> > AddressSpace *as)
> >  }
> >  #endif
> >  
> > +#ifndef CONFIG_USER_ONLY
> > +static DECLARE_BITMAP(cpu_index_map, MAX_CPUMASK_BITS);
> > +
> > +static int cpu_get_free_index(Error **errp)
> > +{
> > +int cpu = find_first_zero_bit(cpu_index_map, max_cpus);
> > +
> > +if (cpu == max_cpus) {
> > +error_setg(errp, "Trying to use more CPUs than allowed max of 
> > %d\n",
> > +max_cpus);
> > +return max_cpus;
> > +} else {
> > +bitmap_set(cpu_index_map, cpu, 1);
> > +return cpu;
> > +}
> > +}
> > +
> > +void cpu_exec_exit(CPUState *cpu)
> > +{
> > +bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
> > +}
> 
> AFAICT, this function is never called, which seems like a bug.

It is called in subsequent patch. If you are suggesting that a function
shouldn't be defined in a patch where it not used, I can move this
down to the other patch.

Regards,
Bharata.

Re: [Qemu-devel] [RFC PATCH v2 15/23] ppc: Move cpu_exec_init() call to realize function

2015-03-25 Thread Bharata B Rao

On Wed, Mar 25, 2015 at 02:25:09PM +1100, David Gibson wrote:
> On Mon, Mar 23, 2015 at 07:05:56PM +0530, Bharata B Rao wrote:
> > Move cpu_exec_init() call from instance_init to realize. This allows
> > any failures from cpu_exec_init() to be handled appropriately.
> > 
> > Also add cpu_exec_exit() call from unrealize.
> 
> This still leaves all non-ppc archs not ever calling cpu_exec_exit().

Oops missed that. Will fix.

Regards,
Bharata.

Re: [Qemu-devel] [PATCH RFC for-2.3 1/1] block: New command line option --no-format-probing

2015-03-25 Thread Markus Armbruster

Paolo Bonzini  writes:

> On 24/03/2015 17:49, Markus Armbruster wrote:
>>> But what about migration from newer to older QEMU?  Libvirt even
>>> supports QEMU versions where the only way to specify disks is "-hda
>>> XYZ", so it is _impossible_ to honor the format=raw specifier.
>> 
>> If you migrate to a QEMU that doesn't understand the new option, libvirt
>> simply won't set it.  You lose the protection against libvirt bugs it
>> provides.  Guest won't notice.
>> 
>> If you somehow manage to find a QEMU old enough to make libvirt use
>> format-incapable interfaces, you'll be using insecure format probing on
>> the destination.  My patch makes no difference.  Good luck migrating to
>> such an old QEMU.
>
> (Didn't mean live migration---sorry, could have simply said "switch").
>
> Based on my reading of the code, libvirt will actually ignore the
> allow_disk_format_probing setting, and not do anything about the format
> when driving such an old QEMU.  By contrast, if you specify a format and
> libvirt invokes an old qemu-nbd without --format, libvirt fails hard.
> That's already CVE worthy, isn't it?
>
> So I think an option like this is premature.  libvirt should _first of
> all_ ensure that it completely abides by the allow_disk_format_probing
> setting, including refusing to drive old QEMUs when format probing is
> disabled.  Once libvirt is consistent within itself, we can talk about
> what help QEMU can provide.

If we had a "no format probing" option in qemu-nbd (not in my patch, but
that's a flaw in the patch, not the idea), then libvirt developers
would've put it to use right away, and then their insecure usage
would've failed hard, making it immediately obvious.

My point is: we don't have to perfect libvirt before we can provide it a
safe failure mode.  The safe failure mode will actually assist with
perfecting libvirt, by making the failures immediately fatal.

> Perfect is not the enemy of good here.  Good is the enemy of secure.
>
>> As to "near misses don't count": for me, what counts is actual users
>> telling me about their difficulties using QEMU securely.  Secure usage
>> shouldn't be hard.
>
> The right answer for them is probably "use libvirt" or "use Boxes"
> depending on the actual usecase.  Invoking QEMU manually is almost never
> the right answer for random untrusted images download from the Internet.

I agree secure usage is currently is too hard for casual command line
use.

Unfortunately, it has proven too hard even for sophisticated
programmatic users like libvirt.  I refuse to blame libvirt for that.
Secure usage really shouldn't be that hard.

>  Also, I suspect any advice to QEMU users about adding
> --no-format-probing would be quickly forgotten.

When we discussed security issues with probing last year, my first line
of defense was "the default should be secure".  Overwhelming opposition
from the backward compatibility camp forced me to retreat from that line
to my second line of defense "secure shouldn't be hard".  That line
needs to be held at all costs :)

"Always specify --no-format-probing" (or however else we label the knob)
is workable advice for programmatic users like libvirt.  I agree it's
hardly helpful advice for human users.  For them, the advice needs to be
something like "add the following stanza to ~/.qemu.conf, then forget
about it".

Same thing, different user interface.

~/.qemu.conf isn't currently used, but it could be.

> That said, if _humans_ have interest in secure use of QEMU, that's a
> much better and more interesting use case than libvirt's, because
> libvirt is itself providing a secure management layer.

Debating which use case is "better" and more interesting doesn't seem to
be wortwhile.  I think both are interesting enough.

> We have other security options, for example seccomp and FIPS mode.
> Format probing definitely falls in this category.  Let's add first of
> all a -security grouping, where "-security [all=]on" enables all of them
> but it's also possible to control the suboptions individually.  Then we
> can add format probing to this category.  The same options can be added
> to the utilities.

Putting it in a security option group is certainly fine with me.  I
picked "misc" only because I couldn't find a better fit, and I could see
other readconfig-incapable options where I couldn't find a better fit,
either.

> Let's iron out the kinks and do it for 2.4.  It's a very useful feature
> indeed.
>
> But it's something we do for _users_, not for libvirt.

For whom you want it to get done isn't as important to me as getting it
done :)

> But I still
> maintain that for libvirt this is basically security theater, and the
> priorities are others.

To be honest, I find your use of "security theater" offensive.

I'd readily accept the moniker if the feature would provide libvirt
developers an excuse not to fix their bugs, or reduce the incentives to
fix their bugs.

But the feature in

[Qemu-devel] [PATCH] nbd: Fix up comment after commit e140177

2015-03-25 Thread Markus Armbruster

Signed-off-by: Markus Armbruster 
---
 blockdev-nbd.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index b29e456..85cda4c 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -47,8 +47,9 @@ void qmp_nbd_server_start(SocketAddress *addr, Error **errp)
 }
 }
 
-/* Hook into the BlockDriverState notifiers to close the export when
- * the file is closed.
+/*
+ * Hook into the BlockBackend notifiers to close the export when the
+ * backend is closed.
  */
 typedef struct NBDCloseNotifier {
 Notifier n;
-- 
1.9.3

Re: [Qemu-devel] [RFC PATCH v2 17/23] xics_kvm: Don't enable KVM_CAP_IRQ_XICS if already enabled

2015-03-25 Thread Bharata B Rao

On Wed, Mar 25, 2015 at 04:24:39PM +1100, David Gibson wrote:
> On Mon, Mar 23, 2015 at 07:05:58PM +0530, Bharata B Rao wrote:
> > When supporting CPU hot removal by parking the vCPU fd and reusing
> > it during hotplug again, there can be cases where we try to reenable
> > KVM_CAP_IRQ_XICS CAP for the vCPU for which it was already enabled.
> > Introduce a boolean member in ICPState to track this and don't
> > reenable the CAP if it was already enabled earlier.
> > 
> > This change allows CPU hot removal to work for sPAPR.
> > 
> > Signed-off-by: Bharata B Rao 
> 
> Why does double enabling the capability cause problems?  I would have
> expected it to be unnecessary, but harmless.

We are reusing the vCPU here w/o closing its fd.

As things stand currently, enabling this cap again will result in
kernel trying to create and associate ICP with this vCPU and that
fails since there is already an ICP associated with it.

Ref: arch/powerpc/kvm/book3s_xics.c:kvmppc_xics_connect_vcpu() kernel code.

So this patch will ensure that we don't renable this cap.

Regards,
Bharata.

[Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread zhanghailiang


Hi all,

We found that, sometimes, the content of VM's memory is inconsistent between 
Source side and Destination side
when we check it just after finishing migration but before VM continue to Run.

We use a patch like bellow to find this issue, you can find it from affix,
and Steps to reproduce:

(1) Compile QEMU:
 ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make

(2) Command and output:
SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio
(qemu) migrate tcp:192.168.3.8:3004
before saving ram complete
ff703f6889ab8701e4e040872d079a28
md_host : after saving ram complete
ff703f6889ab8701e4e040872d079a28

DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 -boot 
c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio 
-incoming tcp:0:3004
(qemu) QEMU_VM_SECTION_END, after loading ram
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after loading all vmstate
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after cpu_synchronize_all_post_init
230e1e68ece9cd4e769630e1bcb5ddfb

This happens occasionally, and it is more easy to reproduce when issue 
migration command during VM's startup time.

We have done further test and found that some pages has been dirtied but its 
corresponding migration_bitmap is not set.
We can't figure out which modules of QEMU has missed setting bitmap when dirty 
page of VM,
it is very difficult for us to trace all the actions of dirtying VM's pages.

Actually, the first time we found this problem was in the COLO FT development, 
and it triggered some strange issues in
VM which all pointed to the issue of inconsistent of VM's memory. (We have try 
to save all memory of VM to slave side every time
when do checkpoint in COLO FT, and everything will be OK.)

Is it OK for some pages that not transferred to destination when do migration ? 
Or is it a bug?

This issue has blocked our COLO development... :(

Any help will be greatly appreciated!

Thanks,
zhanghailiang

--- a/savevm.c
+++ b/savevm.c
@@ -51,6 +51,26 @@
 #define ARP_PTYPE_IP 0x0800
 #define ARP_OP_REQUEST_REV 0x3

+#include "qemu/rcu_queue.h"
+#include 
+
+static void check_host_md5(void)
+{
+int i;
+unsigned char md[MD5_DIGEST_LENGTH];
+MD5_CTX ctx;
+RAMBlock *block = QLIST_FIRST_RCU(&ram_list.blocks);/* Only check 'pc.ram' 
block */
+
+MD5_Init(&ctx);
+MD5_Update(&ctx, (void *)block->host, block->used_length);
+MD5_Final(md, &ctx);
+printf("md_host : ");
+for(i = 0; i < MD5_DIGEST_LENGTH; i++) {
+fprintf(stderr, "%02x", md[i]);
+}
+fprintf(stderr, "\n");
+}
+
 static int announce_self_create(uint8_t *buf,
 uint8_t *mac_addr)
 {
@@ -741,7 +761,13 @@ void qemu_savevm_state_complete(QEMUFile *f)
 qemu_put_byte(f, QEMU_VM_SECTION_END);
 qemu_put_be32(f, se->section_id);

+printf("before saving %s complete\n", se->idstr);
+check_host_md5();
+
 ret = se->ops->save_live_complete(f, se->opaque);
+printf("after saving %s complete\n", se->idstr);
+check_host_md5();
+
 trace_savevm_section_end(se->idstr, se->section_id, ret);
 if (ret < 0) {
 qemu_file_set_error(f, ret);
@@ -1030,6 +1063,11 @@ int qemu_loadvm_state(QEMUFile *f)
 }

 ret = vmstate_load(f, le->se, le->version_id);
+if (section_type == QEMU_VM_SECTION_END) {
+printf("QEMU_VM_SECTION_END, after loading %s\n", 
le->se->idstr);
+check_host_md5();
+}
+
 if (ret < 0) {
 error_report("error while loading state section id %d(%s)",
  section_id, le->se->idstr);
@@ -1061,7 +1099,11 @@ int qemu_loadvm_state(QEMUFile *f)
 g_free(buf);
 }

+printf("after loading all vmstate\n");
+check_host_md5();
 cpu_synchronize_all_post_init();
+printf("after cpu_synchronize_all_post_init\n");
+check_host_md5();

 ret = 0;

--
>From ecb789cf7f383b112da3cce33eb9822a94b9497a Mon Sep 17 00:00:00 2001
From: Li Zhijian 
Date: Tue, 24 Mar 2015 21:53:26 -0400
Subject: [PATCH] check pc.ram block md5sum between migration Source and
 Destination

Signed-off-by: Li Zhijian 
---
 savevm.c | 42 ++
 1 file changed, 42 insertions(+)
 mode change 100644 => 100755 savevm.c

diff --

[Qemu-devel] [RFC PATCH COLO v2 00/13] Block replication for continuous checkpoints

2015-03-25 Thread Wen Congyang

Block replication is a very important feature which is used for
continuous checkpoints(for example: COLO).

Usage:
Please refer to docs/block-replication.txt

You can get the patch here:
https://github.com/wencongyang/qemu-colo/commits/block-replication-v2

Changs Log:
V2:
1. Redesign the secondary qemu(use image-fleecing)
2. Use Error objects to return error message
3. Address the comments from Max Reitz and Eric Blake

Wen Congyang (13):
  docs: block replication's description
  quorum: allow ignoring child errors
  NBD client: connect to nbd server later
  Add new block driver interfaces to control block replication
  quorum: implement block driver interfaces for block replication
  NBD client: implement block driver interfaces for block replication
  allow writing to the backing file
  Allow creating backup jobs when opening BDS
  block: Parse "backing_reference" option to reference existing BDS
  Backup: clear all bitmap when doing block checkpoint
  qcow2: support colo
  skip nbd_target when starting block replication
  Don't allow a disk use backing reference target

 block.c| 242 +++-
 block/Makefile.objs|   2 +-
 block/backup.c |  12 ++
 block/nbd.c| 171 +++--
 block/qcow2.c  | 447 -
 block/qcow2.h  |   6 +
 block/quorum.c | 143 ++-
 docs/block-replication.txt | 147 +++
 include/block/block.h  |   5 +
 include/block/block_int.h  |  13 ++
 include/qemu/hbitmap.h |   8 +
 qapi/block.json|  16 ++
 tests/qemu-iotests/051 |  13 ++
 tests/qemu-iotests/051.out |  13 ++
 util/hbitmap.c |  19 ++
 15 files changed, 1230 insertions(+), 27 deletions(-)
 create mode 100644 docs/block-replication.txt

-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 01/13] docs: block replication's description

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: Paolo Bonzini 
Signed-off-by: Yang Hongyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 docs/block-replication.txt | 147 +
 1 file changed, 147 insertions(+)
 create mode 100644 docs/block-replication.txt

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
new file mode 100644
index 000..874ed8e
--- /dev/null
+++ b/docs/block-replication.txt
@@ -0,0 +1,147 @@
+Block replication
+
+Copyright Fujitsu, Corp. 2015
+Copyright (c) 2015 Intel Corporation
+Copyright (c) 2015 HUAWEI TECHNOLOGIES CO.,LTD.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+The block replication is used for continuous checkpoints. It is designed
+for COLO that Secondary VM is running. It can also be applied for FT/HA
+scene that Secondary VM is not running.
+
+This document gives an overview of block replication's design.
+
+== Background ==
+High availability solutions such as micro checkpoint and COLO will do
+consecutive checkpoint. The VM state of Primary VM and Secondary VM is
+identical right after a VM checkpoint, but becomes different as the VM
+executes till the next checkpoint. To support disk contents checkpoint,
+the modified disk contents in the Secondary VM must be buffered, and are
+only dropped at next checkpoint time. To reduce the network transportation
+effort at the time of checkpoint, the disk modification operations of
+Primary disk are asynchronously forwarded to the Secondary node.
+
+== Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  |  Copy and Forward| |
+  |-(1)--+   | Disk Buffer |
+  |  |   | |
+  | (3)  \-/
+  | speculative  ^
+  |write through(2)
+  |  |   |
+  V  V   |
+   +--+   ++
+   | Primary Disk |   | Secondary Disk |
+   +--+   ++
+
+1) Primary write requests will be copied and forwarded to Secondary
+   QEMU.
+2) Before Primary write requests are written to Secondary disk, the
+   original sector content will be read from Secondary disk and
+   buffered in the Disk buffer, but it will not overwrite the existing
+   sector content in the Disk buffer.
+3) Primary write requests will be written to Secondary disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Architecture ==
+We are going to implement COLO block replication from many basic
+blocks that are already in QEMU.
+
+ virtio-blk   ||
+ ^||.--
+ |||| Secondary
+1 Quorum  ||'--
+ /  \ ||
+/\||
+   Primary  2 NBD  --->  2 NBD
+ disk   client|| server
 virtio-blk
+  ||^  
  ^
+. |||  
  |
+Primary | ||  Secondary disk <- hidden-disk 4 
<- active-disk 3
+' |||  backing^   backing
+  ||| |
+  ||| |
+  ||'-'
+  ||   drive-backup sync=none
+
+1) The disk on the primary is represented by a block device with two
+children, providing replication between a primary disk and the host that
+runs the secondary VM. The read pattern for quorum can be extended to
+make the primary always read from the local disk instead of going through
+NBD.
+
+2) The secondary disk receives writes from the primary VM through QEMU's
+embedded NBD server (speculative write-through)

[Qemu-devel] [RFC PATCH COLO v2 08/13] Allow creating backup jobs when opening BDS

2015-03-25 Thread Wen Congyang

When opening BDS, we need to create backup jobs for
image-fleecing.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Cc: Jeff Cody 
---
 block/Makefile.objs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/Makefile.objs b/block/Makefile.objs
index db2933e..0950973 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -21,10 +21,10 @@ block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
 block-obj-y += write-threshold.o
+block-obj-y += backup.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
-common-obj-y += backup.o
 
 iscsi.o-cflags := $(LIBISCSI_CFLAGS)
 iscsi.o-libs   := $(LIBISCSI_LIBS)
-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 05/13] quorum: implement block driver interfaces for block replication

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block/quorum.c | 79 ++
 1 file changed, 79 insertions(+)

diff --git a/block/quorum.c b/block/quorum.c
index d087135..eee829d 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -84,6 +84,8 @@ typedef struct BDRVQuorumState {
 */
 
 QuorumReadPattern read_pattern;
+
+int colo_index; /* store which child supports block replication */
 } BDRVQuorumState;
 
 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -1024,6 +1026,7 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 
 g_free(opened);
+s->colo_index = -1;
 goto exit;
 
 close_exit:
@@ -1114,6 +1117,78 @@ static void quorum_refresh_filename(BlockDriverState *bs)
 bs->full_open_options = opts;
 }
 
+static void quorum_stop_replication(BlockDriverState *bs, Error **errp);
+static void quorum_start_replication(BlockDriverState *bs, COLOMode mode,
+ Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+int count = 0, i, index;
+Error *local_err = NULL;
+
+/*
+ * TODO: support COLO_SECONDARY_MODE if we allow secondary
+ * QEMU becoming primary QEMU.
+ */
+if (mode != COLO_MODE_PRIMARY) {
+error_set(errp, QERR_INVALID_PARAMETER, "mode");
+return;
+}
+
+if (s->read_pattern != QUORUM_READ_PATTERN_FIFO) {
+error_set(errp, QERR_INVALID_PARAMETER, "read pattern");
+return;
+}
+
+for (i = 0; i < s->num_children; i++) {
+bdrv_start_replication(s->bs[i], mode, &local_err);
+if (local_err) {
+error_free(local_err);
+local_err = NULL;
+} else {
+count++;
+index = i;
+}
+}
+
+if (count == 0) {
+/* No child supports block replication */
+error_set(errp, QERR_UNSUPPORTED);
+} else if (count > 1) {
+quorum_stop_replication(bs, NULL);
+error_setg(errp, "too many children support block replication");
+} else {
+s->colo_index = index;
+}
+}
+
+static void quorum_do_checkpoint(BlockDriverState *bs, Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+
+if (s->colo_index < 0) {
+error_setg(errp, "Block replication is not started");
+return;
+}
+
+bdrv_do_checkpoint(s->bs[s->colo_index], errp);
+}
+
+static void quorum_stop_replication(BlockDriverState *bs, Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+int i;
+
+if (s->colo_index >= 0) {
+bdrv_stop_replication(s->bs[s->colo_index], errp);
+s->colo_index = -1;
+return;
+}
+
+for (i = 0; i < s->num_children; i++) {
+bdrv_stop_replication(s->bs[i], NULL);
+}
+}
+
 static BlockDriver bdrv_quorum = {
 .format_name= "quorum",
 .protocol_name  = "quorum",
@@ -1137,6 +1212,10 @@ static BlockDriver bdrv_quorum = {
 
 .is_filter  = true,
 .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
+
+.bdrv_start_replication = quorum_start_replication,
+.bdrv_do_checkpoint = quorum_do_checkpoint,
+.bdrv_stop_replication  = quorum_stop_replication,
 };
 
 static void bdrv_quorum_init(void)
-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 02/13] quorum: allow ignoring child errors

2015-03-25 Thread Wen Congyang

If the child is not ready, read/write/getlength/flush will
return -errno. It is not critical error, and can be ignored:
1. read/write:
   Just not report the error event.
2. getlength:
   just ignore it. If all children's getlength return -errno,
   and be ignored, return -EIO.
3. flush:
   Just ignore it. If all children's getlength return -errno,
   and be ignored, return 0.

Usage: children.x.ignore-errors=true

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block/quorum.c | 64 +++---
 1 file changed, 57 insertions(+), 7 deletions(-)

diff --git a/block/quorum.c b/block/quorum.c
index 437b122..d087135 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -30,6 +30,7 @@
 #define QUORUM_OPT_BLKVERIFY  "blkverify"
 #define QUORUM_OPT_REWRITE"rewrite-corrupted"
 #define QUORUM_OPT_READ_PATTERN   "read-pattern"
+#define QUORUM_CHILDREN_OPT_IGNORE_ERRORS   "ignore-errors"
 
 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
@@ -65,6 +66,7 @@ typedef struct QuorumVotes {
 /* the following structure holds the state of one quorum instance */
 typedef struct BDRVQuorumState {
 BlockDriverState **bs; /* children BlockDriverStates */
+bool *ignore_errors;   /* ignore children's error? */
 int num_children;  /* children count */
 int threshold; /* if less than threshold children reads gave the
 * same result a quorum error occurs.
@@ -97,6 +99,7 @@ typedef struct QuorumChildRequest {
 uint8_t *buf;
 int ret;
 QuorumAIOCB *parent;
+int index;
 } QuorumChildRequest;
 
 /* Quorum will use the following structure to track progress of each read/write
@@ -209,6 +212,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
 acb->qcrs[i].buf = NULL;
 acb->qcrs[i].ret = 0;
 acb->qcrs[i].parent = acb;
+acb->qcrs[i].index = i;
 }
 
 return acb;
@@ -305,7 +309,7 @@ static void quorum_aio_cb(void *opaque, int ret)
 acb->count++;
 if (ret == 0) {
 acb->success_count++;
-} else {
+} else if (!s->ignore_errors[sacb->index]) {
 quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret);
 }
 assert(acb->count <= s->num_children);
@@ -720,19 +724,31 @@ static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs,
 static int64_t quorum_getlength(BlockDriverState *bs)
 {
 BDRVQuorumState *s = bs->opaque;
-int64_t result;
+int64_t result = -EIO;
 int i;
 
 /* check that all file have the same length */
-result = bdrv_getlength(s->bs[0]);
-if (result < 0) {
-return result;
-}
-for (i = 1; i < s->num_children; i++) {
+for (i = 0; i < s->num_children; i++) {
 int64_t value = bdrv_getlength(s->bs[i]);
+
 if (value < 0) {
 return value;
 }
+
+if (value == 0 && s->ignore_errors[i]) {
+/*
+ * If the child is not ready, it cannot return -errno,
+ * otherwise refresh_total_sectors() will fail when
+ * we open the child.
+ */
+continue;
+}
+
+if (result == -EIO) {
+result = value;
+continue;
+}
+
 if (value != result) {
 return -EIO;
 }
@@ -770,6 +786,9 @@ static coroutine_fn int quorum_co_flush(BlockDriverState 
*bs)
 
 for (i = 0; i < s->num_children; i++) {
 result = bdrv_co_flush(s->bs[i]);
+if (result < 0 && s->ignore_errors[i]) {
+result = 0;
+}
 result_value.l = result;
 quorum_count_vote(&error_votes, &result_value, i);
 }
@@ -844,6 +863,19 @@ static QemuOptsList quorum_runtime_opts = {
 },
 };
 
+static QemuOptsList quorum_children_common_opts = {
+.name = "qcow2 children",
+.head = QTAILQ_HEAD_INITIALIZER(quorum_children_common_opts.head),
+.desc = {
+{
+.name = QUORUM_CHILDREN_OPT_IGNORE_ERRORS,
+.type = QEMU_OPT_BOOL,
+.help = "ignore child I/O error",
+},
+{ /* end of list */ }
+},
+};
+
 static int parse_read_pattern(const char *opt)
 {
 int i;
@@ -939,11 +971,14 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
 /* allocate the children BlockDriverState array */
 s->bs = g_new0(BlockDriverState *, s->num_children);
 opened = g_new0(bool, s->num_children);
+s->ignore_errors = g_new0(bool, s->num_children);
 
 for (i = 0, lentry = qlist_first(list); lentry;
  lentry = qlist_next(lentry), i++) {
 QDict *d;
 QString *string;
+QemuOpts *children_opts = NULL;
+bool value;
 
 switch (qobject_type(lentry->value))
 {
@@ -951,6 +986,20 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
 case QTYPE_QDICT:
 d = qobject_to_qdict(lentry->value);

[Qemu-devel] [RFC PATCH COLO v2 06/13] NBD client: implement block driver interfaces for block replication

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block/nbd.c | 49 +
 1 file changed, 49 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 3faf865..753b322 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -458,6 +458,52 @@ static void nbd_refresh_filename(BlockDriverState *bs)
 bs->full_open_options = opts;
 }
 
+static void nbd_start_replication(BlockDriverState *bs, COLOMode mode,
+  Error **errp)
+{
+BDRVNBDState *s = bs->opaque;
+
+/*
+ * TODO: support COLO_SECONDARY_MODE if we allow secondary
+ * QEMU becoming primary QEMU.
+ */
+if (mode != COLO_MODE_PRIMARY) {
+error_set(errp, QERR_INVALID_PARAMETER, "mode");
+return;
+}
+
+if (s->connected) {
+error_setg(errp, "The connection is established");
+return;
+}
+
+/* TODO: NBD client should be one child of quorum, how to verify it? */
+nbd_connect_server(bs, errp);
+}
+
+static void nbd_do_checkpoint(BlockDriverState *bs, Error **errp)
+{
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+error_setg(errp, "The connection is not established");
+return;
+}
+}
+
+static void nbd_stop_replication(BlockDriverState *bs, Error **errp)
+{
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+error_setg(errp, "The connection is not established");
+return;
+}
+
+nbd_client_close(bs);
+s->connected = false;
+}
+
 static BlockDriver bdrv_nbd = {
 .format_name= "nbd",
 .protocol_name  = "nbd",
@@ -527,6 +573,9 @@ static BlockDriver bdrv_nbd_colo = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_start_replication = nbd_start_replication,
+.bdrv_do_checkpoint = nbd_do_checkpoint,
+.bdrv_stop_replication  = nbd_stop_replication,
 
 .has_variable_length= true,
 };
-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 12/13] skip nbd_target when starting block replication

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/block.c b/block.c
index bd7fa9c..3af5ad4 100644
--- a/block.c
+++ b/block.c
@@ -6368,6 +6368,12 @@ BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
 void bdrv_start_replication(BlockDriverState *bs, COLOMode mode, Error **errp)
 {
 BlockDriver *drv = bs->drv;
+Error *local_err = NULL;
+
+if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKING_REFERENCE, &local_err)) {
+error_free(local_err);
+return;
+}
 
 if (drv && drv->bdrv_start_replication) {
 drv->bdrv_start_replication(bs, mode, errp);
@@ -6381,6 +6387,12 @@ void bdrv_start_replication(BlockDriverState *bs, 
COLOMode mode, Error **errp)
 void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp)
 {
 BlockDriver *drv = bs->drv;
+Error *local_err = NULL;
+
+if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKING_REFERENCE, &local_err)) {
+error_free(local_err);
+return;
+}
 
 if (drv && drv->bdrv_do_checkpoint) {
 drv->bdrv_do_checkpoint(bs, errp);
@@ -6394,6 +6406,12 @@ void bdrv_do_checkpoint(BlockDriverState *bs, Error 
**errp)
 void bdrv_stop_replication(BlockDriverState *bs, Error **errp)
 {
 BlockDriver *drv = bs->drv;
+Error *local_err = NULL;
+
+if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKING_REFERENCE, &local_err)) {
+error_free(local_err);
+return;
+}
 
 if (drv && drv->bdrv_stop_replication) {
 drv->bdrv_stop_replication(bs, errp);
-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 09/13] block: Parse "backing_reference" option to reference existing BDS

2015-03-25 Thread Wen Congyang

Usage:
-drive file=xxx,id=Y, \
-drive 
file=,id=X,backing_reference.drive_id=Y,backing_reference.hidden-disk.*

It will create such backing chain:
   {virtio-blk dev 'Y'}  
{virtio-blk dev 'X'}
 |  
|
 |  
|
 v  
v

[base] <- [mid] <- ( Y )  <- (hidden target) 
<--- ( X )

 v  ^
 v  ^
 v  ^
 v  ^
  drive-backup sync=none 

X's backing file is hidden-disk, and hidden-disk's backing file is Y.
Disk Y may be opened or reopened in read-write mode, so A block backup
job is automatically created: source is Y and target is hidden-disk.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block.c| 145 -
 include/block/block.h  |   1 +
 include/block/block_int.h  |   1 +
 tests/qemu-iotests/051 |  13 
 tests/qemu-iotests/051.out |  13 
 5 files changed, 170 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index b4d629e..bd7fa9c 100644
--- a/block.c
+++ b/block.c
@@ -1351,6 +1351,113 @@ free_exit:
 return ret;
 }
 
+static void backing_reference_completed(void *opaque, int ret)
+{
+BlockDriverState *hidden_disk = opaque;
+
+assert(!hidden_disk->backing_reference);
+}
+
+static int bdrv_open_backing_reference_file(BlockDriverState *bs,
+QDict *options, Error **errp)
+{
+const char *backing_name;
+QDict *hidden_disk_options = NULL;
+BlockDriverState *backing_hd, *hidden_disk;
+BlockBackend *backing_blk;
+Error *local_err = NULL;
+int ret = 0;
+
+backing_name = qdict_get_try_str(options, "drive_id");
+if (!backing_name) {
+error_setg(errp, "Backing reference needs option drive_id");
+ret = -EINVAL;
+goto free_exit;
+}
+qdict_del(options, "drive_id");
+
+qdict_extract_subqdict(options, &hidden_disk_options, "hidden-disk.");
+if (!qdict_size(hidden_disk_options)) {
+error_setg(errp, "Backing reference needs option hidden-disk.*");
+ret = -EINVAL;
+goto free_exit;
+}
+
+if (qdict_size(options)) {
+const QDictEntry *entry = qdict_first(options);
+error_setg(errp, "Backing reference used by '%s' doesn't support "
+   "the option '%s'", bdrv_get_device_name(bs), entry->key);
+ret = -EINVAL;
+goto free_exit;
+}
+
+backing_blk = blk_by_name(backing_name);
+if (!backing_blk) {
+error_set(errp, QERR_DEVICE_NOT_FOUND, backing_name);
+ret = -ENOENT;
+goto free_exit;
+}
+
+backing_hd = blk_bs(backing_blk);
+/* Backing reference itself? */
+if (backing_hd == bs || bdrv_find_overlay(backing_hd, bs)) {
+error_setg(errp, "Backing reference itself");
+ret = -EINVAL;
+goto free_exit;
+}
+
+if (bdrv_op_is_blocked(backing_hd, BLOCK_OP_TYPE_BACKING_REFERENCE,
+   errp)) {
+ret = -EBUSY;
+goto free_exit;
+}
+
+/* hidden-disk is bs's backing file */
+ret = bdrv_open_backing_file(bs, hidden_disk_options, errp);
+hidden_disk_options = NULL;
+if (ret < 0) {
+goto free_exit;
+}
+
+hidden_disk = bs->backing_hd;
+if (!hidden_disk->drv || !hidden_disk->drv->supports_backing) {
+ret = -EINVAL;
+error_setg(errp, "Hidden disk's driver doesn't support backing files");
+goto free_exit;
+}
+
+bdrv_set_backing_hd(hidden_disk, backing_hd);
+bdrv_ref(backing_hd);
+
+/*
+ * backing hd may be opened or reopened in read-write mode, so we
+ * should backup backing hd to hidden disk
+ */
+bdrv_op_unblock(hidden_disk, BLOCK_OP_TYPE_BACKUP_TARGET,
+bs->backing_blocker);
+bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
+hidden_disk->backing_blocker);
+
+bdrv_ref(hidden_disk);
+backup_start(backing_hd, hidden_disk, 0, MIRROR_SYNC_MODE_NONE,
+ BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
+ backing_reference_completed, hidden_disk, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+bdrv_unref(hidden_disk);
+/* FIXME, use which errno? */
+ret = -EIO;
+goto free_exit;
+}
+
+bs->backing_reference = true;
+
+free_exit:
+QDECREF(hidden_disk_options);
+QDECREF(options);
+return ret;
+}
+
 /*
  * Ope

[Qemu-devel] [RFC PATCH COLO v2 11/13] qcow2: support colo

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block/qcow2.c | 447 +-
 block/qcow2.h |   6 +
 2 files changed, 452 insertions(+), 1 deletion(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 32bdf75..cc10af0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -35,6 +35,7 @@
 #include "qapi-event.h"
 #include "trace.h"
 #include "qemu/option_int.h"
+#include "block/blockjob.h"
 
 /*
   Differences with QCOW:
@@ -1496,7 +1497,7 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, 
Error **errp)
 memset(s, 0, sizeof(BDRVQcowState));
 options = qdict_clone_shallow(bs->options);
 
-ret = qcow2_open(bs, options, flags, &local_err);
+ret = bs->drv->bdrv_open(bs, options, flags, &local_err);
 QDECREF(options);
 if (local_err) {
 error_setg(errp, "Could not reopen qcow2 layer: %s",
@@ -2947,9 +2948,453 @@ BlockDriver bdrv_qcow2 = {
 .bdrv_amend_options  = qcow2_amend_options,
 };
 
+/*/
+/*
+ * qcow2 colo functions.
+ *
+ * Note:
+ * 1. The image format is qcow2, but it is only used for block replication.
+ * 2. The image is created by qcow2, not qcow+colo.
+ * 3. The image is an empty image.
+ * 4. The image doesn't contain any snapshot.
+ * 5. The image doesn't contain backing_file in image header.
+ * 6. Active disk and hidden disk use this driver.
+ * 7. The size of Active disk, hidden disk, nbd target should be the same.
+ */
+
+enum {
+COLO_NONE,  /* block replication is not started */
+COLO_RUNNING,   /* block replication is running */
+COLO_DONE,  /* block replication is done(failover) */
+};
+
+static int qcow2_colo_probe(const uint8_t *buf, int buf_size,
+const char *filename)
+{
+/* Use qcow2 as default */
+return 0;
+}
+
+#define COLO_OPT_EXPORT "export"
+static QemuOptsList qcow2_colo_runtime_opts = {
+.name = "qcow2+colo",
+.head = QTAILQ_HEAD_INITIALIZER(qcow2_colo_runtime_opts.head),
+.desc = {
+{
+.name = COLO_OPT_EXPORT,
+.type = QEMU_OPT_STRING,
+.help = "The NBD server name",
+},
+{ /* end of list */ }
+},
+};
+
+/*
+ * usage: -drive if=xxx,driver=qcow2+colo,export=xxx,\
+ *backing_reference.drive_id=,backing_reference.hidden-disk.*
+ */
+static int qcow2_colo_open(BlockDriverState *bs, QDict *options, int flags,
+   Error **errp)
+{
+int ret;
+BDRVQcowState *s = bs->opaque;;
+Error *local_err = NULL;
+QemuOpts *opts = NULL;
+
+ret = qcow2_open(bs, options, flags, errp);
+if (ret < 0) {
+return ret;
+}
+
+ret = -ENOTSUP;
+if (s->nb_snapshots) {
+error_setg(errp, "qcow2+colo doesn't support snapshot");
+goto fail;
+}
+
+if (!bs->backing_hd && bs->backing_file[0] != '\0') {
+error_setg(errp,
+   "qcow2+colo doesn't support backing_file in image header");
+goto fail;
+}
+
+opts = qemu_opts_create(&qcow2_colo_runtime_opts, NULL, 0, &error_abort);
+qemu_opts_absorb_qdict(opts, options, &local_err);
+if (local_err) {
+ret = -EINVAL;
+goto fail;
+}
+
+s->export_name = g_strdup(qemu_opt_get(opts, COLO_OPT_EXPORT));
+if (!s->export_name) {
+error_setg(&local_err, "Missing the option export");
+ret = -EINVAL;
+goto fail;
+}
+
+return 0;
+
+fail:
+qcow2_close(bs);
+qemu_opts_del(opts);
+/* propagate error */
+if (local_err) {
+error_propagate(errp, local_err);
+}
+return ret;
+}
+
+static coroutine_fn int qcow2_colo_co_readv(BlockDriverState *bs,
+int64_t sector_num,
+int remaining_sectors,
+QEMUIOVector *qiov)
+{
+BDRVQcowState *s = bs->opaque;
+BlockDriverState *nbd_target;
+
+switch (s->colo_state) {
+case COLO_NONE:
+return -EIO;
+case COLO_RUNNING:
+return qcow2_co_readv(bs, sector_num, remaining_sectors, qiov);
+case COLO_DONE:
+nbd_target = bs->backing_hd->backing_hd;
+return bdrv_co_readv(nbd_target, sector_num, remaining_sectors, qiov);
+default:
+abort();
+}
+}
+
+static coroutine_fn int qcow2_colo_co_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ int remaining_sectors,
+ QEMUIOVector *qiov)
+{
+BDRVQcowState *s = bs->opaque;
+BlockDriverState *nbd_target;
+
+switch (s->colo_state) {
+case COLO_NONE:
+return -EIO;
+case COLO_RUNNING:
+return qcow2_co_writev(bs, sector_num, remaining_sectors, qiov);
+case COLO_DONE:
+nbd_target = bs->backing_hd-

[Qemu-devel] [RFC PATCH COLO v2 03/13] NBD client: connect to nbd server later

2015-03-25 Thread Wen Congyang

The secondary qemu starts later than the primary qemu, so we
cannot connect to nbd server in bdrv_open().

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block/nbd.c | 122 +---
 1 file changed, 108 insertions(+), 14 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 2176186..3faf865 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -44,6 +44,8 @@
 typedef struct BDRVNBDState {
 NbdClientSession client;
 QemuOpts *socket_opts;
+char *export;
+bool connected;
 } BDRVNBDState;
 
 static int nbd_parse_uri(const char *filename, QDict *options)
@@ -254,50 +256,95 @@ static int nbd_establish_connection(BlockDriverState *bs, 
Error **errp)
 return sock;
 }
 
-static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
-Error **errp)
+static int nbd_connect_server(BlockDriverState *bs, Error **errp)
 {
 BDRVNBDState *s = bs->opaque;
-char *export = NULL;
 int result, sock;
-Error *local_err = NULL;
-
-/* Pop the config into our state object. Exit if invalid. */
-nbd_config(s, options, &export, &local_err);
-if (local_err) {
-error_propagate(errp, local_err);
-return -EINVAL;
-}
 
 /* establish TCP connection, return error if it fails
  * TODO: Configurable retry-until-timeout behaviour.
  */
 sock = nbd_establish_connection(bs, errp);
 if (sock < 0) {
-g_free(export);
+g_free(s->export);
 return sock;
 }
 
 /* NBD handshake */
-result = nbd_client_init(bs, sock, export, errp);
-g_free(export);
+result = nbd_client_init(bs, sock, s->export, errp);
+g_free(s->export);
+s->export = NULL;
+if (!result) {
+s->connected = true;
+}
+
 return result;
 }
 
+static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
+Error **errp)
+{
+BDRVNBDState *s = bs->opaque;
+Error *local_err = NULL;
+
+/* Pop the config into our state object. Exit if invalid. */
+nbd_config(s, options, &s->export, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+return -EINVAL;
+}
+
+return nbd_connect_server(bs, errp);
+}
+
+static int nbd_open_colo(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp)
+{
+BDRVNBDState *s = bs->opaque;
+Error *local_err = NULL;
+
+/* Pop the config into our state object. Exit if invalid. */
+nbd_config(s, options, &s->export, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+return -EINVAL;
+}
+
+return 0;
+}
+
 static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov)
 {
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+return -EIO;
+}
+
 return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
 }
 
 static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
  int nb_sectors, QEMUIOVector *qiov)
 {
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+return -EIO;
+}
+
 return nbd_client_co_writev(bs, sector_num, nb_sectors, qiov);
 }
 
 static int nbd_co_flush(BlockDriverState *bs)
 {
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+return -EIO;
+}
+
 return nbd_client_co_flush(bs);
 }
 
@@ -310,6 +357,12 @@ static void nbd_refresh_limits(BlockDriverState *bs, Error 
**errp)
 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
   int nb_sectors)
 {
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+return -EIO;
+}
+
 return nbd_client_co_discard(bs, sector_num, nb_sectors);
 }
 
@@ -319,23 +372,44 @@ static void nbd_close(BlockDriverState *bs)
 
 qemu_opts_del(s->socket_opts);
 nbd_client_close(bs);
+s->connected = false;
 }
 
 static int64_t nbd_getlength(BlockDriverState *bs)
 {
 BDRVNBDState *s = bs->opaque;
 
+if (!s->connected) {
+/*
+ * We cannot return -ENOTCONN, otherwise refresh_total_sectors()
+ * will fail, and we cannot open nbd client.
+ */
+return 0;
+}
+
 return s->client.size;
 }
 
 static void nbd_detach_aio_context(BlockDriverState *bs)
 {
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+return;
+}
+
 nbd_client_detach_aio_context(bs);
 }
 
 static void nbd_attach_aio_context(BlockDriverState *bs,
AioContext *new_context)
 {
+BDRVNBDState *s = bs->opaque;
+
+if (!s->connected) {
+return;
+}
+
 nbd_client_attach_aio_context(bs, new_context);
 }
 
@@ -438,11 +512,31 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_refresh_filename  = nbd_refresh_filename,
 };
 
+static BlockDriver bdrv_nbd_colo = {
+.format_

[Qemu-devel] [RFC PATCH COLO v2 10/13] Backup: clear all bitmap when doing block checkpoint

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Cc: Jeff Cody 
---
 block/backup.c| 12 
 include/block/block_int.h |  1 +
 include/qemu/hbitmap.h|  8 
 util/hbitmap.c| 19 +++
 4 files changed, 40 insertions(+)

diff --git a/block/backup.c b/block/backup.c
index 1c535b1..4e9d535 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -435,3 +435,15 @@ void backup_start(BlockDriverState *bs, BlockDriverState 
*target,
 job->common.co = qemu_coroutine_create(backup_run);
 qemu_coroutine_enter(job->common.co, job);
 }
+
+void backup_do_checkpoint(BlockJob *job, Error **errp)
+{
+BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+
+if (job->driver != &backup_job_driver) {
+error_setg(errp, "It is not backup job");
+return;
+}
+
+hbitmap_reset_all(backup_job->bitmap);
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 624945d..45d547b 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -628,6 +628,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState 
*target,
   BlockdevOnError on_target_error,
   BlockCompletionFunc *cb, void *opaque,
   Error **errp);
+void backup_do_checkpoint(BlockJob *job, Error **errp);
 
 void blk_dev_change_media_cb(BlockBackend *blk, bool load);
 bool blk_dev_has_removable_media(BlockBackend *blk);
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 550d7ce..95a55e4 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -109,6 +109,14 @@ void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t 
count);
 void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count);
 
 /**
+ * hbitmap_reset_all:
+ * @hb: HBitmap to operate on.
+ *
+ * Reset all bits in an HBitmap.
+ */
+void hbitmap_reset_all(HBitmap *hb);
+
+/**
  * hbitmap_get:
  * @hb: HBitmap to operate on.
  * @item: Bit to query (0-based).
diff --git a/util/hbitmap.c b/util/hbitmap.c
index ab13971..4111ca5 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -353,6 +353,25 @@ void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t 
count)
 hb_reset_between(hb, HBITMAP_LEVELS - 1, start, last);
 }
 
+void hbitmap_reset_all(HBitmap *hb)
+{
+#if 0
+hbitmap_reset(hb, 0, hb->size << hb->granularity);
+#else
+uint64_t size = hb->size;
+unsigned int i;
+
+/* Same as hbitmap_alloc() except memset() */
+for (i = HBITMAP_LEVELS; i-- > 0; ) {
+size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+memset(hb->levels[i], 0, size * sizeof(unsigned long));
+}
+
+assert(size == 1);
+hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
+#endif
+}
+
 bool hbitmap_get(const HBitmap *hb, uint64_t item)
 {
 /* Compute position and bit in the last layer.  */
-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 13/13] Don't allow a disk use backing reference target

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/block.c b/block.c
index 3af5ad4..548cafa 100644
--- a/block.c
+++ b/block.c
@@ -1399,6 +1399,14 @@ static int 
bdrv_open_backing_reference_file(BlockDriverState *bs,
 }
 
 backing_hd = blk_bs(backing_blk);
+/* Don't allow a disk use backing reference target */
+ret = blk_attach_dev(backing_hd->blk, bs);
+if (ret < 0) {
+error_setg(errp, "backing_hd %s is used by the other device model",
+   backing_name);
+goto free_exit;
+}
+
 /* Backing reference itself? */
 if (backing_hd == bs || bdrv_find_overlay(backing_hd, bs)) {
 error_setg(errp, "Backing reference itself");
@@ -2077,6 +2085,7 @@ void bdrv_close(BlockDriverState *bs)
 if (backing_hd->backing_hd->job) {
 block_job_cancel(backing_hd->backing_hd->job);
 }
+blk_detach_dev(backing_hd->backing_hd->blk, bs);
 bdrv_set_backing_hd(backing_hd, NULL);
 bdrv_unref(backing_hd->backing_hd);
 }
-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 04/13] Add new block driver interfaces to control block replication

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Cc: Luiz Capitulino 
Cc: Michael Roth 
---
 block.c   | 39 +++
 include/block/block.h |  4 
 include/block/block_int.h | 11 +++
 qapi/block.json   | 16 
 4 files changed, 70 insertions(+)

diff --git a/block.c b/block.c
index 0fe97de..0ff5cf8 100644
--- a/block.c
+++ b/block.c
@@ -6196,3 +6196,42 @@ BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
 {
 return &bs->stats;
 }
+
+void bdrv_start_replication(BlockDriverState *bs, COLOMode mode, Error **errp)
+{
+BlockDriver *drv = bs->drv;
+
+if (drv && drv->bdrv_start_replication) {
+drv->bdrv_start_replication(bs, mode, errp);
+} else if (bs->file) {
+bdrv_start_replication(bs->file, mode, errp);
+} else {
+error_set(errp, QERR_UNSUPPORTED);
+}
+}
+
+void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp)
+{
+BlockDriver *drv = bs->drv;
+
+if (drv && drv->bdrv_do_checkpoint) {
+drv->bdrv_do_checkpoint(bs, errp);
+} else if (bs->file) {
+bdrv_do_checkpoint(bs->file, errp);
+} else {
+error_set(errp, QERR_UNSUPPORTED);
+}
+}
+
+void bdrv_stop_replication(BlockDriverState *bs, Error **errp)
+{
+BlockDriver *drv = bs->drv;
+
+if (drv && drv->bdrv_stop_replication) {
+drv->bdrv_stop_replication(bs, errp);
+} else if (bs->file) {
+bdrv_stop_replication(bs->file, errp);
+} else {
+error_set(errp, QERR_UNSUPPORTED);
+}
+}
diff --git a/include/block/block.h b/include/block/block.h
index 4c57d63..68f3b1a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -569,4 +569,8 @@ void bdrv_flush_io_queue(BlockDriverState *bs);
 
 BlockAcctStats *bdrv_get_stats(BlockDriverState *bs);
 
+void bdrv_start_replication(BlockDriverState *bs, COLOMode mode, Error **errp);
+void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp);
+void bdrv_stop_replication(BlockDriverState *bs, Error **errp);
+
 #endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
index dccb092..08dd8ba 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -290,6 +290,17 @@ struct BlockDriver {
  */
 int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
 
+
+void (*bdrv_start_replication)(BlockDriverState *bs, COLOMode mode,
+   Error **errp);
+/* Drop Disk buffer when doing checkpoint. */
+void (*bdrv_do_checkpoint)(BlockDriverState *bs, Error **errp);
+/*
+ * After failover, we should flush Disk buffer into secondary disk
+ * and stop block replication.
+ */
+void (*bdrv_stop_replication)(BlockDriverState *bs, Error **errp);
+
 QLIST_ENTRY(BlockDriver) list;
 };
 
diff --git a/qapi/block.json b/qapi/block.json
index e313465..e640566 100644
--- a/qapi/block.json
+++ b/qapi/block.json
@@ -40,6 +40,22 @@
   'data': ['auto', 'none', 'lba', 'large', 'rechs']}
 
 ##
+# @COLOMode
+#
+# An enumeration of COLO mode.
+#
+# @unprotected: COLO is not started or after failover
+#
+# @primary: Primary mode, the vm's state will be sent to secondary QEMU.
+#
+# @secondary: Secondary mode, receive the vm's state from primary QEMU.
+#
+# Since: 2.4
+##
+{ 'enum' : 'COLOMode',
+  'data' : ['unprotected', 'primary', 'secondary']}
+
+##
 # @BlockdevSnapshotInternal
 #
 # @device: the name of the device to generate the snapshot from
-- 
2.1.0

[Qemu-devel] [RFC PATCH COLO v2 07/13] allow writing to the backing file

2015-03-25 Thread Wen Congyang

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block.c | 31 ++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 0ff5cf8..b4d629e 100644
--- a/block.c
+++ b/block.c
@@ -1247,6 +1247,20 @@ out:
 bdrv_refresh_limits(bs, NULL);
 }
 
+#define ALLOW_WRITE_BACKING_FILE"allow-write-backing-file"
+static QemuOptsList backing_file_opts = {
+.name = "backing_file",
+.head = QTAILQ_HEAD_INITIALIZER(backing_file_opts.head),
+.desc = {
+{
+.name = ALLOW_WRITE_BACKING_FILE,
+.type = QEMU_OPT_BOOL,
+.help = "allow write to backing file",
+},
+{ /* end of list */ }
+},
+};
+
 /*
  * Opens the backing file for a BlockDriverState if not yet open
  *
@@ -1261,6 +1275,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 int ret = 0;
 BlockDriverState *backing_hd;
 Error *local_err = NULL;
+QemuOpts *opts = NULL;
+int flags;
 
 if (bs->backing_hd != NULL) {
 QDECREF(options);
@@ -1273,6 +1289,19 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 }
 
 bs->open_flags &= ~BDRV_O_NO_BACKING;
+flags = bdrv_backing_flags(bs->open_flags);
+
+opts = qemu_opts_create(&backing_file_opts, NULL, 0, &error_abort);
+qemu_opts_absorb_qdict(opts, options, &local_err);
+if (local_err) {
+ret = -EINVAL;
+goto free_exit;
+}
+if (qemu_opt_get_bool(opts, ALLOW_WRITE_BACKING_FILE, false)) {
+flags |= BDRV_O_RDWR;
+}
+qemu_opts_del(opts);
+
 if (qdict_haskey(options, "file.filename")) {
 backing_filename[0] = '\0';
 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
@@ -1305,7 +1334,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 assert(bs->backing_hd == NULL);
 ret = bdrv_open(&backing_hd,
 *backing_filename ? backing_filename : NULL, NULL, options,
-bdrv_backing_flags(bs->open_flags), NULL, &local_err);
+flags, NULL, &local_err);
 if (ret < 0) {
 bdrv_unref(backing_hd);
 backing_hd = NULL;
-- 
2.1.0

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread Dr. David Alan Gilbert

* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote:
> Hi all,
> 
> We found that, sometimes, the content of VM's memory is inconsistent between 
> Source side and Destination side
> when we check it just after finishing migration but before VM continue to Run.
> 
> We use a patch like bellow to find this issue, you can find it from affix,
> and Steps to reproduce:
> 
> (1) Compile QEMU:
>  ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make
> 
> (2) Command and output:
> SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
> -netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c 
> -drive 
> file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
>  -device 
> virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
> -vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
> stdio
> (qemu) migrate tcp:192.168.3.8:3004
> before saving ram complete
> ff703f6889ab8701e4e040872d079a28
> md_host : after saving ram complete
> ff703f6889ab8701e4e040872d079a28
> 
> DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
> -netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 
> -boot c -drive 
> file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
>  -device 
> virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
> -vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
> stdio -incoming tcp:0:3004
> (qemu) QEMU_VM_SECTION_END, after loading ram
> 230e1e68ece9cd4e769630e1bcb5ddfb
> md_host : after loading all vmstate
> 230e1e68ece9cd4e769630e1bcb5ddfb
> md_host : after cpu_synchronize_all_post_init
> 230e1e68ece9cd4e769630e1bcb5ddfb
> 
> This happens occasionally, and it is more easy to reproduce when issue 
> migration command during VM's startup time.
> 
> We have done further test and found that some pages has been dirtied but its 
> corresponding migration_bitmap is not set.
> We can't figure out which modules of QEMU has missed setting bitmap when 
> dirty page of VM,
> it is very difficult for us to trace all the actions of dirtying VM's pages.
> 
> Actually, the first time we found this problem was in the COLO FT 
> development, and it triggered some strange issues in
> VM which all pointed to the issue of inconsistent of VM's memory. (We have 
> try to save all memory of VM to slave side every time
> when do checkpoint in COLO FT, and everything will be OK.)
> 
> Is it OK for some pages that not transferred to destination when do migration 
> ? Or is it a bug?

That does sound like a bug.
The only other explanation I have is that memory is being changed by a device 
emulation
that happens after the end of a saving the vm, or after loading the memory.  
That's
certainly possible - especially if a device (say networking) hasn't been 
properly
stopped.

> This issue has blocked our COLO development... :(
> 
> Any help will be greatly appreciated!

I suggest:
   1) Does it happen with devices other than virtio?
   2) Strip the devices down - e.g. just run with serial and no video/usb
   3) Try doing the md5 comparison at the end of ram_save_complete
   4) mprotect RAM after the ram_save_complete and see if anything faults.
   5) Can you trigger this with normal migration or just COLO?
  I'm wondering if something is doing something on a running/paused/etc 
state
  change and isn't expecting the new COLO states.

Dave


> 
> Thanks,
> zhanghailiang
> 
> --- a/savevm.c
> +++ b/savevm.c
> @@ -51,6 +51,26 @@
>  #define ARP_PTYPE_IP 0x0800
>  #define ARP_OP_REQUEST_REV 0x3
> 
> +#include "qemu/rcu_queue.h"
> +#include 
> +
> +static void check_host_md5(void)
> +{
> +int i;
> +unsigned char md[MD5_DIGEST_LENGTH];
> +MD5_CTX ctx;
> +RAMBlock *block = QLIST_FIRST_RCU(&ram_list.blocks);/* Only check 
> 'pc.ram' block */
> +
> +MD5_Init(&ctx);
> +MD5_Update(&ctx, (void *)block->host, block->used_length);
> +MD5_Final(md, &ctx);
> +printf("md_host : ");
> +for(i = 0; i < MD5_DIGEST_LENGTH; i++) {
> +fprintf(stderr, "%02x", md[i]);
> +}
> +fprintf(stderr, "\n");
> +}
> +
>  static int announce_self_create(uint8_t *buf,
>  uint8_t *mac_addr)
>  {
> @@ -741,7 +761,13 @@ void qemu_savevm_state_complete(QEMUFile *f)
>  qemu_put_byte(f, QEMU_VM_SECTION_END);
>  qemu_put_be32(f, se->section_id);
> 
> +printf("before saving %s complete\n", se->idstr);
> +check_host_md5();
> +
>  ret = se->ops->save_live_complete(f, se->opaque);
> +printf("after saving %s complete\n", se->idstr);
> +check_host_md5();
> +
>  trace_savevm_section_end(se->idstr, se->section_id, ret);
>  if (ret < 0) {
>  qemu_file_set_error(f, ret);
> @@ -1030,6 +1063,11 @@ int qemu_loadvm_state(QEMUFile *f)
>  }
> 
>  ret =

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread Juan Quintela

zhanghailiang  wrote:
> Hi all,
>
> We found that, sometimes, the content of VM's memory is inconsistent between 
> Source side and Destination side
> when we check it just after finishing migration but before VM continue to Run.
>
> We use a patch like bellow to find this issue, you can find it from affix,
> and Steps to reprduce:
>
> (1) Compile QEMU:
>  ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make
>
> (2) Command and output:
> SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
> -netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c 
> -drive 
> file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
>  -device 
> virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
> -vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
> stdio

Could you try to reproduce:
- without vhost
- without virtio-net
- cache=unsafe is going to give you trouble, but trouble should only
  happen after migration of pages have finished.

What kind of load were you having when reproducing this issue?
Just to confirm, you have been able to reproduce this without COLO
patches, right?

> (qemu) migrate tcp:192.168.3.8:3004
> before saving ram complete
> ff703f6889ab8701e4e040872d079a28
> md_host : after saving ram complete
> ff703f6889ab8701e4e040872d079a28
>
> DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
> -netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 
> -boot c -drive 
> file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
>  -device 
> virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
> -vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
> stdio -incoming tcp:0:3004
> (qemu) QEMU_VM_SECTION_END, after loading ram
> 230e1e68ece9cd4e769630e1bcb5ddfb
> md_host : after loading all vmstate
> 230e1e68ece9cd4e769630e1bcb5ddfb
> md_host : after cpu_synchronize_all_post_init
> 230e1e68ece9cd4e769630e1bcb5ddfb
>
> This happens occasionally, and it is more easy to reproduce when issue 
> migration command during VM's startup time.

OK, a couple of things.  Memory don't have to be exactly identical.
Virtio devices in particular do funny things on "post-load".  There
aren't warantees for that as far as I know, we should end with an
equivalent device state in memory.

> We have done further test and found that some pages has been dirtied but its 
> corresponding migration_bitmap is not set.
> We can't figure out which modules of QEMU has missed setting bitmap when 
> dirty page of VM,
> it is very difficult for us to trace all the actions of dirtying VM's pages.

This seems to point to a bug in one of the devices.

> Actually, the first time we found this problem was in the COLO FT 
> development, and it triggered some strange issues in
> VM which all pointed to the issue of inconsistent of VM's memory. (We have 
> try to save all memory of VM to slave side every time
> when do checkpoint in COLO FT, and everything will be OK.)
>
> Is it OK for some pages that not transferred to destination when do migration 
> ? Or is it a bug?

Pages transferred should be the same, after device state transmission is
when things could change.

> This issue has blocked our COLO development... :(
>
> Any help will be greatly appreciated!

Later, Juan.

>
> Thanks,
> zhanghailiang
>
> --- a/savevm.c
> +++ b/savevm.c
> @@ -51,6 +51,26 @@
>  #define ARP_PTYPE_IP 0x0800
>  #define ARP_OP_REQUEST_REV 0x3
>
> +#include "qemu/rcu_queue.h"
> +#include 
> +
> +static void check_host_md5(void)
> +{
> +int i;
> +unsigned char md[MD5_DIGEST_LENGTH];
> +MD5_CTX ctx;
> +RAMBlock *block = QLIST_FIRST_RCU(&ram_list.blocks);/* Only check 
> 'pc.ram' block */
> +
> +MD5_Init(&ctx);
> +MD5_Update(&ctx, (void *)block->host, block->used_length);
> +MD5_Final(md, &ctx);
> +printf("md_host : ");
> +for(i = 0; i < MD5_DIGEST_LENGTH; i++) {
> +fprintf(stderr, "%02x", md[i]);
> +}
> +fprintf(stderr, "\n");
> +}
> +
>  static int announce_self_create(uint8_t *buf,
>  uint8_t *mac_addr)
>  {
> @@ -741,7 +761,13 @@ void qemu_savevm_state_complete(QEMUFile *f)
>  qemu_put_byte(f, QEMU_VM_SECTION_END);
>  qemu_put_be32(f, se->section_id);
>
> +printf("before saving %s complete\n", se->idstr);
> +check_host_md5();
> +
>  ret = se->ops->save_live_complete(f, se->opaque);
> +printf("after saving %s complete\n", se->idstr);
> +check_host_md5();
> +
>  trace_savevm_section_end(se->idstr, se->section_id, ret);
>  if (ret < 0) {
>  qemu_file_set_error(f, ret);
> @@ -1030,6 +1063,11 @@ int qemu_loadvm_state(QEMUFile *f)
>  }
>
>  ret = vmstate_load(f, le->se, le->version_id);
> +if (section_type == QEMU_VM_SECTION_END) {

Re: [Qemu-devel] [PATCH for-2.3] arm: memory: Replace memory_region_init_ram with memory_region_allocate_system_memory

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 01:10, Peter Maydell wrote:
> So why do only the ARM boards get fixes here?
> (Why did we allow ourselves to break half our boards
> in commit 0b183fc871 in the first place?)

Because the patches to convert them were in my stolen laptop. :/

Paolo

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread Wen Congyang

On 03/25/2015 05:50 PM, Juan Quintela wrote:
> zhanghailiang  wrote:
>> Hi all,
>>
>> We found that, sometimes, the content of VM's memory is inconsistent between 
>> Source side and Destination side
>> when we check it just after finishing migration but before VM continue to 
>> Run.
>>
>> We use a patch like bellow to find this issue, you can find it from affix,
>> and Steps to reprduce:
>>
>> (1) Compile QEMU:
>>  ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make
>>
>> (2) Command and output:
>> SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
>> -netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c 
>> -drive 
>> file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
>>  -device 
>> virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
>> -vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
>> stdio
> 
> Could you try to reproduce:
> - without vhost
> - without virtio-net
> - cache=unsafe is going to give you trouble, but trouble should only
>   happen after migration of pages have finished.

I can use e1000 to reproduce this problem.

> 
> What kind of load were you having when reproducing this issue?
> Just to confirm, you have been able to reproduce this without COLO
> patches, right?

I can reproduce it without COLO patches. The newest commit is:
commit 054903a832b865eb5432d79b5c9d1e1ff31b58d7
Author: Peter Maydell 
Date:   Tue Mar 24 16:34:16 2015 +

Update version for v2.3.0-rc1 release

Signed-off-by: Peter Maydell 

> 
>> (qemu) migrate tcp:192.168.3.8:3004
>> before saving ram complete
>> ff703f6889ab8701e4e040872d079a28
>> md_host : after saving ram complete
>> ff703f6889ab8701e4e040872d079a28
>>
>> DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
>> -netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 
>> -boot c -drive 
>> file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
>>  -device 
>> virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
>> -vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
>> stdio -incoming tcp:0:3004
>> (qemu) QEMU_VM_SECTION_END, after loading ram
>> 230e1e68ece9cd4e769630e1bcb5ddfb
>> md_host : after loading all vmstate
>> 230e1e68ece9cd4e769630e1bcb5ddfb
>> md_host : after cpu_synchronize_all_post_init
>> 230e1e68ece9cd4e769630e1bcb5ddfb
>>
>> This happens occasionally, and it is more easy to reproduce when issue 
>> migration command during VM's startup time.
> 
> OK, a couple of things.  Memory don't have to be exactly identical.
> Virtio devices in particular do funny things on "post-load".  There
> aren't warantees for that as far as I know, we should end with an
> equivalent device state in memory.
> 
>> We have done further test and found that some pages has been dirtied but its 
>> corresponding migration_bitmap is not set.
>> We can't figure out which modules of QEMU has missed setting bitmap when 
>> dirty page of VM,
>> it is very difficult for us to trace all the actions of dirtying VM's pages.
> 
> This seems to point to a bug in one of the devices.
> 
>> Actually, the first time we found this problem was in the COLO FT 
>> development, and it triggered some strange issues in
>> VM which all pointed to the issue of inconsistent of VM's memory. (We have 
>> try to save all memory of VM to slave side every time
>> when do checkpoint in COLO FT, and everything will be OK.)
>>
>> Is it OK for some pages that not transferred to destination when do 
>> migration ? Or is it a bug?
> 
> Pages transferred should be the same, after device state transmission is
> when things could change.
> 
>> This issue has blocked our COLO development... :(
>>
>> Any help will be greatly appreciated!
> 
> Later, Juan.
>

Re: [Qemu-devel] [PATCH for-2.3 0/3] Contain drive_get() misuse

2015-03-25 Thread Markus Armbruster

Markus Armbruster  writes:

[...]
> * "allwinner-a10"
>
>"-nodefaults -serial null -device allwinner-a10" doesn't explode,
>which means I can't exclude the possibility that this might actually
>do something useful for someone.  I'd say we treat it just like
>"pc87312": leave alone now, document incompatible change.
[...]
> How could this kind of mistake could look like in NIC device models?
> "allwinner-a10"'s instance_init aw_a10_init() provides a clue: it messes
> with qemu_check_nic_model():
>
> if (nd_table[0].used) {
> qemu_check_nic_model(&nd_table[0], TYPE_AW_EMAC);
> qdev_set_nic_properties(DEVICE(&s->emac), &nd_table[0]);
> }
>
> No other device does that.  Without -nodefaults, this fails:
>
> -device allwinner-a10: Unsupported NIC model: xgmac

Even better:

$  qemu-system-arm -S -M highbank -monitor stdio
QEMU 2.2.90 monitor - type 'help' for more information
(qemu) device_add allwinner-a10
Unsupported NIC model: xgmac
[Exit 1 ]

exit() in monitor command, big no-no.

Machine type doesn't matter.  To get past this, nd_table[0] must have
model=allwinner-emac.  With -net nic,model=allwinner-emac, it has, and
we get to the next round of failures:

* cubieboard

  Board already creates an allwinner-a10, and creating a second one with
  -device or device_add aborts.

* Any other board with an onboard NIC, e.g. highbank

  Refuses to start, as board doesn't support model=allwinner-emac:

  qemu-system-arm: Unsupported NIC model: allwinner-emac

* Any other board without an onboard NIC, e.g. virt

  Warns on startup:

  Warning: requested NIC (anonymous, model allwinner-emac) was not created 
(not supported by this machine?

  -device / device_add allwinner-a10 succeeds, as long as serial_hds[0]
  is set.  If you suppress it with -nodefaults, -device / device_add
  exit()s (no-no again) with

  Can't create serial device, empty char device

  If the board has an onboard serial (e.g. collie), both the onboard
  serial and the allwinner-a10 get are now connected to it.  Not going
  to work.  We normally catch this error, but bypassing qdev properties
  also bypasses the check.

Actual use of -device / device_add allwinner-a10 seems vanishingly
unlikely.  Thus, setting cannot_instantiate_with_device_add_yet is
unlikely to break anything.  Let's do it to mask the "device_add
exit()s" bug.

> I haven't checked all uses of nd_table[].

I have now, this is the only offender.

[Qemu-devel] [PATCH v2 0/9] mips/kvm: Support FPU & SIMD (MSA) in MIPS KVM guests

2015-03-25 Thread James Hogan

Review on v1 has gone quiet, so here's v2 which addresses the feedback
received for v1. Thanks to all who have taken the time to review it so
far.

This patchset primarily adds support for FPU and MIPS SIMD Architecture
(MSA) in MIPS KVM guests to QEMU. It depends on the KVM patchset which I
recently submitted to add the corresponding hypervisor support to KVM
("[PATCH 00/20] MIPS: KVM: Guest FPU & SIMD (MSA) support").

All comments welcome.

Changes in v2:
- Moved most of patch 7 and updates to linux-headers/linux/kvm.h from
  patches 8 and 9 into a new patch 1, which is purely for reference
  (Paolo).
- Add the changes to MIPS_CP0_{32,64} macros from v1 patch 7 to patch 2,
  since the rest of that patch is now unnecessary and the change is
  along the same lines as patch 2 (not added Leon's Reviewed-by to this
  patch due to that non-reviewed change).
- Fix line wrapping of kvm_mips_get_one_reg() calls from Config4 and
  Config5 in patch 5 (Leon).
- Change (1 << x) to (1U << x) in important places in patch 5, 8 & 9 to
  avoid compiler defined behaviour (Leon).

James Hogan (9):
  DONT APPLY: linux-headers: Update MIPS KVM headers
  mips/kvm: Sync with newer MIPS KVM headers
  mips/kvm: Remove a couple of noisy DPRINTFs
  mips/kvm: Implement PRid CP0 register
  mips/kvm: Implement Config CP0 registers
  mips/kvm: Support unsigned KVM registers
  mips/kvm: Support signed 64-bit KVM registers
  mips/kvm: Support FPU in MIPS KVM guests
  mips/kvm: Support MSA in MIPS KVM guests

 linux-headers/asm-mips/kvm.h | 160 ++---
 linux-headers/linux/kvm.h|   2 +
 target-mips/cpu.h|   2 +
 target-mips/kvm.c| 410 ---
 4 files changed, 487 insertions(+), 87 deletions(-)

-- 
2.0.5

[Qemu-devel] [PATCH v2 3/9] mips/kvm: Remove a couple of noisy DPRINTFs

2015-03-25 Thread James Hogan

The DPRINTFs in cpu_mips_io_interrupts_pending() and kvm_arch_pre_run()
are particularly noisy during normal execution, and also not
particularly helpful. Remove them so that more important debug messages
can be more easily seen.

Signed-off-by: James Hogan 
Reviewed-by: Leon Alrae 
Cc: Paolo Bonzini 
Cc: Aurelien Jarno 
---
 target-mips/kvm.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index bbdaccc9d729..1722a6a40598 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -87,7 +87,6 @@ static inline int cpu_mips_io_interrupts_pending(MIPSCPU *cpu)
 {
 CPUMIPSState *env = &cpu->env;
 
-DPRINTF("%s: %#x\n", __func__, env->CP0_Cause & (1 << (2 + CP0Ca_IP)));
 return env->CP0_Cause & (0x1 << (2 + CP0Ca_IP));
 }
 
@@ -112,7 +111,6 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
 
 void kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
 {
-DPRINTF("%s\n", __func__);
 }
 
 int kvm_arch_process_async_events(CPUState *cs)
-- 
2.0.5

[Qemu-devel] [PATCH v2 5/9] mips/kvm: Implement Config CP0 registers

2015-03-25 Thread James Hogan

Implement saving and restoring to KVM state of the Config CP0 registers
(namely Config, Config1, Config2, Config3, Config4, and Config5). These
control the features available to a guest, and a few of the fields will
soon be writeable by a guest so QEMU needs to know about them so as not
to clobber them on migration/savevm.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Leon Alrae 
Cc: Aurelien Jarno 
---
Changes in v2:
- Fix line wrapping of kvm_mips_get_one_reg() calls from Config4 and
  Config5 (Leon).
- Change (1 << x) to (1U << x) in important places to avoid compiler
  defined behaviour (Leon).
---
 target-mips/kvm.c | 106 ++
 1 file changed, 106 insertions(+)

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index d41facfca0d5..ead8c5f73930 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -223,6 +223,12 @@ int kvm_mips_set_ipi_interrupt(MIPSCPU *cpu, int irq, int 
level)
 #define KVM_REG_MIPS_CP0_CAUSE  MIPS_CP0_32(13, 0)
 #define KVM_REG_MIPS_CP0_EPCMIPS_CP0_64(14, 0)
 #define KVM_REG_MIPS_CP0_PRID   MIPS_CP0_32(15, 0)
+#define KVM_REG_MIPS_CP0_CONFIG MIPS_CP0_32(16, 0)
+#define KVM_REG_MIPS_CP0_CONFIG1MIPS_CP0_32(16, 1)
+#define KVM_REG_MIPS_CP0_CONFIG2MIPS_CP0_32(16, 2)
+#define KVM_REG_MIPS_CP0_CONFIG3MIPS_CP0_32(16, 3)
+#define KVM_REG_MIPS_CP0_CONFIG4MIPS_CP0_32(16, 4)
+#define KVM_REG_MIPS_CP0_CONFIG5MIPS_CP0_32(16, 5)
 #define KVM_REG_MIPS_CP0_ERROREPC   MIPS_CP0_64(30, 0)
 
 static inline int kvm_mips_put_one_reg(CPUState *cs, uint64_t reg_id,
@@ -305,6 +311,34 @@ static inline int kvm_mips_get_one_reg64(CPUState *cs, 
uint64 reg_id,
 return kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &cp0reg);
 }
 
+#define KVM_REG_MIPS_CP0_CONFIG_MASK(1U << CP0C0_M)
+#define KVM_REG_MIPS_CP0_CONFIG1_MASK   (1U << CP0C1_M)
+#define KVM_REG_MIPS_CP0_CONFIG2_MASK   (1U << CP0C2_M)
+#define KVM_REG_MIPS_CP0_CONFIG3_MASK   (1U << CP0C3_M)
+#define KVM_REG_MIPS_CP0_CONFIG4_MASK   (1U << CP0C4_M)
+#define KVM_REG_MIPS_CP0_CONFIG5_MASK   0
+
+static inline int kvm_mips_change_one_reg(CPUState *cs, uint64_t reg_id,
+  int32_t *addr, int32_t mask)
+{
+int err;
+int32_t tmp, change;
+
+err = kvm_mips_get_one_reg(cs, reg_id, &tmp);
+if (err < 0) {
+return err;
+}
+
+/* only change bits in mask */
+change = (*addr ^ tmp) & mask;
+if (!change) {
+return 0;
+}
+
+tmp = tmp ^ change;
+return kvm_mips_put_one_reg(cs, reg_id, &tmp);
+}
+
 /*
  * We freeze the KVM timer when either the VM clock is stopped or the state is
  * saved (the state is dirty).
@@ -527,6 +561,48 @@ static int kvm_mips_put_cp0_registers(CPUState *cs, int 
level)
 DPRINTF("%s: Failed to put CP0_PRID (%d)\n", __func__, err);
 ret = err;
 }
+err = kvm_mips_change_one_reg(cs, KVM_REG_MIPS_CP0_CONFIG,
+  &env->CP0_Config0,
+  KVM_REG_MIPS_CP0_CONFIG_MASK);
+if (err < 0) {
+DPRINTF("%s: Failed to change CP0_CONFIG (%d)\n", __func__, err);
+ret = err;
+}
+err = kvm_mips_change_one_reg(cs, KVM_REG_MIPS_CP0_CONFIG1,
+  &env->CP0_Config1,
+  KVM_REG_MIPS_CP0_CONFIG1_MASK);
+if (err < 0) {
+DPRINTF("%s: Failed to change CP0_CONFIG1 (%d)\n", __func__, err);
+ret = err;
+}
+err = kvm_mips_change_one_reg(cs, KVM_REG_MIPS_CP0_CONFIG2,
+  &env->CP0_Config2,
+  KVM_REG_MIPS_CP0_CONFIG2_MASK);
+if (err < 0) {
+DPRINTF("%s: Failed to change CP0_CONFIG2 (%d)\n", __func__, err);
+ret = err;
+}
+err = kvm_mips_change_one_reg(cs, KVM_REG_MIPS_CP0_CONFIG3,
+  &env->CP0_Config3,
+  KVM_REG_MIPS_CP0_CONFIG3_MASK);
+if (err < 0) {
+DPRINTF("%s: Failed to change CP0_CONFIG3 (%d)\n", __func__, err);
+ret = err;
+}
+err = kvm_mips_change_one_reg(cs, KVM_REG_MIPS_CP0_CONFIG4,
+  &env->CP0_Config4,
+  KVM_REG_MIPS_CP0_CONFIG4_MASK);
+if (err < 0) {
+DPRINTF("%s: Failed to change CP0_CONFIG4 (%d)\n", __func__, err);
+ret = err;
+}
+err = kvm_mips_change_one_reg(cs, KVM_REG_MIPS_CP0_CONFIG5,
+  &env->CP0_Config5,
+  KVM_REG_MIPS_CP0_CONFIG5_MASK);
+if (err < 0) {
+DPRINTF("%s: Failed to change CP0_CONFIG5 (%d)\n", __func__, err);
+ret = err;
+}
 err = kvm_mips_put_one_ulreg(cs, KVM_REG_MIPS_CP0_ERROREPC,
  &env->CP0_ErrorEPC);
 if (err < 0) {
@@ -618,6 +694,36 @@ static int kvm_mips_get_cp0_registers(CPUState *cs)
 DPRINTF("%s: Faile

[Qemu-devel] [PATCH v2 1/9] DONT APPLY: linux-headers: Update MIPS KVM headers

2015-03-25 Thread James Hogan

This patch updates the linux headers based solely on the changes in my
"MIPS: KVM: Guest FPU & SIMD (MSA) support" KVM patchset. It is provided
for reference since those changes haven't been merged yet. The stored
headers will need to be synced after my KVM patches are merged and
before the rest of this patchset is applied.

The individual KVM patches that make up this change are:
- "[PATCH 07/20] MIPS: KVM: Clean up register definitions a little"
  https://patchwork.kernel.org/patch/5986171/
- "[PATCH 14/20] MIPS: KVM: Expose FPU registers"
  https://patchwork.kernel.org/patch/5986211/
- "[PATCH 15/20] MIPS: KVM: Wire up FPU capability"
  https://patchwork.kernel.org/patch/5986201/
- "[PATCH 19/20] MIPS: KVM: Expose MSA registers"
  https://patchwork.kernel.org/patch/5986191/
- "[PATCH 20/20] MIPS: KVM: Wire up MSA capability"
  https://patchwork.kernel.org/patch/5986151/

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Leon Alrae 
Cc: Aurelien Jarno 
---
Changes in v2:
- Removed most of patch 7 and updates to linux-headers/linux/kvm.h from
  patches 8 and 9, and put in this patch for reference (Paolo).
---
 linux-headers/asm-mips/kvm.h | 160 ++-
 linux-headers/linux/kvm.h|   2 +
 2 files changed, 101 insertions(+), 61 deletions(-)

diff --git a/linux-headers/asm-mips/kvm.h b/linux-headers/asm-mips/kvm.h
index 2c04b6d9ff85..6985eb59b085 100644
--- a/linux-headers/asm-mips/kvm.h
+++ b/linux-headers/asm-mips/kvm.h
@@ -36,77 +36,85 @@ struct kvm_regs {
 
 /*
  * for KVM_GET_FPU and KVM_SET_FPU
- *
- * If Status[FR] is zero (32-bit FPU), the upper 32-bits of the FPRs
- * are zero filled.
  */
 struct kvm_fpu {
-   __u64 fpr[32];
-   __u32 fir;
-   __u32 fccr;
-   __u32 fexr;
-   __u32 fenr;
-   __u32 fcsr;
-   __u32 pad;
 };
 
 
 /*
- * For MIPS, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access CP0
+ * For MIPS, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various
  * registers.  The id field is broken down as follows:
  *
- *  bits[2..0]   - Register 'sel' index.
- *  bits[7..3]   - Register 'rd'  index.
- *  bits[15..8]  - Must be zero.
- *  bits[31..16] - 1 -> CP0 registers.
- *  bits[51..32] - Must be zero.
  *  bits[63..52] - As per linux/kvm.h
+ *  bits[51..32] - Must be zero.
+ *  bits[31..16] - Register set.
+ *
+ * Register set = 0: GP registers from kvm_regs (see definitions below).
+ *
+ * Register set = 1: CP0 registers.
+ *  bits[15..8]  - Must be zero.
+ *  bits[7..3]   - Register 'rd'  index.
+ *  bits[2..0]   - Register 'sel' index.
+ *
+ * Register set = 2: KVM specific registers (see definitions below).
+ *
+ * Register set = 3: FPU / MSA registers (see definitions below).
  *
  * Other sets registers may be added in the future.  Each set would
  * have its own identifier in bits[31..16].
- *
- * The registers defined in struct kvm_regs are also accessible, the
- * id values for these are below.
  */
 
-#define KVM_REG_MIPS_R0 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0)
-#define KVM_REG_MIPS_R1 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 1)
-#define KVM_REG_MIPS_R2 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 2)
-#define KVM_REG_MIPS_R3 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 3)
-#define KVM_REG_MIPS_R4 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 4)
-#define KVM_REG_MIPS_R5 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 5)
-#define KVM_REG_MIPS_R6 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 6)
-#define KVM_REG_MIPS_R7 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 7)
-#define KVM_REG_MIPS_R8 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 8)
-#define KVM_REG_MIPS_R9 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 9)
-#define KVM_REG_MIPS_R10 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 10)
-#define KVM_REG_MIPS_R11 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 11)
-#define KVM_REG_MIPS_R12 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 12)
-#define KVM_REG_MIPS_R13 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 13)
-#define KVM_REG_MIPS_R14 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 14)
-#define KVM_REG_MIPS_R15 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 15)
-#define KVM_REG_MIPS_R16 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 16)
-#define KVM_REG_MIPS_R17 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 17)
-#define KVM_REG_MIPS_R18 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 18)
-#define KVM_REG_MIPS_R19 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 19)
-#define KVM_REG_MIPS_R20 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 20)
-#define KVM_REG_MIPS_R21 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 21)
-#define KVM_REG_MIPS_R22 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 22)
-#define KVM_REG_MIPS_R23 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 23)
-#define KVM_REG_MIPS_R24 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 24)
-#define KVM_REG_MIPS_R25 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 25)
-#define KVM_REG_MIPS_R26 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 26)
-#define KVM_REG_MIPS_R27 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 27)
-#define KVM_REG_MIPS_R28 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 28)
-#define KVM_REG_MIPS_R29 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 29)
-#define KVM_REG_MIPS_R30 (KVM_REG_MIPS | KVM_REG_SIZE_U64 | 30)
-#define KVM_REG_MIPS_R31 (KVM_REG_MIPS | KVM_RE

Re: [Qemu-devel] One question to lowlevel/xl/xl.c and lowlevel/xc/xc.c

2015-03-25 Thread Ian Campbell

On Wed, 2015-03-25 at 09:18 +0800, Chen, Tiejun wrote:
> Actually my problem is that, I need to add a new parameter, 'flag', like 
> this, xc_assign_device(xxx,xxx,flag). So if I don't refine xc.c, tools 
> can't be compiled successfully. Or maybe you're suggesting I may isolate 
> this file while building tools, right?

I answered this in my original reply:
 it is acceptable for the new
parameter to not be plumbed up to the Python bindings until someone
comes along with a requirement to use it from Python. IOW you can just
pass whatever the nop value is for the new argument.

The "nop value" is whatever value should be passed to retain the current
behaviour.

Ian.

[Qemu-devel] [PATCH v2 4/9] mips/kvm: Implement PRid CP0 register

2015-03-25 Thread James Hogan

Implement saving and restoring to KVM state of the Processor ID (PRid)
CP0 register. This allows QEMU to control the PRid exposed to the guest
instead of using the default set by KVM.

Signed-off-by: James Hogan 
Reviewed-by: Leon Alrae 
Cc: Paolo Bonzini 
Cc: Aurelien Jarno 
---
 target-mips/kvm.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index 1722a6a40598..d41facfca0d5 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -222,6 +222,7 @@ int kvm_mips_set_ipi_interrupt(MIPSCPU *cpu, int irq, int 
level)
 #define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0)
 #define KVM_REG_MIPS_CP0_CAUSE  MIPS_CP0_32(13, 0)
 #define KVM_REG_MIPS_CP0_EPCMIPS_CP0_64(14, 0)
+#define KVM_REG_MIPS_CP0_PRID   MIPS_CP0_32(15, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC   MIPS_CP0_64(30, 0)
 
 static inline int kvm_mips_put_one_reg(CPUState *cs, uint64_t reg_id,
@@ -521,6 +522,11 @@ static int kvm_mips_put_cp0_registers(CPUState *cs, int 
level)
 DPRINTF("%s: Failed to put CP0_EPC (%d)\n", __func__, err);
 ret = err;
 }
+err = kvm_mips_put_one_reg(cs, KVM_REG_MIPS_CP0_PRID, &env->CP0_PRid);
+if (err < 0) {
+DPRINTF("%s: Failed to put CP0_PRID (%d)\n", __func__, err);
+ret = err;
+}
 err = kvm_mips_put_one_ulreg(cs, KVM_REG_MIPS_CP0_ERROREPC,
  &env->CP0_ErrorEPC);
 if (err < 0) {
@@ -607,6 +613,11 @@ static int kvm_mips_get_cp0_registers(CPUState *cs)
 DPRINTF("%s: Failed to get CP0_EPC (%d)\n", __func__, err);
 ret = err;
 }
+err = kvm_mips_get_one_reg(cs, KVM_REG_MIPS_CP0_PRID, &env->CP0_PRid);
+if (err < 0) {
+DPRINTF("%s: Failed to get CP0_PRID (%d)\n", __func__, err);
+ret = err;
+}
 err = kvm_mips_get_one_ulreg(cs, KVM_REG_MIPS_CP0_ERROREPC,
  &env->CP0_ErrorEPC);
 if (err < 0) {
-- 
2.0.5

[Qemu-devel] [PATCH v2 7/9] mips/kvm: Support signed 64-bit KVM registers

2015-03-25 Thread James Hogan

Rename kvm_mips_{get,put}_one_reg64() to kvm_mips_{get,put}_one_ureg64()
since they take an int64_t pointer, and add separate signed 64-bit
accessors. These will be used for double precision floating point
registers.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Leon Alrae 
Cc: Aurelien Jarno 
---
 target-mips/kvm.c | 40 +++-
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index 4e5c8ba3d10c..e8a8858f0cfb 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -268,7 +268,18 @@ static inline int kvm_mips_put_one_ulreg(CPUState *cs, 
uint64_t reg_id,
 }
 
 static inline int kvm_mips_put_one_reg64(CPUState *cs, uint64_t reg_id,
- uint64_t *addr)
+ int64_t *addr)
+{
+struct kvm_one_reg cp0reg = {
+.id = reg_id,
+.addr = (uintptr_t)addr
+};
+
+return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &cp0reg);
+}
+
+static inline int kvm_mips_put_one_ureg64(CPUState *cs, uint64_t reg_id,
+  uint64_t *addr)
 {
 struct kvm_one_reg cp0reg = {
 .id = reg_id,
@@ -330,7 +341,18 @@ static inline int kvm_mips_get_one_ulreg(CPUState *cs, 
uint64 reg_id,
 }
 
 static inline int kvm_mips_get_one_reg64(CPUState *cs, uint64 reg_id,
- uint64_t *addr)
+ int64_t *addr)
+{
+struct kvm_one_reg cp0reg = {
+.id = reg_id,
+.addr = (uintptr_t)addr
+};
+
+return kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &cp0reg);
+}
+
+static inline int kvm_mips_get_one_ureg64(CPUState *cs, uint64 reg_id,
+  uint64_t *addr)
 {
 struct kvm_one_reg cp0reg = {
 .id = reg_id,
@@ -385,13 +407,13 @@ static int kvm_mips_save_count(CPUState *cs)
 int err, ret = 0;
 
 /* freeze KVM timer */
-err = kvm_mips_get_one_reg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
+err = kvm_mips_get_one_ureg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
 if (err < 0) {
 DPRINTF("%s: Failed to get COUNT_CTL (%d)\n", __func__, err);
 ret = err;
 } else if (!(count_ctl & KVM_REG_MIPS_COUNT_CTL_DC)) {
 count_ctl |= KVM_REG_MIPS_COUNT_CTL_DC;
-err = kvm_mips_put_one_reg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
+err = kvm_mips_put_one_ureg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
 if (err < 0) {
 DPRINTF("%s: Failed to set COUNT_CTL.DC=1 (%d)\n", __func__, err);
 ret = err;
@@ -427,14 +449,14 @@ static int kvm_mips_restore_count(CPUState *cs)
 int err_dc, err, ret = 0;
 
 /* check the timer is frozen */
-err_dc = kvm_mips_get_one_reg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
+err_dc = kvm_mips_get_one_ureg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
 if (err_dc < 0) {
 DPRINTF("%s: Failed to get COUNT_CTL (%d)\n", __func__, err_dc);
 ret = err_dc;
 } else if (!(count_ctl & KVM_REG_MIPS_COUNT_CTL_DC)) {
 /* freeze timer (sets COUNT_RESUME for us) */
 count_ctl |= KVM_REG_MIPS_COUNT_CTL_DC;
-err = kvm_mips_put_one_reg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
+err = kvm_mips_put_one_ureg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
 if (err < 0) {
 DPRINTF("%s: Failed to set COUNT_CTL.DC=1 (%d)\n", __func__, err);
 ret = err;
@@ -458,7 +480,7 @@ static int kvm_mips_restore_count(CPUState *cs)
 /* resume KVM timer */
 if (err_dc >= 0) {
 count_ctl &= ~KVM_REG_MIPS_COUNT_CTL_DC;
-err = kvm_mips_put_one_reg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
+err = kvm_mips_put_one_ureg64(cs, KVM_REG_MIPS_COUNT_CTL, &count_ctl);
 if (err < 0) {
 DPRINTF("%s: Failed to set COUNT_CTL.DC=0 (%d)\n", __func__, err);
 ret = err;
@@ -491,8 +513,8 @@ static void kvm_mips_update_state(void *opaque, int 
running, RunState state)
 } else {
 /* Set clock restore time to now */
 count_resume = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-ret = kvm_mips_put_one_reg64(cs, KVM_REG_MIPS_COUNT_RESUME,
- &count_resume);
+ret = kvm_mips_put_one_ureg64(cs, KVM_REG_MIPS_COUNT_RESUME,
+  &count_resume);
 if (ret < 0) {
 fprintf(stderr, "Failed setting COUNT_RESUME\n");
 return;
-- 
2.0.5

[Qemu-devel] [PATCH v2 6/9] mips/kvm: Support unsigned KVM registers

2015-03-25 Thread James Hogan

Add KVM register access functions for the uint32_t type. This is
required for FP and MSA control registers, which are represented as
unsigned 32-bit integers.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Leon Alrae 
Cc: Aurelien Jarno 
---
 target-mips/kvm.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index ead8c5f73930..4e5c8ba3d10c 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -243,6 +243,18 @@ static inline int kvm_mips_put_one_reg(CPUState *cs, 
uint64_t reg_id,
 return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &cp0reg);
 }
 
+static inline int kvm_mips_put_one_ureg(CPUState *cs, uint64_t reg_id,
+uint32_t *addr)
+{
+uint64_t val64 = *addr;
+struct kvm_one_reg cp0reg = {
+.id = reg_id,
+.addr = (uintptr_t)&val64
+};
+
+return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &cp0reg);
+}
+
 static inline int kvm_mips_put_one_ulreg(CPUState *cs, uint64_t reg_id,
  target_ulong *addr)
 {
@@ -283,6 +295,23 @@ static inline int kvm_mips_get_one_reg(CPUState *cs, 
uint64_t reg_id,
 return ret;
 }
 
+static inline int kvm_mips_get_one_ureg(CPUState *cs, uint64_t reg_id,
+uint32_t *addr)
+{
+int ret;
+uint64_t val64 = 0;
+struct kvm_one_reg cp0reg = {
+.id = reg_id,
+.addr = (uintptr_t)&val64
+};
+
+ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &cp0reg);
+if (ret >= 0) {
+*addr = val64;
+}
+return ret;
+}
+
 static inline int kvm_mips_get_one_ulreg(CPUState *cs, uint64 reg_id,
  target_ulong *addr)
 {
-- 
2.0.5

[Qemu-devel] [PATCH v2 2/9] mips/kvm: Sync with newer MIPS KVM headers

2015-03-25 Thread James Hogan

The KVM_REG_MIPS_COUNT_* definitions are now included in
linux-headers/asm-mips/kvm.h since commit b061808d39fa ("linux-headers:
update linux headers to kvm/next"), therefore the duplicate definitions
in target-mips/kvm.c can now be dropped.

The MIPS_C0_{32,64} macros are also updated to utilise definitions more
recently added to the asm-mips/kvm.h header.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Leon Alrae 
Cc: Aurelien Jarno 
---
Changes in v2:
- Add the changes to MIPS_CP0_{32,64} macros from v1 patch 7, since the
  rest of that patch is now unnecessary and the change is along the same
  lines as this patch.
- (not added Leon's Reviewed-by due to above non-reviewed change).
---
 target-mips/kvm.c | 15 ++-
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index 4d1f7ead8142..bbdaccc9d729 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -206,10 +206,10 @@ int kvm_mips_set_ipi_interrupt(MIPSCPU *cpu, int irq, int 
level)
 }
 
 #define MIPS_CP0_32(_R, _S) \
-(KVM_REG_MIPS | KVM_REG_SIZE_U32 | 0x1 | (8 * (_R) + (_S)))
+(KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S)))
 
 #define MIPS_CP0_64(_R, _S) \
-(KVM_REG_MIPS | KVM_REG_SIZE_U64 | 0x1 | (8 * (_R) + (_S)))
+(KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U64 | (8 * (_R) + (_S)))
 
 #define KVM_REG_MIPS_CP0_INDEX  MIPS_CP0_32(0, 0)
 #define KVM_REG_MIPS_CP0_CONTEXTMIPS_CP0_64(4, 0)
@@ -226,17 +226,6 @@ int kvm_mips_set_ipi_interrupt(MIPSCPU *cpu, int irq, int 
level)
 #define KVM_REG_MIPS_CP0_EPCMIPS_CP0_64(14, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC   MIPS_CP0_64(30, 0)
 
-/* CP0_Count control */
-#define KVM_REG_MIPS_COUNT_CTL  (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
- 0x2 | 0)
-#define KVM_REG_MIPS_COUNT_CTL_DC   0x0001  /* master disable */
-/* CP0_Count resume monotonic nanoseconds */
-#define KVM_REG_MIPS_COUNT_RESUME   (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
- 0x2 | 1)
-/* CP0_Count rate in Hz */
-#define KVM_REG_MIPS_COUNT_HZ   (KVM_REG_MIPS | KVM_REG_SIZE_U64 | \
- 0x2 | 2)
-
 static inline int kvm_mips_put_one_reg(CPUState *cs, uint64_t reg_id,
int32_t *addr)
 {
-- 
2.0.5

[Qemu-devel] [PATCH v2 8/9] mips/kvm: Support FPU in MIPS KVM guests

2015-03-25 Thread James Hogan

Support the new KVM_CAP_MIPS_FPU capability, which allows the host's FPU
to be exposed to the KVM guest.

The capability is enabled if the guest core has an FPU according to its
Config1 register. Various config bits are now writeable so that KVM is
aware of the configuration (Config1.FP) and so that QEMU can
save/restore the guest modifiable bits (Config5.FRE, Config5.UFR,
Config5.UFE). The FCSR/FIR registers and the floating point registers
are now saved/restored (depending on the FR mode bit).

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Leon Alrae 
Cc: Aurelien Jarno 
---
Changes in v2:
- Change (1 << x) to (1U << x) in important places to avoid compiler
  defined behaviour (Leon).
- Removed update of linux-headers/linux/kvm.h, see patch 1 (Paolo).
---
 target-mips/cpu.h |   2 +
 target-mips/kvm.c | 124 --
 2 files changed, 122 insertions(+), 4 deletions(-)

diff --git a/target-mips/cpu.h b/target-mips/cpu.h
index f9d2b4c5af80..68c8260da0e6 100644
--- a/target-mips/cpu.h
+++ b/target-mips/cpu.h
@@ -462,6 +462,8 @@ struct CPUMIPSState {
 #define CP0C5_CV 29
 #define CP0C5_EVA28
 #define CP0C5_MSAEn  27
+#define CP0C5_UFE9
+#define CP0C5_FRE8
 #define CP0C5_SBRI   6
 #define CP0C5_UFR2
 #define CP0C5_NFExists   0
diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index e8a8858f0cfb..4920244c161a 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -29,6 +29,8 @@
 #define DPRINTF(fmt, ...) \
 do { if (DEBUG_KVM) { fprintf(stderr, fmt, ## __VA_ARGS__); } } while (0)
 
+static int kvm_mips_fpu_cap;
+
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 KVM_CAP_LAST_INFO
 };
@@ -45,16 +47,29 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
 /* MIPS has 128 signals */
 kvm_set_sigmask_len(s, 16);
 
+kvm_mips_fpu_cap = kvm_check_extension(s, KVM_CAP_MIPS_FPU);
+
 DPRINTF("%s\n", __func__);
 return 0;
 }
 
 int kvm_arch_init_vcpu(CPUState *cs)
 {
+MIPSCPU *cpu = MIPS_CPU(cs);
+CPUMIPSState *env = &cpu->env;
 int ret = 0;
 
 qemu_add_vm_change_state_handler(kvm_mips_update_state, cs);
 
+if (kvm_mips_fpu_cap && env->CP0_Config1 & (1 << CP0C1_FP)) {
+ret = kvm_vcpu_enable_cap(cs, KVM_CAP_MIPS_FPU, 0, 0);
+if (ret < 0) {
+/* mark unsupported so it gets disabled on reset */
+kvm_mips_fpu_cap = 0;
+ret = 0;
+}
+}
+
 DPRINTF("%s\n", __func__);
 return ret;
 }
@@ -63,8 +78,8 @@ void kvm_mips_reset_vcpu(MIPSCPU *cpu)
 {
 CPUMIPSState *env = &cpu->env;
 
-if (env->CP0_Config1 & (1 << CP0C1_FP)) {
-fprintf(stderr, "Warning: FPU not supported with KVM, disabling\n");
+if (!kvm_mips_fpu_cap && env->CP0_Config1 & (1 << CP0C1_FP)) {
+fprintf(stderr, "Warning: KVM does not support FPU, disabling\n");
 env->CP0_Config1 &= ~(1 << CP0C1_FP);
 }
 
@@ -363,11 +378,14 @@ static inline int kvm_mips_get_one_ureg64(CPUState *cs, 
uint64 reg_id,
 }
 
 #define KVM_REG_MIPS_CP0_CONFIG_MASK(1U << CP0C0_M)
-#define KVM_REG_MIPS_CP0_CONFIG1_MASK   (1U << CP0C1_M)
+#define KVM_REG_MIPS_CP0_CONFIG1_MASK   ((1U << CP0C1_M) | \
+ (1U << CP0C1_FP))
 #define KVM_REG_MIPS_CP0_CONFIG2_MASK   (1U << CP0C2_M)
 #define KVM_REG_MIPS_CP0_CONFIG3_MASK   (1U << CP0C3_M)
 #define KVM_REG_MIPS_CP0_CONFIG4_MASK   (1U << CP0C4_M)
-#define KVM_REG_MIPS_CP0_CONFIG5_MASK   0
+#define KVM_REG_MIPS_CP0_CONFIG5_MASK   ((1U << CP0C5_UFE) | \
+ (1U << CP0C5_FRE) | \
+ (1U << CP0C5_UFR))
 
 static inline int kvm_mips_change_one_reg(CPUState *cs, uint64_t reg_id,
   int32_t *addr, int32_t mask)
@@ -529,6 +547,98 @@ static void kvm_mips_update_state(void *opaque, int 
running, RunState state)
 }
 }
 
+static int kvm_mips_put_fpu_registers(CPUState *cs, int level)
+{
+MIPSCPU *cpu = MIPS_CPU(cs);
+CPUMIPSState *env = &cpu->env;
+int err, ret = 0;
+unsigned int i;
+
+/* Only put FPU state if we're emulating a CPU with an FPU */
+if (env->CP0_Config1 & (1 << CP0C1_FP)) {
+/* FPU Control Registers */
+if (level == KVM_PUT_FULL_STATE) {
+err = kvm_mips_put_one_ureg(cs, KVM_REG_MIPS_FCR_IR,
+&env->active_fpu.fcr0);
+if (err < 0) {
+DPRINTF("%s: Failed to put FCR_IR (%d)\n", __func__, err);
+ret = err;
+}
+}
+err = kvm_mips_put_one_ureg(cs, KVM_REG_MIPS_FCR_CSR,
+&env->active_fpu.fcr31);
+if (err < 0) {
+DPRINTF("%s: Failed to put FCR_CSR (%d)\n", __func__, err);
+ret = err;
+}
+
+/* Floating point registers */
+for (i = 0; i < 32; ++i) {
+if (env->CP0_Status & (1 << CP0

[Qemu-devel] [PATCH v2 9/9] mips/kvm: Support MSA in MIPS KVM guests

2015-03-25 Thread James Hogan

Support the new KVM_CAP_MIPS_MSA capability, which allows MIPS SIMD
Architecture (MSA) to be exposed to the KVM guest.

The capability is enabled if the guest core has MSA according to its
Config3 register. Various config bits are now writeable so that KVM is
aware of the configuration (Config3.MSAP) and so that QEMU can
save/restore the guest modifiable bits (Config5.MSAEn). The MSACSR/MSAIR
registers and the MSA vector registers are now saved/restored. Since the
FP registers are a subset of the vector registers, they are omitted if
the guest has MSA.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Leon Alrae 
Cc: Aurelien Jarno 
---
Changes in v2:
- Change (1 << x) to (1U << x) in important places to avoid compiler
  defined behaviour (Leon).
- Removed update of linux-headers/linux/kvm.h, see patch 1 (Paolo).
---
 target-mips/kvm.c | 127 +-
 1 file changed, 107 insertions(+), 20 deletions(-)

diff --git a/target-mips/kvm.c b/target-mips/kvm.c
index 4920244c161a..b9e7a97afca0 100644
--- a/target-mips/kvm.c
+++ b/target-mips/kvm.c
@@ -30,6 +30,7 @@
 do { if (DEBUG_KVM) { fprintf(stderr, fmt, ## __VA_ARGS__); } } while (0)
 
 static int kvm_mips_fpu_cap;
+static int kvm_mips_msa_cap;
 
 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 KVM_CAP_LAST_INFO
@@ -48,6 +49,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
 kvm_set_sigmask_len(s, 16);
 
 kvm_mips_fpu_cap = kvm_check_extension(s, KVM_CAP_MIPS_FPU);
+kvm_mips_msa_cap = kvm_check_extension(s, KVM_CAP_MIPS_MSA);
 
 DPRINTF("%s\n", __func__);
 return 0;
@@ -70,6 +72,15 @@ int kvm_arch_init_vcpu(CPUState *cs)
 }
 }
 
+if (kvm_mips_msa_cap && env->CP0_Config3 & (1 << CP0C3_MSAP)) {
+ret = kvm_vcpu_enable_cap(cs, KVM_CAP_MIPS_MSA, 0, 0);
+if (ret < 0) {
+/* mark unsupported so it gets disabled on reset */
+kvm_mips_msa_cap = 0;
+ret = 0;
+}
+}
+
 DPRINTF("%s\n", __func__);
 return ret;
 }
@@ -82,6 +93,10 @@ void kvm_mips_reset_vcpu(MIPSCPU *cpu)
 fprintf(stderr, "Warning: KVM does not support FPU, disabling\n");
 env->CP0_Config1 &= ~(1 << CP0C1_FP);
 }
+if (!kvm_mips_msa_cap && env->CP0_Config3 & (1 << CP0C3_MSAP)) {
+fprintf(stderr, "Warning: KVM does not support MSA, disabling\n");
+env->CP0_Config3 &= ~(1 << CP0C3_MSAP);
+}
 
 DPRINTF("%s\n", __func__);
 }
@@ -381,9 +396,11 @@ static inline int kvm_mips_get_one_ureg64(CPUState *cs, 
uint64 reg_id,
 #define KVM_REG_MIPS_CP0_CONFIG1_MASK   ((1U << CP0C1_M) | \
  (1U << CP0C1_FP))
 #define KVM_REG_MIPS_CP0_CONFIG2_MASK   (1U << CP0C2_M)
-#define KVM_REG_MIPS_CP0_CONFIG3_MASK   (1U << CP0C3_M)
+#define KVM_REG_MIPS_CP0_CONFIG3_MASK   ((1U << CP0C3_M) | \
+ (1U << CP0C3_MSAP))
 #define KVM_REG_MIPS_CP0_CONFIG4_MASK   (1U << CP0C4_M)
-#define KVM_REG_MIPS_CP0_CONFIG5_MASK   ((1U << CP0C5_UFE) | \
+#define KVM_REG_MIPS_CP0_CONFIG5_MASK   ((1U << CP0C5_MSAEn) | \
+ (1U << CP0C5_UFE) | \
  (1U << CP0C5_FRE) | \
  (1U << CP0C5_UFR))
 
@@ -572,17 +589,53 @@ static int kvm_mips_put_fpu_registers(CPUState *cs, int 
level)
 ret = err;
 }
 
-/* Floating point registers */
+/*
+ * FPU register state is a subset of MSA vector state, so don't put FPU
+ * registers if we're emulating a CPU with MSA.
+ */
+if (!(env->CP0_Config3 & (1 << CP0C3_MSAP))) {
+/* Floating point registers */
+for (i = 0; i < 32; ++i) {
+if (env->CP0_Status & (1 << CP0St_FR)) {
+err = kvm_mips_put_one_ureg64(cs, KVM_REG_MIPS_FPR_64(i),
+  &env->active_fpu.fpr[i].d);
+} else {
+err = kvm_mips_get_one_ureg(cs, KVM_REG_MIPS_FPR_32(i),
+&env->active_fpu.fpr[i].w[FP_ENDIAN_IDX]);
+}
+if (err < 0) {
+DPRINTF("%s: Failed to put FPR%u (%d)\n", __func__, i, 
err);
+ret = err;
+}
+}
+}
+}
+
+/* Only put MSA state if we're emulating a CPU with MSA */
+if (env->CP0_Config3 & (1 << CP0C3_MSAP)) {
+/* MSA Control Registers */
+if (level == KVM_PUT_FULL_STATE) {
+err = kvm_mips_put_one_reg(cs, KVM_REG_MIPS_MSA_IR,
+   &env->msair);
+if (err < 0) {
+DPRINTF("%s: Failed to put MSA_IR (%d)\n", __func__, err);
+ret = err;
+}
+}
+err = kvm_mips_put_one_reg(cs, KVM_REG_MIPS_MSA_CSR,
+   &env->active_tc.msacsr);
+

[Qemu-devel] [PATCH] rdma: Fix cleanup in error paths

2015-03-25 Thread Padmanabh Ratnakar

As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
order in which resources are destroyed was changed for fixing
a seg fault. Due to this change, CQ will never get destroyed as
CQ should be destroyed after QP destruction. Seg fault is caused
improper cleanup when connection fails. Fixing cleanup after
connection failure and order in which resources are destroyed
in qemu_rdma_cleanup() routine.

Signed-off-by: Meghana Cheripady 
Signed-off-by: Padmanabh Ratnakar 
---
 migration/rdma.c |   22 --
 1 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index e6c3a67..77e3444 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -2194,6 +2194,10 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
 }
 }
 
+if (rdma->qp) {
+rdma_destroy_qp(rdma->cm_id);
+rdma->qp = NULL;
+}
 if (rdma->cq) {
 ibv_destroy_cq(rdma->cq);
 rdma->cq = NULL;
@@ -2206,18 +2210,14 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
 ibv_dealloc_pd(rdma->pd);
 rdma->pd = NULL;
 }
-if (rdma->listen_id) {
-rdma_destroy_id(rdma->listen_id);
-rdma->listen_id = NULL;
-}
 if (rdma->cm_id) {
-if (rdma->qp) {
-rdma_destroy_qp(rdma->cm_id);
-rdma->qp = NULL;
-}
 rdma_destroy_id(rdma->cm_id);
 rdma->cm_id = NULL;
 }
+if (rdma->listen_id) {
+rdma_destroy_id(rdma->listen_id);
+rdma->listen_id = NULL;
+}
 if (rdma->channel) {
 rdma_destroy_event_channel(rdma->channel);
 rdma->channel = NULL;
@@ -2309,8 +2309,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp)
 if (ret) {
 perror("rdma_connect");
 ERROR(errp, "connecting to destination!");
-rdma_destroy_id(rdma->cm_id);
-rdma->cm_id = NULL;
 goto err_rdma_source_connect;
 }
 
@@ -2319,8 +2317,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp)
 perror("rdma_get_cm_event after rdma_connect");
 ERROR(errp, "connecting to destination!");
 rdma_ack_cm_event(cm_event);
-rdma_destroy_id(rdma->cm_id);
-rdma->cm_id = NULL;
 goto err_rdma_source_connect;
 }
 
@@ -2328,8 +2324,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
**errp)
 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
 ERROR(errp, "connecting to destination!");
 rdma_ack_cm_event(cm_event);
-rdma_destroy_id(rdma->cm_id);
-rdma->cm_id = NULL;
 goto err_rdma_source_connect;
 }
 rdma->connected = true;
-- 
1.7.1

[Qemu-devel] [PULL for v2.3 01/01] seccomp: update libseccomp version and remove arch restriction

2015-03-25 Thread Eduardo Otubo

Libseccomp version updated to 2.2.0 and arch restriction to x86/x86_64
is now removed. It's supposed to work on armv7l as well.

Related bug: https://bugs.launchpad.net/qemu/+bug/1363641

Signed-off-by: Eduardo Otubo 
---
 configure | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 589798e..cbe6495 100755
--- a/configure
+++ b/configure
@@ -1848,14 +1848,13 @@ fi
 # libseccomp check
 
 if test "$seccomp" != "no" ; then
-if test "$cpu" = "i386" || test "$cpu" = "x86_64" &&
-$pkg_config --atleast-version=2.1.1 libseccomp; then
+if $pkg_config --atleast-version=2.2.0 libseccomp; then
 libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
 QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
seccomp="yes"
 else
if test "$seccomp" = "yes"; then
-feature_not_found "libseccomp" "Install libseccomp devel >= 2.1.1"
+feature_not_found "libseccomp" "Install libseccomp devel >= 2.2.0"
fi
seccomp="no"
 fi
-- 
1.9.1

Re: [Qemu-devel] [v3][PATCH 2/2] libxl: introduce gfx_passthru_kind

2015-03-25 Thread Ian Campbell

On Wed, 2015-03-25 at 09:10 +0800, Chen, Tiejun wrote:

> +But when given as a string the B option describes the type
> +of device to enable. Not this behavior is only supported with upstream

"Note" and "the upstream..."

> +=item "igd"
> +
> +Enables graphics device PCI passthrough but force set the type of device
> +with the Intel Graphics Device.

"but force set the type" doesn't scan very well. "... forcing the type
of device to Intel Graphics Device" works I think.

> I understand what you mean but that table just includes IGDs existed on 
> BDW and HSW. Because in the case of qemu upstream we're just covering 
> these platforms, and with our discussion we don't have any plan to add 
> those legacy platforms in the future. But qemu-xen-traditional still 
> covers those platforms. So I'm afraid its not good to check this with 
> that table as well.

Hrm, OK. I suppose we can live with autodetect and igd both meaning igd
and whoever adds a new type will have to remember to add a check for
qemu-trad then.

Ian.

[Qemu-devel] [PULL for v2.3 00/01] seccomp branch queue

2015-03-25 Thread Eduardo Otubo

The following changes since commit 054903a832b865eb5432d79b5c9d1e1ff31b58d7:

  Update version for v2.3.0-rc1 release (2015-03-24 16:34:16 +)

are available in the git repository at:

  git://github.com/otubo/qemu.git tags/pull-seccomp-20150325

for you to fetch changes up to 8e27fc200457e3f2473d0069263774d4ba17bd85:

  seccomp: update libseccomp version and remove arch restriction (2015-03-25 
11:03:27 +0100)


seccomp: update libseccomp version and remove arch restriction

Eduardo Otubo (1):
  seccomp: update libseccomp version and remove arch restriction

 configure | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

-- 
1.9.1

Re: [Qemu-devel] [RESEND PATCH v4 6/6] acpi: Add hardware implementation for memory hot unplug

2015-03-25 Thread Igor Mammedov

On Tue, 24 Mar 2015 18:48:02 +0800
Zhu Guihua  wrote:

> 
> On 03/24/2015 06:31 PM, Igor Mammedov wrote:
> > On Tue, 24 Mar 2015 17:38:53 +0800
> > Zhu Guihua  wrote:
> >
> >> On 03/16/2015 10:59 PM, Igor Mammedov wrote:
> >>> On Mon, 16 Mar 2015 16:58:18 +0800
> >>> Zhu Guihua  wrote:
> >>>
>  This patch adds a new bit to memory hotplug IO port indicating that
> >>> actually bit was added in 2/6 where is_removing had been added.
> >>>
>  EJ0 has been evaluated by guest OS. And call pc-dimm unplug cb to do
>  the real removal.
> 
>  Signed-off-by: Zhu Guihua 
>  ---
> docs/specs/acpi_mem_hotplug.txt   | 11 +--
> hw/acpi/memory_hotplug.c  | 21 +++--
> hw/core/qdev.c|  2 +-
> hw/i386/acpi-build.c  |  9 +
> hw/i386/acpi-dsdt-mem-hotplug.dsl | 10 ++
> include/hw/acpi/pc-hotplug.h  |  2 ++
> include/hw/qdev-core.h|  1 +
> trace-events  |  1 +
> 8 files changed, 52 insertions(+), 5 deletions(-)
> 
>  diff --git a/docs/specs/acpi_mem_hotplug.txt 
>  b/docs/specs/acpi_mem_hotplug.txt
>  index 1290994..85cd4b8 100644
>  --- a/docs/specs/acpi_mem_hotplug.txt
>  +++ b/docs/specs/acpi_mem_hotplug.txt
>  @@ -19,7 +19,9 @@ Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 
>  byte access):
>   1: Device insert event, used to distinguish device for 
>  which
>  no device check event to OSPM was issued.
>  It's valid only when bit 1 is set.
>  -  2-7: reserved and should be ignored by OSPM
>  +  2: Device remove event, used to distinguish device for 
>  which
>  + no device check event to OSPM was issued.
>  +  3-7: reserved and should be ignored by OSPM
>   [0x15-0x17] reserved
> 
>   write access:
>  @@ -35,7 +37,12 @@ Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 
>  byte access):
>   1: if set to 1 clears device insert event, set by OSPM
>  after it has emitted device check event for the
>  selected memory device
>  -  2-7: reserved, OSPM must clear them before writing to 
>  register
>  +  2: if set to 1 clears device remove event, set by OSPM
>  + after it has emitted device check event for the
>  + selected memory device. if guest fails to eject 
>  device, it
>  + should send OST event about it and forget about device
>  + removal.
>  +  3-7: reserved, OSPM must clear them before writing to 
>  register
> 
> Selecting memory device slot beyond present range has no effect on 
>  platform:
>    - write accesses to memory hot-plug registers not documented above 
>  are
>  diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c
>  index 687b2f1..d6b8c89 100644
>  --- a/hw/acpi/memory_hotplug.c
>  +++ b/hw/acpi/memory_hotplug.c
>  @@ -2,6 +2,7 @@
> #include "hw/acpi/pc-hotplug.h"
> #include "hw/mem/pc-dimm.h"
> #include "hw/boards.h"
>  +#include "hw/qdev-core.h"
> #include "trace.h"
> #include "qapi-event.h"
> 
>  @@ -91,6 +92,8 @@ static void acpi_memory_hotplug_write(void *opaque, 
>  hwaddr addr, uint64_t data,
> MemHotplugState *mem_st = opaque;
> MemStatus *mdev;
> ACPIOSTInfo *info;
>  +DeviceState *dev = NULL;
>  +HotplugHandler *hotplug_ctrl = NULL;
> 
> if (!mem_st->dev_count) {
> return;
>  @@ -122,19 +125,33 @@ static void acpi_memory_hotplug_write(void 
>  *opaque, hwaddr addr, uint64_t data,
> mdev = &mem_st->devs[mem_st->selector];
> mdev->ost_status = data;
> trace_mhp_acpi_write_ost_status(mem_st->selector, 
>  mdev->ost_status);
>  -/* TODO: implement memory removal on guest signal */
> 
> info = acpi_memory_device_status(mem_st->selector, mdev);
> qapi_event_send_acpi_device_ost(info, &error_abort);
> qapi_free_ACPIOSTInfo(info);
> break;
>  -case 0x14:
>  +case 0x14: /* set is_* fields */
> mdev = &mem_st->devs[mem_st->selector];
> if (data & 2) { /* clear insert event */
> mdev->is_inserting  = false;
> trace_mhp_acpi_clear_insert_evt(mem_st->selector);
>  +} else if (data & 4) { /* request removal of device */
> >>> fix comment to match docs above.
> >>>
>  +mdev->is_removing = false;
>  +trace

Re: [Qemu-devel] [PATCH RFC for-2.3 1/1] block: New command line option --no-format-probing

2015-03-25 Thread Paolo Bonzini

On 25/03/2015 09:10, Markus Armbruster wrote:
>> Based on my reading of the code, libvirt will actually ignore the
>> allow_disk_format_probing setting, and not do anything about the format
>> when driving such an old QEMU.  By contrast, if you specify a format and
>> libvirt invokes an old qemu-nbd without --format, libvirt fails hard.
>> That's already CVE worthy, isn't it?
>>
>> So I think an option like this is premature.  libvirt should _first of
>> all_ ensure that it completely abides by the allow_disk_format_probing
>> setting, including refusing to drive old QEMUs when format probing is
>> disabled.  Once libvirt is consistent within itself, we can talk about
>> what help QEMU can provide.
> 
> If we had a "no format probing" option in qemu-nbd (not in my patch, but
> that's a flaw in the patch, not the idea), then libvirt developers
> would've put it to use right away, and then their insecure usage
> would've failed hard, making it immediately obvious.

It wouldn't have fixed the bug in _old QEMU without -drive_ (and thus
without the option!), where allow_disk_format_probing does nothing.
Which is a vulnerability!  I noticed now that Eric is not CCed, adding him.

qemu-nbd invocations always require an explicit disk format in libvirt,
and fail hard on old qemu-nbd.

In the end we have:

- one bug for old QEMUs, so libvirt would not use the option

- one case where libvirt is stricter and requires a format, so the
option does not add anything

> I agree secure usage is currently is too hard for casual command line
> use.  Unfortunately, it has proven too hard even for sophisticated
> programmatic users like libvirt.

I disagree.  Libvirt has been doing well.  Hard---yes; but not too hard.

> When we discussed security issues with probing last year, my first line
> of defense was "the default should be secure".  Overwhelming opposition
> from the backward compatibility camp forced me to retreat from that line
> to my second line of defense "secure shouldn't be hard".  That line
> needs to be held at all costs :)

I agree.  Making the default secure is difficult because the old-style
options (-hda, -cdrom, etc.) are still good for casual use.  When I'm
boot-testing 20 different QEMUs to check that a series is bisectable,
it's generally okay to use IDE disks and cache=writeback, and it also
shouldn't matter if my image is raw or qcow2.

And making "secure" as easy as "--security on" is a worthwhile goal indeed.

That said, I'm not sure it's attainable.  For example, most downloadable
images are provided in qcow2 format.  The question then is: if users
can't be bothered to use -drive format=raw all the time, why would they
be bothered to use -drive format=qcow2 instead?

Users want security _and_ DWIM.  Anvil, meet hammer.

>> But I still
>> maintain that for libvirt this is basically security theater, and the
>> priorities are others.
> 
> To be honest, I find your use of "security theater" offensive.
> 
> I'd readily accept the moniker if the feature would provide libvirt
> developers an excuse not to fix their bugs, or reduce the incentives to
> fix their bugs.

I say it again: libvirt should first be consistent within itself.
Either it should drop support for old QEMUs, or it should be audited to
ensure that old QEMUs are invoked securely.

Once old QEMUs are deemed okay, we can add smarties to new QEMUs.  If
you don't do it in this order, the smarties indeed reduce the incentive
to fix bugs.

Also, I'm calling it security theater because on the surface the option
seems to provide a mitigation strategy, but in the end doesn't provide
much.  Because...

> But the feature in fact does the opposite: it makes such bugs blatantly
> obvious and release-blocker-painful.

... they are only obvious inasmuch as there is a testcase for "no
explicitly specified format".  Otherwise, you have a hidden failure for
new QEMU and a vulnerability for old QEMU, so you don't gain anything.

Problem is, the same person write code and testcases.  If you are smart
enough to add testcases that catch possible vulnerability, you probably
are already catching it in the code.  (And the testcase is a way of
praising yourself for your forethought. :)).

Paolo

Re: [Qemu-devel] [PATCH] nbd: Fix up comment after commit e140177

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 09:18, Markus Armbruster wrote:
> Signed-off-by: Markus Armbruster 
> ---
>  blockdev-nbd.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/blockdev-nbd.c b/blockdev-nbd.c
> index b29e456..85cda4c 100644
> --- a/blockdev-nbd.c
> +++ b/blockdev-nbd.c
> @@ -47,8 +47,9 @@ void qmp_nbd_server_start(SocketAddress *addr, Error **errp)
>  }
>  }
>  
> -/* Hook into the BlockDriverState notifiers to close the export when
> - * the file is closed.
> +/*
> + * Hook into the BlockBackend notifiers to close the export when the
> + * backend is closed.
>   */
>  typedef struct NBDCloseNotifier {
>  Notifier n;
> 

Applied, thanks.

Paolo

Re: [Qemu-devel] [PATCH] rdma: Fix cleanup in error paths

2015-03-25 Thread Dr. David Alan Gilbert

> As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
> order in which resources are destroyed was changed for fixing
> a seg fault. Due to this change, CQ will never get destroyed as
> CQ should be destroyed after QP destruction. Seg fault is caused
> improper cleanup when connection fails. Fixing cleanup after
> connection failure and order in which resources are destroyed
> in qemu_rdma_cleanup() routine.

Do you have a test case that triggers the seg fault?

> Signed-off-by: Meghana Cheripady 
> Signed-off-by: Padmanabh Ratnakar 
> ---
>  migration/rdma.c |   22 --
>  1 files changed, 8 insertions(+), 14 deletions(-)
> 
> diff --git a/migration/rdma.c b/migration/rdma.c
> index e6c3a67..77e3444 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -2194,6 +2194,10 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>  }
>  }
>  
> +if (rdma->qp) {
> +rdma_destroy_qp(rdma->cm_id);
> +rdma->qp = NULL;
> +}

OK, that makes sense, the manpage for rdma_destroy_qp says
'Users must destroy any QP associated with an rdma_cm_id before destroying the 
ID.'
so it's probably good to have this early.

>  if (rdma->cq) {
>  ibv_destroy_cq(rdma->cq);
>  rdma->cq = NULL;
> @@ -2206,18 +2210,14 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>  ibv_dealloc_pd(rdma->pd);
>  rdma->pd = NULL;
>  }
> -if (rdma->listen_id) {
> -rdma_destroy_id(rdma->listen_id);
> -rdma->listen_id = NULL;
> -}
>  if (rdma->cm_id) {
> -if (rdma->qp) {
> -rdma_destroy_qp(rdma->cm_id);
> -rdma->qp = NULL;
> -}
>  rdma_destroy_id(rdma->cm_id);
>  rdma->cm_id = NULL;
>  }
> +if (rdma->listen_id) {
> +rdma_destroy_id(rdma->listen_id);
> +rdma->listen_id = NULL;
> +}

Can you explain this reorder - why is the order of listen_id and cm_id 
important?
is that a failure on the destination?

>  if (rdma->channel) {
>  rdma_destroy_event_channel(rdma->channel);
>  rdma->channel = NULL;
> @@ -2309,8 +2309,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
> **errp)
>  if (ret) {
>  perror("rdma_connect");
>  ERROR(errp, "connecting to destination!");
> -rdma_destroy_id(rdma->cm_id);
> -rdma->cm_id = NULL;
>  goto err_rdma_source_connect;
>  }
>  
> @@ -2319,8 +2317,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
> **errp)
>  perror("rdma_get_cm_event after rdma_connect");
>  ERROR(errp, "connecting to destination!");
>  rdma_ack_cm_event(cm_event);
> -rdma_destroy_id(rdma->cm_id);
> -rdma->cm_id = NULL;
>  goto err_rdma_source_connect;
>  }
>  
> @@ -2328,8 +2324,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
> **errp)
>  perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
>  ERROR(errp, "connecting to destination!");
>  rdma_ack_cm_event(cm_event);
> -rdma_destroy_id(rdma->cm_id);
> -rdma->cm_id = NULL;
>  goto err_rdma_source_connect;
>  }
>  rdma->connected = true;
> -- 
> 1.7.1

Dave
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [Qemu-devel] [RESEND PATCH v4 6/6] acpi: Add hardware implementation for memory hot unplug

2015-03-25 Thread Igor Mammedov

On Wed, 25 Mar 2015 14:13:23 +0800
Zhu Guihua  wrote:

> 
> On 03/24/2015 06:26 PM, Igor Mammedov wrote:
> > On Tue, 24 Mar 2015 17:34:29 +0800
> > Zhu Guihua  wrote:
> >
> >> On 03/23/2015 08:47 PM, Igor Mammedov wrote:
> >>> On Mon, 23 Mar 2015 18:59:28 +0800
> >>> Zhu Guihua  wrote:
> >>>
>  On 03/16/2015 10:59 PM, Igor Mammedov wrote:
>  [...]
> > 
> > diff --git a/hw/i386/acpi-dsdt-mem-hotplug.dsl 
> > b/hw/i386/acpi-dsdt-mem-hotplug.dsl
> > index 1e9ec39..ef847e2 100644
> > --- a/hw/i386/acpi-dsdt-mem-hotplug.dsl
> > +++ b/hw/i386/acpi-dsdt-mem-hotplug.dsl
> > @@ -29,6 +29,7 @@
> > External(MEMORY_SLOT_PROXIMITY, FieldUnitObj) // read 
> > only
> > External(MEMORY_SLOT_ENABLED, FieldUnitObj) // 1 if 
> > enabled, read only
> > External(MEMORY_SLOT_INSERT_EVENT, FieldUnitObj) // 
> > (read) 1 if has a insert event. (write) 1 to clear event
> > +External(MEMORY_SLOT_REMOVE_EVENT, FieldUnitObj) // (read) 
> > 1 if has a remove event. (write) 1 to clear event
> > External(MEMORY_SLOT_SLECTOR, FieldUnitObj) // DIMM 
> > selector, write only
> > External(MEMORY_SLOT_OST_EVENT, FieldUnitObj) // _OST 
> > event code, write only
> > External(MEMORY_SLOT_OST_STATUS, FieldUnitObj) // _OST 
> > status code, write only
> > @@ -55,6 +56,8 @@
> > If (LEqual(MEMORY_SLOT_INSERT_EVENT, One)) { // 
> > Memory device needs check
> > MEMORY_SLOT_NOTIFY_METHOD(Local0, 1)
> > Store(1, MEMORY_SLOT_INSERT_EVENT)
> > +} Elseif (LEqual(MEMORY_SLOT_REMOVE_EVENT, One)) { 
> > // Ejection request
> > +MEMORY_SLOT_NOTIFY_METHOD(Local0, 3)
> > clear removing field here.
>  You mean clear remove event here?
> >>> yes
> >> I tested this method, clear remove event here will lead to guest
> >> kernel panic.
> > it shouldn't cause panic if it only clears flag in QEMU
> > (that's what it should do).
> >
> >
> >> }
> >> // TODO: handle memory eject request
> >> Add(Local0, One, Local0) // goto next DIMM
> >> @@ -156,5 +159,12 @@
> >> Store(Arg2, MEMORY_SLOT_OST_STATUS)
> >> Release(MEMORY_SLOT_LOCK)
> >> }
> >> +
> >> +Method(MEMORY_SLOT_EJECT_METHOD, 2) {
> >> +Acquire(MEMORY_SLOT_LOCK, 0x)
> >> +Store(ToInteger(Arg0), MEMORY_SLOT_SLECTOR) // select 
> >> DIMM
> >> +Store(One, MEMORY_SLOT_REMOVE_EVENT)
> > redo it using enabled field. Otherwise it could cause guest/QEMU crash:
> >
> > - if 1st memory was asked to be removed
> > - then OSPM processes it but has not called _EJ0 yet leaving is_removed 
> > set
> > - then QEMU adds/removes another DIMM triggering slots scan
> >   which would issue second Notify(remove) since is_removed is still 
> > set
> > - as result it will cause failure in OSPM or in QEMU if OSPM issues 
> > double EJ0()
> >
>  If OSPM processed the ejection request but not called _EJ0, the device
>  will still be present in qemu,
>  we must handle this.
> >>> There is nothing to handle in this case, if OSPM hasn't called _EJ0 then
> >>> nothing happens and device stays in QEMU.
> >>>
>  So I think OSPM issues double EJ0 maybe reasonable
>  in this situation.
>  What's your opinion?
> >>> the first _EJ0 must do ejection, as for the second I think it should be 
> >>> NOP.
> >> So we should judge the enabled field to check whether the device is
> >> present before
> >> issuing Notify(remove)?
> > I wouldn't check if device is present.
> > I'd unconditionally clear it and make sure on QEMU side that
> > operation is NOP if device is not present.
> 
> I'm sorry that I have not fully understood your meaning about
> 'redo it using enabled field'.  How to do it?
> 
> MEMORY_SLOT_ENABLED is read only, how can I use it to handle EJ0?
it is reserved in write side if you look at spec,
but appears that we can't reuse MEMORY_SLOT_ENABLED due to bug in
already released QEMU versions.
See another reply where I explained it in more details and suggested
how QSPM<->QEMU protocol should be amended.

> 
> Thanks,
> Zhu
> 
> 
>

Re: [Qemu-devel] [PATCH] migration: flush the bdrv before stopping VM

2015-03-25 Thread Juan Quintela

"Li, Liang Z"  wrote:
>> >> > Right now, we don't have an interface to detect that cases and got
>> >> > back to the iterative stage.
>> >>
>> >> How about go back to the iterative stage when detect that the
>> >> pending_size is larger Than max_size, like this:
>> >>
>> >> +/* do flush here is aimed to shorten the VM downtime,
>> >> + * bdrv_flush_all is a time consuming operation
>> >> + * when the guest has done some file writing */
>> >> +bdrv_flush_all();
>> >> +pending_size = qemu_savevm_state_pending(s->file, 
>> >> max_size);
>> >> +if (pending_size && pending_size >= max_size) {
>> >> +qemu_mutex_unlock_iothread();
>> >> +continue;
>> >> +}
>> >>   ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
>> >>   if (ret >= 0) {
>> >>   qemu_file_set_rate_limit(s->file, INT64_MAX);
>> >>
>> >> and this is quite simple.
>> >
>> > Yes, but it is too simple. If you hold all the locks during
>> > bdrv_flush_all(), your VM will effectively stop as soon as it performs
>> > the next I/O access, so you don't win much. And you still don't have a
>> > timeout for cases where the flush takes really long.
>> 
>> This is probably better than what we had now (basically we are "meassuring"
>> after bdrv_flush_all how much the amount of dirty memory has changed,
>> and return to iterative stage if it took too much.  A timeout would be better
>> anyways.  And an interface te start the synchronization sooner
>> asynchronously would be also good.
>> 
>> Notice that my understanding is that any proper fix for this is 2.4 material.
>
> Then, how to deal with this issue in 2.3, leave it here? or make an
> incomplete fix like I do above?

I think it is better to leave it here for 2.3. With a patch like this
one, we improve in one load and we got worse in a different load (depens
a lot in the ratio of dirtying memory vs disk).  I have no data which
load is more common, so I prefer to be conservative so late in the
cycle.  What do you think?

Later, Juan.


>
> Liang
>
>> Thanks, Juan.

Re: [Qemu-devel] [PATCH] migration: flush the bdrv before stopping VM

2015-03-25 Thread Kevin Wolf

Am 25.03.2015 um 11:50 hat Juan Quintela geschrieben:
> "Li, Liang Z"  wrote:
> >> >> > Right now, we don't have an interface to detect that cases and got
> >> >> > back to the iterative stage.
> >> >>
> >> >> How about go back to the iterative stage when detect that the
> >> >> pending_size is larger Than max_size, like this:
> >> >>
> >> >> +/* do flush here is aimed to shorten the VM downtime,
> >> >> + * bdrv_flush_all is a time consuming operation
> >> >> + * when the guest has done some file writing */
> >> >> +bdrv_flush_all();
> >> >> +pending_size = qemu_savevm_state_pending(s->file, 
> >> >> max_size);
> >> >> +if (pending_size && pending_size >= max_size) {
> >> >> +qemu_mutex_unlock_iothread();
> >> >> +continue;
> >> >> +}
> >> >>   ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> >> >>   if (ret >= 0) {
> >> >>   qemu_file_set_rate_limit(s->file, INT64_MAX);
> >> >>
> >> >> and this is quite simple.
> >> >
> >> > Yes, but it is too simple. If you hold all the locks during
> >> > bdrv_flush_all(), your VM will effectively stop as soon as it performs
> >> > the next I/O access, so you don't win much. And you still don't have a
> >> > timeout for cases where the flush takes really long.
> >> 
> >> This is probably better than what we had now (basically we are "meassuring"
> >> after bdrv_flush_all how much the amount of dirty memory has changed,
> >> and return to iterative stage if it took too much.  A timeout would be 
> >> better
> >> anyways.  And an interface te start the synchronization sooner
> >> asynchronously would be also good.
> >> 
> >> Notice that my understanding is that any proper fix for this is 2.4 
> >> material.
> >
> > Then, how to deal with this issue in 2.3, leave it here? or make an
> > incomplete fix like I do above?
> 
> I think it is better to leave it here for 2.3. With a patch like this
> one, we improve in one load and we got worse in a different load (depens
> a lot in the ratio of dirtying memory vs disk).  I have no data which
> load is more common, so I prefer to be conservative so late in the
> cycle.  What do you think?

I agree, it's too late in the release cycle for such a change.

Kevin

Re: [Qemu-devel] RFC: memory API changes

2015-03-25 Thread Igor Mammedov

On Mon, 23 Mar 2015 17:32:21 +0100
Andreas Färber  wrote:

> Am 23.03.2015 um 16:11 schrieb Peter Maydell:
> > On 23 March 2015 at 14:39, Paolo Bonzini  wrote:
> >> On 23/03/2015 13:24, Peter Maydell wrote:
> >>>  * cpu_physical_memory_rw are obsolete and should be replaced
> >>>with uses of the as_* functions -- we should at least note
> >>>this in the header file. (Can't do this as an automated change
> >>>really because the correct AS to use is callsite dependent.)
> >>
> >> All users that should _not_ be using address_space_memory have been
> >> already changed to address_space_rw, or should have, so it can be done
> >> automatically.  Same for cpu_physical_memory_map/unmap, BTW.
> > 
> > Hmm. Checking a few, I notice that for instance the kvm-all.c
> > cpu_physical_memory_rw() should probably be using cpu->as.
> [snip]
> 
> Igor, does that impact our APIC discussion?
it should work but it would still be the same as we do now
i.e. APIC MMIO regions would be mapped overlapped in 
address_space_memory (see 09daed84). But dispatch wouldn't
go through cpu->as listener (it's initialized only for TCG).

My hope was that with cpu->as we can get rid of 'current' usage
in APIC code. But even without that it would be nice cleanup
that removes not needed anymore icc bus and moves default APIC
mapping from board to a place where it belongs (i.e. into CPU).

> 
> Regards,
> Andreas
>

Re: [Qemu-devel] [PATCH] Avoid crashing on multiple -incoming

2015-03-25 Thread Juan Quintela

"Dr. David Alan Gilbert (git)"  wrote:
> From: "Dr. David Alan Gilbert" 
>
> Passing multiple -incoming options used to crash qemu (due to
> an invalid state transition incoming->incoming).  Instead we now
> take the last -incoming option, e.g.:
>
> qemu-system-x86_64 -nographic -incoming tcp:: -incoming defer
>
> ends up doing the defer.
>
> Signed-off-by: Dr. David Alan Gilbert 

Applied, thanks.

Re: [Qemu-devel] [PATCH 0/3] hw/arm: add Fixed Virtual Platform VE support

2015-03-25 Thread Peter Maydell

On 25 March 2015 at 01:43, Sergey Fedorov  wrote:
> On 24.03.2015 18:30, Sergey Fedorov wrote:
>> So if I understand you correctly, it would be suitable to implement a
>> model like Juno ARM Development Platform in order to get AArch64 VE
>> model with SMP support in system mode?
>
> As you can guess, my objective is to get a model based on Versatile
> Express with Cortex-A57 CPU which could be used to run an SMP Linux
> kernel on system mode QEMU. So I decided to use the same model as ARM
> FastModels provides. I could get ARMv8-A Foundation Platform model as
> well, but it also uses spin table boot method. Seems there's no much choice.

Basically I'd rather we modelled some bit of real hardware.
I don't see the point in providing a model of somebody
else's model -- if you don't care about matching real
hardware we have 'virt' already. So I don't really want
to model either the Foundation Platform or the AEM,
unless there's a really compelling reason to do that.

thanks
-- PMM

Re: [Qemu-devel] [PATCH for-2.3] arm: memory: Replace memory_region_init_ram with memory_region_allocate_system_memory

2015-03-25 Thread Dirk Müller

Hi Peter,

>> So why do only the ARM boards get fixes here?
> ...ah, I see you've submitted patches for other boards too,
> you just didn't put them together into a single series. Sorry.

Some other architectures were already fixed (x86 and ppc) and I'll be
working through the other arches shortly. I've tested this on vexpress
and virt.

Greetings,
Dirk

Re: [Qemu-devel] [PATCH] rdma: Fix cleanup in error paths

2015-03-25 Thread Juan Quintela

Padmanabh Ratnakar  wrote:
> As part of commit e325b49a320b493cc5d69e263751ff716dc458fe,
> order in which resources are destroyed was changed for fixing
> a seg fault. Due to this change, CQ will never get destroyed as
> CQ should be destroyed after QP destruction. Seg fault is caused
> improper cleanup when connection fails. Fixing cleanup after
> connection failure and order in which resources are destroyed
> in qemu_rdma_cleanup() routine.
>
> Signed-off-by: Meghana Cheripady 
> Signed-off-by: Padmanabh Ratnakar 
> ---
>  migration/rdma.c |   22 --
>  1 files changed, 8 insertions(+), 14 deletions(-)
>
> diff --git a/migration/rdma.c b/migration/rdma.c
> index e6c3a67..77e3444 100644
> --- a/migration/rdma.c
> +++ b/migration/rdma.c
> @@ -2194,6 +2194,10 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>  }
>  }
>  
> +if (rdma->qp) {
> +rdma_destroy_qp(rdma->cm_id);
> +rdma->qp = NULL;
> +}

Agreed with this change.


>  if (rdma->cq) {
>  ibv_destroy_cq(rdma->cq);
>  rdma->cq = NULL;
> @@ -2206,18 +2210,14 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
>  ibv_dealloc_pd(rdma->pd);
>  rdma->pd = NULL;
>  }
> -if (rdma->listen_id) {
> -rdma_destroy_id(rdma->listen_id);
> -rdma->listen_id = NULL;
> -}

I am not sure about this one.  We have (receiving side)

create listen_id
cm_id = rdma_accept(listen_id)

So, it looks better to do the cleanup in the other order, but notice
that with current code, I think this don't matter (i.e. we can't call
qemu_rdma_cleanup() from other place that accept).


>  if (rdma->cm_id) {
> -if (rdma->qp) {
> -rdma_destroy_qp(rdma->cm_id);
> -rdma->qp = NULL;
> -}

This is the "companion" of the 1st chunk, and I agree with the c"

>  rdma_destroy_id(rdma->cm_id);
>  rdma->cm_id = NULL;
>  }
> +if (rdma->listen_id) {
> +rdma_destroy_id(rdma->listen_id);
> +rdma->listen_id = NULL;
> +}

Companion of the second chunk that I can't understand why it is moved.

>  if (rdma->channel) {
>  rdma_destroy_event_channel(rdma->channel);
>  rdma->channel = NULL;
> @@ -2309,8 +2309,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
> **errp)
>  if (ret) {
>  perror("rdma_connect");
>  ERROR(errp, "connecting to destination!");
> -rdma_destroy_id(rdma->cm_id);
> -rdma->cm_id = NULL;
>  goto err_rdma_source_connect;
>  }

This other three are nice, remove code, and make it correct with the
case that qp has to be removed first.

So, should we drop the listen_id part, or there is a reason for it?

Later, Juan.


> @@ -2319,8 +2317,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
> **errp)
>  perror("rdma_get_cm_event after rdma_connect");
>  ERROR(errp, "connecting to destination!");
>  rdma_ack_cm_event(cm_event);
> -rdma_destroy_id(rdma->cm_id);
> -rdma->cm_id = NULL;
>  goto err_rdma_source_connect;
>  }
>  
> @@ -2328,8 +2324,6 @@ static int qemu_rdma_connect(RDMAContext *rdma, Error 
> **errp)
>  perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
>  ERROR(errp, "connecting to destination!");
>  rdma_ack_cm_event(cm_event);
> -rdma_destroy_id(rdma->cm_id);
> -rdma->cm_id = NULL;
>  goto err_rdma_source_connect;
>  }
>  rdma->connected = true;

Re: [Qemu-devel] [PATCH 01/15] migration: remove last_sent_block from save_page_header

2015-03-25 Thread Dr. David Alan Gilbert

* Liang Li (liang.z...@intel.com) wrote:
> From: Juan Quintela 
> 
> Compression code (still not on tree) want to call this funtion from
> outside the migration thread, so we can't write to last_sent_block.
> 
> Instead of reverting full patch:
> 
> [PULL 07/11] save_block_hdr: we can recalculate
> 
> Just revert the parts that touch last_sent_block.
> 
> Signed-off-by: Juan Quintela 

Reviewed-by: Dr. David Alan Gilbert 

> ---
>  arch_init.c | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index fcfa328..4c8fcee 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -332,19 +332,14 @@ static size_t save_page_header(QEMUFile *f, RAMBlock 
> *block, ram_addr_t offset)
>  {
>  size_t size;
>  
> -if (block == last_sent_block) {
> -offset |= RAM_SAVE_FLAG_CONTINUE;
> -}
> -
>  qemu_put_be64(f, offset);
>  size = 8;
>  
> -if (block != last_sent_block) {
> +if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
>  qemu_put_byte(f, strlen(block->idstr));
>  qemu_put_buffer(f, (uint8_t *)block->idstr,
>  strlen(block->idstr));
>  size += 1 + strlen(block->idstr);
> -last_sent_block = block;
>  }
>  return size;
>  }
> @@ -644,6 +639,10 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, 
> ram_addr_t offset,
>  XBZRLE_cache_lock();
>  
>  current_addr = block->offset + offset;
> +
> +if (block == last_sent_block) {
> +offset |= RAM_SAVE_FLAG_CONTINUE;
> +}
>  if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
>  if (ret != RAM_SAVE_CONTROL_DELAYED) {
>  if (bytes_xmit > 0) {
> @@ -739,6 +738,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool 
> last_stage,
>  
>  /* if page is unmodified, continue to the next */
>  if (pages > 0) {
> +last_sent_block = block;
>  break;
>  }
>  }
> -- 
> 1.9.1
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [Qemu-devel] AioContext of block jobs

2015-03-25 Thread Paolo Bonzini

On 25/03/2015 09:31, Fam Zheng wrote:
> I was looking at block jobs' AioContext and realized that the block job
> coroutines are actually started in main loop.
> 
> I'm confused because 5a7e7a0bad17c96e03f55ed7019e2d7545e21a96 and friends in
> the series [1] seem to move the coroutines to the BDS's iothreads, but it
> didn't do that.
> 
> (Although after the first block_job_yield or sleep, the coroutines ARE resumed
> in the right AioContext.)
> 
> Why is it safe to start the jobs from the main thread where QMP command is
> handled? I see no guarantee that the jobs won't access BDS before first yield
> but after releasing the AioContext.
> 
> Is this a bug?

It's okay because the coroutine is started while the main thread is
holding the AioContext.  So the first "stint" of the coroutine, until
the first yield, is done in the main thread but still with the
AioContext held.

After the first yield, the main thread releases the AioContext, the
dataplane thread gets it back and the coroutine is moved to the
dataplane thread.

Paolo

Re: [Qemu-devel] Support for NetLogic XLP Processors

2015-03-25 Thread Duarte Silva

Hi guys,

could anybody help out? Is there a guide on how to implement new CPU's in QEMU 
(or that at least helps in that task) or on how to debug this kind of stuff?

Cheers,
Duarte

On Sunday 22 March 2015 11:13:37 Duarte Silva wrote:
> Hi guys,
> 
> I have been struggling to get some binaries compiled for NetLogic XLP
> processor to run under QEMU. I have tried a bunch of things (most going back
> and forth) and always get the following error message:
> 
> qemu: uncaught target signal 4 (Illegal instruction) - core dumped
> Illegal instruction
> 
> I tried to debug it using GDB but to no avail. Does anybody have ideas? I'm
> running QEMU 2.2.1.
> 
> Thanks for any help, cheers,
> Duarte

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread zhanghailiang


On 2015/3/25 17:46, Dr. David Alan Gilbert wrote:

* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote:

Hi all,

We found that, sometimes, the content of VM's memory is inconsistent between 
Source side and Destination side
when we check it just after finishing migration but before VM continue to Run.

We use a patch like bellow to find this issue, you can find it from affix,
and Steps to reproduce:

(1) Compile QEMU:
  ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make

(2) Command and output:
SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio
(qemu) migrate tcp:192.168.3.8:3004
before saving ram complete
ff703f6889ab8701e4e040872d079a28
md_host : after saving ram complete
ff703f6889ab8701e4e040872d079a28

DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 -boot 
c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio 
-incoming tcp:0:3004
(qemu) QEMU_VM_SECTION_END, after loading ram
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after loading all vmstate
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after cpu_synchronize_all_post_init
230e1e68ece9cd4e769630e1bcb5ddfb

This happens occasionally, and it is more easy to reproduce when issue 
migration command during VM's startup time.

We have done further test and found that some pages has been dirtied but its 
corresponding migration_bitmap is not set.
We can't figure out which modules of QEMU has missed setting bitmap when dirty 
page of VM,
it is very difficult for us to trace all the actions of dirtying VM's pages.

Actually, the first time we found this problem was in the COLO FT development, 
and it triggered some strange issues in
VM which all pointed to the issue of inconsistent of VM's memory. (We have try 
to save all memory of VM to slave side every time
when do checkpoint in COLO FT, and everything will be OK.)

Is it OK for some pages that not transferred to destination when do migration ? 
Or is it a bug?


That does sound like a bug.
The only other explanation I have is that memory is being changed by a device 
emulation
that happens after the end of a saving the vm, or after loading the memory.  
That's
certainly possible - especially if a device (say networking) hasn't been 
properly
stopped.


This issue has blocked our COLO development... :(

Any help will be greatly appreciated!




Hi Dave，


I suggest:
1) Does it happen with devices other than virtio?
2) Strip the devices down - e.g. just run with serial and no video/usb


I will try this ...


3) Try doing the md5 comparison at the end of ram_save_complete


Yes, we have done this test at the end of ram_save_complete.


4) mprotect RAM after the ram_save_complete and see if anything faults.


I have tried, and there will be kvm error reports. like bellow:

KVM: entry failed, hardware error 0x7
EAX= EBX=e000 ECX=9578 EDX=434f
ESI=fc10 EDI=434f EBP= ESP=1fca
EIP=9594 EFL=00010246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0040 0400  9300
CS =f000 000f  9b00
SS =434f 000434f0  9300
DS =434f 000434f0  9300
FS =   9300
GS =   9300
LDT=   8200
TR =   8b00
GDT= 0002dcc8 0047
IDT=  
CR0=0010 CR2= CR3= CR4=
DR0= DR1= DR2= 
DR3=
DR6=0ff0 DR7=0400
EFER=
Code=c0 74 0f 66 b9 78 95 00 00 66 31 d2 66 31 c0 e9 47 e0 fb 90  90 fa fc 
66 c3 66 53 66 89 c3 66 e8 9d e8 ff ff 66 01 c3 66 89 d8 66 e8 40 e9 ff ff 66
ERROR: invalid runstate transition: 'internal-error' -> 'colo'


5) Can you trigger this with normal migration or just COLO?


Yes, we have reproduced it without COLO, just in normal migration


   I'm wondering if something is doing something on a running/paused/etc 
state
   change and isn't expecting the new COLO states.



Actually, we can also reproduce this with COLO, only in master side, we store 
all RAM to ram-cache, and also duplicate migration_bitmap,
and after a checkpoint, before VM continue to run, we compare cache memory and 
current memory, if they are different but whose
dirty bitmap is not set, we thought this page is missing tra

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread zhanghailiang


On 2015/3/25 17:50, Juan Quintela wrote:

zhanghailiang  wrote:

Hi all,

We found that, sometimes, the content of VM's memory is inconsistent between 
Source side and Destination side
when we check it just after finishing migration but before VM continue to Run.

We use a patch like bellow to find this issue, you can find it from affix,
and Steps to reprduce:

(1) Compile QEMU:
  ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make

(2) Command and output:
SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio


Could you try to reproduce:
- without vhost
- without virtio-net


Yes, with e1000, the problem is still exist, i will test it with dropping all 
the unnecessary device configure,
just as Dave's suggestion.


- cache=unsafe is going to give you trouble, but trouble should only
   happen after migration of pages have finished.

What kind of load were you having when reproducing this issue?
Just to confirm, you have been able to reproduce this without COLO
patches, right?



Yes, we reproduce this only in normal migration.


(qemu) migrate tcp:192.168.3.8:3004
before saving ram complete
ff703f6889ab8701e4e040872d079a28
md_host : after saving ram complete
ff703f6889ab8701e4e040872d079a28

DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 -boot 
c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio 
-incoming tcp:0:3004
(qemu) QEMU_VM_SECTION_END, after loading ram
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after loading all vmstate
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after cpu_synchronize_all_post_init
230e1e68ece9cd4e769630e1bcb5ddfb

This happens occasionally, and it is more easy to reproduce when issue 
migration command during VM's startup time.


OK, a couple of things.  Memory don't have to be exactly identical.
Virtio devices in particular do funny things on "post-load".  There
aren't warantees for that as far as I know, we should end with an
equivalent device state in memory.


We have done further test and found that some pages has been dirtied but its 
corresponding migration_bitmap is not set.
We can't figure out which modules of QEMU has missed setting bitmap when dirty 
page of VM,
it is very difficult for us to trace all the actions of dirtying VM's pages.


This seems to point to a bug in one of the devices.


Actually, the first time we found this problem was in the COLO FT development, 
and it triggered some strange issues in
VM which all pointed to the issue of inconsistent of VM's memory. (We have try 
to save all memory of VM to slave side every time
when do checkpoint in COLO FT, and everything will be OK.)

Is it OK for some pages that not transferred to destination when do migration ? 
Or is it a bug?


Pages transferred should be the same, after device state transmission is
when things could change.


This issue has blocked our COLO development... :(

Any help will be greatly appreciated!


Later, Juan.



Thanks,
zhanghailiang

--- a/savevm.c
+++ b/savevm.c
@@ -51,6 +51,26 @@
  #define ARP_PTYPE_IP 0x0800
  #define ARP_OP_REQUEST_REV 0x3

+#include "qemu/rcu_queue.h"
+#include 
+
+static void check_host_md5(void)
+{
+int i;
+unsigned char md[MD5_DIGEST_LENGTH];
+MD5_CTX ctx;
+RAMBlock *block = QLIST_FIRST_RCU(&ram_list.blocks);/* Only check 'pc.ram' 
block */
+
+MD5_Init(&ctx);
+MD5_Update(&ctx, (void *)block->host, block->used_length);
+MD5_Final(md, &ctx);
+printf("md_host : ");
+for(i = 0; i < MD5_DIGEST_LENGTH; i++) {
+fprintf(stderr, "%02x", md[i]);
+}
+fprintf(stderr, "\n");
+}
+
  static int announce_self_create(uint8_t *buf,
  uint8_t *mac_addr)
  {
@@ -741,7 +761,13 @@ void qemu_savevm_state_complete(QEMUFile *f)
  qemu_put_byte(f, QEMU_VM_SECTION_END);
  qemu_put_be32(f, se->section_id);

+printf("before saving %s complete\n", se->idstr);
+check_host_md5();
+
  ret = se->ops->save_live_complete(f, se->opaque);
+printf("after saving %s complete\n", se->idstr);
+check_host_md5();
+
  trace_savevm_section_end(se->idstr, se->section_id, ret);
  if (ret < 0) {
  qemu_file_set_error(f, ret);
@@ -1030,6 +1063,11 @@ int qemu_loadvm_state(QEMUFile *f)
  }

  ret = vmstate_load(f, le->se, le->version_id);
+

Re: [Qemu-devel] RFC: memory API changes

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 00:41, Peter Maydell wrote:
> On 24 March 2015 at 20:00, Paolo Bonzini  wrote:
>> I agree with that.  I just want to keep ld/st*_phys _in addition_ as the
>> short forms of address_space_ld/st*, and keep ld/st*_phys instead of
>> address_space_ld/st* for those uses that have cs->as as the first argument.
> 
> ...but for ARM I want to be able to specify the memory
> attribute argument (and possibly also get the behaviour
> right on failure). So I definitely don't want the short
> forms for my cs->as uses.

You're free to move ARM to the longer versions, and/or to push the short
versions to all cpu.h files except ARM's.

> And it seems to me at best
> uncertain that anybody does, in the long run.

I disagree: most CPUs are in odd fixes/unmaintained state (so attributes
probably won't matter), and most don't even define an unassigned_access
callback (so result won't matter either).

>> The rationale is to evolve ld/st*_phys into CPU-specific accessors
>> paralleling the bus-specific accessors.
> 
> I don't think this is any harder starting from
> address_space_ld/st* than if we leave ld/st*_phys
> around.

It does cause unnecessary churn though.

Paolo

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread Dr. David Alan Gilbert

* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote:
> On 2015/3/25 17:46, Dr. David Alan Gilbert wrote:
> >* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote:
> >>Hi all,
> >>
> >>We found that, sometimes, the content of VM's memory is inconsistent 
> >>between Source side and Destination side
> >>when we check it just after finishing migration but before VM continue to 
> >>Run.
> >>
> >>We use a patch like bellow to find this issue, you can find it from affix,
> >>and Steps to reproduce:
> >>
> >>(1) Compile QEMU:
> >>  ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make
> >>
> >>(2) Command and output:
> >>SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
> >>-netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c 
> >>-drive 
> >>file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
> >> -device 
> >>virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
> >>-vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
> >>stdio
> >>(qemu) migrate tcp:192.168.3.8:3004
> >>before saving ram complete
> >>ff703f6889ab8701e4e040872d079a28
> >>md_host : after saving ram complete
> >>ff703f6889ab8701e4e040872d079a28
> >>
> >>DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
> >>-netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 
> >>-boot c -drive 
> >>file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
> >> -device 
> >>virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 
> >>-vnc :7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor 
> >>stdio -incoming tcp:0:3004
> >>(qemu) QEMU_VM_SECTION_END, after loading ram
> >>230e1e68ece9cd4e769630e1bcb5ddfb
> >>md_host : after loading all vmstate
> >>230e1e68ece9cd4e769630e1bcb5ddfb
> >>md_host : after cpu_synchronize_all_post_init
> >>230e1e68ece9cd4e769630e1bcb5ddfb
> >>
> >>This happens occasionally, and it is more easy to reproduce when issue 
> >>migration command during VM's startup time.
> >>
> >>We have done further test and found that some pages has been dirtied but 
> >>its corresponding migration_bitmap is not set.
> >>We can't figure out which modules of QEMU has missed setting bitmap when 
> >>dirty page of VM,
> >>it is very difficult for us to trace all the actions of dirtying VM's pages.
> >>
> >>Actually, the first time we found this problem was in the COLO FT 
> >>development, and it triggered some strange issues in
> >>VM which all pointed to the issue of inconsistent of VM's memory. (We have 
> >>try to save all memory of VM to slave side every time
> >>when do checkpoint in COLO FT, and everything will be OK.)
> >>
> >>Is it OK for some pages that not transferred to destination when do 
> >>migration ? Or is it a bug?
> >
> >That does sound like a bug.
> >The only other explanation I have is that memory is being changed by a 
> >device emulation
> >that happens after the end of a saving the vm, or after loading the memory.  
> >That's
> >certainly possible - especially if a device (say networking) hasn't been 
> >properly
> >stopped.
> >
> >>This issue has blocked our COLO development... :(
> >>
> >>Any help will be greatly appreciated!
> >
> 
> Hi Dave???
> 
> >I suggest:
> >1) Does it happen with devices other than virtio?
> >2) Strip the devices down - e.g. just run with serial and no video/usb
> 
> I will try this ...
> 
> >3) Try doing the md5 comparison at the end of ram_save_complete
> 
> Yes, we have done this test at the end of ram_save_complete.
> 
> >4) mprotect RAM after the ram_save_complete and see if anything faults.
> 
> I have tried, and there will be kvm error reports. like bellow:
> 
> KVM: entry failed, hardware error 0x7

If the RAM is mprotect'd with PROT_READ|PROT_EXEC I'm not sure why this
would happen; but then the question is why is it trying to do a KVM entry
after the end of ram_save_complete - that suggests the CPUs are trying to
run again, and that shouldn't happen in a normal migration.
If you can trigger this KVM entry on a non-colo migration, then I suggest
get the backtrace from where the KVM entry failed, because then you
should be able to see why the CPU was being restarted.

Dave

> EAX= EBX=e000 ECX=9578 EDX=434f
> ESI=fc10 EDI=434f EBP= ESP=1fca
> EIP=9594 EFL=00010246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
> ES =0040 0400  9300
> CS =f000 000f  9b00
> SS =434f 000434f0  9300
> DS =434f 000434f0  9300
> FS =   9300
> GS =   9300
> LDT=   8200
> TR =   8b00
> GDT= 0002dcc8 0047
> IDT=  
> CR0=0010 CR2= CR3= CR4=
> DR0= DR1= DR2= 
> DR3=00

[Qemu-devel] [PATCH for-2.3] virtio-serial: fix virtio config size

2015-03-25 Thread Michael S. Tsirkin

commit 9b70c1790acacae54d559d38ca69186a85040bb8
virtio-serial: switch to standard-headers

changes virtio_console_config size from 8 to 12 bytes:
it adds an optional 4 byte emerg_wr field.

As this crosses a power of two boundary, this changes the PCI BAR size,
which breaks migration compatibility with old qemu machine types.
It's probably a problem for other transports as well.

As a temporary fix, as we don't yet support this new field anyway,
simply make the config size smaller at init time.

Long terms we probably want something along the lines
of virtio_net_set_config_size.

Reported-by: Cole Robinson 
Signed-off-by: Michael S. Tsirkin 
---

Cole, could you pls confirm the following fixes the
issue that you reported?

Thanks!

 hw/char/virtio-serial-bus.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
index 9a029d2..d05473f 100644
--- a/hw/char/virtio-serial-bus.c
+++ b/hw/char/virtio-serial-bus.c
@@ -949,8 +949,10 @@ static void virtio_serial_device_realize(DeviceState *dev, 
Error **errp)
 return;
 }
 
+/* We don't support emergency write, skip it for now. */
+/* TODO: cleaner fix, depending on host features. */
 virtio_init(vdev, "virtio-serial", VIRTIO_ID_CONSOLE,
-sizeof(struct virtio_console_config));
+offsetof(struct virtio_console_config, emerg_wr));
 
 /* Spawn a new virtio-serial bus on which the ports will ride as devices */
 qbus_create_inplace(&vser->bus, sizeof(vser->bus), TYPE_VIRTIO_SERIAL_BUS,
-- 
MST

Re: [Qemu-devel] RFC: memory API changes

2015-03-25 Thread Peter Maydell

On 25 March 2015 at 11:34, Paolo Bonzini  wrote:
>
>
> On 25/03/2015 00:41, Peter Maydell wrote:
>> On 24 March 2015 at 20:00, Paolo Bonzini  wrote:
>>> I agree with that.  I just want to keep ld/st*_phys _in addition_ as the
>>> short forms of address_space_ld/st*, and keep ld/st*_phys instead of
>>> address_space_ld/st* for those uses that have cs->as as the first argument.
>>
>> ...but for ARM I want to be able to specify the memory
>> attribute argument (and possibly also get the behaviour
>> right on failure). So I definitely don't want the short
>> forms for my cs->as uses.
>
> You're free to move ARM to the longer versions, and/or to push the short
> versions to all cpu.h files except ARM's.
>
>> And it seems to me at best
>> uncertain that anybody does, in the long run.
>
> I disagree: most CPUs are in odd fixes/unmaintained state (so attributes
> probably won't matter), and most don't even define an unassigned_access
> callback (so result won't matter either).

I was trying to avoid leaving us with yet another half-finished
set of API transitions: because many of our CPUs are in this
odd-fixes state, it's unlikely anybody will get round to
updating them in the near future, so we'll be carrying a
duplicate set of functions around for a long time. Doing
an automated update to change everything to the new style
seemed a better plan to me.

If you insist I can leave the ldl_phys&c around as wrappers
with a comment saying /* Do not use these in new code;
use address_space_* instead. */,
though.

-- PMM

Re: [Qemu-devel] [PATCH for-2.3] virtio-serial: fix virtio config size

2015-03-25 Thread Cole Robinson

On 03/25/2015 07:41 AM, Michael S. Tsirkin wrote:
> commit 9b70c1790acacae54d559d38ca69186a85040bb8
> virtio-serial: switch to standard-headers
> 
> changes virtio_console_config size from 8 to 12 bytes:
> it adds an optional 4 byte emerg_wr field.
> 
> As this crosses a power of two boundary, this changes the PCI BAR size,
> which breaks migration compatibility with old qemu machine types.
> It's probably a problem for other transports as well.
> 
> As a temporary fix, as we don't yet support this new field anyway,
> simply make the config size smaller at init time.
> 
> Long terms we probably want something along the lines
> of virtio_net_set_config_size.
> 
> Reported-by: Cole Robinson 
> Signed-off-by: Michael S. Tsirkin 
> ---
> 
> Cole, could you pls confirm the following fixes the
> issue that you reported?
> 
> Thanks!
> 
>  hw/char/virtio-serial-bus.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c
> index 9a029d2..d05473f 100644
> --- a/hw/char/virtio-serial-bus.c
> +++ b/hw/char/virtio-serial-bus.c
> @@ -949,8 +949,10 @@ static void virtio_serial_device_realize(DeviceState 
> *dev, Error **errp)
>  return;
>  }
>  
> +/* We don't support emergency write, skip it for now. */
> +/* TODO: cleaner fix, depending on host features. */
>  virtio_init(vdev, "virtio-serial", VIRTIO_ID_CONSOLE,
> -sizeof(struct virtio_console_config));
> +offsetof(struct virtio_console_config, emerg_wr));
>  
>  /* Spawn a new virtio-serial bus on which the ports will ride as devices 
> */
>  qbus_create_inplace(&vser->bus, sizeof(vser->bus), 
> TYPE_VIRTIO_SERIAL_BUS,
> 

Fixes the migration failure for me, thanks!

Tested-by: Cole Robinson 

- Cole

Re: [Qemu-devel] [v6 08/14] migration: Add the core code of multi-thread compression

2015-03-25 Thread Juan Quintela

Liang Li  wrote:
> Implement the core logic of the multiple thread compression. At this
> point, multiple thread compression can't co-work with xbzrle yet.
>
> Signed-off-by: Liang Li 
> Signed-off-by: Yang Zhang 

> ---
>  arch_init.c | 184 
> +---
>  1 file changed, 177 insertions(+), 7 deletions(-)
>
> diff --git a/arch_init.c b/arch_init.c
> index 48cae22..9f63c0f 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -355,12 +355,33 @@ static DecompressParam *decomp_param;
>  static QemuThread *decompress_threads;
>  static uint8_t *compressed_data_buf;
>  
> +static int do_compress_ram_page(CompressParam *param);
> +
>  static void *do_data_compress(void *opaque)
>  {
> -while (!quit_comp_thread) {
> +CompressParam *param = opaque;
>  
> -/* To be done */
What is the different with changing this loop to:


> +while (!quit_comp_thread) {

Here we don't have quit_comp_thread protected by anything.

> +qemu_mutex_lock(¶m->mutex);
> +/* Re-check the quit_comp_thread in case of
> + * terminate_compression_threads is called just before
> + * qemu_mutex_lock(¶m->mutex) and after
> + * while(!quit_comp_thread), re-check it here can make
> + * sure the compression thread terminate as expected.
> + */
> +while (!param->start && !quit_comp_thread) {

Here and next use is protected by param->mutex, but param is per
compression thread, so, it is not really protected.

> +qemu_cond_wait(¶m->cond, ¶m->mutex);
> +}
> +if (!quit_comp_thread) {
> +do_compress_ram_page(param);
> +}
> +param->start = false;

param->start is pretected by param->mutex everywhere

> +qemu_mutex_unlock(¶m->mutex);
>  
> +qemu_mutex_lock(comp_done_lock);
> +param->done = true;

param->done protected by comp_done_lock

> +qemu_cond_signal(comp_done_cond);
> +qemu_mutex_unlock(comp_done_lock);
>  }
>  
>  return NULL;
> @@ -368,9 +389,15 @@ static void *do_data_compress(void *opaque)
>  
>  static inline void terminate_compression_threads(void)
>  {
> -quit_comp_thread = true;
> +int idx, thread_count;
>  
> -/* To be done */
> +thread_count = migrate_compress_threads();
> +quit_comp_thread = true;

quite_comp_thread not protected again.

> +for (idx = 0; idx < thread_count; idx++) {
> +qemu_mutex_lock(&comp_param[idx].mutex);
> +qemu_cond_signal(&comp_param[idx].cond);
> +qemu_mutex_unlock(&comp_param[idx].mutex);
> +}
>  }
>  
>  void migrate_compress_threads_join(void)
> @@ -420,6 +447,7 @@ void migrate_compress_threads_create(void)
>   * it's ops to empty.
>   */
>  comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
> +comp_param[i].done = true;
>  qemu_mutex_init(&comp_param[i].mutex);
>  qemu_cond_init(&comp_param[i].cond);
>  qemu_thread_create(compress_threads + i, "compress",
> @@ -829,6 +857,97 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, 
> ram_addr_t offset,
>  return pages;
>  }
>  
> +static int do_compress_ram_page(CompressParam *param)
> +{
> +int bytes_sent, blen;
> +uint8_t *p;
> +RAMBlock *block = param->block;
> +ram_addr_t offset = param->offset;
> +
> +p = memory_region_get_ram_ptr(block->mr) + (offset & TARGET_PAGE_MASK);
> +
> +bytes_sent = save_page_header(param->file, block, offset |
> +  RAM_SAVE_FLAG_COMPRESS_PAGE);
> +blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
> + migrate_compress_level());
> +bytes_sent += blen;
> +atomic_inc(&acct_info.norm_pages);
> +
> +return bytes_sent;
> +}
> +
> +static inline void start_compression(CompressParam *param)
> +{
> +param->done = false;

Not protected (well, its caller have protected it by comp_done_lock.

> +qemu_mutex_lock(¶m->mutex);
> +param->start = true;
> +qemu_cond_signal(¶m->cond);
> +qemu_mutex_unlock(¶m->mutex);
> +}
> +
> +
> +static uint64_t bytes_transferred;
> +
> +static void flush_compressed_data(QEMUFile *f)
> +{
> +int idx, len, thread_count;
> +
> +if (!migrate_use_compression()) {
> +return;
> +}
> +thread_count = migrate_compress_threads();
> +for (idx = 0; idx < thread_count; idx++) {
> +if (!comp_param[idx].done) {

done is not protected here.

> +qemu_mutex_lock(comp_done_lock);
> +while (!comp_param[idx].done && !quit_comp_thread) {


Now, it is under comp_done_lock.  Bun none of its other uses is
protected by it.

And here done is proteced by comp_done_cond


> +qemu_cond_wait(comp_done_cond, comp_done_lock);
> +}
> +qemu_mutex_unlock(comp_done_lock);
> +}
> +if (!quit_comp_thread) {

Here, it is unprotected again.

> +

Re: [Qemu-devel] RFC: memory API changes

2015-03-25 Thread Paolo Bonzini

On 25/03/2015 12:43, Peter Maydell wrote:
> I was trying to avoid leaving us with yet another half-finished
> set of API transitions: because many of our CPUs are in this
> odd-fixes state, it's unlikely anybody will get round to
> updating them in the near future, so we'll be carrying a
> duplicate set of functions around for a long time.

They're not duplicate, they're shortcuts.  Using longer function names
with more arguments is unnecessary if there's no need for the extra
features.

> If you insist I can leave the ldl_phys&c around as wrappers
> with a comment saying /* Do not use these in new code;
> use address_space_* instead. */,
> though.

I'll take care of changing the wrappers to take a CPUState instead of
AddressSpace, and move them to cpu.h.

If you convert ARM to address_space_*, ARM's cpu.h obviously won't get them.

Paolo

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread zhanghailiang


On 2015/3/25 19:36, Dr. David Alan Gilbert wrote:

* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote:

On 2015/3/25 17:46, Dr. David Alan Gilbert wrote:

* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote:

Hi all,

We found that, sometimes, the content of VM's memory is inconsistent between 
Source side and Destination side
when we check it just after finishing migration but before VM continue to Run.

We use a patch like bellow to find this issue, you can find it from affix,
and Steps to reproduce:

(1) Compile QEMU:
  ./configure --target-list=x86_64-softmmu  --extra-ldflags="-lssl" && make

(2) Command and output:
SRC: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0-device virtio-net-pci,id=net-pci0,netdev=hn0 -boot c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio
(qemu) migrate tcp:192.168.3.8:3004
before saving ram complete
ff703f6889ab8701e4e040872d079a28
md_host : after saving ram complete
ff703f6889ab8701e4e040872d079a28

DST: # x86_64-softmmu/qemu-system-x86_64 -enable-kvm -cpu qemu64,-kvmclock 
-netdev tap,id=hn0,vhost=on -device virtio-net-pci,id=net-pci0,netdev=hn0 -boot 
c -drive 
file=/mnt/sdb/pure_IMG/sles/sles11_sp3.img,if=none,id=drive-virtio-disk0,cache=unsafe
 -device 
virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -vnc 
:7 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio 
-incoming tcp:0:3004
(qemu) QEMU_VM_SECTION_END, after loading ram
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after loading all vmstate
230e1e68ece9cd4e769630e1bcb5ddfb
md_host : after cpu_synchronize_all_post_init
230e1e68ece9cd4e769630e1bcb5ddfb

This happens occasionally, and it is more easy to reproduce when issue 
migration command during VM's startup time.

We have done further test and found that some pages has been dirtied but its 
corresponding migration_bitmap is not set.
We can't figure out which modules of QEMU has missed setting bitmap when dirty 
page of VM,
it is very difficult for us to trace all the actions of dirtying VM's pages.

Actually, the first time we found this problem was in the COLO FT development, 
and it triggered some strange issues in
VM which all pointed to the issue of inconsistent of VM's memory. (We have try 
to save all memory of VM to slave side every time
when do checkpoint in COLO FT, and everything will be OK.)

Is it OK for some pages that not transferred to destination when do migration ? 
Or is it a bug?


That does sound like a bug.
The only other explanation I have is that memory is being changed by a device 
emulation
that happens after the end of a saving the vm, or after loading the memory.  
That's
certainly possible - especially if a device (say networking) hasn't been 
properly
stopped.


This issue has blocked our COLO development... :(

Any help will be greatly appreciated!




Hi Dave???


I suggest:
1) Does it happen with devices other than virtio?
2) Strip the devices down - e.g. just run with serial and no video/usb


I will try this ...


3) Try doing the md5 comparison at the end of ram_save_complete


Yes, we have done this test at the end of ram_save_complete.


4) mprotect RAM after the ram_save_complete and see if anything faults.


I have tried, and there will be kvm error reports. like bellow:

KVM: entry failed, hardware error 0x7


If the RAM is mprotect'd with PROT_READ|PROT_EXEC I'm not sure why this
would happen; but then the question is why is it trying to do a KVM entry
after the end of ram_save_complete - that suggests the CPUs are trying to
run again, and that shouldn't happen in a normal migration.
If you can trigger this KVM entry on a non-colo migration, then I suggest
get the backtrace from where the KVM entry failed, because then you
should be able to see why the CPU was being restarted.



Er, sorry, the follow report is reproduced with COLO, i will test this with 
mprotect
in normal migration.


Dave


EAX= EBX=e000 ECX=9578 EDX=434f
ESI=fc10 EDI=434f EBP= ESP=1fca
EIP=9594 EFL=00010246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0040 0400  9300
CS =f000 000f  9b00
SS =434f 000434f0  9300
DS =434f 000434f0  9300
FS =   9300
GS =   9300
LDT=   8200
TR =   8b00
GDT= 0002dcc8 0047
IDT=  
CR0=0010 CR2= CR3= CR4=
DR0= DR1= DR2= 
DR3=
DR6=0ff0 DR7=0400
EFER=
Code=c0 74 0f 66 b9 78 95 00 00 66 31 d2 66 31 c0 e9 47 e0 fb 90  90 fa fc 
66 c3 66 53 66 89 c3 66 e8 9d e8 ff f

Re: [Qemu-devel] [v6 10/14] migration: Add the core code for decompression

2015-03-25 Thread Juan Quintela

Liang Li  wrote:
> Implement the core logic of multiple thread decompression,
> the decompression can work now.
>
> Signed-off-by: Liang Li 
> Signed-off-by: Yang Zhang 
> ---
>  arch_init.c | 50 --
>  1 file changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/arch_init.c b/arch_init.c
> index b81acc9..6a0d709 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -888,6 +888,13 @@ static inline void start_compression(CompressParam 
> *param)
>  qemu_mutex_unlock(¶m->mutex);
>  }
>  
> +static inline void start_decompression(DecompressParam *param)
> +{
> +qemu_mutex_lock(¶m->mutex);
> +param->start = true;

start protucetd by param->mutex

> +qemu_cond_signal(¶m->cond);
> +qemu_mutex_unlock(¶m->mutex);
> +}
>  
>  static uint64_t bytes_transferred;
>  
> @@ -1459,8 +1466,26 @@ void ram_handle_compressed(void *host, uint8_t ch, 
> uint64_t size)
>  
>  static void *do_data_decompress(void *opaque)
>  {
> +DecompressParam *param = opaque;
> +size_t pagesize;
> +
>  while (!quit_decomp_thread) {

quit_decomp_thread locking (or lack of it is equivalent to the one of 
quit_comp_thread)



> -/* To be done */
> +qemu_mutex_lock(¶m->mutex);
> +while (!param->start && !quit_decomp_thread) {

start protected by param->mutex.


> +qemu_cond_wait(¶m->cond, ¶m->mutex);
> +pagesize = TARGET_PAGE_SIZE;
> +if (!quit_decomp_thread) {
> +/* uncompress() will return failed in some case, especially
> + * when the page is dirted when doing the compression, it's
> + * not a problem because the dirty page will be retransferred
> + * and uncompress() won't break the data in other pages.
> + */
> +uncompress((Bytef *)param->des, &pagesize,
> +   (const Bytef *)param->compbuf, param->len);
> +}
> +param->start = false;
> +}
> +qemu_mutex_unlock(¶m->mutex);
>  }
>  
>  return NULL;
> @@ -1492,6 +1517,11 @@ void migrate_decompress_threads_join(void)
>  quit_decomp_thread = true;
>  thread_count = migrate_decompress_threads();
>  for (i = 0; i < thread_count; i++) {
> +qemu_mutex_lock(&decomp_param[i].mutex);
> +qemu_cond_signal(&decomp_param[i].cond);
> +qemu_mutex_unlock(&decomp_param[i].mutex);
> +}
> +for (i = 0; i < thread_count; i++) {
>  qemu_thread_join(decompress_threads + i);
>  qemu_mutex_destroy(&decomp_param[i].mutex);
>  qemu_cond_destroy(&decomp_param[i].cond);
> @@ -1508,7 +1538,23 @@ void migrate_decompress_threads_join(void)
>  static void decompress_data_with_multi_threads(uint8_t *compbuf,
> void *host, int len)
>  {
> -/* To be done */
> +int idx, thread_count;
> +
> +thread_count = migrate_decompress_threads();
> +while (true) {
> +for (idx = 0; idx < thread_count; idx++) {
> +if (!decomp_param[idx].start) {

start not protected

> +memcpy(decomp_param[idx].compbuf, compbuf, len);
> +decomp_param[idx].des = host;
> +decomp_param[idx].len = len;
> +start_decompression(&decomp_param[idx]);
> +break;
> +}
> +}
> +if (idx < thread_count) {
> +break;
> +}
> +}
>  }
>  
>  static int ram_load(QEMUFile *f, void *opaque, int version_id)

Re: [Qemu-devel] [PATCH 2/6] throttle: Add throttle group infrastructure

2015-03-25 Thread Stefan Hajnoczi

On Tue, Mar 24, 2015 at 04:33:48PM +0100, Alberto Garcia wrote:
> On Tue, Mar 24, 2015 at 03:03:07PM +, Stefan Hajnoczi wrote:
> 
> > > +typedef struct ThrottleGroup {
> > > +char *name; /* This is constant during the lifetime of the group */
> > 
> > Is this also protected by throttle_groups_lock?
> > 
> > I guess throttle_groups_lock must be held in order to read this
> > field - otherwise there is a risk that the object is freed unless
> > you have already incremented the refcount.
> 
> The creation and destruction of ThrottleGroup objects are protected by
> throttle_groups_lock. That includes handling the memory for that field
> and its contents.
> 
> Once the ThrottleGroup is created the 'name' field doesn't need any
> additional locking since it remains constant during the lifetime of
> the group.
> 
> The only way to read it from the outside is throttle_group_get_name()
> and that's safe (until you release the reference to the group, that
> is).

Right, the race condition is when the group is released.

Looking at this again, the assumption isn't that throttle_groups_lock is
held.  The AioContext lock is held by throttle_group_get_name() users
and that's why there is no race when releasing the reference.

If someone adds a throttle_group_get_name() caller in the future without
holding AioContext, then we'd be in trouble.  That is why documenting
the locking constraints is useful.

Stefan


pgph8pXXwxUOR.pgp
Description: PGP signature

Re: [Qemu-devel] [PATCH 4/6] throttle: Add throttle group support

2015-03-25 Thread Stefan Hajnoczi

On Tue, Mar 24, 2015 at 06:48:37PM +0100, Alberto Garcia wrote:
> On Tue, Mar 24, 2015 at 04:31:45PM +, Stefan Hajnoczi wrote:
> 
> > > +/* get next bs round in round robin style */
> > > +token = throttle_group_next_bs(token);
> > > +while (token != start  &&
> > > +   qemu_co_queue_empty(&token->throttled_reqs[is_write])) {
> > 
> > It's not safe to access bs->throttled_reqs[].  There are many of
> > other places that access bs->throttled_reqs[] without holding
> > tg->lock (e.g. block.c).
> >
> > throttled_reqs needs to be protected by tg->lock in order for this
> > to work.
> 
> Good catch, but I don't think that's the solution. throttled_reqs
> cannot be protected by that lock because it must release it before
> calling qemu_co_queue_wait(&bs->throttled_reqs[is_write]). Otherwise
> it will cause a deadlock.
> 
> My assumption was that throttled_reqs would be protected by the BDS's
> AioContext lock, but I overlooked the fact that this part of the
> algorithm needs to access other BDSs' queues so we indeed have a
> problem.
> 
> I will give it a thought to see what I can come up with. Since we only
> need to check whether other BDSs have pending requests maybe I can
> implement this with a flag.

A flag is a good solution since it is not possible to acquire other BDS'
AioContext lock from arbitrary block layer code (it's only allowed when
the QEMU global mutex is held).

Stefan


pgpnSFCaVPpRy.pgp
Description: PGP signature

Re: [Qemu-devel] AioContext of block jobs

2015-03-25 Thread Fam Zheng

On Wed, 03/25 12:25, Paolo Bonzini wrote:
> 
> 
> On 25/03/2015 09:31, Fam Zheng wrote:
> > I was looking at block jobs' AioContext and realized that the block job
> > coroutines are actually started in main loop.
> > 
> > I'm confused because 5a7e7a0bad17c96e03f55ed7019e2d7545e21a96 and friends in
> > the series [1] seem to move the coroutines to the BDS's iothreads, but it
> > didn't do that.
> > 
> > (Although after the first block_job_yield or sleep, the coroutines ARE 
> > resumed
> > in the right AioContext.)
> > 
> > Why is it safe to start the jobs from the main thread where QMP command is
> > handled? I see no guarantee that the jobs won't access BDS before first 
> > yield
> > but after releasing the AioContext.
> > 
> > Is this a bug?
> 
> It's okay because the coroutine is started while the main thread is
> holding the AioContext.  So the first "stint" of the coroutine, until
> the first yield, is done in the main thread but still with the
> AioContext held.

I see! That's what I missed. Thanks!

Fam

> 
> After the first yield, the main thread releases the AioContext, the
> dataplane thread gets it back and the coroutine is moved to the
> dataplane thread.
>

Re: [Qemu-devel] [PATCH 2/6] throttle: Add throttle group infrastructure

2015-03-25 Thread Alberto Garcia


> > > > +typedef struct ThrottleGroup {
> > > > +char *name; /* This is constant during the lifetime of the group */
> > > 
> > > Is this also protected by throttle_groups_lock?
> > > 
> > > I guess throttle_groups_lock must be held in order to read this
> > > field - otherwise there is a risk that the object is freed
> > > unless you have already incremented the refcount.
> > 
> > The creation and destruction of ThrottleGroup objects are
> > protected by throttle_groups_lock. That includes handling the
> > memory for that field and its contents.
> > 
> > Once the ThrottleGroup is created the 'name' field doesn't need
> > any additional locking since it remains constant during the
> > lifetime of the group.
> > 
> > The only way to read it from the outside is
> > throttle_group_get_name() and that's safe (until you release the
> > reference to the group, that is).
> 
> Right, the race condition is when the group is released.
> 
> Looking at this again, the assumption isn't that
> throttle_groups_lock is held.  The AioContext lock is held by
> throttle_group_get_name() users and that's why there is no race when
> releasing the reference.
> 
> If someone adds a throttle_group_get_name() caller in the future
> without holding AioContext, then we'd be in trouble.  That is why
> documenting the locking constraints is useful.

But that would only happen if you access bs->throttle_state without
holding bs's AioContext. I understand that it's implicit that you
should hold the context there.

Maybe I can update the throttle_group_* API to use BlockDriverState
in all cases instead of ThrottleState, it's probably more clear like
that.

Berto

Re: [Qemu-devel] [RFC PATCH 01/14] docs: block replication's description

2015-03-25 Thread Paolo Bonzini



On 16/03/2015 07:19, Wen Congyang wrote:
> On 03/13/2015 05:05 PM, Fam Zheng wrote:
>> On Fri, 03/13 17:01, Wen Congyang wrote:
>>> On 03/11/2015 02:49 PM, Fam Zheng wrote:
 On Wed, 03/11 14:44, Wen Congyang wrote:
> On 03/03/2015 03:59 PM, Fam Zheng wrote:
>> On Tue, 03/03 15:53, Wen Congyang wrote:
>>> I test qcow2_make_empty()'s performance. The result shows that it may
>>> take about 100ms(normal sata disk). It is not acceptable for COLO. So
>>> I think disk buff is necessary(just use it to replace qcow2).
>>
>> Why not tmpfs or ramdisk?
>
> Another problem:
> After failover, secondary write request will be written in (active disk)?
> It is better to write request to (nbd target). Is there any feature can
> be reused to implement it?

 You can use block commit or stream to move the data.
>>>
>>> Can the job stream move the data? I don't find the write ops in 
>>> block/stream.c.
>>
>> It is bdrv_co_copy_on_readv that moves data.
> 
> Does the job stream move the data from base to top?

Yes.  block-commit goes in the other direction.

Paolo

Re: [Qemu-devel] [RFC PATCH COLO v2 02/13] quorum: allow ignoring child errors

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 10:36, Wen Congyang wrote:
> +static QemuOptsList quorum_children_common_opts = {
> +.name = "qcow2 children",

.name should be "quorum children".

Paolo

> +.head = QTAILQ_HEAD_INITIALIZER(quorum_children_common_opts.head),
> +.desc = {
> +{
> +.name = QUORUM_CHILDREN_OPT_IGNORE_ERRORS,
> +.type = QEMU_OPT_BOOL,
> +.help = "ignore child I/O error",
> +},
> +{ /* end of list */ }
> +},
> +};
> +

Re: [Qemu-devel] [RFC PATCH COLO v2 03/13] NBD client: connect to nbd server later

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 10:36, Wen Congyang wrote:
> The secondary qemu starts later than the primary qemu, so we
> cannot connect to nbd server in bdrv_open().
> 
> Signed-off-by: Wen Congyang 
> Signed-off-by: zhanghailiang 
> Signed-off-by: Gonglei 
> ---
>  block/nbd.c | 122 
> +---
>  1 file changed, 108 insertions(+), 14 deletions(-)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index 2176186..3faf865 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -44,6 +44,8 @@
>  typedef struct BDRVNBDState {
>  NbdClientSession client;
>  QemuOpts *socket_opts;
> +char *export;
> +bool connected;
>  } BDRVNBDState;
>  
>  static int nbd_parse_uri(const char *filename, QDict *options)
> @@ -254,50 +256,95 @@ static int nbd_establish_connection(BlockDriverState 
> *bs, Error **errp)
>  return sock;
>  }
>  
> -static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
> -Error **errp)
> +static int nbd_connect_server(BlockDriverState *bs, Error **errp)
>  {
>  BDRVNBDState *s = bs->opaque;
> -char *export = NULL;
>  int result, sock;
> -Error *local_err = NULL;
> -
> -/* Pop the config into our state object. Exit if invalid. */
> -nbd_config(s, options, &export, &local_err);
> -if (local_err) {
> -error_propagate(errp, local_err);
> -return -EINVAL;
> -}
>  
>  /* establish TCP connection, return error if it fails
>   * TODO: Configurable retry-until-timeout behaviour.
>   */
>  sock = nbd_establish_connection(bs, errp);
>  if (sock < 0) {
> -g_free(export);
> +g_free(s->export);
>  return sock;
>  }
>  
>  /* NBD handshake */
> -result = nbd_client_init(bs, sock, export, errp);
> -g_free(export);
> +result = nbd_client_init(bs, sock, s->export, errp);
> +g_free(s->export);
> +s->export = NULL;
> +if (!result) {
> +s->connected = true;
> +}
> +
>  return result;
>  }
>  
> +static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
> +Error **errp)
> +{
> +BDRVNBDState *s = bs->opaque;
> +Error *local_err = NULL;
> +
> +/* Pop the config into our state object. Exit if invalid. */
> +nbd_config(s, options, &s->export, &local_err);
> +if (local_err) {
> +error_propagate(errp, local_err);
> +return -EINVAL;
> +}
> +
> +return nbd_connect_server(bs, errp);
> +}
> +
> +static int nbd_open_colo(BlockDriverState *bs, QDict *options, int flags,
> + Error **errp)
> +{
> +BDRVNBDState *s = bs->opaque;
> +Error *local_err = NULL;
> +
> +/* Pop the config into our state object. Exit if invalid. */
> +nbd_config(s, options, &s->export, &local_err);
> +if (local_err) {
> +error_propagate(errp, local_err);
> +return -EINVAL;
> +}
> +
> +return 0;
> +}
> +
>  static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
>  int nb_sectors, QEMUIOVector *qiov)
>  {
> +BDRVNBDState *s = bs->opaque;
> +
> +if (!s->connected) {
> +return -EIO;
> +}
> +
>  return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
>  }
>  
>  static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
>   int nb_sectors, QEMUIOVector *qiov)
>  {
> +BDRVNBDState *s = bs->opaque;
> +
> +if (!s->connected) {
> +return -EIO;
> +}
> +
>  return nbd_client_co_writev(bs, sector_num, nb_sectors, qiov);
>  }
>  
>  static int nbd_co_flush(BlockDriverState *bs)
>  {
> +BDRVNBDState *s = bs->opaque;
> +
> +if (!s->connected) {
> +return -EIO;
> +}
> +
>  return nbd_client_co_flush(bs);
>  }
>  
> @@ -310,6 +357,12 @@ static void nbd_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
>int nb_sectors)
>  {
> +BDRVNBDState *s = bs->opaque;
> +
> +if (!s->connected) {
> +return -EIO;
> +}
> +
>  return nbd_client_co_discard(bs, sector_num, nb_sectors);
>  }
>  
> @@ -319,23 +372,44 @@ static void nbd_close(BlockDriverState *bs)
>  
>  qemu_opts_del(s->socket_opts);
>  nbd_client_close(bs);
> +s->connected = false;
>  }
>  
>  static int64_t nbd_getlength(BlockDriverState *bs)
>  {
>  BDRVNBDState *s = bs->opaque;
>  
> +if (!s->connected) {
> +/*
> + * We cannot return -ENOTCONN, otherwise refresh_total_sectors()
> + * will fail, and we cannot open nbd client.
> + */
> +return 0;
> +}
> +
>  return s->client.size;
>  }
>  
>  static void nbd_detach_aio_context(BlockDriverState *bs)
>  {
> +BDRVNBDState *s = bs->opaque;
> +
> +if (!s->connected) {
> +return;
> +}
> +
>  nbd_client_detach_aio_context(bs);
>  }
>  
>  static void nbd_attach_aio_cont

Re: [Qemu-devel] [RFC PATCH COLO v2 04/13] Add new block driver interfaces to control block replication

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 10:36, Wen Congyang wrote:
> Signed-off-by: Wen Congyang 
> Signed-off-by: zhanghailiang 
> Signed-off-by: Gonglei 
> Cc: Luiz Capitulino 
> Cc: Michael Roth 
> ---
>  block.c   | 39 +++
>  include/block/block.h |  4 
>  include/block/block_int.h | 11 +++
>  qapi/block.json   | 16 
>  4 files changed, 70 insertions(+)
> 
> diff --git a/block.c b/block.c
> index 0fe97de..0ff5cf8 100644
> --- a/block.c
> +++ b/block.c
> @@ -6196,3 +6196,42 @@ BlockAcctStats *bdrv_get_stats(BlockDriverState *bs)
>  {
>  return &bs->stats;
>  }
> +
> +void bdrv_start_replication(BlockDriverState *bs, COLOMode mode, Error 
> **errp)
> +{
> +BlockDriver *drv = bs->drv;
> +
> +if (drv && drv->bdrv_start_replication) {
> +drv->bdrv_start_replication(bs, mode, errp);
> +} else if (bs->file) {
> +bdrv_start_replication(bs->file, mode, errp);
> +} else {
> +error_set(errp, QERR_UNSUPPORTED);
> +}
> +}
> +
> +void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp)
> +{
> +BlockDriver *drv = bs->drv;
> +
> +if (drv && drv->bdrv_do_checkpoint) {
> +drv->bdrv_do_checkpoint(bs, errp);
> +} else if (bs->file) {
> +bdrv_do_checkpoint(bs->file, errp);
> +} else {
> +error_set(errp, QERR_UNSUPPORTED);
> +}
> +}
> +
> +void bdrv_stop_replication(BlockDriverState *bs, Error **errp)
> +{
> +BlockDriver *drv = bs->drv;
> +
> +if (drv && drv->bdrv_stop_replication) {
> +drv->bdrv_stop_replication(bs, errp);
> +} else if (bs->file) {
> +bdrv_stop_replication(bs->file, errp);
> +} else {
> +error_set(errp, QERR_UNSUPPORTED);
> +}
> +}
> diff --git a/include/block/block.h b/include/block/block.h
> index 4c57d63..68f3b1a 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -569,4 +569,8 @@ void bdrv_flush_io_queue(BlockDriverState *bs);
>  
>  BlockAcctStats *bdrv_get_stats(BlockDriverState *bs);
>  
> +void bdrv_start_replication(BlockDriverState *bs, COLOMode mode, Error 
> **errp);
> +void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp);
> +void bdrv_stop_replication(BlockDriverState *bs, Error **errp);
> +
>  #endif
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index dccb092..08dd8ba 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -290,6 +290,17 @@ struct BlockDriver {
>   */
>  int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
>  
> +
> +void (*bdrv_start_replication)(BlockDriverState *bs, COLOMode mode,
> +   Error **errp);
> +/* Drop Disk buffer when doing checkpoint. */
> +void (*bdrv_do_checkpoint)(BlockDriverState *bs, Error **errp);
> +/*
> + * After failover, we should flush Disk buffer into secondary disk
> + * and stop block replication.
> + */
> +void (*bdrv_stop_replication)(BlockDriverState *bs, Error **errp);
> +
>  QLIST_ENTRY(BlockDriver) list;
>  };
>  
> diff --git a/qapi/block.json b/qapi/block.json
> index e313465..e640566 100644
> --- a/qapi/block.json
> +++ b/qapi/block.json
> @@ -40,6 +40,22 @@
>'data': ['auto', 'none', 'lba', 'large', 'rechs']}
>  
>  ##
> +# @COLOMode
> +#
> +# An enumeration of COLO mode.
> +#
> +# @unprotected: COLO is not started or after failover
> +#
> +# @primary: Primary mode, the vm's state will be sent to secondary QEMU.
> +#
> +# @secondary: Secondary mode, receive the vm's state from primary QEMU.
> +#
> +# Since: 2.4
> +##
> +{ 'enum' : 'COLOMode',
> +  'data' : ['unprotected', 'primary', 'secondary']}

Perhaps replace COLOMode with ReplicationMode or FaultToleranceMode?

> +##
>  # @BlockdevSnapshotInternal
>  #
>  # @device: the name of the device to generate the snapshot from
> 

Apart from this,

Reviewed-by: Paolo Bonzini

Re: [Qemu-devel] [RFC PATCH COLO v2 05/13] quorum: implement block driver interfaces for block replication

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 10:36, Wen Congyang wrote:
> Signed-off-by: Wen Congyang 
> Signed-off-by: zhanghailiang 
> Signed-off-by: Gonglei 
> ---
>  block/quorum.c | 79 
> ++
>  1 file changed, 79 insertions(+)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index d087135..eee829d 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -84,6 +84,8 @@ typedef struct BDRVQuorumState {
>  */
>  
>  QuorumReadPattern read_pattern;
> +
> +int colo_index; /* store which child supports block replication */
>  } BDRVQuorumState;
>  
>  typedef struct QuorumAIOCB QuorumAIOCB;
> @@ -1024,6 +1026,7 @@ static int quorum_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  }
>  
>  g_free(opened);
> +s->colo_index = -1;
>  goto exit;
>  
>  close_exit:
> @@ -1114,6 +1117,78 @@ static void quorum_refresh_filename(BlockDriverState 
> *bs)
>  bs->full_open_options = opts;
>  }
>  
> +static void quorum_stop_replication(BlockDriverState *bs, Error **errp);
> +static void quorum_start_replication(BlockDriverState *bs, COLOMode mode,
> + Error **errp)
> +{
> +BDRVQuorumState *s = bs->opaque;
> +int count = 0, i, index;
> +Error *local_err = NULL;
> +
> +/*
> + * TODO: support COLO_SECONDARY_MODE if we allow secondary
> + * QEMU becoming primary QEMU.
> + */
> +if (mode != COLO_MODE_PRIMARY) {
> +error_set(errp, QERR_INVALID_PARAMETER, "mode");
> +return;
> +}
> +
> +if (s->read_pattern != QUORUM_READ_PATTERN_FIFO) {
> +error_set(errp, QERR_INVALID_PARAMETER, "read pattern");
> +return;
> +}
> +
> +for (i = 0; i < s->num_children; i++) {
> +bdrv_start_replication(s->bs[i], mode, &local_err);
> +if (local_err) {
> +error_free(local_err);
> +local_err = NULL;
> +} else {
> +count++;
> +index = i;
> +}
> +}
> +
> +if (count == 0) {
> +/* No child supports block replication */
> +error_set(errp, QERR_UNSUPPORTED);
> +} else if (count > 1) {
> +quorum_stop_replication(bs, NULL);
> +error_setg(errp, "too many children support block replication");
> +} else {
> +s->colo_index = index;
> +}
> +}
> +
> +static void quorum_do_checkpoint(BlockDriverState *bs, Error **errp)
> +{
> +BDRVQuorumState *s = bs->opaque;
> +
> +if (s->colo_index < 0) {
> +error_setg(errp, "Block replication is not started");
> +return;
> +}
> +
> +bdrv_do_checkpoint(s->bs[s->colo_index], errp);
> +}
> +
> +static void quorum_stop_replication(BlockDriverState *bs, Error **errp)
> +{
> +BDRVQuorumState *s = bs->opaque;
> +int i;
> +
> +if (s->colo_index >= 0) {
> +bdrv_stop_replication(s->bs[s->colo_index], errp);
> +s->colo_index = -1;
> +return;
> +}
> +
> +for (i = 0; i < s->num_children; i++) {
> +bdrv_stop_replication(s->bs[i], NULL);
> +}

Why do anything at all if s->colo_index < 0?  So, with the for loop
removed (and possibly the if condition changed to "s->colo_index < 0",
similar to quorum_do_checkpoint:

Reviewed-by: Paolo Bonzini 

Paolo

> +}
> +
>  static BlockDriver bdrv_quorum = {
>  .format_name= "quorum",
>  .protocol_name  = "quorum",
> @@ -1137,6 +1212,10 @@ static BlockDriver bdrv_quorum = {
>  
>  .is_filter  = true,
>  .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
> +
> +.bdrv_start_replication = quorum_start_replication,
> +.bdrv_do_checkpoint = quorum_do_checkpoint,
> +.bdrv_stop_replication  = quorum_stop_replication,
>  };
>  
>  static void bdrv_quorum_init(void)
>

Re: [Qemu-devel] [RFC PATCH COLO v2 06/13] NBD client: implement block driver interfaces for block replication

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 10:36, Wen Congyang wrote:
> Signed-off-by: Wen Congyang 
> Signed-off-by: zhanghailiang 
> Signed-off-by: Gonglei 
> ---
>  block/nbd.c | 49 +
>  1 file changed, 49 insertions(+)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index 3faf865..753b322 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -458,6 +458,52 @@ static void nbd_refresh_filename(BlockDriverState *bs)
>  bs->full_open_options = opts;
>  }
>  
> +static void nbd_start_replication(BlockDriverState *bs, COLOMode mode,
> +  Error **errp)
> +{
> +BDRVNBDState *s = bs->opaque;
> +
> +/*
> + * TODO: support COLO_SECONDARY_MODE if we allow secondary
> + * QEMU becoming primary QEMU.
> + */
> +if (mode != COLO_MODE_PRIMARY) {
> +error_set(errp, QERR_INVALID_PARAMETER, "mode");
> +return;
> +}
> +
> +if (s->connected) {
> +error_setg(errp, "The connection is established");
> +return;
> +}
> +
> +/* TODO: NBD client should be one child of quorum, how to verify it? */
> +nbd_connect_server(bs, errp);
> +}
> +
> +static void nbd_do_checkpoint(BlockDriverState *bs, Error **errp)
> +{
> +BDRVNBDState *s = bs->opaque;
> +
> +if (!s->connected) {
> +error_setg(errp, "The connection is not established");
> +return;
> +}
> +}
> +
> +static void nbd_stop_replication(BlockDriverState *bs, Error **errp)
> +{
> +BDRVNBDState *s = bs->opaque;
> +
> +if (!s->connected) {
> +error_setg(errp, "The connection is not established");
> +return;
> +}
> +
> +nbd_client_close(bs);
> +s->connected = false;
> +}
> +
>  static BlockDriver bdrv_nbd = {
>  .format_name= "nbd",
>  .protocol_name  = "nbd",
> @@ -527,6 +573,9 @@ static BlockDriver bdrv_nbd_colo = {
>  .bdrv_detach_aio_context= nbd_detach_aio_context,
>  .bdrv_attach_aio_context= nbd_attach_aio_context,
>  .bdrv_refresh_filename  = nbd_refresh_filename,
> +.bdrv_start_replication = nbd_start_replication,
> +.bdrv_do_checkpoint = nbd_do_checkpoint,
> +.bdrv_stop_replication  = nbd_stop_replication,
>  
>  .has_variable_length= true,
>  };
> 

Reviewed-by: Paolo Bonzini

Re: [Qemu-devel] [RFC PATCH COLO v2 10/13] Backup: clear all bitmap when doing block checkpoint

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 10:36, Wen Congyang wrote:
> 
> +void backup_do_checkpoint(BlockJob *job, Error **errp)
> +{
> +BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
> +
> +if (job->driver != &backup_job_driver) {
> +error_setg(errp, "It is not backup job");
> +return;
> +}
> +
> +hbitmap_reset_all(backup_job->bitmap);
> +}

Please add instead a block_job_do_checkpoint API, and a do_checkpoint
function pointer to BlockJobDriver.

> +{
> +#if 0
> +hbitmap_reset(hb, 0, hb->size << hb->granularity);
> +#else
> +uint64_t size = hb->size;
> +unsigned int i;
> +
> +/* Same as hbitmap_alloc() except memset() */
> +for (i = HBITMAP_LEVELS; i-- > 0; ) {

This can be "--i >= 1"...

> +size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
> +memset(hb->levels[i], 0, size * sizeof(unsigned long));
> +}
> +
> +assert(size == 1);
> +hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);

... if you use "=" instead of "|=" here.

> +#endif
> +}

Please pick one implementation (no #if), and also add a testcase to
tests/test-hbitmap.c.

Paolo

Re: [Qemu-devel] [PATCH v2 0/2] ahci: test varying sector offsets

2015-03-25 Thread Stefan Hajnoczi

On Fri, Mar 13, 2015 at 03:22:01PM -0400, John Snow wrote:
> This is a re-send of patches 7 & 8 from an earlier series,
> "[PATCH v2 0/8] ahci: add more IO tests" which ultimately got bounced
> back because I used some glib functions that were too new.
> 
> v2:
> - Patchew caught a pathing problem with the qemu-img binary;
>   the relative path produced by the Makefile does not prepend
>   "./", so I was relying on the /distro's/ qemu-img by accident.
>   Fix that by using realpath().
> 
> v1:
> - Removed "./" from the execution CLI. Now you can set an absolute or
>   relative path for QTEST_QEMU_IMG and it will work either way. The default
>   as generated by the Makefile will be a relative path.
> 
> - Removed the g_spawn_check_exit_status glib call from mkimg(). See the
>   in-line comments in patch 1/2 for correctness justification.
> 
> John Snow (2):
>   qtest/ahci: add qcow2 support to ahci-test
>   qtest/ahci: test different disk sectors
> 
>  tests/Makefile|  1 +
>  tests/ahci-test.c | 84 
> +--
>  tests/libqos/ahci.c   | 10 +++---
>  tests/libqos/ahci.h   |  4 +--
>  tests/libqos/libqos.c | 44 +++
>  tests/libqos/libqos.h |  2 ++
>  6 files changed, 116 insertions(+), 29 deletions(-)

Acked-by: Stefan Hajnoczi 

As in traditional Linux Acked-by.  I've looked briefly at the patches
and am happy.


pgpERhjp8dgb0.pgp
Description: PGP signature

Re: [Qemu-devel] [RFC PATCH COLO v2 00/13] Block replication for continuous checkpoints

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 10:36, Wen Congyang wrote:
> Block replication is a very important feature which is used for
> continuous checkpoints(for example: COLO).
> 
> Usage:
> Please refer to docs/block-replication.txt
> 
> You can get the patch here:
> https://github.com/wencongyang/qemu-colo/commits/block-replication-v2
> 
> Changs Log:
> V2:
> 1. Redesign the secondary qemu(use image-fleecing)
> 2. Use Error objects to return error message
> 3. Address the comments from Max Reitz and Eric Blake
> 
> Wen Congyang (13):
>   docs: block replication's description
>   quorum: allow ignoring child errors
>   NBD client: connect to nbd server later
>   Add new block driver interfaces to control block replication
>   quorum: implement block driver interfaces for block replication
>   NBD client: implement block driver interfaces for block replication
>   allow writing to the backing file
>   Allow creating backup jobs when opening BDS
>   block: Parse "backing_reference" option to reference existing BDS
>   Backup: clear all bitmap when doing block checkpoint
>   qcow2: support colo
>   skip nbd_target when starting block replication
>   Don't allow a disk use backing reference target
> 
>  block.c| 242 +++-
>  block/Makefile.objs|   2 +-
>  block/backup.c |  12 ++
>  block/nbd.c| 171 +++--
>  block/qcow2.c  | 447 
> -
>  block/qcow2.h  |   6 +
>  block/quorum.c | 143 ++-
>  docs/block-replication.txt | 147 +++
>  include/block/block.h  |   5 +
>  include/block/block_int.h  |  13 ++
>  include/qemu/hbitmap.h |   8 +
>  qapi/block.json|  16 ++
>  tests/qemu-iotests/051 |  13 ++
>  tests/qemu-iotests/051.out |  13 ++
>  util/hbitmap.c |  19 ++
>  15 files changed, 1230 insertions(+), 27 deletions(-)
>  create mode 100644 docs/block-replication.txt
> 

Looks nice!

I've reviewed the patches where I'm competent.

Paolo

Re: [Qemu-devel] [PATCH v2 0/2] AHCI: avoid mapping stale guest memory

2015-03-25 Thread Stefan Hajnoczi

On Fri, Mar 13, 2015 at 05:50:52PM -0400, John Snow wrote:
> Currently, the AHCI device tries to re-map guest memory every time
> the low or high address registers are written to, whether or not the
> AHCI device is currently active. If the other register has stale
> information in it, this may lead to runtime failures.
> 
> Reconfigure the AHCI device to ignore writes to these registers while
> the device is active, and otherwise postpone the dma memory map until
> the device becomes active.
> 
> If the mappings should for whatever reason fail, do not activate the
> bits that tell the user the device has been started successfully.
> 
> v2:
>  - ahci_map_[clb|fis]_address now returns true on success
>  - PORT_CMD_LIST_ON and PORT_CMD_FIS_ON only turn on if the map succeeds
>  - Fix compiler warning due to changing context.
> 
> John Snow (2):
>   AHCI: Do not (re)map FB/CLB buffers while not running
>   AHCI: Protect cmd register
> 
>  hw/ide/ahci.c | 76 
> +--
>  hw/ide/ahci.h |  2 ++
>  2 files changed, 60 insertions(+), 18 deletions(-)
> 
> -- 
> 1.9.3
> 

Reviewed-by: Stefan Hajnoczi 


pgpt9WDYzjbDI.pgp
Description: PGP signature

Re: [Qemu-devel] [PATCH for-2.3 2/2] i440fx-test: Fix test paths to include architecture

2015-03-25 Thread Paolo Bonzini



On 25/03/2015 00:20, Andreas Färber wrote:
>>> >> -g_test_add(testpath, FirmwareTestFixture, NULL, setup_fixture,
>>> >> +char *path = g_strdup_printf("/%s%s", qtest_get_arch(), testpath);
>>> >> +g_test_add(path, FirmwareTestFixture, NULL, setup_fixture,
>>> >>  test_i440fx_firmware, NULL);
>>> >> +g_free(path);
>>> >>   }
>>> >>
>> > 
>> > Is it not worth adding an even more generic wrapper to prevent future
>> > desynch from our preferred path format?
> As mentioned in the commit message, g_test_add() is a macro, not a
> function, so seemed more complicated to wrap. Can you post a patch if
> you have an idea? :)
> 

You would have to wrap g_test_add_vtable with qtest_add_vtable, and then
add a macro that mimicks g_test_add:

/* hook up a test with fixture under test path */
#define qtest_add(testpath, Fixture, tdata, fsetup, ftest, fteardown) \
G_STMT_START {  \
 void (*add_vtable) (const char*,   
\
gsize, \
gconstpointer, \
void (*) (Fixture*, 
gconstpointer),   \
void (*) (Fixture*, 
gconstpointer),   \
void (*) (Fixture*, 
gconstpointer)) =  (void (*) (const gchar *, gsize, gconstpointer, void (
*) (Fixture*, gconstpointer), void (*) (Fixture*, gconstpointer), void (*) 
(Fixture*, gconstpointer))) qtest_add_vtable; \
 add_vtable \
  (testpath, sizeof (Fixture), tdata, 
fsetup, ftest, fteardown); \
} G_STMT_END


Paolo

Re: [Qemu-devel] question about live migration with storage

2015-03-25 Thread Paolo Bonzini

On 10/03/2015 03:01, Zhang Haoyu wrote:
> My test results show that when using thin-provisioning qcow2
> image(created by qemu-img create -f qcow2 preallocation=metadata), 
> even the unallocated sectors will be transferred to destination, so
> much data is transferred, so the qcow2 image in destination is full
> allocated.

Yes, metadata preallocation actually allocates all sectors.  Only, the
qcow2 image is sparse so the sectors do not consume space on the source.

I think you can avoid the problem by passing file.detect-zeroes=unmap in
the -drive option of the destination QEMU.  Kevin/Peter, can you confirm
this should work?

Paolo

Re: [Qemu-devel] [PATCH v3 0/9] Convert ffs(3) to ctz32()

2015-03-25 Thread Stefan Hajnoczi

On Mon, Mar 23, 2015 at 03:29:22PM +, Stefan Hajnoczi wrote:
> v3:
>  * Use Paolo's much cleaner rewrite of the omap intc loop [Paolo]
>  * Rebased on qemu.git/master
> 
> v2:
>  * Audit coccinelle patch and split out two cases where ctz32(0) == 32 must be
>handled [Peter]
>  * Cc qemu-sta...@nongnu.org on first patch [Eric]
> 
> MinGW does not always provide ffs(3).  My MinGW 4.9.2 cannot link QEMU when
> ./configure --enable-debug was used due to the missing ffs(3) function.
> 
> In the past we already got rid of ffsl(3) calls, so getting rid of ffs(3) is a
> logical next step.
> 
> This series replaces ffs(3) calls with ctz32().  Their semantics differ as 
> follows:
> 
> 1. ffs(0) == 0 but ctz32(0) == 32
> 
> 2. Otherwise, ffs(val) - 1 == ctz32(val)
> 
> The first patch fixes a bug that was discovered when auditing ffs(3) callers.
> 
> The final patch adds checkpatch.pl support to prevent ffs(3), ffsl(3), and
> ffsll(3) from being introduced into the codebase again in the future.
> 
> Note that there are instances of 64-bit values being passed to ffs(3).  I have
> mechanically converted them to ctz32() and not worried about whether the
> original code is buggy or not.
> 
> Paolo Bonzini (1):
>   omap_intc: convert ffs(3) to ctz32() in omap_inth_sir_update()
> 
> Stefan Hajnoczi (8):
>   bt-sdp: fix broken uuids power-of-2 calculation
>   hw/arm/nseries: convert ffs(3) to ctz32()
>   uninorth: convert ffs(3) to ctz32()
>   Convert (ffs(val) - 1) to ctz32(val)
>   Convert ffs() != 0 callers to ctz32()
>   sd: convert sd_normal_command() ffs(3) call to ctz32()
>   os-win32: drop ffs(3) prototype
>   checkpatch: complain about ffs(3) calls
> 
>  block.c |  2 +-
>  block/qcow2-refcount.c  |  2 +-
>  block/qcow2.c   |  4 ++--
>  block/qed.c |  4 ++--
>  block/rbd.c |  2 +-
>  block/sheepdog.c|  2 +-
>  hw/acpi/pcihp.c |  2 +-
>  hw/arm/nseries.c|  5 -
>  hw/arm/omap1.c  |  6 ++
>  hw/arm/pxa2xx_gpio.c|  2 +-
>  hw/arm/strongarm.c  |  4 ++--
>  hw/bt/sdp.c |  2 +-
>  hw/char/virtio-serial-bus.c |  8 
>  hw/display/tc6393xb.c   |  2 +-
>  hw/gpio/max7310.c   |  2 +-
>  hw/gpio/omap_gpio.c | 13 +
>  hw/gpio/zaurus.c|  2 +-
>  hw/i2c/omap_i2c.c   | 10 +++---
>  hw/intc/allwinner-a10-pic.c |  8 
>  hw/intc/omap_intc.c |  9 +
>  hw/pci-host/bonito.c|  2 +-
>  hw/pci-host/uninorth.c  |  5 -
>  hw/pci/msi.c| 12 ++--
>  hw/pci/pcie_aer.c   |  2 +-
>  hw/pci/shpc.c   | 10 +-
>  hw/pci/slotid_cap.c |  2 +-
>  hw/ppc/ppce500_spin.c   |  2 +-
>  hw/scsi/megasas.c   |  2 +-
>  hw/sd/sd.c  |  3 ++-
>  include/hw/pci/pci.h| 16 
>  include/hw/pci/pcie_regs.h  | 18 +-
>  include/sysemu/os-win32.h   |  3 ---
>  kvm-all.c   |  8 
>  scripts/checkpatch.pl   | 11 +++
>  target-ppc/cpu.h|  4 ++--
>  35 files changed, 103 insertions(+), 88 deletions(-)

Applied to my block-next tree:
https://github.com/stefanha/qemu/commits/block-next

Stefan


pgpI4hUumOVPK.pgp
Description: PGP signature

Re: [Qemu-devel] [Migration Bug? ] Occasionally, the content of VM's memory is inconsistent between Source and Destination of migration

2015-03-25 Thread Paolo Bonzini

On 25/03/2015 11:21, Wen Congyang wrote:
> > What kind of load were you having when reproducing this issue?
> > Just to confirm, you have been able to reproduce this without COLO
> > patches, right?
>
> I can reproduce it without COLO patches. 

Thanks.  Please try doing mprotect (using qemu_ram_foreach_block to
mprotect all the areas) after ram_save_complete.

Paolo

Re: [Qemu-devel] Support for NetLogic XLP Processors

2015-03-25 Thread James Hogan

Hi Duarte,

On 22/03/15 11:13, Duarte Silva wrote:
> Hi guys,
> 
> I have been struggling to get some binaries compiled for NetLogic XLP 
> processor to run under QEMU. I have tried a bunch of things (most going back 
> and forth) and always get the following error message:
> 
> qemu: uncaught target signal 4 (Illegal instruction) - core dumped
> Illegal instruction
> 
> I tried to debug it using GDB but to no avail. Does anybody have ideas? I'm 
> running QEMU 2.2.1.

It sounds like the program had an instruction that QEMU doesn't
recognise, or doesn't think should be allowed on the current CPU which
you've set with -cpu. You might be able to find out what that
instruction is by putting this on your qemu command line:

 -singlestep -d in_asm

that will cause it to only translate one instruction at a time, and to
dump each one as it is translated. The last one printed will very likely
to be the one that is causing the problem.

Cheers
James

signature.asc
Description: OpenPGP digital signature

1 2 3 >

1 - 100 of 279 matches

Mail list logo