date:20170120

Re: [Qemu-devel] [PATCH RFC v3 02/14] intel_iommu: simplify irq region translation

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 13, 2017 11:06 AM
> 
> Before we have int-remap, we need to bypass interrupt write requests.
> That's not necessary now - we have supported int-remap, and all the irq
> region requests should be redirected there. Cleaning up the block with
> an assertion instead.

This comment is not accurate. According to code, the reason why you
can do such simplification is because we have standalone memory
region now for interrupt addresses. There should be nothing to do 
with int-remap, which can be disabled by guest... Maybe the standalone
region was added when developing int-remap, but functionally they
are not related. :-)

> 
> Signed-off-by: Peter Xu 
> ---
>  hw/i386/intel_iommu.c | 28 ++--
>  1 file changed, 6 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 2868e37..77d467a 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -818,28 +818,12 @@ static void vtd_do_iommu_translate(VTDAddressSpace 
> *vtd_as,
> PCIBus *bus,
>  bool writes = true;
>  VTDIOTLBEntry *iotlb_entry;
> 
> -/* Check if the request is in interrupt address range */
> -if (vtd_is_interrupt_addr(addr)) {
> -if (is_write) {
> -/* FIXME: since we don't know the length of the access here, we
> - * treat Non-DWORD length write requests without PASID as
> - * interrupt requests, too. Withoud interrupt remapping support,
> - * we just use 1:1 mapping.
> - */
> -VTD_DPRINTF(MMU, "write request to interrupt address "
> -"gpa 0x%"PRIx64, addr);
> -entry->iova = addr & VTD_PAGE_MASK_4K;
> -entry->translated_addr = addr & VTD_PAGE_MASK_4K;
> -entry->addr_mask = ~VTD_PAGE_MASK_4K;
> -entry->perm = IOMMU_WO;
> -return;
> -} else {
> -VTD_DPRINTF(GENERAL, "error: read request from interrupt address 
> "
> -"gpa 0x%"PRIx64, addr);
> -vtd_report_dmar_fault(s, source_id, addr, VTD_FR_READ, is_write);
> -return;
> -}
> -}
> +/*
> + * We have standalone memory region for interrupt addresses, we
> + * should never receive translation requests in this region.
> + */
> +assert(!vtd_is_interrupt_addr(addr));
> +
>  /* Try to fetch slpte form IOTLB */
>  iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
>  if (iotlb_entry) {
> --
> 2.7.4

Re: [Qemu-devel] [PATCH v6 4/5] migration: disallow migrate_add_blocker during migration

2017-01-20 Thread Greg Kurz

On Mon, 16 Jan 2017 17:01:53 +0530
Ashijeet Acharya  wrote:

> If a migration is already in progress and somebody attempts
> to add a migration blocker, this should rightly fail.
> 
> Add an errp parameter and a retcode return value to migrate_add_blocker.
> 
> Signed-off-by: John Snow 
> Signed-off-by: Ashijeet Acharya 
> ---

For the 9pfs part.

Acked-by: Greg Kurz 

>  block/qcow.c  |  8 +++-
>  block/vdi.c   |  8 +++-
>  block/vhdx.c  | 17 +++--
>  block/vmdk.c  |  9 -
>  block/vpc.c   | 11 ---
>  block/vvfat.c | 19 ---
>  hw/9pfs/9p.c  | 33 ++---
>  hw/display/virtio-gpu.c   | 32 +++-
>  hw/intc/arm_gic_kvm.c | 17 +++--
>  hw/intc/arm_gicv3_its_kvm.c   | 20 +---
>  hw/intc/arm_gicv3_kvm.c   | 19 ---
>  hw/misc/ivshmem.c | 14 ++
>  hw/scsi/vhost-scsi.c  | 25 +++--
>  hw/virtio/vhost.c |  8 +++-
>  include/migration/migration.h |  7 ++-
>  migration/migration.c | 37 +++--
>  stubs/migr-blocker.c  |  3 ++-
>  target/i386/kvm.c | 16 +---
>  18 files changed, 222 insertions(+), 81 deletions(-)
> 
> diff --git a/block/qcow.c b/block/qcow.c
> index 7540f43..fb738fc 100644
> --- a/block/qcow.c
> +++ b/block/qcow.c
> @@ -104,6 +104,7 @@ static int qcow_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  unsigned int len, i, shift;
>  int ret;
>  QCowHeader header;
> +Error *local_err = NULL;
>  
>  ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
>  if (ret < 0) {
> @@ -252,7 +253,12 @@ static int qcow_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  error_setg(&s->migration_blocker, "The qcow format used by node '%s' "
> "does not support live migration",
> bdrv_get_device_or_node_name(bs));
> -migrate_add_blocker(s->migration_blocker);
> +ret = migrate_add_blocker(s->migration_blocker, &local_err);
> +if (local_err) {
> +error_propagate(errp, local_err);
> +error_free(s->migration_blocker);
> +goto fail;
> +}
>  
>  qemu_co_mutex_init(&s->lock);
>  return 0;
> diff --git a/block/vdi.c b/block/vdi.c
> index 96b78d5..0aeb940 100644
> --- a/block/vdi.c
> +++ b/block/vdi.c
> @@ -361,6 +361,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, 
> int flags,
>  VdiHeader header;
>  size_t bmap_size;
>  int ret;
> +Error *local_err = NULL;
>  
>  logout("\n");
>  
> @@ -471,7 +472,12 @@ static int vdi_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  error_setg(&s->migration_blocker, "The vdi format used by node '%s' "
> "does not support live migration",
> bdrv_get_device_or_node_name(bs));
> -migrate_add_blocker(s->migration_blocker);
> +ret = migrate_add_blocker(s->migration_blocker, &local_err);
> +if (local_err) {
> +error_propagate(errp, local_err);
> +error_free(s->migration_blocker);
> +goto fail_free_bmap;
> +}
>  
>  qemu_co_mutex_init(&s->write_lock);
>  
> diff --git a/block/vhdx.c b/block/vhdx.c
> index 0ba2f0a..68db9e0 100644
> --- a/block/vhdx.c
> +++ b/block/vhdx.c
> @@ -991,6 +991,17 @@ static int vhdx_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  }
>  }
>  
> +/* Disable migration when VHDX images are used */
> +error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
> +   "does not support live migration",
> +   bdrv_get_device_or_node_name(bs));
> +ret = migrate_add_blocker(s->migration_blocker, &local_err);
> +if (local_err) {
> +error_propagate(errp, local_err);
> +error_free(s->migration_blocker);
> +goto fail;
> +}
> +
>  if (flags & BDRV_O_RDWR) {
>  ret = vhdx_update_headers(bs, s, false, NULL);
>  if (ret < 0) {
> @@ -1000,12 +1011,6 @@ static int vhdx_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  
>  /* TODO: differencing files */
>  
> -/* Disable migration when VHDX images are used */
> -error_setg(&s->migration_blocker, "The vhdx format used by node '%s' "
> -   "does not support live migration",
> -   bdrv_get_device_or_node_name(bs));
> -migrate_add_blocker(s->migration_blocker);
> -
>  return 0;
>  fail:
>  vhdx_close(bs);
> diff --git a/block/vmdk.c b/block/vmdk.c
> index a11c27a..7750212 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -941,6 +941,7 @@ static int vmdk_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  int ret;
>  BDRVVmdkState *s = bs->opaque;
>  uint32_t magic;
> +Error *local_err

Re: [Qemu-devel] [PATCH RFC v3 03/14] intel_iommu: renaming gpa to iova where proper

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 13, 2017 11:06 AM
> 
> There are lots of places in current intel_iommu.c codes that named
> "iova" as "gpa". It is really confusing to use a name "gpa" in these
> places (which is very easily to be understood as "Guest Physical
> Address", while it's not). To make the codes (much) easier to be read, I
> decided to do this once and for all.
> 
> No functional change is made. Only literal ones.

If looking at VT-d spec (3.2 Domains and Address Translation)

Remapping hardware treats the address in inbound requests as DMA 
Address. Depending on the software usage model, the DMA address 
space may be the Guest-Physical Address (GPA) space of the virtual 
machine to which the device is assigned, or application Virtual Address 
(VA) space defined by the PASID assigned to an application, or some 
abstract I/O virtual address (IOVA) space defined by software.

For simplicity, this document refers to address in requests-without-
PASID as GPA, and address in requests-with-PASID as Virtual Address 
(VA) (or Guest Virtual Address (GVA), if such request is from a device 
assigned to a virtual machine). The translated address is referred to 
as 
HPA.

It will add more readability if similar comment is added in this file - you
can say choosing iova to represent address in requests-without-PASID.

> 
> Signed-off-by: Peter Xu 
> ---
>  hw/i386/intel_iommu.c | 36 ++--
>  1 file changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 77d467a..275c3db 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -259,7 +259,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t
> source_id,
>  uint64_t *key = g_malloc(sizeof(*key));
>  uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
> 
> -VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64
> +VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " iova 0x%"PRIx64
>  " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte,
>  domain_id);
>  if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
> @@ -575,12 +575,12 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr, 
> uint32_t
> index)
>  return slpte;
>  }
> 
> -/* Given a gpa and the level of paging structure, return the offset of 
> current
> - * level.
> +/* Given an iova and the level of paging structure, return the offset
> + * of current level.
>   */
> -static inline uint32_t vtd_gpa_level_offset(uint64_t gpa, uint32_t level)
> +static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
>  {
> -return (gpa >> vtd_slpt_level_shift(level)) &
> +return (iova >> vtd_slpt_level_shift(level)) &
>  ((1ULL << VTD_SL_LEVEL_BITS) - 1);
>  }
> 
> @@ -628,10 +628,10 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, 
> uint32_t
> level)
>  }
>  }
> 
> -/* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
> +/* Given the @iova, get relevant @slptep. @slpte_level will be the last level
>   * of the translation, can be used for deciding the size of large page.
>   */
> -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
> +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t iova, bool 
> is_write,
>  uint64_t *slptep, uint32_t *slpte_level,
>  bool *reads, bool *writes)
>  {
> @@ -642,11 +642,11 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, 
> uint64_t gpa,
> bool is_write,
>  uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
>  uint64_t access_right_check;
> 
> -/* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in 
> CAP_REG
> - * and AW in context-entry.
> +/* Check if @iova is above 2^X-1, where X is the minimum of MGAW
> + * in CAP_REG and AW in context-entry.
>   */
> -if (gpa & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
> -VTD_DPRINTF(GENERAL, "error: gpa 0x%"PRIx64 " exceeds limits", gpa);
> +if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
> +VTD_DPRINTF(GENERAL, "error: iova 0x%"PRIx64 " exceeds limits", 
> iova);
>  return -VTD_FR_ADDR_BEYOND_MGAW;
>  }
> 
> @@ -654,13 +654,13 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, 
> uint64_t gpa,
> bool is_write,
>  access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
> 
>  while (true) {
> -offset = vtd_gpa_level_offset(gpa, level);
> +offset = vtd_iova_level_offset(iova, level);
>  slpte = vtd_get_slpte(addr, offset);
> 
>  if (slpte == (uint64_t)-1) {
>  VTD_DPRINTF(GENERAL, "error: fail to access second-level paging "
> -"entry at level %"PRIu32 " for gpa 0x%"PRIx64,
> -level, gpa);
> +

Re: [Qemu-devel] [PATCH RFC v3 01/14] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 13, 2017 11:06 AM
> 
> From: Aviv Ben-David 
> 
> This capability asks the guest to invalidate cache before each map operation.
> We can use this invalidation to trap map operations in the hypervisor.
> 
> Signed-off-by: Aviv Ben-David 
> Signed-off-by: Peter Xu 
> ---
>  hw/i386/intel_iommu.c  | 5 +
>  hw/i386/intel_iommu_internal.h | 1 +
>  include/hw/i386/intel_iommu.h  | 2 ++
>  3 files changed, 8 insertions(+)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index ec62239..2868e37 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -2107,6 +2107,7 @@ static Property vtd_properties[] = {
>  DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
>  ON_OFF_AUTO_AUTO),
>  DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
> +DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled,
> FALSE),
>  DEFINE_PROP_END_OF_LIST(),
>  };
> 
> @@ -2488,6 +2489,10 @@ static void vtd_init(IntelIOMMUState *s)
>  s->ecap |= VTD_ECAP_DT;
>  }
> 
> +if (s->cache_mode_enabled) {
> +s->cap |= VTD_CAP_CM;
> +}
> +

I think some of my old comments are not answered:

1) Better to use caching_mode to follow spec

2) Does it make sense to automatically set this flag if any VFIO device
has been statically assigned when starting Qemu? Also for hot-add
device path, some check of caching mode is required. If not set, 
should we fail hot-add operation? I don't think we have such physical
platform with some devices behind IOMMU while others not.

>  vtd_reset_context_cache(s);
>  vtd_reset_iotlb(s);
> 
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index 356f188..4104121 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -202,6 +202,7 @@
>  #define VTD_CAP_MAMV(VTD_MAMV << 48)
>  #define VTD_CAP_PSI (1ULL << 39)
>  #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
> +#define VTD_CAP_CM  (1ULL << 7)
> 
>  /* Supported Adjusted Guest Address Widths */
>  #define VTD_CAP_SAGAW_SHIFT 8
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 405c9d1..749eef9 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -257,6 +257,8 @@ struct IntelIOMMUState {
>  uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
>  uint32_t version;
> 
> +bool cache_mode_enabled;/* RO - is cap CM enabled? */
> +
>  dma_addr_t root;/* Current root table pointer */
>  bool root_extended; /* Type of root table (extended or not) 
> */
>  bool dmar_enabled;  /* Set if DMA remapping is enabled */
> --
> 2.7.4

Re: [Qemu-devel] [PATCH RFC v3 01/14] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 08:32:06AM +, Tian, Kevin wrote:
> > From: Peter Xu [mailto:pet...@redhat.com]
> > Sent: Friday, January 13, 2017 11:06 AM
> > 
> > From: Aviv Ben-David 
> > 
> > This capability asks the guest to invalidate cache before each map 
> > operation.
> > We can use this invalidation to trap map operations in the hypervisor.
> > 
> > Signed-off-by: Aviv Ben-David 
> > Signed-off-by: Peter Xu 
> > ---
> >  hw/i386/intel_iommu.c  | 5 +
> >  hw/i386/intel_iommu_internal.h | 1 +
> >  include/hw/i386/intel_iommu.h  | 2 ++
> >  3 files changed, 8 insertions(+)
> > 
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index ec62239..2868e37 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -2107,6 +2107,7 @@ static Property vtd_properties[] = {
> >  DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
> >  ON_OFF_AUTO_AUTO),
> >  DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
> > +DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled,
> > FALSE),
> >  DEFINE_PROP_END_OF_LIST(),
> >  };
> > 
> > @@ -2488,6 +2489,10 @@ static void vtd_init(IntelIOMMUState *s)
> >  s->ecap |= VTD_ECAP_DT;
> >  }
> > 
> > +if (s->cache_mode_enabled) {
> > +s->cap |= VTD_CAP_CM;
> > +}
> > +
> 
> I think some of my old comments are not answered:
> 
> 1) Better to use caching_mode to follow spec

Sure.

> 
> 2) Does it make sense to automatically set this flag if any VFIO device
> has been statically assigned when starting Qemu?

I'm okay with both, considering that people using this flag will be
possibly advanced users. So I would like to hear others' opinion.

> Also for hot-add
> device path, some check of caching mode is required. If not set, 
> should we fail hot-add operation? I don't think we have such physical
> platform with some devices behind IOMMU while others not.

Could you explain in what case will we fail a hot plug?

Thanks,

-- peterx

Re: [Qemu-devel] [PATCH RFC v3 01/14] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 20, 2017 4:55 PM
> 
> On Fri, Jan 20, 2017 at 08:32:06AM +, Tian, Kevin wrote:
> > > From: Peter Xu [mailto:pet...@redhat.com]
> > > Sent: Friday, January 13, 2017 11:06 AM
> > >
> > > From: Aviv Ben-David 
> > >
> > > This capability asks the guest to invalidate cache before each map 
> > > operation.
> > > We can use this invalidation to trap map operations in the hypervisor.
> > >
> > > Signed-off-by: Aviv Ben-David 
> > > Signed-off-by: Peter Xu 
> > > ---
> > >  hw/i386/intel_iommu.c  | 5 +
> > >  hw/i386/intel_iommu_internal.h | 1 +
> > >  include/hw/i386/intel_iommu.h  | 2 ++
> > >  3 files changed, 8 insertions(+)
> > >
> > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > index ec62239..2868e37 100644
> > > --- a/hw/i386/intel_iommu.c
> > > +++ b/hw/i386/intel_iommu.c
> > > @@ -2107,6 +2107,7 @@ static Property vtd_properties[] = {
> > >  DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
> > >  ON_OFF_AUTO_AUTO),
> > >  DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
> > > +DEFINE_PROP_BOOL("cache-mode", IntelIOMMUState, cache_mode_enabled,
> > > FALSE),
> > >  DEFINE_PROP_END_OF_LIST(),
> > >  };
> > >
> > > @@ -2488,6 +2489,10 @@ static void vtd_init(IntelIOMMUState *s)
> > >  s->ecap |= VTD_ECAP_DT;
> > >  }
> > >
> > > +if (s->cache_mode_enabled) {
> > > +s->cap |= VTD_CAP_CM;
> > > +}
> > > +
> >
> > I think some of my old comments are not answered:
> >
> > 1) Better to use caching_mode to follow spec
> 
> Sure.
> 
> >
> > 2) Does it make sense to automatically set this flag if any VFIO device
> > has been statically assigned when starting Qemu?
> 
> I'm okay with both, considering that people using this flag will be
> possibly advanced users. So I would like to hear others' opinion.
> 
> > Also for hot-add
> > device path, some check of caching mode is required. If not set,
> > should we fail hot-add operation? I don't think we have such physical
> > platform with some devices behind IOMMU while others not.
> 
> Could you explain in what case will we fail a hot plug?
> 

user enables intel-iommu, but don't set caching mode.

Then later user hot-add a PCI device to the VM. Guest will assume
newly assigned device also behind the default vIOMMU, and thus
needs to setup IOVA mappings, which is then broken...

Thanks
Kevin

Re: [Qemu-devel] [PATCH RFC v3 02/14] intel_iommu: simplify irq region translation

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 08:22:14AM +, Tian, Kevin wrote:
> > From: Peter Xu [mailto:pet...@redhat.com]
> > Sent: Friday, January 13, 2017 11:06 AM
> > 
> > Before we have int-remap, we need to bypass interrupt write requests.
> > That's not necessary now - we have supported int-remap, and all the irq
> > region requests should be redirected there. Cleaning up the block with
> > an assertion instead.
> 
> This comment is not accurate. According to code, the reason why you
> can do such simplification is because we have standalone memory
> region now for interrupt addresses. There should be nothing to do 
> with int-remap, which can be disabled by guest... Maybe the standalone
> region was added when developing int-remap, but functionally they
> are not related. :-)

IMHO the above commit message is fairly clear. :-)

But sure I can add some more emphasise like:

  "Before we have int-remap memory region, ..."

Do you think it's okay? Or any better suggestion?

(Just to mention that even guest disables IR, the MSI region will
 still be there.)

Thanks,

-- peterx

Re: [Qemu-devel] [PATCH RFC v3 01/14] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 08:59:01AM +, Tian, Kevin wrote:

[...]

> > > Also for hot-add
> > > device path, some check of caching mode is required. If not set,
> > > should we fail hot-add operation? I don't think we have such physical
> > > platform with some devices behind IOMMU while others not.
> > 
> > Could you explain in what case will we fail a hot plug?
> > 
> 
> user enables intel-iommu, but don't set caching mode.
> 
> Then later user hot-add a PCI device to the VM. Guest will assume
> newly assigned device also behind the default vIOMMU, and thus
> needs to setup IOVA mappings, which is then broken...

Is the newly added device a vfio-pci device? If so, we should hit
this and VM will stops to work:

if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
error_report("We need to set cache_mode=1 for intel-iommu to enable "
 "device assignment with IOMMU protection.");
exit(1);
}

I admit this is not user-friendly, and a better way may be that we
disallow the hot-plug in that case, telling the user about the error,
rather than crashing the VM. But, I think that can be a patch outside
this series, considering (again) that this only affects advanced
users.

Thanks,

-- peterx

Re: [Qemu-devel] [PATCH RFC v3 02/14] intel_iommu: simplify irq region translation

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 20, 2017 5:05 PM
> 
> On Fri, Jan 20, 2017 at 08:22:14AM +, Tian, Kevin wrote:
> > > From: Peter Xu [mailto:pet...@redhat.com]
> > > Sent: Friday, January 13, 2017 11:06 AM
> > >
> > > Before we have int-remap, we need to bypass interrupt write requests.
> > > That's not necessary now - we have supported int-remap, and all the irq
> > > region requests should be redirected there. Cleaning up the block with
> > > an assertion instead.
> >
> > This comment is not accurate. According to code, the reason why you
> > can do such simplification is because we have standalone memory
> > region now for interrupt addresses. There should be nothing to do
> > with int-remap, which can be disabled by guest... Maybe the standalone
> > region was added when developing int-remap, but functionally they
> > are not related. :-)
> 
> IMHO the above commit message is fairly clear. :-)
> 
> But sure I can add some more emphasise like:
> 
>   "Before we have int-remap memory region, ..."
> 
> Do you think it's okay? Or any better suggestion?
> 
> (Just to mention that even guest disables IR, the MSI region will
>  still be there.)
> 

My option is simple - this patch has nothing to do with int-remap.
It's not necessary, not because we supported int-remap. It's because
we have a standalone memory region for interrupt addresses, as you
described in the code. :-)

Thanks
Kevin

[Qemu-devel] [PULL v2 05/12] s390x/pci: dynamically allocate iommu

2017-01-20 Thread Cornelia Huck

From: Yi Min Zhao 

When initializing a PCI device, an address space is required during PCI
core initialization and before the call to the embedding object hotplug
callback. To provide this AS, we allocate a S390PCIIOMMU object
containing this AS. Initialization of S390PCIIOMMU object is done
before the PCI device is completely created. So that we cannot
associate the IOMMU with the device at the moment. To track the IOMMU
object, we use g_hash functions with the PCI device's bus address as a
key to provide an array of pointers indexed by the PCI device's devfn
to the allocated IOMMU.

Signed-off-by: Yi Min Zhao 
Reviewed-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c | 72 +++--
 hw/s390x/s390-pci-bus.h |  7 -
 2 files changed, 64 insertions(+), 15 deletions(-)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 11244fc93c..f3afb17f6a 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -432,11 +432,48 @@ static const MemoryRegionIOMMUOps s390_iommu_ops = {
 .translate = s390_translate_iommu,
 };
 
+static S390PCIIOMMU *s390_pci_get_iommu(S390pciState *s, PCIBus *bus,
+int devfn)
+{
+uint64_t key = (uintptr_t)bus;
+S390PCIIOMMUTable *table = g_hash_table_lookup(s->iommu_table, &key);
+S390PCIIOMMU *iommu;
+
+if (!table) {
+table = g_malloc0(sizeof(S390PCIIOMMUTable));
+table->key = key;
+g_hash_table_insert(s->iommu_table, &table->key, table);
+}
+
+iommu = table->iommu[PCI_SLOT(devfn)];
+if (!iommu) {
+iommu = S390_PCI_IOMMU(object_new(TYPE_S390_PCI_IOMMU));
+
+char *mr_name = g_strdup_printf("iommu-root-%02x:%02x.%01x",
+pci_bus_num(bus),
+PCI_SLOT(devfn),
+PCI_FUNC(devfn));
+char *as_name = g_strdup_printf("iommu-pci-%02x:%02x.%01x",
+pci_bus_num(bus),
+PCI_SLOT(devfn),
+PCI_FUNC(devfn));
+memory_region_init(&iommu->mr, OBJECT(iommu), mr_name, UINT64_MAX);
+address_space_init(&iommu->as, &iommu->mr, as_name);
+table->iommu[PCI_SLOT(devfn)] = iommu;
+
+g_free(mr_name);
+g_free(as_name);
+}
+
+return iommu;
+}
+
 static AddressSpace *s390_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn)
 {
 S390pciState *s = opaque;
+S390PCIIOMMU *iommu = s390_pci_get_iommu(s, bus, devfn);
 
-return &s->iommu[PCI_SLOT(devfn)]->as;
+return &iommu->as;
 }
 
 static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t to_be_set)
@@ -520,19 +557,21 @@ void s390_pci_iommu_disable(S390PCIIOMMU *iommu)
 object_unparent(OBJECT(&iommu->iommu_mr));
 }
 
-static void s390_pcihost_init_as(S390pciState *s)
+static void s390_pci_iommu_free(PCIBus *bus, int32_t devfn)
 {
-int i;
-S390PCIIOMMU *iommu;
+uint64_t key = (uintptr_t)bus;
+S390PCIIOMMUTable *table = g_hash_table_lookup(s->iommu_table, &key);
+S390PCIIOMMU *iommu = table ? table->iommu[PCI_SLOT(devfn)] : NULL;
 
-for (i = 0; i < PCI_SLOT_MAX; i++) {
-iommu = g_malloc0(sizeof(S390PCIIOMMU));
-memory_region_init(&iommu->mr, OBJECT(s),
-   "iommu-root-s390", UINT64_MAX);
-address_space_init(&iommu->as, &iommu->mr, "iommu-pci");
-
-s->iommu[i] = iommu;
+if (!table || !iommu) {
+return;
 }
+
+table->iommu[PCI_SLOT(devfn)] = NULL;
+address_space_destroy(&iommu->as);
+object_unparent(OBJECT(&iommu->mr));
+object_unparent(OBJECT(iommu));
+object_unref(OBJECT(iommu));
 }
 
 static int s390_pcihost_init(SysBusDevice *dev)
@@ -548,7 +587,6 @@ static int s390_pcihost_init(SysBusDevice *dev)
  s390_pci_set_irq, s390_pci_map_irq, NULL,
  get_system_memory(), get_system_io(), 0, 64,
  TYPE_PCI_BUS);
-s390_pcihost_init_as(s);
 pci_setup_iommu(b, s390_pci_dma_iommu, s);
 
 bus = BUS(b);
@@ -558,6 +596,8 @@ static int s390_pcihost_init(SysBusDevice *dev)
 s->bus = S390_PCI_BUS(qbus_create(TYPE_S390_PCI_BUS, DEVICE(s), NULL));
 qbus_set_hotplug_handler(BUS(s->bus), DEVICE(s), NULL);
 
+s->iommu_table = g_hash_table_new_full(g_int64_hash, g_int64_equal,
+   NULL, g_free);
 QTAILQ_INIT(&s->pending_sei);
 return 0;
 }
@@ -661,7 +701,7 @@ static void s390_pcihost_hot_plug(HotplugHandler 
*hotplug_dev,
 }
 
 pbdev->pdev = pdev;
-pbdev->iommu = s->iommu[PCI_SLOT(pdev->devfn)];
+pbdev->iommu = s390_pci_get_iommu(s, pdev->bus, pdev->devfn);
 pbdev->iommu->pbdev = pbdev;
 pbdev->state = ZPCI_FS_STANDBY;
 
@@ -708,8 +748,9 @@ static void s390_pcihost_timer_cb(void *opaque)

[Qemu-devel] [PULL v2 01/12] s390x: remove double compat statement

2017-01-20 Thread Cornelia Huck

From: Christian Borntraeger 

We chain our compat handler via the CCW_COMPAT macros and via the
class_init function. (e.g. ccw_machine_2_7_class_options calls
ccw_machine_2_8_class_options). As all class_init functions in that
chain call SET_MACHINE_COMPAT for their compat settings, and
SET_MACHINE_COMPAT will append there is no need to do that again.

Signed-off-by: Christian Borntraeger 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-virtio-ccw.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index e340eab36b..dbf4f01165 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -339,7 +339,6 @@ static const TypeInfo ccw_machine_info = {
 HW_COMPAT_2_7
 
 #define CCW_COMPAT_2_6 \
-CCW_COMPAT_2_7 \
 HW_COMPAT_2_6 \
 {\
 .driver   = TYPE_S390_IPL,\
@@ -352,7 +351,6 @@ static const TypeInfo ccw_machine_info = {
 },
 
 #define CCW_COMPAT_2_5 \
-CCW_COMPAT_2_6 \
 HW_COMPAT_2_5
 
 #define CCW_COMPAT_2_4 \
-- 
2.11.0

[Qemu-devel] [PULL v2 02/12] s390x: add compat machine for 2.9

2017-01-20 Thread Cornelia Huck

Signed-off-by: Cornelia Huck 
Acked-by: Christian Borntraeger 
---
 hw/s390x/s390-virtio-ccw.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index dbf4f01165..e9a676797a 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -335,6 +335,9 @@ static const TypeInfo ccw_machine_info = {
 } \
 type_init(ccw_machine_register_##suffix)
 
+#define CCW_COMPAT_2_8 \
+HW_COMPAT_2_8
+
 #define CCW_COMPAT_2_7 \
 HW_COMPAT_2_7
 
@@ -393,14 +396,26 @@ static const TypeInfo ccw_machine_info = {
 .value= "0",\
 },
 
+static void ccw_machine_2_9_instance_options(MachineState *machine)
+{
+}
+
+static void ccw_machine_2_9_class_options(MachineClass *mc)
+{
+}
+DEFINE_CCW_MACHINE(2_9, "2.9", true);
+
 static void ccw_machine_2_8_instance_options(MachineState *machine)
 {
+ccw_machine_2_9_instance_options(machine);
 }
 
 static void ccw_machine_2_8_class_options(MachineClass *mc)
 {
+ccw_machine_2_9_class_options(mc);
+SET_MACHINE_COMPAT(mc, CCW_COMPAT_2_8);
 }
-DEFINE_CCW_MACHINE(2_8, "2.8", true);
+DEFINE_CCW_MACHINE(2_8, "2.8", false);
 
 static void ccw_machine_2_7_instance_options(MachineState *machine)
 {
-- 
2.11.0

[Qemu-devel] [PULL v2 03/12] s390x/kvm: use kvm_gsi_routing_enabled in flic

2017-01-20 Thread Cornelia Huck

From: Fei Li 

Let's use kvm_gsi_routing_enabled() to check if kvm supports
KVM_CAP_IRQ_ROUTING in order to avoid a needless ioctl invocation.

Signed-off-by: Fei Li 
Signed-off-by: Cornelia Huck 
---
 hw/intc/s390_flic_kvm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/intc/s390_flic_kvm.c b/hw/intc/s390_flic_kvm.c
index 21ac2e2dcd..c313166fbe 100644
--- a/hw/intc/s390_flic_kvm.c
+++ b/hw/intc/s390_flic_kvm.c
@@ -201,7 +201,7 @@ static int kvm_s390_register_io_adapter(S390FLICState *fs, 
uint32_t id,
 .addr = (uint64_t)&adapter,
 };
 
-if (!kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING)) {
+if (!kvm_gsi_routing_enabled()) {
 /* nothing to do */
 return 0;
 }
@@ -226,7 +226,7 @@ static int kvm_s390_io_adapter_map(S390FLICState *fs, 
uint32_t id,
 KVMS390FLICState *flic = KVM_S390_FLIC(fs);
 int r;
 
-if (!kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING)) {
+if (!kvm_gsi_routing_enabled()) {
 /* nothing to do */
 return 0;
 }
-- 
2.11.0

[Qemu-devel] [PULL v2 00/12] s390x update

2017-01-20 Thread Cornelia Huck

The following changes since commit 0f6bcf68a99efdc531b209551f2b760b0bdcc554:

  Merge remote-tracking branch 'remotes/artyom/tags/pull-sun4v-20170118' into 
staging (2017-01-19 18:34:13 +)

are available in the git repository at:

  git://github.com/cohuck/qemu tags/s390x-20170120-v2

for you to fetch changes up to 8c797e758ad41b8dd6ea6f72c6f0194dedb30a2f:

  virtio-ccw: fix ring sizing (2017-01-20 10:02:02 +0100)


First set of s390x patches for 2.9:
- rework of the zpci code, giving us proper multibus support
- introduction of the 2.9 machine
- fixes and improvements



Christian Borntraeger (1):
  s390x: remove double compat statement

Cornelia Huck (1):
  s390x: add compat machine for 2.9

Fei Li (1):
  s390x/kvm: use kvm_gsi_routing_enabled in flic

Michael S. Tsirkin (1):
  virtio-ccw: fix ring sizing

Pierre Morel (3):
  s390x/pci: change the device array to a list
  s390x/pci: PCI multibus bridge handling
  s390x/pci: handle PCIBridge bus number

Yi Min Zhao (5):
  s390x/pci: make S390PCIIOMMU inherit Object
  s390x/pci: dynamically allocate iommu
  s390x/pci: optimize calling s390_get_phb()
  s390x/pci: use hashtable to look up zpci via fh
  s390x/pci: merge msix init functions

 hw/intc/s390_flic_kvm.c|   4 +-
 hw/s390x/s390-pci-bus.c| 361 +
 hw/s390x/s390-pci-bus.h|  46 --
 hw/s390x/s390-pci-inst.c   |  74 +-
 hw/s390x/s390-pci-inst.h   |   2 +-
 hw/s390x/s390-virtio-ccw.c |  19 ++-
 hw/s390x/virtio-ccw.c  |   2 +-
 hw/virtio/virtio.c |   5 +
 include/hw/virtio/virtio.h |   1 +
 target/s390x/kvm.c |   2 +-
 10 files changed, 334 insertions(+), 182 deletions(-)

-- 
2.11.0

[Qemu-devel] [PULL v2 07/12] s390x/pci: optimize calling s390_get_phb()

2017-01-20 Thread Cornelia Huck

From: Yi Min Zhao 

A function may recursively call device search functions or may call
serveral different device search function. Passing the S390pciState to
search functions as an argument instead of looking up it inside the
search functions lowers the number of calling s390_get_phb().

Signed-off-by: Yi Min Zhao 
Reviewed-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c  | 66 +++-
 hw/s390x/s390-pci-bus.h  | 10 +---
 hw/s390x/s390-pci-inst.c | 24 ++
 target/s390x/kvm.c   |  2 +-
 4 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 7ba1ae8b25..70673af591 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -31,7 +31,7 @@
 do { } while (0)
 #endif
 
-static S390pciState *s390_get_phb(void)
+S390pciState *s390_get_phb(void)
 {
 static S390pciState *phb;
 
@@ -91,9 +91,9 @@ int chsc_sei_nt2_have_event(void)
 return !QTAILQ_EMPTY(&s->pending_sei);
 }
 
-S390PCIBusDevice *s390_pci_find_next_avail_dev(S390PCIBusDevice *pbdev)
+S390PCIBusDevice *s390_pci_find_next_avail_dev(S390pciState *s,
+   S390PCIBusDevice *pbdev)
 {
-S390pciState *s = s390_get_phb();
 S390PCIBusDevice *ret = pbdev ? QTAILQ_NEXT(pbdev, link) :
 QTAILQ_FIRST(&s->zpci_devs);
 
@@ -104,10 +104,9 @@ S390PCIBusDevice 
*s390_pci_find_next_avail_dev(S390PCIBusDevice *pbdev)
 return ret;
 }
 
-S390PCIBusDevice *s390_pci_find_dev_by_fid(uint32_t fid)
+S390PCIBusDevice *s390_pci_find_dev_by_fid(S390pciState *s, uint32_t fid)
 {
 S390PCIBusDevice *pbdev;
-S390pciState *s = s390_get_phb();
 
 QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
 if (pbdev->fid == fid) {
@@ -121,7 +120,8 @@ S390PCIBusDevice *s390_pci_find_dev_by_fid(uint32_t fid)
 void s390_pci_sclp_configure(SCCB *sccb)
 {
 PciCfgSccb *psccb = (PciCfgSccb *)sccb;
-S390PCIBusDevice *pbdev = 
s390_pci_find_dev_by_fid(be32_to_cpu(psccb->aid));
+S390PCIBusDevice *pbdev = s390_pci_find_dev_by_fid(s390_get_phb(),
+   
be32_to_cpu(psccb->aid));
 uint16_t rc;
 
 if (be16_to_cpu(sccb->h.length) < 16) {
@@ -153,7 +153,8 @@ out:
 void s390_pci_sclp_deconfigure(SCCB *sccb)
 {
 PciCfgSccb *psccb = (PciCfgSccb *)sccb;
-S390PCIBusDevice *pbdev = 
s390_pci_find_dev_by_fid(be32_to_cpu(psccb->aid));
+S390PCIBusDevice *pbdev = s390_pci_find_dev_by_fid(s390_get_phb(),
+   
be32_to_cpu(psccb->aid));
 uint16_t rc;
 
 if (be16_to_cpu(sccb->h.length) < 16) {
@@ -192,10 +193,9 @@ out:
 psccb->header.response_code = cpu_to_be16(rc);
 }
 
-static S390PCIBusDevice *s390_pci_find_dev_by_uid(uint16_t uid)
+static S390PCIBusDevice *s390_pci_find_dev_by_uid(S390pciState *s, uint16_t 
uid)
 {
 S390PCIBusDevice *pbdev;
-S390pciState *s = s390_get_phb();
 
 QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
 if (pbdev->uid == uid) {
@@ -206,10 +206,10 @@ static S390PCIBusDevice 
*s390_pci_find_dev_by_uid(uint16_t uid)
 return NULL;
 }
 
-static S390PCIBusDevice *s390_pci_find_dev_by_target(const char *target)
+static S390PCIBusDevice *s390_pci_find_dev_by_target(S390pciState *s,
+ const char *target)
 {
 S390PCIBusDevice *pbdev;
-S390pciState *s = s390_get_phb();
 
 if (!target) {
 return NULL;
@@ -224,10 +224,9 @@ static S390PCIBusDevice *s390_pci_find_dev_by_target(const 
char *target)
 return NULL;
 }
 
-S390PCIBusDevice *s390_pci_find_dev_by_idx(uint32_t idx)
+S390PCIBusDevice *s390_pci_find_dev_by_idx(S390pciState *s, uint32_t idx)
 {
 S390PCIBusDevice *pbdev;
-S390pciState *s = s390_get_phb();
 
 QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
 if (pbdev->idx == idx) {
@@ -238,9 +237,8 @@ S390PCIBusDevice *s390_pci_find_dev_by_idx(uint32_t idx)
 return NULL;
 }
 
-S390PCIBusDevice *s390_pci_find_dev_by_fh(uint32_t fh)
+S390PCIBusDevice *s390_pci_find_dev_by_fh(S390pciState *s, uint32_t fh)
 {
-S390pciState *s = s390_get_phb();
 S390PCIBusDevice *pbdev;
 
 QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
@@ -544,7 +542,7 @@ void s390_pci_iommu_disable(S390PCIIOMMU *iommu)
 object_unparent(OBJECT(&iommu->iommu_mr));
 }
 
-static void s390_pci_iommu_free(PCIBus *bus, int32_t devfn)
+static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
 {
 uint64_t key = (uintptr_t)bus;
 S390PCIIOMMUTable *table = g_hash_table_lookup(s->iommu_table, &key);
@@ -638,10 +636,10 @@ static void s390_pci_msix_free(S390PCIBusDevice *pbdev)
 object_unparent(OBJECT(&pbdev->msix_notify_mr));
 }
 
-static S390PCIBusDevice *s390_pci_device_new(const char *target)
+static S390PCIBusDevice *s390_pci_device_new(S390pciState *s,
+ const

[Qemu-devel] [PULL v2 10/12] s390x/pci: handle PCIBridge bus number

2017-01-20 Thread Cornelia Huck

From: Pierre Morel 

The PCI bus number is usually set by the host during the enumeration.

In the s390 architecture we neither get a Device Tree nor have an
enumeration understanding bridge devices.

Let's fake the enumeration on reset and set the PCI_PRIMARY_BUS,
PCI_SECONDARY_BUS and PCI_SUBORDINATE_BUS config entries for the
bridges.

Let's add the configuration of these three config entries on bridge hot
plug.

The bus number is calculated based on a new entry, bus_num of the
S390pciState device.

This commit is inspired by what spapr pci does.

Signed-off-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c | 52 +
 hw/s390x/s390-pci-bus.h |  1 +
 2 files changed, 53 insertions(+)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index c2ae01c9a6..5cf97b4f7d 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -576,6 +576,7 @@ static int s390_pcihost_init(SysBusDevice *dev)
 s->iommu_table = g_hash_table_new_full(g_int64_hash, g_int64_equal,
NULL, g_free);
 s->zpci_table = g_hash_table_new_full(g_int_hash, g_int_equal, NULL, NULL);
+s->bus_no = 0;
 QTAILQ_INIT(&s->pending_sei);
 QTAILQ_INIT(&s->zpci_devs);
 return 0;
@@ -673,12 +674,24 @@ static void s390_pcihost_hot_plug(HotplugHandler 
*hotplug_dev,
 if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
 BusState *bus;
 PCIBridge *pb = PCI_BRIDGE(dev);
+PCIDevice *pdev = PCI_DEVICE(dev);
 
 pci_bridge_map_irq(pb, dev->id, s390_pci_map_irq);
 pci_setup_iommu(&pb->sec_bus, s390_pci_dma_iommu, s);
 
 bus = BUS(&pb->sec_bus);
 qbus_set_hotplug_handler(bus, DEVICE(s), errp);
+
+if (dev->hotplugged) {
+pci_default_write_config(pdev, PCI_PRIMARY_BUS, s->bus_no, 1);
+s->bus_no += 1;
+pci_default_write_config(pdev, PCI_SECONDARY_BUS, s->bus_no, 1);
+do {
+pdev = pdev->bus->parent_dev;
+pci_default_write_config(pdev, PCI_SUBORDINATE_BUS,
+ s->bus_no, 1);
+} while (pdev->bus && pci_bus_num(pdev->bus));
+}
 } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
 pdev = PCI_DEVICE(dev);
 
@@ -812,6 +825,44 @@ out:
 object_unparent(OBJECT(pbdev));
 }
 
+static void s390_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev,
+  void *opaque)
+{
+S390pciState *s = opaque;
+unsigned int primary = s->bus_no;
+unsigned int subordinate = 0xff;
+PCIBus *sec_bus = NULL;
+
+if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) !=
+ PCI_HEADER_TYPE_BRIDGE)) {
+return;
+}
+
+(s->bus_no)++;
+pci_default_write_config(pdev, PCI_PRIMARY_BUS, primary, 1);
+pci_default_write_config(pdev, PCI_SECONDARY_BUS, s->bus_no, 1);
+pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, s->bus_no, 1);
+
+sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
+if (!sec_bus) {
+return;
+}
+
+pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, subordinate, 1);
+pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
+s390_pci_enumerate_bridge, s);
+pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, s->bus_no, 1);
+}
+
+static void s390_pcihost_reset(DeviceState *dev)
+{
+S390pciState *s = S390_PCI_HOST_BRIDGE(dev);
+PCIBus *bus = s->parent_obj.bus;
+
+s->bus_no = 0;
+pci_for_each_device(bus, pci_bus_num(bus), s390_pci_enumerate_bridge, s);
+}
+
 static void s390_pcihost_class_init(ObjectClass *klass, void *data)
 {
 SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
@@ -819,6 +870,7 @@ static void s390_pcihost_class_init(ObjectClass *klass, 
void *data)
 HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass);
 
 dc->cannot_instantiate_with_device_add_yet = true;
+dc->reset = s390_pcihost_reset;
 k->init = s390_pcihost_init;
 hc->plug = s390_pcihost_hot_plug;
 hc->unplug = s390_pcihost_hot_unplug;
diff --git a/hw/s390x/s390-pci-bus.h b/hw/s390x/s390-pci-bus.h
index b82b18eb07..b0adefa788 100644
--- a/hw/s390x/s390-pci-bus.h
+++ b/hw/s390x/s390-pci-bus.h
@@ -310,6 +310,7 @@ typedef struct S390PCIBus {
 typedef struct S390pciState {
 PCIHostState parent_obj;
 uint32_t next_idx;
+int bus_no;
 S390PCIBus *bus;
 GHashTable *iommu_table;
 GHashTable *zpci_table;
-- 
2.11.0

[Qemu-devel] [PULL v2 04/12] s390x/pci: make S390PCIIOMMU inherit Object

2017-01-20 Thread Cornelia Huck

From: Yi Min Zhao 

Currently S390PCIIOMMU is a normal struct. Let's make it inherit Object
in order to take advantage of QOM. In addition, we move some stuff
related to IOMMU from S390PCIBusDevice to S390PCIIOMMU.

Signed-off-by: Yi Min Zhao 
Acked-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c  | 52 +---
 hw/s390x/s390-pci-bus.h  | 20 ---
 hw/s390x/s390-pci-inst.c | 50 --
 hw/s390x/s390-pci-inst.h |  2 +-
 4 files changed, 71 insertions(+), 53 deletions(-)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 63f6248f1d..11244fc93c 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -187,8 +187,8 @@ void s390_pci_sclp_deconfigure(SCCB *sccb)
 if (pbdev->summary_ind) {
 pci_dereg_irqs(pbdev);
 }
-if (pbdev->iommu_enabled) {
-pci_dereg_ioat(pbdev);
+if (pbdev->iommu->enabled) {
+pci_dereg_ioat(pbdev->iommu);
 }
 pbdev->state = ZPCI_FS_STANDBY;
 rc = SCLP_RC_NORMAL_COMPLETION;
@@ -377,12 +377,12 @@ out:
 return pte;
 }
 
-static IOMMUTLBEntry s390_translate_iommu(MemoryRegion *iommu, hwaddr addr,
+static IOMMUTLBEntry s390_translate_iommu(MemoryRegion *mr, hwaddr addr,
   bool is_write)
 {
 uint64_t pte;
 uint32_t flags;
-S390PCIBusDevice *pbdev = container_of(iommu, S390PCIBusDevice, iommu_mr);
+S390PCIIOMMU *iommu = container_of(mr, S390PCIIOMMU, iommu_mr);
 IOMMUTLBEntry ret = {
 .target_as = &address_space_memory,
 .iova = 0,
@@ -391,10 +391,10 @@ static IOMMUTLBEntry s390_translate_iommu(MemoryRegion 
*iommu, hwaddr addr,
 .perm = IOMMU_NONE,
 };
 
-switch (pbdev->state) {
+switch (iommu->pbdev->state) {
 case ZPCI_FS_ENABLED:
 case ZPCI_FS_BLOCKED:
-if (!pbdev->iommu_enabled) {
+if (!iommu->enabled) {
 return ret;
 }
 break;
@@ -404,11 +404,11 @@ static IOMMUTLBEntry s390_translate_iommu(MemoryRegion 
*iommu, hwaddr addr,
 
 DPRINTF("iommu trans addr 0x%" PRIx64 "\n", addr);
 
-if (addr < pbdev->pba || addr > pbdev->pal) {
+if (addr < iommu->pba || addr > iommu->pal) {
 return ret;
 }
 
-pte = s390_guest_io_table_walk(s390_pci_get_table_origin(pbdev->g_iota),
+pte = s390_guest_io_table_walk(s390_pci_get_table_origin(iommu->g_iota),
addr);
 if (!pte) {
 return ret;
@@ -503,19 +503,21 @@ static const MemoryRegionOps s390_msi_ctrl_ops = {
 .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
-void s390_pci_iommu_enable(S390PCIBusDevice *pbdev)
+void s390_pci_iommu_enable(S390PCIIOMMU *iommu)
 {
-memory_region_init_iommu(&pbdev->iommu_mr, OBJECT(&pbdev->iommu->mr),
- &s390_iommu_ops, "iommu-s390", pbdev->pal + 1);
-memory_region_add_subregion(&pbdev->iommu->mr, 0, &pbdev->iommu_mr);
-pbdev->iommu_enabled = true;
+char *name = g_strdup_printf("iommu-s390-%04x", iommu->pbdev->uid);
+memory_region_init_iommu(&iommu->iommu_mr, OBJECT(&iommu->mr),
+ &s390_iommu_ops, name, iommu->pal + 1);
+iommu->enabled = true;
+memory_region_add_subregion(&iommu->mr, 0, &iommu->iommu_mr);
+g_free(name);
 }
 
-void s390_pci_iommu_disable(S390PCIBusDevice *pbdev)
+void s390_pci_iommu_disable(S390PCIIOMMU *iommu)
 {
-memory_region_del_subregion(&pbdev->iommu->mr, &pbdev->iommu_mr);
-object_unparent(OBJECT(&pbdev->iommu_mr));
-pbdev->iommu_enabled = false;
+iommu->enabled = false;
+memory_region_del_subregion(&iommu->mr, &iommu->iommu_mr);
+object_unparent(OBJECT(&iommu->iommu_mr));
 }
 
 static void s390_pcihost_init_as(S390pciState *s)
@@ -660,6 +662,7 @@ static void s390_pcihost_hot_plug(HotplugHandler 
*hotplug_dev,
 
 pbdev->pdev = pdev;
 pbdev->iommu = s->iommu[PCI_SLOT(pdev->devfn)];
+pbdev->iommu->pbdev = pbdev;
 pbdev->state = ZPCI_FS_STANDBY;
 
 s390_pci_msix_init(pbdev);
@@ -692,8 +695,8 @@ static void s390_pcihost_timer_cb(void *opaque)
 if (pbdev->summary_ind) {
 pci_dereg_irqs(pbdev);
 }
-if (pbdev->iommu_enabled) {
-pci_dereg_ioat(pbdev);
+if (pbdev->iommu->enabled) {
+pci_dereg_ioat(pbdev->iommu);
 }
 
 pbdev->state = ZPCI_FS_STANDBY;
@@ -877,8 +880,8 @@ static void s390_pci_device_reset(DeviceState *dev)
 if (pbdev->summary_ind) {
 pci_dereg_irqs(pbdev);
 }
-if (pbdev->iommu_enabled) {
-pci_dereg_ioat(pbdev);
+if (pbdev->iommu->enabled) {
+pci_dereg_ioat(pbdev->iommu);
 }
 
 pbdev->fmb_addr = 0;
@@ -944,11 +947,18 @@ static const TypeInfo s390_pci_device_info = {
 .class_init = s390_pci_device_class_init,
 };
 
+static TypeInfo s390_pci_iommu_info = {
+.name = TYPE_S390_PCI_IOMMU,
+

[Qemu-devel] [PULL v2 09/12] s390x/pci: use hashtable to look up zpci via fh

2017-01-20 Thread Cornelia Huck

From: Yi Min Zhao 

After PCI multibus is supported, more than 32 PCI devices could be
plugged. The current implementation of s390_pci_find_dev_by_fh()
appears low performance if there's a huge number of PCI devices
plugged. Therefore we introduce a hashtable using idx as key to store
zpci device's pointer on account of translating fh to idx very easily.

Signed-off-by: Yi Min Zhao 
Reviewed-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c | 22 --
 hw/s390x/s390-pci-bus.h |  1 +
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index ce29320592..c2ae01c9a6 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -227,25 +227,16 @@ static S390PCIBusDevice 
*s390_pci_find_dev_by_target(S390pciState *s,
 
 S390PCIBusDevice *s390_pci_find_dev_by_idx(S390pciState *s, uint32_t idx)
 {
-S390PCIBusDevice *pbdev;
-
-QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
-if (pbdev->idx == idx) {
-return pbdev;
-}
-}
-
-return NULL;
+return g_hash_table_lookup(s->zpci_table, &idx);
 }
 
 S390PCIBusDevice *s390_pci_find_dev_by_fh(S390pciState *s, uint32_t fh)
 {
-S390PCIBusDevice *pbdev;
+uint32_t idx = FH_MASK_INDEX & fh;
+S390PCIBusDevice *pbdev = s390_pci_find_dev_by_idx(s, idx);
 
-QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
-if (pbdev->fh == fh) {
-return pbdev;
-}
+if (pbdev && pbdev->fh == fh) {
+return pbdev;
 }
 
 return NULL;
@@ -584,6 +575,7 @@ static int s390_pcihost_init(SysBusDevice *dev)
 
 s->iommu_table = g_hash_table_new_full(g_int64_hash, g_int64_equal,
NULL, g_free);
+s->zpci_table = g_hash_table_new_full(g_int_hash, g_int_equal, NULL, NULL);
 QTAILQ_INIT(&s->pending_sei);
 QTAILQ_INIT(&s->zpci_devs);
 return 0;
@@ -735,6 +727,7 @@ static void s390_pcihost_hot_plug(HotplugHandler 
*hotplug_dev,
 }
 pbdev->fh = pbdev->idx;
 QTAILQ_INSERT_TAIL(&s->zpci_devs, pbdev, link);
+g_hash_table_insert(s->zpci_table, &pbdev->idx, pbdev);
 }
 }
 
@@ -815,6 +808,7 @@ static void s390_pcihost_hot_unplug(HotplugHandler 
*hotplug_dev,
 out:
 pbdev->fid = 0;
 QTAILQ_REMOVE(&s->zpci_devs, pbdev, link);
+g_hash_table_remove(s->zpci_table, &pbdev->idx);
 object_unparent(OBJECT(pbdev));
 }
 
diff --git a/hw/s390x/s390-pci-bus.h b/hw/s390x/s390-pci-bus.h
index fbdc64febf..b82b18eb07 100644
--- a/hw/s390x/s390-pci-bus.h
+++ b/hw/s390x/s390-pci-bus.h
@@ -312,6 +312,7 @@ typedef struct S390pciState {
 uint32_t next_idx;
 S390PCIBus *bus;
 GHashTable *iommu_table;
+GHashTable *zpci_table;
 QTAILQ_HEAD(, SeiContainer) pending_sei;
 QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs;
 } S390pciState;
-- 
2.11.0

[Qemu-devel] [PULL v2 12/12] virtio-ccw: fix ring sizing

2017-01-20 Thread Cornelia Huck

From: "Michael S. Tsirkin" 

Current code seems to assume ring size is
always decreased but this is not required by spec:
what spec says is just that size can not exceed
the maximum. Fix it up.

Signed-off-by: Michael S. Tsirkin 
Message-Id: <1484256243-1982-1-git-send-email-...@redhat.com>
Signed-off-by: Cornelia Huck 
---
 hw/s390x/virtio-ccw.c  | 2 +-
 hw/virtio/virtio.c | 5 +
 include/hw/virtio/virtio.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
index 07650683f7..63c46373fb 100644
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@@ -149,7 +149,7 @@ static int virtio_ccw_set_vqs(SubchDev *sch, VqInfoBlock 
*info,
 } else {
 if (info) {
 /* virtio-1 allows changing the ring size. */
-if (virtio_queue_get_num(vdev, index) < num) {
+if (virtio_queue_get_max_num(vdev, index) < num) {
 /* Fail if we exceed the maximum number. */
 return -EINVAL;
 }
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index aa4f38f50a..94d7dbae6c 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1262,6 +1262,11 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
 return vdev->vq[n].vring.num;
 }
 
+int virtio_queue_get_max_num(VirtIODevice *vdev, int n)
+{
+return vdev->vq[n].vring.num_default;
+}
+
 int virtio_get_num_queues(VirtIODevice *vdev)
 {
 int i;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index e5541c61f7..6523bacd2f 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -228,6 +228,7 @@ void virtio_queue_set_addr(VirtIODevice *vdev, int n, 
hwaddr addr);
 hwaddr virtio_queue_get_addr(VirtIODevice *vdev, int n);
 void virtio_queue_set_num(VirtIODevice *vdev, int n, int num);
 int virtio_queue_get_num(VirtIODevice *vdev, int n);
+int virtio_queue_get_max_num(VirtIODevice *vdev, int n);
 int virtio_get_num_queues(VirtIODevice *vdev);
 void virtio_queue_set_rings(VirtIODevice *vdev, int n, hwaddr desc,
 hwaddr avail, hwaddr used);
-- 
2.11.0

Re: [Qemu-devel] [PATCH RFC v3 02/14] intel_iommu: simplify irq region translation

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 09:15:27AM +, Tian, Kevin wrote:
> > From: Peter Xu [mailto:pet...@redhat.com]
> > Sent: Friday, January 20, 2017 5:05 PM
> > 
> > On Fri, Jan 20, 2017 at 08:22:14AM +, Tian, Kevin wrote:
> > > > From: Peter Xu [mailto:pet...@redhat.com]
> > > > Sent: Friday, January 13, 2017 11:06 AM
> > > >
> > > > Before we have int-remap, we need to bypass interrupt write requests.
> > > > That's not necessary now - we have supported int-remap, and all the irq
> > > > region requests should be redirected there. Cleaning up the block with
> > > > an assertion instead.
> > >
> > > This comment is not accurate. According to code, the reason why you
> > > can do such simplification is because we have standalone memory
> > > region now for interrupt addresses. There should be nothing to do
> > > with int-remap, which can be disabled by guest... Maybe the standalone
> > > region was added when developing int-remap, but functionally they
> > > are not related. :-)
> > 
> > IMHO the above commit message is fairly clear. :-)
> > 
> > But sure I can add some more emphasise like:
> > 
> >   "Before we have int-remap memory region, ..."
> > 
> > Do you think it's okay? Or any better suggestion?
> > 
> > (Just to mention that even guest disables IR, the MSI region will
> >  still be there.)
> > 
> 
> My option is simple - this patch has nothing to do with int-remap.
> It's not necessary, not because we supported int-remap. It's because
> we have a standalone memory region for interrupt addresses, as you
> described in the code. :-)

I really think they are the same thing...

How about this:

Now we have a standalone memory region for MSI, all the irq region
requests should be redirected there. Cleaning up the block with an
assertion instead.

-- peterx

[Qemu-devel] [PULL v2 06/12] s390x/pci: change the device array to a list

2017-01-20 Thread Cornelia Huck

From: Pierre Morel 

In order to support a greater number of devices we use a QTAILQ
list of devices instead of a limited array.

This leads us to change:
- every lookup function s390_pci_find_xxx() for QTAILQ
- the FH_MASK_INDEX to index up to 65536 devices

Signed-off-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c | 100 
 hw/s390x/s390-pci-bus.h |   7 +++-
 2 files changed, 56 insertions(+), 51 deletions(-)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index f3afb17f6a..7ba1ae8b25 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -93,33 +93,24 @@ int chsc_sei_nt2_have_event(void)
 
 S390PCIBusDevice *s390_pci_find_next_avail_dev(S390PCIBusDevice *pbdev)
 {
-int idx = 0;
-S390PCIBusDevice *dev = NULL;
 S390pciState *s = s390_get_phb();
+S390PCIBusDevice *ret = pbdev ? QTAILQ_NEXT(pbdev, link) :
+QTAILQ_FIRST(&s->zpci_devs);
 
-if (pbdev) {
-idx = (pbdev->fh & FH_MASK_INDEX) + 1;
+while (ret && ret->state == ZPCI_FS_RESERVED) {
+ret = QTAILQ_NEXT(ret, link);
 }
 
-for (; idx < PCI_SLOT_MAX; idx++) {
-dev = s->pbdev[idx];
-if (dev && dev->state != ZPCI_FS_RESERVED) {
-return dev;
-}
-}
-
-return NULL;
+return ret;
 }
 
 S390PCIBusDevice *s390_pci_find_dev_by_fid(uint32_t fid)
 {
 S390PCIBusDevice *pbdev;
-int i;
 S390pciState *s = s390_get_phb();
 
-for (i = 0; i < PCI_SLOT_MAX; i++) {
-pbdev = s->pbdev[i];
-if (pbdev && pbdev->fid == fid) {
+QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
+if (pbdev->fid == fid) {
 return pbdev;
 }
 }
@@ -203,16 +194,10 @@ out:
 
 static S390PCIBusDevice *s390_pci_find_dev_by_uid(uint16_t uid)
 {
-int i;
 S390PCIBusDevice *pbdev;
 S390pciState *s = s390_get_phb();
 
-for (i = 0; i < PCI_SLOT_MAX; i++) {
-pbdev = s->pbdev[i];
-if (!pbdev) {
-continue;
-}
-
+QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
 if (pbdev->uid == uid) {
 return pbdev;
 }
@@ -223,7 +208,6 @@ static S390PCIBusDevice *s390_pci_find_dev_by_uid(uint16_t 
uid)
 
 static S390PCIBusDevice *s390_pci_find_dev_by_target(const char *target)
 {
-int i;
 S390PCIBusDevice *pbdev;
 S390pciState *s = s390_get_phb();
 
@@ -231,12 +215,7 @@ static S390PCIBusDevice *s390_pci_find_dev_by_target(const 
char *target)
 return NULL;
 }
 
-for (i = 0; i < PCI_SLOT_MAX; i++) {
-pbdev = s->pbdev[i];
-if (!pbdev) {
-continue;
-}
-
+QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
 if (!strcmp(pbdev->target, target)) {
 return pbdev;
 }
@@ -247,9 +226,16 @@ static S390PCIBusDevice *s390_pci_find_dev_by_target(const 
char *target)
 
 S390PCIBusDevice *s390_pci_find_dev_by_idx(uint32_t idx)
 {
+S390PCIBusDevice *pbdev;
 S390pciState *s = s390_get_phb();
 
-return s->pbdev[idx & FH_MASK_INDEX];
+QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
+if (pbdev->idx == idx) {
+return pbdev;
+}
+}
+
+return NULL;
 }
 
 S390PCIBusDevice *s390_pci_find_dev_by_fh(uint32_t fh)
@@ -257,9 +243,10 @@ S390PCIBusDevice *s390_pci_find_dev_by_fh(uint32_t fh)
 S390pciState *s = s390_get_phb();
 S390PCIBusDevice *pbdev;
 
-pbdev = s->pbdev[fh & FH_MASK_INDEX];
-if (pbdev && pbdev->fh == fh) {
-return pbdev;
+QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
+if (pbdev->fh == fh) {
+return pbdev;
+}
 }
 
 return NULL;
@@ -599,6 +586,7 @@ static int s390_pcihost_init(SysBusDevice *dev)
 s->iommu_table = g_hash_table_new_full(g_int64_hash, g_int64_equal,
NULL, g_free);
 QTAILQ_INIT(&s->pending_sei);
+QTAILQ_INIT(&s->zpci_devs);
 return 0;
 }
 
@@ -666,6 +654,25 @@ static S390PCIBusDevice *s390_pci_device_new(const char 
*target)
 return S390_PCI_DEVICE(dev);
 }
 
+static bool s390_pci_alloc_idx(S390PCIBusDevice *pbdev)
+{
+uint32_t idx;
+S390pciState *s = s390_get_phb();
+
+idx = s->next_idx;
+while (s390_pci_find_dev_by_idx(idx)) {
+idx = (idx + 1) & FH_MASK_INDEX;
+if (idx == s->next_idx) {
+return false;
+}
+}
+
+pbdev->idx = idx;
+s->next_idx = (idx + 1) & FH_MASK_INDEX;
+
+return true;
+}
+
 static void s390_pcihost_hot_plug(HotplugHandler *hotplug_dev,
   DeviceState *dev, Error **errp)
 {
@@ -713,18 +720,14 @@ static void s390_pcihost_hot_plug(HotplugHandler 
*hotplug_dev,
  pbdev->fh, pbdev->fid);
 }
 } else if (object_dynamic_cast(OBJECT(dev), TYPE_S390_PCI_DEVICE)) {
-int idx;
-
 pbdev = S390_PCI_DEVICE(dev);
-for (idx = 0; idx < PCI_SLOT_MAX; idx++) {
-

[Qemu-devel] [RFC PATCH v1] softfloat: Add round-to-odd rounding mode

2017-01-20 Thread Bharata B Rao

Power ISA 3.0 introduces a few quadruple precision floating point
instructions that support round-to-odd rounding mode. The
round-to-odd mode is explained as under:

Let Z be the intermediate arithmetic result or the operand of a convert
operation. If Z can be represented exactly in the target format, the
result is Z. Otherwise the result is either Z1 or Z2 whichever is odd.
Here Z1 and Z2 are the next larger and smaller numbers representable
in the target format respectively.

Signed-off-by: Bharata B Rao 
---
Changes in v1:
- Addressed rounding to infinity in roundAndPackFloat128().
- Added 64bit round to odd implementation(roundAndPackFloat64()) as
  it is needed by xscvqpdp instruction of Power ISA 3.0.

v0: http://patchwork.ozlabs.org/patch/716956/

 fpu/softfloat.c | 10 ++
 include/fpu/softfloat.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index c295f31..b9b36ad 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -623,6 +623,9 @@ static float64 roundAndPackFloat64(flag zSign, int zExp, 
uint64_t zSig,
 case float_round_down:
 roundIncrement = zSign ? 0x3ff : 0;
 break;
+case float_round_to_odd:
+roundIncrement = (zSig & 0x200) ? 0 : 0x3ff;
+break;
 default:
 abort();
 }
@@ -1149,6 +1152,9 @@ static float128 roundAndPackFloat128(flag zSign, int32_t 
zExp,
 case float_round_down:
 increment = zSign && zSig2;
 break;
+case float_round_to_odd:
+increment = !(zSig1 & 0x1) && zSig2;
+break;
 default:
 abort();
 }
@@ -1168,6 +1174,7 @@ static float128 roundAndPackFloat128(flag zSign, int32_t 
zExp,
 if (( roundingMode == float_round_to_zero )
  || ( zSign && ( roundingMode == float_round_up ) )
  || ( ! zSign && ( roundingMode == float_round_down ) )
+ || ( roundingMode == float_round_to_odd )
) {
 return
 packFloat128(
@@ -1215,6 +1222,9 @@ static float128 roundAndPackFloat128(flag zSign, int32_t 
zExp,
 case float_round_down:
 increment = zSign && zSig2;
 break;
+case float_round_to_odd:
+increment = !(zSig1 & 0x1) && zSig2;
+break;
 default:
 abort();
 }
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 842ec6b..8a39028 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -180,6 +180,8 @@ enum {
 float_round_up   = 2,
 float_round_to_zero  = 3,
 float_round_ties_away= 4,
+/* Not an IEEE rounding mode: round to the closest odd mantissa value */
+float_round_to_odd   = 5,
 };
 
 /*
-- 
2.7.4

[Qemu-devel] [PULL v2 08/12] s390x/pci: PCI multibus bridge handling

2017-01-20 Thread Cornelia Huck

From: Pierre Morel 

When the hotplug handler detects a PCI bridge, the secondary bus has
been initialized by the core PCI code. We give the secondary bus the
bridge name and associate to it the IOMMU handling and
hotplug/hotunplug callbacks.

Signed-off-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 70673af591..ce29320592 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -19,6 +19,7 @@
 #include "s390-pci-bus.h"
 #include "s390-pci-inst.h"
 #include "hw/pci/pci_bus.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/pci/msi.h"
 #include "qemu/error-report.h"
 
@@ -677,7 +678,16 @@ static void s390_pcihost_hot_plug(HotplugHandler 
*hotplug_dev,
 S390PCIBusDevice *pbdev = NULL;
 S390pciState *s = s390_get_phb();
 
-if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
+BusState *bus;
+PCIBridge *pb = PCI_BRIDGE(dev);
+
+pci_bridge_map_irq(pb, dev->id, s390_pci_map_irq);
+pci_setup_iommu(&pb->sec_bus, s390_pci_dma_iommu, s);
+
+bus = BUS(&pb->sec_bus);
+qbus_set_hotplug_handler(bus, DEVICE(s), errp);
+} else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
 pdev = PCI_DEVICE(dev);
 
 if (!dev->id) {
@@ -754,7 +764,10 @@ static void s390_pcihost_hot_unplug(HotplugHandler 
*hotplug_dev,
 S390PCIBusDevice *pbdev = NULL;
 S390pciState *s = s390_get_phb();
 
-if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
+error_setg(errp, "PCI bridge hot unplug currently not supported");
+return;
+} else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
 pci_dev = PCI_DEVICE(dev);
 
 QTAILQ_FOREACH(pbdev, &s->zpci_devs, link) {
-- 
2.11.0

Re: [Qemu-devel] [PATCH RFC v3 03/14] intel_iommu: renaming gpa to iova where proper

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 08:27:31AM +, Tian, Kevin wrote:
> > From: Peter Xu [mailto:pet...@redhat.com]
> > Sent: Friday, January 13, 2017 11:06 AM
> > 
> > There are lots of places in current intel_iommu.c codes that named
> > "iova" as "gpa". It is really confusing to use a name "gpa" in these
> > places (which is very easily to be understood as "Guest Physical
> > Address", while it's not). To make the codes (much) easier to be read, I
> > decided to do this once and for all.
> > 
> > No functional change is made. Only literal ones.
> 
> If looking at VT-d spec (3.2 Domains and Address Translation)
> 
>   Remapping hardware treats the address in inbound requests as DMA 
>   Address. Depending on the software usage model, the DMA address 
>   space may be the Guest-Physical Address (GPA) space of the virtual 
>   machine to which the device is assigned, or application Virtual Address 
>   (VA) space defined by the PASID assigned to an application, or some 
>   abstract I/O virtual address (IOVA) space defined by software.
> 
>   For simplicity, this document refers to address in requests-without-
>   PASID as GPA, and address in requests-with-PASID as Virtual Address 
>   (VA) (or Guest Virtual Address (GVA), if such request is from a device 
>   assigned to a virtual machine). The translated address is referred to 
> as 
>   HPA.
> 
> It will add more readability if similar comment is added in this file - you
> can say choosing iova to represent address in requests-without-PASID.

I agree that the code will be more readable if we can explain all the
bits in detail.

However this patch is not adding comments, but to "fix" an existing
literal problem, which is very misleading to people reading the codes
for the first time. The places touched in this patch was doing the
namings incorrectly, so I just corrected them. So even if we want to
have more comments on explaining the bits, IMHO it'll be nicer to use
a separate patch, not squashing all the things into a single one.

If you won't disagree, I'd like to keep this single patch as-it-is, to
make sure this series can converge soon (I will be sorry since I'll
extend this series a bit, I hope that won't extend the review process
too long for it). After that, we can add more documentations if
needed.

Thanks,

-- peterx

Re: [Qemu-devel] [PATCH RFC v3 01/14] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 20, 2017 5:12 PM
> 
> On Fri, Jan 20, 2017 at 08:59:01AM +, Tian, Kevin wrote:
> 
> [...]
> 
> > > > Also for hot-add
> > > > device path, some check of caching mode is required. If not set,
> > > > should we fail hot-add operation? I don't think we have such physical
> > > > platform with some devices behind IOMMU while others not.
> > >
> > > Could you explain in what case will we fail a hot plug?
> > >
> >
> > user enables intel-iommu, but don't set caching mode.
> >
> > Then later user hot-add a PCI device to the VM. Guest will assume
> > newly assigned device also behind the default vIOMMU, and thus
> > needs to setup IOVA mappings, which is then broken...
> 
> Is the newly added device a vfio-pci device? If so, we should hit
> this and VM will stops to work:
> 
> if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
> error_report("We need to set cache_mode=1 for intel-iommu to enable "
>  "device assignment with IOMMU protection.");
> exit(1);
> }

sorry I didn't found this code. In which code path is it hit?

> 
> I admit this is not user-friendly, and a better way may be that we
> disallow the hot-plug in that case, telling the user about the error,
> rather than crashing the VM. But, I think that can be a patch outside
> this series, considering (again) that this only affects advanced
> users.
> 

Crashing VM is bad but anyway, I'll leave maintainer to decide
whether they'd like it fixed now or later. :-)

Thanks
Kevin

[Qemu-devel] [PULL v2 11/12] s390x/pci: merge msix init functions

2017-01-20 Thread Cornelia Huck

From: Yi Min Zhao 

Currently there're two functions, s390_pci_setup_msix() and
s390_pci_msix_init(), for msix initialization, and being called once
for each zpci device plugging. Let's integrate them.

Moreover msix is mandatory in s390 architecture. So we ensure the pci
device being plugged supports msix. For vfio (which is the only tested
setup so far), nothing changes.

Signed-off-by: Yi Min Zhao 
Reviewed-by: Pierre Morel 
Signed-off-by: Cornelia Huck 
---
 hw/s390x/s390-pci-bus.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 5cf97b4f7d..69b0291e8a 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -582,8 +582,9 @@ static int s390_pcihost_init(SysBusDevice *dev)
 return 0;
 }
 
-static int s390_pci_setup_msix(S390PCIBusDevice *pbdev)
+static int s390_pci_msix_init(S390PCIBusDevice *pbdev)
 {
+char *name;
 uint8_t pos;
 uint16_t ctrl;
 uint32_t table, pba;
@@ -591,7 +592,7 @@ static int s390_pci_setup_msix(S390PCIBusDevice *pbdev)
 pos = pci_find_capability(pbdev->pdev, PCI_CAP_ID_MSIX);
 if (!pos) {
 pbdev->msix.available = false;
-return 0;
+return -1;
 }
 
 ctrl = pci_host_config_read_common(pbdev->pdev, pos + PCI_MSIX_FLAGS,
@@ -607,21 +608,15 @@ static int s390_pci_setup_msix(S390PCIBusDevice *pbdev)
 pbdev->msix.pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
 pbdev->msix.entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
 pbdev->msix.available = true;
-return 0;
-}
-
-static void s390_pci_msix_init(S390PCIBusDevice *pbdev)
-{
-char *name;
 
 name = g_strdup_printf("msix-s390-%04x", pbdev->uid);
-
 memory_region_init_io(&pbdev->msix_notify_mr, OBJECT(pbdev),
   &s390_msi_ctrl_ops, pbdev, name, PAGE_SIZE);
 memory_region_add_subregion(&pbdev->iommu->mr, ZPCI_MSI_ADDR,
 &pbdev->msix_notify_mr);
-
 g_free(name);
+
+return 0;
 }
 
 static void s390_pci_msix_free(S390PCIBusDevice *pbdev)
@@ -724,8 +719,11 @@ static void s390_pcihost_hot_plug(HotplugHandler 
*hotplug_dev,
 pbdev->iommu->pbdev = pbdev;
 pbdev->state = ZPCI_FS_STANDBY;
 
-s390_pci_msix_init(pbdev);
-s390_pci_setup_msix(pbdev);
+if (s390_pci_msix_init(pbdev)) {
+error_setg(errp, "MSI-X support is mandatory "
+   "in the S390 architecture");
+return;
+}
 
 if (dev->hotplugged) {
 s390_pci_generate_plug_event(HP_EVENT_RESERVED_TO_STANDBY,
-- 
2.11.0

Re: [Qemu-devel] [PATCH RFC v3 01/14] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 09:20:01AM +, Tian, Kevin wrote:
> > From: Peter Xu [mailto:pet...@redhat.com]
> > Sent: Friday, January 20, 2017 5:12 PM
> > 
> > On Fri, Jan 20, 2017 at 08:59:01AM +, Tian, Kevin wrote:
> > 
> > [...]
> > 
> > > > > Also for hot-add
> > > > > device path, some check of caching mode is required. If not set,
> > > > > should we fail hot-add operation? I don't think we have such physical
> > > > > platform with some devices behind IOMMU while others not.
> > > >
> > > > Could you explain in what case will we fail a hot plug?
> > > >
> > >
> > > user enables intel-iommu, but don't set caching mode.
> > >
> > > Then later user hot-add a PCI device to the VM. Guest will assume
> > > newly assigned device also behind the default vIOMMU, and thus
> > > needs to setup IOVA mappings, which is then broken...
> > 
> > Is the newly added device a vfio-pci device? If so, we should hit
> > this and VM will stops to work:
> > 
> > if (!s->cache_mode_enabled && new & IOMMU_NOTIFIER_MAP) {
> > error_report("We need to set cache_mode=1 for intel-iommu to enable 
> > "
> >  "device assignment with IOMMU protection.");
> > exit(1);
> > }
> 
> sorry I didn't found this code. In which code path is it hit?

It's in patch 14/14 of this series.

> 
> > 
> > I admit this is not user-friendly, and a better way may be that we
> > disallow the hot-plug in that case, telling the user about the error,
> > rather than crashing the VM. But, I think that can be a patch outside
> > this series, considering (again) that this only affects advanced
> > users.
> > 
> 
> Crashing VM is bad but anyway, I'll leave maintainer to decide
> whether they'd like it fixed now or later. :-)

Sure. Thanks,

-- peterx

Re: [Qemu-devel] [RFC PATCH v0] softfloat: Add round-to-odd rounding mode

2017-01-20 Thread Bharata B Rao

On Thu, Jan 19, 2017 at 07:29:54AM -0800, Richard Henderson wrote:
> On 01/18/2017 09:14 PM, Bharata B Rao wrote:
> > Power ISA 3.0 introduces a few quadruple precision floating point
> > instructions that support round-to-add rounding mode. The
> > round-to-odd mode is explained as under:
> > 
> > Let Z be the intermediate arithmetic result or the operand of a convert
> > operation. If Z can be represented exactly in the target format, the
> > result is Z. Otherwise the result is either Z1 or Z2 whichever is odd.
> > Here Z1 and Z2 are the next larger and smaller numbers representable
> > in the target format respectively.
> > 
> > Signed-off-by: Bharata B Rao 
> > ---
> > - I am not fully sure if this the correct implementation for the above
> >   described round-to-odd rounding method. Any help is appreciated.
> > - Didn't bother to add round-to-odd to other floating point precision
> >   variants as round-to-odd option is currently supported only for some
> >   instructions that work on quad precision.
> > 
> >  fpu/softfloat.c | 6 ++
> >  include/fpu/softfloat.h | 1 +
> >  2 files changed, 7 insertions(+)
> > 
> > diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> > index c295f31..05932a9 100644
> > --- a/fpu/softfloat.c
> > +++ b/fpu/softfloat.c
> > @@ -1149,6 +1149,9 @@ static float128 roundAndPackFloat128(flag zSign, 
> > int32_t zExp,
> >  case float_round_down:
> >  increment = zSign && zSig2;
> >  break;
> > +case float_round_to_odd:
> > +increment = !(zSig1 & 0x1) && zSig2;
> > +break;
> >  default:
> >  abort();
> >  }
> > @@ -1215,6 +1218,9 @@ static float128 roundAndPackFloat128(flag zSign, 
> > int32_t zExp,
> >  case float_round_down:
> >  increment = zSign && zSig2;
> >  break;
> > +case float_round_to_odd:
> > +increment = !(zSig1 & 0x1) && zSig2;
> > +break;
> >  default:
> >  abort();
> >  }
> 
> I believe you've missed the section in between that deals with
> round-to-largest or to infinity:
> 
> if (( roundingMode == float_round_to_zero )
>  || ( zSign && ( roundingMode == float_round_up ) )
>  || ( ! zSign && ( roundingMode == float_round_down ) )
>) {

Addresed in v1. Thanks Richard and Peter for pointing this out.

> 
> The description in see in the manual on page 387 is more precise than what
> you quote above:

Right. I quoted from page 385 as it is more concise for patch
description. However if you think the more precise definition of page
387 is appropriate in commit, I can put it.

Regards,
Bharata.

Re: [Qemu-devel] [PATCH v3 2/4] compiler: rework BUG_ON using a struct

2017-01-20 Thread Dr. David Alan Gilbert

* Michael S. Tsirkin (m...@redhat.com) wrote:
> There are theoretical concerns that some compilers might not trigger
> build failures on attempts to define an array of size -1 and make it a
> variable sized array instead. Let rewrite using a struct with a negative
> bit field size instead as there are no dynamic bit field sizes.  This is
> similar to what Linux does.
> 
> Signed-off-by: Michael S. Tsirkin 
> ---
>  include/qemu/compiler.h | 9 ++---
>  1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
> index 7512082..c6f673e 100644
> --- a/include/qemu/compiler.h
> +++ b/include/qemu/compiler.h
> @@ -85,9 +85,12 @@
>  #define typeof_field(type, field) typeof(((type *)0)->field)
>  #define type_check(t1,t2) ((t1*)0 - (t2*)0)
>  
> -#define QEMU_BUILD_BUG_ON(x) \
> -typedef char glue(qemu_build_bug_on__, __LINE__)[(x) ? -1 : 1] \
> -__attribute__((unused))
> +#define QEMU_BUILD_BUG_ON_STRUCT(x) \
> +struct { \
> +int qemu_build_bug_on : (x) ? -1 : 1; \
> +}

The problem with this is it can't be used as an expression, where as
your previous version could.
I've got a similar case (see previous reply) that needed an expression
bug-on that would evaluate to zero.

Dave

> +#define QEMU_BUILD_BUG_ON(x) typedef QEMU_BUILD_BUG_ON_STRUCT(x) \
> +glue(qemu_build_bug_on__, __LINE__) __attribute__((unused))
>  
>  #if defined __GNUC__
>  # if !QEMU_GNUC_PREREQ(4, 4)
> -- 
> MST
> 
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [Qemu-devel] [PATCH v2 3/3] block: get max_transfer limit for char (scsi-generic) devices

2017-01-20 Thread Fam Zheng

On Thu, 01/19 21:51, Eric Farman wrote:
> Commit 6f607174 introduced a routine to get the maximum number
> of bytes for a single I/O transfer for block devices, however
> scsi generic devices are character devices, not block.  Add
> a condition for this, such that scsi generic devices can view
> the same data.
> 
> Some tweaking of data is required, because the block and sg
> ioctls return values in different scales (sectors versus
> bytes).  So adjust hdev_get_max_transfer_length such that it
> always returns a value in bytes.
> 
> Signed-off-by: Eric Farman 
> ---
>  block/file-posix.c| 10 ++
>  include/block/block.h |  1 +
>  2 files changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 2115155..94068ca 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -657,9 +657,11 @@ static int hdev_get_max_transfer_length(BlockDriverState 
> *bs, int fd)
>  int max_sectors = 0;
>  short max_sectors_short = 0;
>  if (bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
> +/* sg returns a value in bytes */
>  return max_sectors;

The variable name is now misleading, maybe use "bytes" instead?

>  } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors_short) == 0) {
> -return max_sectors_short;
> +/* block returns a value in sectors */
> +return max_sectors_short << BDRV_SECTOR_BITS;
>  } else {
>  return -errno;
>  }
> @@ -674,10 +676,10 @@ static void raw_refresh_limits(BlockDriverState *bs, 
> Error **errp)
>  struct stat st;
>  
>  if (!fstat(s->fd, &st)) {
> -if (S_ISBLK(st.st_mode)) {
> +if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
>  int ret = hdev_get_max_transfer_length(bs, s->fd);
> -if (ret > 0 && ret <= BDRV_REQUEST_MAX_SECTORS) {
> -bs->bl.max_transfer = pow2floor(ret << BDRV_SECTOR_BITS);
> +if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
> +bs->bl.max_transfer = pow2floor(ret);
>  }
>  }
>  }
> diff --git a/include/block/block.h b/include/block/block.h
> index 8b0dcda..4e81f20 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -116,6 +116,7 @@ typedef struct HDGeometry {
>  
>  #define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
>   INT_MAX >> BDRV_SECTOR_BITS)
> +#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
>  
>  /*
>   * Allocation status flags
> -- 
> 2.8.4
> 
>

Re: [Qemu-devel] [PULL 00/12] s390x update

2017-01-20 Thread Cornelia Huck

On Thu, 19 Jan 2017 18:32:44 +
Peter Maydell  wrote:

> Hi; I'm afraid this fails to build on Windows:
> 
> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
> function ‘s390_pci_get_iommu’:
> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:415:20:
> error: cast from pointer to integer of different size
> [-Werror=pointer-to-int-cast]
>  uint64_t key = (unsigned long)bus;
> ^
> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
> function ‘s390_pci_iommu_free’:
> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:539:20:
> error: cast from pointer to integer of different size
> [-Werror=pointer-to-int-cast]
>  uint64_t key = (unsigned long)bus;
> ^
> cc1: all warnings being treated as errors
> 
> 
> You probably wanted uintptr_t.

Fixed and v2 sent.

I'm wondering if there is any way to catch this earlier (without
actually building on Windows)? My mingw 4.8.2 cross-build worked fine...

[Qemu-devel] [PATCH 0/2] block/nfs: fix segfault and naming of runtime opts

2017-01-20 Thread Peter Lieven

commit 94d6a7a accidently left the naming of runtime opts and QAPI
scheme inconsistent. Furthermore a NULL pointer dereference resulted
in a segfault when parsing URI parameters.

Peter Lieven (2):
  block/nfs: fix NULL pointer dereference in URI parsing
  block/nfs: fix naming of runtime opts

 block/nfs.c | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

-- 
1.9.1

[Qemu-devel] [PATCH 1/2] block/nfs: fix NULL pointer dereference in URI parsing

2017-01-20 Thread Peter Lieven

parse_uint_full wants to put the parsed value into the
variabled passed via its second argument which is NULL.

Fixes: 94d6a7a76e9df9919629428f6c598e2b97d9426c
Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Lieven 
---
 block/nfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/nfs.c b/block/nfs.c
index a564340..baaecff 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -108,12 +108,13 @@ static int nfs_parse_uri(const char *filename, QDict 
*options, Error **errp)
 qdict_put(options, "path", qstring_from_str(uri->path));
 
 for (i = 0; i < qp->n; i++) {
+unsigned long long val;
 if (!qp->p[i].value) {
 error_setg(errp, "Value for NFS parameter expected: %s",
qp->p[i].name);
 goto out;
 }
-if (parse_uint_full(qp->p[i].value, NULL, 0)) {
+if (parse_uint_full(qp->p[i].value, &val, 0)) {
 error_setg(errp, "Illegal value for NFS parameter: %s",
qp->p[i].name);
 goto out;
-- 
1.9.1

Re: [Qemu-devel] [PATCH RFC v3 03/14] intel_iommu: renaming gpa to iova where proper

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 20, 2017 5:24 PM
> 
> On Fri, Jan 20, 2017 at 08:27:31AM +, Tian, Kevin wrote:
> > > From: Peter Xu [mailto:pet...@redhat.com]
> > > Sent: Friday, January 13, 2017 11:06 AM
> > >
> > > There are lots of places in current intel_iommu.c codes that named
> > > "iova" as "gpa". It is really confusing to use a name "gpa" in these
> > > places (which is very easily to be understood as "Guest Physical
> > > Address", while it's not). To make the codes (much) easier to be read, I
> > > decided to do this once and for all.
> > >
> > > No functional change is made. Only literal ones.
> >
> > If looking at VT-d spec (3.2 Domains and Address Translation)
> >
> > Remapping hardware treats the address in inbound requests as DMA
> > Address. Depending on the software usage model, the DMA address
> > space may be the Guest-Physical Address (GPA) space of the virtual
> > machine to which the device is assigned, or application Virtual Address
> > (VA) space defined by the PASID assigned to an application, or some
> > abstract I/O virtual address (IOVA) space defined by software.
> >
> > For simplicity, this document refers to address in requests-without-
> > PASID as GPA, and address in requests-with-PASID as Virtual Address
> > (VA) (or Guest Virtual Address (GVA), if such request is from a device
> > assigned to a virtual machine). The translated address is referred to as
> > HPA.
> >
> > It will add more readability if similar comment is added in this file - you
> > can say choosing iova to represent address in requests-without-PASID.
> 
> I agree that the code will be more readable if we can explain all the
> bits in detail.
> 
> However this patch is not adding comments, but to "fix" an existing
> literal problem, which is very misleading to people reading the codes
> for the first time. The places touched in this patch was doing the
> namings incorrectly, so I just corrected them. So even if we want to
> have more comments on explaining the bits, IMHO it'll be nicer to use
> a separate patch, not squashing all the things into a single one.
> 
> If you won't disagree, I'd like to keep this single patch as-it-is, to
> make sure this series can converge soon (I will be sorry since I'll
> extend this series a bit, I hope that won't extend the review process
> too long for it). After that, we can add more documentations if
> needed.
> 

fine with me.

Thanks
Kevin

[Qemu-devel] [PATCH 2/2] block/nfs: fix naming of runtime opts

2017-01-20 Thread Peter Lieven

commit 94d6a7a accidently left the naming of runtime opts and QAPI
scheme inconsistent. As one consequence passing of parameters in the
URI is broken. Sync the naming of the runtime opts to the QAPI
scheme.

Fixes: 94d6a7a76e9df9919629428f6c598e2b97d9426c
Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Lieven 
---
 block/nfs.c | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/nfs.c b/block/nfs.c
index baaecff..464d547 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -359,27 +359,27 @@ static QemuOptsList runtime_opts = {
 .help = "Path of the image on the host",
 },
 {
-.name = "uid",
+.name = "user",
 .type = QEMU_OPT_NUMBER,
 .help = "UID value to use when talking to the server",
 },
 {
-.name = "gid",
+.name = "group",
 .type = QEMU_OPT_NUMBER,
 .help = "GID value to use when talking to the server",
 },
 {
-.name = "tcp-syncnt",
+.name = "tcp-syn-count",
 .type = QEMU_OPT_NUMBER,
 .help = "Number of SYNs to send during the session establish",
 },
 {
-.name = "readahead",
+.name = "readahead-size",
 .type = QEMU_OPT_NUMBER,
 .help = "Set the readahead size in bytes",
 },
 {
-.name = "pagecache",
+.name = "page-cache-size",
 .type = QEMU_OPT_NUMBER,
 .help = "Set the pagecache size in bytes",
 },
@@ -508,29 +508,29 @@ static int64_t nfs_client_open(NFSClient *client, QDict 
*options,
 goto fail;
 }
 
-if (qemu_opt_get(opts, "uid")) {
-client->uid = qemu_opt_get_number(opts, "uid", 0);
+if (qemu_opt_get(opts, "user")) {
+client->uid = qemu_opt_get_number(opts, "user", 0);
 nfs_set_uid(client->context, client->uid);
 }
 
-if (qemu_opt_get(opts, "gid")) {
-client->gid = qemu_opt_get_number(opts, "gid", 0);
+if (qemu_opt_get(opts, "group")) {
+client->gid = qemu_opt_get_number(opts, "group", 0);
 nfs_set_gid(client->context, client->gid);
 }
 
-if (qemu_opt_get(opts, "tcp-syncnt")) {
-client->tcp_syncnt = qemu_opt_get_number(opts, "tcp-syncnt", 0);
+if (qemu_opt_get(opts, "tcp-syn-count")) {
+client->tcp_syncnt = qemu_opt_get_number(opts, "tcp-syn-count", 0);
 nfs_set_tcp_syncnt(client->context, client->tcp_syncnt);
 }
 
 #ifdef LIBNFS_FEATURE_READAHEAD
-if (qemu_opt_get(opts, "readahead")) {
+if (qemu_opt_get(opts, "readahead-size")) {
 if (open_flags & BDRV_O_NOCACHE) {
 error_setg(errp, "Cannot enable NFS readahead "
  "if cache.direct = on");
 goto fail;
 }
-client->readahead = qemu_opt_get_number(opts, "readahead", 0);
+client->readahead = qemu_opt_get_number(opts, "readahead-size", 0);
 if (client->readahead > QEMU_NFS_MAX_READAHEAD_SIZE) {
 error_report("NFS Warning: Truncating NFS readahead "
  "size to %d", QEMU_NFS_MAX_READAHEAD_SIZE);
@@ -545,13 +545,13 @@ static int64_t nfs_client_open(NFSClient *client, QDict 
*options,
 #endif
 
 #ifdef LIBNFS_FEATURE_PAGECACHE
-if (qemu_opt_get(opts, "pagecache")) {
+if (qemu_opt_get(opts, "page-cache-size")) {
 if (open_flags & BDRV_O_NOCACHE) {
 error_setg(errp, "Cannot enable NFS pagecache "
  "if cache.direct = on");
 goto fail;
 }
-client->pagecache = qemu_opt_get_number(opts, "pagecache", 0);
+client->pagecache = qemu_opt_get_number(opts, "page-cache-size", 0);
 if (client->pagecache > QEMU_NFS_MAX_PAGECACHE_SIZE) {
 error_report("NFS Warning: Truncating NFS pagecache "
  "size to %d pages", QEMU_NFS_MAX_PAGECACHE_SIZE);
-- 
1.9.1

Re: [Qemu-devel] [PATCH RFC v3 02/14] intel_iommu: simplify irq region translation

2017-01-20 Thread Tian, Kevin

> From: Peter Xu [mailto:pet...@redhat.com]
> Sent: Friday, January 20, 2017 5:28 PM
> 
> On Fri, Jan 20, 2017 at 09:15:27AM +, Tian, Kevin wrote:
> > > From: Peter Xu [mailto:pet...@redhat.com]
> > > Sent: Friday, January 20, 2017 5:05 PM
> > >
> > > On Fri, Jan 20, 2017 at 08:22:14AM +, Tian, Kevin wrote:
> > > > > From: Peter Xu [mailto:pet...@redhat.com]
> > > > > Sent: Friday, January 13, 2017 11:06 AM
> > > > >
> > > > > Before we have int-remap, we need to bypass interrupt write requests.
> > > > > That's not necessary now - we have supported int-remap, and all the 
> > > > > irq
> > > > > region requests should be redirected there. Cleaning up the block with
> > > > > an assertion instead.
> > > >
> > > > This comment is not accurate. According to code, the reason why you
> > > > can do such simplification is because we have standalone memory
> > > > region now for interrupt addresses. There should be nothing to do
> > > > with int-remap, which can be disabled by guest... Maybe the standalone
> > > > region was added when developing int-remap, but functionally they
> > > > are not related. :-)
> > >
> > > IMHO the above commit message is fairly clear. :-)
> > >
> > > But sure I can add some more emphasise like:
> > >
> > >   "Before we have int-remap memory region, ..."
> > >
> > > Do you think it's okay? Or any better suggestion?
> > >
> > > (Just to mention that even guest disables IR, the MSI region will
> > >  still be there.)
> > >
> >
> > My option is simple - this patch has nothing to do with int-remap.
> > It's not necessary, not because we supported int-remap. It's because
> > we have a standalone memory region for interrupt addresses, as you
> > described in the code. :-)
> 
> I really think they are the same thing...
> 
> How about this:
> 
> Now we have a standalone memory region for MSI, all the irq region
> requests should be redirected there. Cleaning up the block with an
> assertion instead.
> 

btw what about guest setups a valid mapping at 0xFEEx_ in
its remapping structure, which is then programmed to virtual
device as DMA destination? Then when emulating that virtual DMA,
vtd_do_iommu_translate should simply return (maybe throw out
a warning for diagnostic purpose) instead of assert here. 

VT-d spec defines as below:

Software must ensure the second-level paging-structure entries 
are programmed not to remap input addresses to the interrupt 
address range. Hardware behavior is undefined for memory 
requests remapped to the interrupt address range.

I don't think "hardware behavior is undefined" is equal to "assert
thus kill VM"...

Thanks
Kevin

Re: [Qemu-devel] [PULL 00/12] s390x update

2017-01-20 Thread Thomas Huth

On 20.01.2017 10:21, Cornelia Huck wrote:
> On Thu, 19 Jan 2017 18:32:44 +
> Peter Maydell  wrote:
> 
>> Hi; I'm afraid this fails to build on Windows:
>>
>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
>> function ‘s390_pci_get_iommu’:
>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:415:20:
>> error: cast from pointer to integer of different size
>> [-Werror=pointer-to-int-cast]
>>  uint64_t key = (unsigned long)bus;
>> ^
>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
>> function ‘s390_pci_iommu_free’:
>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:539:20:
>> error: cast from pointer to integer of different size
>> [-Werror=pointer-to-int-cast]
>>  uint64_t key = (unsigned long)bus;
>> ^
>> cc1: all warnings being treated as errors
>>
>>
>> You probably wanted uintptr_t.
> 
> Fixed and v2 sent.
> 
> I'm wondering if there is any way to catch this earlier (without
> actually building on Windows)? My mingw 4.8.2 cross-build worked fine...

You likely got to use the 32-bit version of MinGW (or compile with -m32
with any other version of GCC) to catch this issue, so that you get
sizeof(long) != sizeof(void *).

 Thomas

Re: [Qemu-devel] [PATCH] display: cirrus: check vga bits per pixel(bpp) value

2017-01-20 Thread Wolfgang Bumiller

On Wed, Jan 11, 2017 at 09:43:41PM +0100, Gerd Hoffmann wrote:
> On Mi, 2017-01-11 at 16:59 +0200, Alberto Garcia wrote:
> > On Mon, Nov 28, 2016 at 11:52:08AM +0530, P J P wrote:
> > > | > --- a/hw/display/cirrus_vga.c
> > > | > +++ b/hw/display/cirrus_vga.c
> > > | > @@ -272,6 +272,9 @@ static void 
> > > cirrus_update_memory_access(CirrusVGAState
> > > | > *s);
> > > | >  static bool blit_region_is_unsafe(struct CirrusVGAState *s,
> > > | >int32_t pitch, int32_t addr)
> > > | >  {
> > > | > +if (!pitch) {
> > > | > +return true;
> > > | > +}
> > > | >
> > > | 
> > > | That doesn't look directly related to 'cirrus_get_bpp', care to explain?
> > > 
> > >   'blit_region_is_unsafe' is called from 'blit_is_unsafe' to check if 
> > > blit 
> > > parameters (cirrus_blt_srcpitch/cirrus_blt_dstpitch)  are safe for 
> > > 'cirrus_do_copy'. These too could lead to div by zero in cirrus_do_copy
> > 
> > This change is causing display artifacts in QEMU 2.8.
> > 
> > What seems to happen is that blit_is_unsafe() is also called for
> > CIRRUS_BLTMODE_PATTERNCOPY, but in this case cirrus_blt_srcpitch is
> > not used. However, because of this new check if its value is 0 then
> > cirrus_bitblt_common_patterncopy() returns early and becomes a no-op.
> 
> inflight vga queue pull request has a fix for that.

Do you mean:
 [PATCH] display: cirrus: ignore source pitch value as needed in blit_is_unsafe
 (Message-Id: <20170109203520.5619-1-brog...@suse.com>)

Because I'm still seeing artifacts on some setups (eg. on win XP).
As far as I can tell the check is still too strong:
The rops used by cirrus_bitblt_common_patterncopy seem to only be using
the destination pitch as far as I can see (all functions in
cirrus_vga_rop2.h) and in my tests only the destination pitch got
filled in, the source pitch was left as zero. Adapting the check when
coming from cirrus_bitblt_common_patterncopy seems to fix the issue for
me.

Additionally (but this didn't have any visible effect in my test (and
shouldn't)) the cirrus_fill rops called from cirrus_bitblt_solidfill
don't actually divide by the pitch (as far as I can see) but just add
it to their destination offset (cirrus_vga_rop2.h around line 276?),
not sure if it makes sense to change how this is handled at all as a
zero pitch there would IMO produce artifacts with or without the check.
I just thought I'd point it out in case someone wanted to know.

What do you think of the patch below? (Applied on top of both other
patches)?

It could definitely use some auditing to see if I missed any of the
code paths, since it involves a bunch of function pointers fetched from
lists depending on parameters. Here's a debug print showing the
situtation in cirrus_bitblt_common_patterncopy() when the artifacts
occured:

s->cirrus_blt_mode   = 0xc0,
s->cirrus_blt_modeext= 0x00,
  Inferred use of s->vga.gr[0x32] from above values:
rop_to_index[s->vga.gr[0x32]]= 5
  (should be ROP2(cirrus_colorexpand_pattern_src) ?)
s->cirrus_blt_pixelwidth = 2
s->cirrus_blt_width  = 1242
s->cirrus_blt_height = 27
s->cirrus_blt_srcpitch   = 0  <-- culprit
s->cirrus_blt_dstpitch   = 2560

 8< 

>From a3be50cc3e3bb0f5eb784d30048b8866bdca Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller 
Date: Fri, 20 Jan 2017 09:44:39 +0100
Subject: [PATCH] cirrus: allow zero source pitch in pattern fill rops

The rops used by cirrus_bitblt_common_patterncopy only use
the destination pitch, so the source pitch shoul allowed to
be zero.

Signed-off-by: Wolfgang Bumiller 
---
 hw/display/cirrus_vga.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c
index 379910d..c2fce8c 100644
--- a/hw/display/cirrus_vga.c
+++ b/hw/display/cirrus_vga.c
@@ -272,9 +272,6 @@ static void cirrus_update_memory_access(CirrusVGAState *s);
 static bool blit_region_is_unsafe(struct CirrusVGAState *s,
   int32_t pitch, int32_t addr)
 {
-if (!pitch) {
-return true;
-}
 if (pitch < 0) {
 int64_t min = addr
 + ((int64_t)s->cirrus_blt_height-1) * pitch;
@@ -294,7 +291,7 @@ static bool blit_region_is_unsafe(struct CirrusVGAState *s,
 return false;
 }

-static bool blit_is_unsafe(struct CirrusVGAState *s, bool dst_only)
+static bool blit_is_unsafe(struct CirrusVGAState *s, bool dst_only, bool 
zero_src_pitch_ok)
 {
 /* should be the case, see cirrus_bitblt_start */
 assert(s->cirrus_blt_width > 0);
@@ -304,6 +301,10 @@ static bool blit_is_unsafe(struct CirrusVGAState *s, bool 
dst_only)
 return true;
 }

+if (!s->cirrus_blt_dstpitch) {
+return true;
+}
+
 if (blit_region_is_unsafe(s, s->cirrus_blt_dstpitch,
   s->cirrus_blt_dstaddr & s->cirrus_addr_mask)) {

Re: [Qemu-devel] [PATCH v2 3/3] block: get max_transfer limit for char (scsi-generic) devices

2017-01-20 Thread Fam Zheng

On Fri, 01/20 17:31, Fam Zheng wrote:
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 2115155..94068ca 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -657,9 +657,11 @@ static int 
> > hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
> >  int max_sectors = 0;
> >  short max_sectors_short = 0;
> >  if (bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
> > +/* sg returns a value in bytes */
> >  return max_sectors;
> 
> The variable name is now misleading, maybe use "bytes" instead?

BTW patch 2 should already make the function return bytes consistently.  Doing
it here is meaningless code churn.

Fam

Re: [Qemu-devel] [PULL 00/12] s390x update

2017-01-20 Thread Thomas Huth

On 20.01.2017 10:52, Thomas Huth wrote:
> On 20.01.2017 10:21, Cornelia Huck wrote:
>> On Thu, 19 Jan 2017 18:32:44 +
>> Peter Maydell  wrote:
>>
>>> Hi; I'm afraid this fails to build on Windows:
>>>
>>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
>>> function ‘s390_pci_get_iommu’:
>>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:415:20:
>>> error: cast from pointer to integer of different size
>>> [-Werror=pointer-to-int-cast]
>>>  uint64_t key = (unsigned long)bus;
>>> ^
>>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
>>> function ‘s390_pci_iommu_free’:
>>> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:539:20:
>>> error: cast from pointer to integer of different size
>>> [-Werror=pointer-to-int-cast]
>>>  uint64_t key = (unsigned long)bus;
>>> ^
>>> cc1: all warnings being treated as errors
>>>
>>>
>>> You probably wanted uintptr_t.
>>
>> Fixed and v2 sent.
>>
>> I'm wondering if there is any way to catch this earlier (without
>> actually building on Windows)? My mingw 4.8.2 cross-build worked fine...
> 
> You likely got to use the 32-bit version of MinGW (or compile with -m32
> with any other version of GCC) to catch this issue, so that you get
> sizeof(long) != sizeof(void *).

... or was it the other way round? ... use the 64-bit version of MinGW?
... that likely makes more sense... sorry, it's been a while that I used
MinGW for the last time.

 Thomas

Re: [Qemu-devel] [PULL 00/12] s390x update

2017-01-20 Thread Cornelia Huck

On Fri, 20 Jan 2017 10:52:26 +0100
Thomas Huth  wrote:

> On 20.01.2017 10:21, Cornelia Huck wrote:
> > On Thu, 19 Jan 2017 18:32:44 +
> > Peter Maydell  wrote:
> > 
> >> Hi; I'm afraid this fails to build on Windows:
> >>
> >> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
> >> function ‘s390_pci_get_iommu’:
> >> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:415:20:
> >> error: cast from pointer to integer of different size
> >> [-Werror=pointer-to-int-cast]
> >>  uint64_t key = (unsigned long)bus;
> >> ^
> >> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
> >> function ‘s390_pci_iommu_free’:
> >> /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:539:20:
> >> error: cast from pointer to integer of different size
> >> [-Werror=pointer-to-int-cast]
> >>  uint64_t key = (unsigned long)bus;
> >> ^
> >> cc1: all warnings being treated as errors
> >>
> >>
> >> You probably wanted uintptr_t.
> > 
> > Fixed and v2 sent.
> > 
> > I'm wondering if there is any way to catch this earlier (without
> > actually building on Windows)? My mingw 4.8.2 cross-build worked fine...
> 
> You likely got to use the 32-bit version of MinGW (or compile with -m32
> with any other version of GCC) to catch this issue, so that you get
> sizeof(long) != sizeof(void *).

Well, I already do that (mingw32, and I see -m32 on the command line;
that already caught sizeof(uint64_t) != sizeof(void *) here.) But isn't
sizeof(long) == sizeof(void *) even in the -m32 case?

Re: [Qemu-devel] [PATCH RFC v3 02/14] intel_iommu: simplify irq region translation

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 09:52:01AM +, Tian, Kevin wrote:

[...]

> btw what about guest setups a valid mapping at 0xFEEx_ in
> its remapping structure, which is then programmed to virtual
> device as DMA destination? Then when emulating that virtual DMA,
> vtd_do_iommu_translate should simply return (maybe throw out
> a warning for diagnostic purpose) instead of assert here. 
> 
> VT-d spec defines as below:
> 
>   Software must ensure the second-level paging-structure entries 
>   are programmed not to remap input addresses to the interrupt 
>   address range. Hardware behavior is undefined for memory 
>   requests remapped to the interrupt address range.

Thanks for this reference. That's something I was curious about.

> 
> I don't think "hardware behavior is undefined" is equal to "assert
> thus kill VM"...

I don't think it will kill the VM. After we have the MSI region, it
should just use that IR region for everything (read/write/translate).
So iiuc when anyone setups IOVA mapping within range 0xfeex, then
a DMA will trigger an interrupt (rather than memory moves), but in
most cases the interrupt will be illegal since either the data is
invalid (e.g., non-zero reserved bits, or SID verification failure),
further it should trigger a vIOMMU fault (though IR fault reporting is
still incomplete, that's my next thing to do after this series).

Thanks,

-- peterx

Re: [Qemu-devel] [PULL 00/12] s390x update

2017-01-20 Thread Thomas Huth

On 20.01.2017 11:00, Cornelia Huck wrote:
> On Fri, 20 Jan 2017 10:52:26 +0100
> Thomas Huth  wrote:
> 
>> On 20.01.2017 10:21, Cornelia Huck wrote:
>>> On Thu, 19 Jan 2017 18:32:44 +
>>> Peter Maydell  wrote:
>>>
 Hi; I'm afraid this fails to build on Windows:

 /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
 function ‘s390_pci_get_iommu’:
 /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:415:20:
 error: cast from pointer to integer of different size
 [-Werror=pointer-to-int-cast]
  uint64_t key = (unsigned long)bus;
 ^
 /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
 function ‘s390_pci_iommu_free’:
 /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:539:20:
 error: cast from pointer to integer of different size
 [-Werror=pointer-to-int-cast]
  uint64_t key = (unsigned long)bus;
 ^
 cc1: all warnings being treated as errors


 You probably wanted uintptr_t.
>>>
>>> Fixed and v2 sent.
>>>
>>> I'm wondering if there is any way to catch this earlier (without
>>> actually building on Windows)? My mingw 4.8.2 cross-build worked fine...
>>
>> You likely got to use the 32-bit version of MinGW (or compile with -m32
>> with any other version of GCC) to catch this issue, so that you get
>> sizeof(long) != sizeof(void *).
> 
> Well, I already do that (mingw32, and I see -m32 on the command line;
> that already caught sizeof(uint64_t) != sizeof(void *) here.) But isn't
> sizeof(long) == sizeof(void *) even in the -m32 case?

Yes, sorry, my bad, it's the 64-bit version of MinGW where sizeof(long)
is different to sizeof(void *), see
https://sourceforge.net/p/mingw-w64/bugs/58/ for example.

 Thomas

Re: [Qemu-devel] [PATCH RFC] vfio error recovery: kernel support

2017-01-20 Thread Cao jin



On 01/20/2017 04:16 AM, Michael S. Tsirkin wrote:
> This is a design and an initial patch for kernel side for AER
> support in VFIO.
> 
> 0. What happens now (PCIE AER only)
>Fatal errors cause a link reset.
>Non fatal errors don't.
>All errors stop the VM eventually, but not immediately
>because it's detected and reported asynchronously.
>Interrupts are forwarded as usual.
>Correctable errors are not reported to guest at all.
>Note: PPC EEH is different. This focuses on AER.
> 
> 1. Correctable errors
>I don't see a need to report these to guest. So let's not.
> 
> 2. Fatal errors
>It's not easy to handle them gracefully since link reset
>is needed. As a first step, let's use the existing mechanism
>in that case.
>
> 2. Non-fatal errors
>Here we could make progress by reporting them to guest
>and have guest handle them.
>Issues:
> a. this behaviour should only be enabled with new userspace
>old userspace should work without changes
> Suggestion: One way to address this would be to add a new eventfd
> non_fatal_err_trigger. If not set, invoke err_trigger.
> 
> b. drivers are supposed to stop MMIO when error is reported
> if vm keeps going, we will keep doing MMIO/config
> Suggestion 1: ignore this. vm stop happens much later when userspace runs 
> anyway,
> so we are not making things much worse
> Suggestion 2: try to stop MMIO/config, resume on resume call
> 
> Patch below implements Suggestion 1.
> 
> c. PF driver might detect that function is completely broken,
> if vm keeps going, we will keep doing MMIO/config
> Suggestion 1: ignore this. vm stop happens much later when userspace runs 
> anyway,
> so we are not making things much worse
> Suggestion 2: detect this and invoke err_trigger to stop VM
> 
> Patch below implements Suggestion 2.
> 
> Aside: we currently return PCI_ERS_RESULT_DISCONNECT when device
> is not attached. This seems bogus, likely based on the confusing name.
> We probably should return PCI_ERS_RESULT_CAN_RECOVER.
> 
> The following patch does not change that.
> 
> Signed-off-by: Michael S. Tsirkin 
> 
> ---
> 
> The patch is completely untested. Let's discuss the design first.
> Cao jin, if this is deemed acceptable please take it from here.
> 

Ok, thanks very much.

> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index dce511f..fdca683 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -1292,7 +1292,9 @@ static pci_ers_result_t 
> vfio_pci_aer_err_detected(struct pci_dev *pdev,
>  
>   mutex_lock(&vdev->igate);
>  
> - if (vdev->err_trigger)
> + if (state == pci_channel_io_normal && vdev->non_fatal_err_trigger)
> + eventfd_signal(vdev->err_trigger, 1);
> + else if (vdev->err_trigger)
>   eventfd_signal(vdev->err_trigger, 1);
>  
>   mutex_unlock(&vdev->igate);
> @@ -1302,8 +1304,38 @@ static pci_ers_result_t 
> vfio_pci_aer_err_detected(struct pci_dev *pdev,
>   return PCI_ERS_RESULT_CAN_RECOVER;
>  }
>  
> +static pci_ers_result_t vfio_pci_aer_slot_reset(struct pci_dev *pdev,
> + pci_channel_state_t state)
> +{
> + struct vfio_pci_device *vdev;
> + struct vfio_device *device;
> +
> + device = vfio_device_get_from_dev(&pdev->dev);
> + if (!device)
> + goto err_dev;
> +
> + vdev = vfio_device_data(device);
> + if (!vdev)
> + goto err_dev;
> +
> + mutex_lock(&vdev->igate);
> +
> + if (vdev->err_trigger)
> + eventfd_signal(vdev->err_trigger, 1);
> +
> + mutex_unlock(&vdev->igate);
> +
> + vfio_device_put(device);
> +
> +err_data:
> + vfio_device_put(device);
> +err_dev:
> + return PCI_ERS_RESULT_RECOVERED;
> +}
> +
>  static const struct pci_error_handlers vfio_err_handlers = {
>   .error_detected = vfio_pci_aer_err_detected,
> + .slot_reset = vfio_pci_aer_slot_reset,
>  };
>  

if .slot_reset wants to be called, .error_detected should return
PCI_ERS_RESULT_NEED_RESET, as pci-error-recovery.txt said, so does code.

Is .slot_reset now just a copy of .error_detected and we are going do
some tricks here? or else don't get why .slot_reset signal user again.

>  static struct pci_driver vfio_pci_driver = {
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
> b/drivers/vfio/pci/vfio_pci_intrs.c
> index 1c46045..e883db5 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -611,6 +611,17 @@ static int vfio_pci_set_err_trigger(struct 
> vfio_pci_device *vdev,
>  count, flags, data);
>  }
>  
> +static int vfio_pci_set_non_fatal_err_trigger(struct vfio_pci_device *vdev,
> + unsigned index, unsigned start,
> + unsigned count, uint32_t flags, void *data)
> +{
> + if (index !=

Re: [Qemu-devel] [PULL 00/12] s390x update

2017-01-20 Thread Cornelia Huck

On Fri, 20 Jan 2017 11:04:29 +0100
Thomas Huth  wrote:

> On 20.01.2017 11:00, Cornelia Huck wrote:
> > On Fri, 20 Jan 2017 10:52:26 +0100
> > Thomas Huth  wrote:
> > 
> >> On 20.01.2017 10:21, Cornelia Huck wrote:
> >>> On Thu, 19 Jan 2017 18:32:44 +
> >>> Peter Maydell  wrote:
> >>>
>  Hi; I'm afraid this fails to build on Windows:
> 
>  /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
>  function ‘s390_pci_get_iommu’:
>  /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:415:20:
>  error: cast from pointer to integer of different size
>  [-Werror=pointer-to-int-cast]
>   uint64_t key = (unsigned long)bus;
>  ^
>  /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c: In
>  function ‘s390_pci_iommu_free’:
>  /home/petmay01/linaro/qemu-for-merges/hw/s390x/s390-pci-bus.c:539:20:
>  error: cast from pointer to integer of different size
>  [-Werror=pointer-to-int-cast]
>   uint64_t key = (unsigned long)bus;
>  ^
>  cc1: all warnings being treated as errors
> 
> 
>  You probably wanted uintptr_t.
> >>>
> >>> Fixed and v2 sent.
> >>>
> >>> I'm wondering if there is any way to catch this earlier (without
> >>> actually building on Windows)? My mingw 4.8.2 cross-build worked fine...
> >>
> >> You likely got to use the 32-bit version of MinGW (or compile with -m32
> >> with any other version of GCC) to catch this issue, so that you get
> >> sizeof(long) != sizeof(void *).
> > 
> > Well, I already do that (mingw32, and I see -m32 on the command line;
> > that already caught sizeof(uint64_t) != sizeof(void *) here.) But isn't
> > sizeof(long) == sizeof(void *) even in the -m32 case?
> 
> Yes, sorry, my bad, it's the 64-bit version of MinGW where sizeof(long)
> is different to sizeof(void *), see
> https://sourceforge.net/p/mingw-w64/bugs/58/ for example.

Ah, that makes sense. Thanks!

Re: [Qemu-devel] [PATCH v2 07/11] aspeed/smc: handle SPI flash Command mode

2017-01-20 Thread Peter Maydell

On 19 January 2017 at 20:35, Cédric Le Goater  wrote:
> On 01/19/2017 08:26 PM, Peter Maydell wrote:
>> On 9 January 2017 at 16:24, Cédric Le Goater  wrote:
>>> The Aspeed SMC controllers have a mode (Command mode) in which
>>> accesses to the flash content are no different than doing MMIOs. The
>>> controller generates all the necessary commands to load (or store)
>>> data in memory.
>>>
>>> However, accesses are restricted to the segment window assigned the
>>> the flash module by the controller. This window is defined by the
>>> Segment Address Register.
>>>
>>> Signed-off-by: Cédric Le Goater 
>>> Reviewed-by: Andrew Jeffery 
>>> ---
>>>  hw/ssi/aspeed_smc.c | 152 
>>> ++--
>>>  include/hw/ssi/aspeed_smc.h |   2 +-
>>>  2 files changed, 132 insertions(+), 22 deletions(-)
>>
>> This deleted the only call to aspeed_smc_is_usermode() but not
>> its definition, which makes clang complain:
>> /Users/pm215/src/qemu-for-merges/hw/ssi/aspeed_smc.c:409:20: error:
>> unused function 'aspeed_smc_is_usermode' [-Werror,-Wunused-function]
>>
>> Presumably the function itself should be deleted?
>
> yes. This is correct. I will send a patch for it.

I'll just edit the commit in my target-arm tree, that
will be simplest.

thanks
-- PMM

Re: [Qemu-devel] [PATCH 0/6] i386: Add "max" CPU model to TCG and KVM

2017-01-20 Thread Peter Maydell

On 19 January 2017 at 21:04, Eduardo Habkost  wrote:
> This is v2 of the previous series that enabled the "host" CPU
> model on TCG. Now a new "max" CPU is being added, while keeping
> "host" KVM-specific.
>
> In addition to simply adding "max" as a copy of the existing
> "host" CPU model, additional patches change it to not use any
> host CPUID information when in TCG mode.
>
> ---
> Cc: "Richard W.M. Jones" 
> Cc: David Hildenbrand 
> Cc: Cornelia Huck 
> Cc: "Daniel P. Berrange" 
> Cc: Peter Maydell 
> Cc: Igor Mammedov 
> Cc: Jiri Denemark 
> Cc: Richard Henderson 
> Cc: Christian Borntraeger 
> Cc: "Jason J. Herne" 
> Cc: Paolo Bonzini 
>
> Eduardo Habkost (6):
>   i386: Unset cannot_destroy_with_object_finalize_yet on "host" model
>   i386: Add ordering field to CPUClass
>   i386: Rename X86CPU::host_features to X86CPU::max_features
>   i386: Create "max" CPU model
>   i386: Make "max" model not use any host CPUID info on TCG
>   i386: Don't set CPUClass::cpu_def on "max" model
>
>  target/i386/cpu-qom.h |   6 ++-
>  target/i386/cpu.h |   2 +-
>  target/i386/cpu.c | 110 
> +-
>  3 files changed, 71 insertions(+), 47 deletions(-)

Should we be documenting this new -cpu max somewhere?

(If this series is OK I can send patches to add support to
ARM, and also to add "-machine gic-version=max" to allow
picking the best available interrupt controller type, by
analogy.)

thanks
-- PMM

Re: [Qemu-devel] [PATCH v2 07/11] aspeed/smc: handle SPI flash Command mode

2017-01-20 Thread Cédric Le Goater

On 01/20/2017 11:13 AM, Peter Maydell wrote:
> On 19 January 2017 at 20:35, Cédric Le Goater  wrote:
>> On 01/19/2017 08:26 PM, Peter Maydell wrote:
>>> On 9 January 2017 at 16:24, Cédric Le Goater  wrote:
 The Aspeed SMC controllers have a mode (Command mode) in which
 accesses to the flash content are no different than doing MMIOs. The
 controller generates all the necessary commands to load (or store)
 data in memory.

 However, accesses are restricted to the segment window assigned the
 the flash module by the controller. This window is defined by the
 Segment Address Register.

 Signed-off-by: Cédric Le Goater 
 Reviewed-by: Andrew Jeffery 
 ---
  hw/ssi/aspeed_smc.c | 152 
 ++--
  include/hw/ssi/aspeed_smc.h |   2 +-
  2 files changed, 132 insertions(+), 22 deletions(-)
>>>
>>> This deleted the only call to aspeed_smc_is_usermode() but not
>>> its definition, which makes clang complain:
>>> /Users/pm215/src/qemu-for-merges/hw/ssi/aspeed_smc.c:409:20: error:
>>> unused function 'aspeed_smc_is_usermode' [-Werror,-Wunused-function]
>>>
>>> Presumably the function itself should be deleted?
>>
>> yes. This is correct. I will send a patch for it.
> 
> I'll just edit the commit in my target-arm tree, that
> will be simplest.

ok. sure. 

I do push a branch on github before sending for travis to test.
Should that error have been caught by travis ? Just asking, as
I might have missed it.

Thanks,

C.

[Qemu-devel] [qemu PATCH] pc.h: move x-mach-use-reliable-get-clock compat entry to PC_COMPAT_2_8

2017-01-20 Thread Marcelo Tosatti


As noticed by David Gilbert, commit 6053a86 'kvmclock: reduce kvmclock
differences on migration' added 'x-mach-use-reliable-get-clock' and a
compatibility entry that turns it off; however it got merged after 2.8.0
was released but the entry has gone into PC_COMPAT_2_7 where it should
have gone into PC_COMPAT_2_8.

Fix it by moving the entry to PC_COMPAT_2_8.

Signed-off-by: Marcelo Tosatti 


diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index b22e699..738bfd6 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -375,14 +375,15 @@ int e820_get_num_entries(void);
 bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 
 #define PC_COMPAT_2_8 \
-
-#define PC_COMPAT_2_7 \
-HW_COMPAT_2_7 \
+HW_COMPAT_2_8 \
 {\
 .driver   = "kvmclock",\
 .property = "x-mach-use-reliable-get-clock",\
 .value= "off",\
 },\
+
+#define PC_COMPAT_2_7 \
+HW_COMPAT_2_7 \
 {\
 .driver   = TYPE_X86_CPU,\
 .property = "l3-cache",\

Re: [Qemu-devel] kvmclock in wrong compat macro?

2017-01-20 Thread Marcelo Tosatti

On Wed, Jan 18, 2017 at 04:03:08PM +, Dr. David Alan Gilbert wrote:
> Hi Marcelo,
>In commit 6053a86 'kvmclock: reduce kvmclock differences on migration'
> you add a 'x-mach-use-reliable-get-clock' and a compatibility entry
> that turns it off; if I'm reading it right though it got merged
> after 2.8.0 was released but the entry has gone into PC_COMPAT_2_7
> where I guess it should have gone into PC_COMPAT_2_8.

Hi David, 

Thats right: there was a merge/time conflict (patch was written when 
PC_COMPAT_2_8 made sense). Patch sent.

> Also, why is it x86 specific?

Because kvmclock is.

Re: [Qemu-devel] [RFC PATCH v1] softfloat: Add round-to-odd rounding mode

2017-01-20 Thread Peter Maydell

On 20 January 2017 at 09:17, Bharata B Rao  wrote:
> Power ISA 3.0 introduces a few quadruple precision floating point
> instructions that support round-to-odd rounding mode. The
> round-to-odd mode is explained as under:
>
> Let Z be the intermediate arithmetic result or the operand of a convert
> operation. If Z can be represented exactly in the target format, the
> result is Z. Otherwise the result is either Z1 or Z2 whichever is odd.
> Here Z1 and Z2 are the next larger and smaller numbers representable
> in the target format respectively.
>
> Signed-off-by: Bharata B Rao 
> ---
> Changes in v1:
> - Addressed rounding to infinity in roundAndPackFloat128().
> - Added 64bit round to odd implementation(roundAndPackFloat64()) as
>   it is needed by xscvqpdp instruction of Power ISA 3.0.
>
> v0: http://patchwork.ozlabs.org/patch/716956/
>
>  fpu/softfloat.c | 10 ++
>  include/fpu/softfloat.h |  2 ++
>  2 files changed, 12 insertions(+)
>
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index c295f31..b9b36ad 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -623,6 +623,9 @@ static float64 roundAndPackFloat64(flag zSign, int zExp, 
> uint64_t zSig,
>  case float_round_down:
>  roundIncrement = zSign ? 0x3ff : 0;
>  break;
> +case float_round_to_odd:
> +roundIncrement = (zSig & 0x200) ? 0 : 0x3ff;

Are you sure that should be 0x200 and not 0x400 ?

> +break;
>  default:
>  abort();
>  }

This isn't sufficient, because it won't do the right thing
in the code which is picking between "round to infinity" and
"round to largest/smallest representable number". That's
phrased differently from the Float128 code but it's still
there:

return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));

will generate a result with an all-zeros mantissa for
roundIncrement == 0 (ie go to infinity) and an all-ones
mantissa otherwise (ie go to largest-representable).
That works for the existing cases but it doesn't work
for round_to_odd.

How are you testing these patches? You ought to be able
to compare the results against a known-good implementation
for a bunch of these corner cases and a lot of random
input values, which is the best way to be confident there
aren't bugs in them. (risu should be able to do this since
it supports PPC now, though it can miss things depending
on the number of iterations you ask it to do.)

> @@ -1149,6 +1152,9 @@ static float128 roundAndPackFloat128(flag zSign, 
> int32_t zExp,
>  case float_round_down:
>  increment = zSign && zSig2;
>  break;
> +case float_round_to_odd:
> +increment = !(zSig1 & 0x1) && zSig2;
> +break;
>  default:
>  abort();
>  }
> @@ -1168,6 +1174,7 @@ static float128 roundAndPackFloat128(flag zSign, 
> int32_t zExp,
>  if (( roundingMode == float_round_to_zero )
>   || ( zSign && ( roundingMode == float_round_up ) )
>   || ( ! zSign && ( roundingMode == float_round_down ) )
> + || ( roundingMode == float_round_to_odd )
> ) {
>  return
>  packFloat128(
> @@ -1215,6 +1222,9 @@ static float128 roundAndPackFloat128(flag zSign, 
> int32_t zExp,
>  case float_round_down:
>  increment = zSign && zSig2;
>  break;
> +case float_round_to_odd:
> +increment = !(zSig1 & 0x1) && zSig2;
> +break;
>  default:
>  abort();
>  }
> diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
> index 842ec6b..8a39028 100644
> --- a/include/fpu/softfloat.h
> +++ b/include/fpu/softfloat.h
> @@ -180,6 +180,8 @@ enum {
>  float_round_up   = 2,
>  float_round_to_zero  = 3,
>  float_round_ties_away= 4,
> +/* Not an IEEE rounding mode: round to the closest odd mantissa value */
> +float_round_to_odd   = 5,
>  };
>
>  
> /*

thanks
-- PMM

Re: [Qemu-devel] [PATCH v7 26/27] tcg: enable MTTCG by default for ARM on x86 hosts

2017-01-20 Thread Alex Bennée


Pranith Kumar  writes:

> Alex Bennée writes:
>
>> This enables the multi-threaded system emulation by default for ARMv7
>> and ARMv8 guests using the x86_64 TCG backend. This is because on the
>> guest side:
>>
>>   - The ARM translate.c/translate-64.c have been converted to
>> - use MTTCG safe atomic primitives
>> - emit the appropriate barrier ops
>>   - The ARM machine has been updated to
>> - hold the BQL when modifying shared cross-vCPU state
>> - defer cpu_reset to async safe work
>>
>> All the host backends support the barrier and atomic primitives but
>> need to provide same-or-better support for normal load/store
>> operations.
>>
>> Signed-off-by: Alex Bennée 
>
> 
>
>>
>> +/* This defines the natural memory order supported by this
>> + * architecture before guarantees made by various barrier
>> + * instructions.
>> + *
>> + * The x86 has a pretty strong memory ordering which only really
>> + * allows for some stores to be re-ordered after loads.
>> + */
>> +#include "tcg-mo.h"
>> +
>> +static inline int get_tcg_target_mo(void)
>> +{
>> +return TCG_MO_ALL & ~TCG_MO_LD_ST;
>> +}
>> +
>
> Shouldn't this be TCG_MO_ALL & ~TCG_MO_ST_LD?

The case that x86 doesn't handle normally is store-after-load which is
what I assumed TCG_MO_LD_ST was. Perhaps we need some better comments
for each of the enums?

>
> Thanks,


--
Alex Bennée

Re: [Qemu-devel] [qemu PATCH] pc.h: move x-mach-use-reliable-get-clock compat entry to PC_COMPAT_2_8

2017-01-20 Thread Dr. David Alan Gilbert

* Marcelo Tosatti (mtosa...@redhat.com) wrote:
> 
> As noticed by David Gilbert, commit 6053a86 'kvmclock: reduce kvmclock
> differences on migration' added 'x-mach-use-reliable-get-clock' and a
> compatibility entry that turns it off; however it got merged after 2.8.0
> was released but the entry has gone into PC_COMPAT_2_7 where it should
> have gone into PC_COMPAT_2_8.
> 
> Fix it by moving the entry to PC_COMPAT_2_8.
> 
> Signed-off-by: Marcelo Tosatti 

Reviewed-by: Dr. David Alan Gilbert 

> 
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index b22e699..738bfd6 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -375,14 +375,15 @@ int e820_get_num_entries(void);
>  bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>  
>  #define PC_COMPAT_2_8 \
> -
> -#define PC_COMPAT_2_7 \
> -HW_COMPAT_2_7 \
> +HW_COMPAT_2_8 \
>  {\
>  .driver   = "kvmclock",\
>  .property = "x-mach-use-reliable-get-clock",\
>  .value= "off",\
>  },\
> +
> +#define PC_COMPAT_2_7 \
> +HW_COMPAT_2_7 \
>  {\
>  .driver   = TYPE_X86_CPU,\
>  .property = "l3-cache",\
> 
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [Qemu-devel] [PATCH v2 07/11] aspeed/smc: handle SPI flash Command mode

2017-01-20 Thread Cédric Le Goater

On 01/20/2017 11:17 AM, Cédric Le Goater wrote:
> On 01/20/2017 11:13 AM, Peter Maydell wrote:
>> On 19 January 2017 at 20:35, Cédric Le Goater  wrote:
>>> On 01/19/2017 08:26 PM, Peter Maydell wrote:
 On 9 January 2017 at 16:24, Cédric Le Goater  wrote:
> The Aspeed SMC controllers have a mode (Command mode) in which
> accesses to the flash content are no different than doing MMIOs. The
> controller generates all the necessary commands to load (or store)
> data in memory.
>
> However, accesses are restricted to the segment window assigned the
> the flash module by the controller. This window is defined by the
> Segment Address Register.
>
> Signed-off-by: Cédric Le Goater 
> Reviewed-by: Andrew Jeffery 
> ---
>  hw/ssi/aspeed_smc.c | 152 
> ++--
>  include/hw/ssi/aspeed_smc.h |   2 +-
>  2 files changed, 132 insertions(+), 22 deletions(-)

 This deleted the only call to aspeed_smc_is_usermode() but not
 its definition, which makes clang complain:
 /Users/pm215/src/qemu-for-merges/hw/ssi/aspeed_smc.c:409:20: error:
 unused function 'aspeed_smc_is_usermode' [-Werror,-Wunused-function]

 Presumably the function itself should be deleted?
>>>
>>> yes. This is correct. I will send a patch for it.
>>
>> I'll just edit the commit in my target-arm tree, that
>> will be simplest.
> 
> ok. sure. 
> 
> I do push a branch on github before sending for travis to test.
> Should that error have been caught by travis ? Just asking, as
> I might have missed it.

So I did miss it. I will take a closer look at the logs next time.

Thanks,

C.

Re: [Qemu-devel] [PATCH v2 3/3] block: get max_transfer limit for char (scsi-generic) devices

2017-01-20 Thread Eric Farman




On 01/20/2017 04:53 AM, Fam Zheng wrote:

On Fri, 01/20 17:31, Fam Zheng wrote:

diff --git a/block/file-posix.c b/block/file-posix.c
index 2115155..94068ca 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -657,9 +657,11 @@ static int hdev_get_max_transfer_length(BlockDriverState 
*bs, int fd)
 int max_sectors = 0;
 short max_sectors_short = 0;
 if (bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+/* sg returns a value in bytes */
 return max_sectors;


The variable name is now misleading, maybe use "bytes" instead?


BTW patch 2 should already make the function return bytes consistently.  Doing
it here is meaningless code churn.


Excellent points.  Spinning v3 now.

 - Eric

[Qemu-devel] [PULL v2 00/36] target-arm queue

2017-01-20 Thread Peter Maydell

Changes from v1: just squashed in
http://patchwork.ozlabs.org/patch/717312/
which deletes a now-unused function.

thanks
-- PMM

The following changes since commit 0f6bcf68a99efdc531b209551f2b760b0bdcc554:

  Merge remote-tracking branch 'remotes/artyom/tags/pull-sun4v-20170118' into 
staging (2017-01-19 18:34:13 +)

are available in the git repository at:

  git://git.linaro.org/people/pmaydell/qemu-arm.git 
tags/pull-target-arm-20170120

for you to fetch changes up to f29cacfb5fc0a6e93efc3f6d2900d82d625f143e:

  hw/arm/virt: Add board property to enable EL2 (2017-01-20 11:15:11 +)


target-arm queue:
 * support virtualization in GICv3
 * enable EL2 in AArch64 CPU models
 * allow EL2 to be enabled on 'virt' board via -machine virtualization=on
 * aspeed: SMC improvements
 * m25p80: support die erase command
 * m25p80: Add Quad Page Program 4byte
 * m25p80: Improve 1GiB Micron flash definition
 * arm: Uniquely name imx25 I2C buses


Alastair D'Silva (1):
  arm: Uniquely name imx25 I2C buses.

Andrew Jones (1):
  hw/arm/virt-acpi-build: use SMC if booting in EL2

Ard Biesheuvel (1):
  hw/arm/virt-acpi - reserve ECAM space as PNP0C02 device

Cédric Le Goater (10):
  aspeed/smc: remove call to reset in realize function
  aspeed/smc: remove call to aspeed_smc_update_cs() in reset function
  aspeed/smc: rework the prototype of the AspeedSMCFlash helper routines
  aspeed/smc: autostrap CE0/1 configuration
  aspeed/smc: unfold the AspeedSMCController array
  aspeed/smc: adjust the size of the register region
  aspeed/smc: handle SPI flash Command mode
  aspeed/smc: reset flash after each test
  aspeed/smc: extend tests for Command mode
  aspeed: use first FMC flash as a boot ROM

Marcin Krzeminski (3):
  block: m25p80: Add Quad Page Program 4byte
  block: m25p80: Introduce die erase command
  block: m25p80: Improve 1GiB Micron flash definition

Peter Maydell (19):
  target/arm: Handle VIRQ and VFIQ in arm_cpu_do_interrupt_aarch32()
  target/arm: Implement DBGVCR32_EL2 system register
  hw/intc/arm_gicv3: Add external IRQ lines for VIRQ and VFIQ
  hw/intc/arm_gic: Add external IRQ lines for VIRQ and VFIQ
  target-arm: Expose output GPIO line for VCPU maintenance interrupt
  hw/arm/virt: Wire VIRQ, VFIQ, maintenance irq lines from GIC to CPU
  target-arm: Add ARMCPU fields for GIC CPU i/f config
  hw/intc/gicv3: Add defines for ICH system register fields
  hw/intc/gicv3: Add data fields for virtualization support
  hw/intc/arm_gicv3: Add accessors for ICH_ system registers
  hw/intc/arm_gicv3: Implement ICV_ registers which are just accessors
  hw/intc/arm_gicv3: Implement ICV_ HPPIR, DIR and RPR registers
  hw/intc/arm_gicv3: Implement ICV_ registers EOIR and IAR
  hw/intc/arm_gicv3: Implement gicv3_cpuif_virt_update()
  hw/intc/arm_gicv3: Implement EL2 traps for CPU i/f regs
  hw/arm/virt: Support using SMC for PSCI
  target/arm/psci.c: If EL2 implemented, start CPUs in EL2
  target-arm: Enable EL2 feature bit on A53 and A57
  hw/arm/virt: Add board property to enable EL2

Shannon Zhao (1):
  arm: virt: Fix segmentation fault when specifying an unsupported CPU

 hw/intc/gicv3_internal.h   |   79 +++
 include/hw/arm/virt.h  |5 +-
 include/hw/intc/arm_gic_common.h   |2 +
 include/hw/intc/arm_gicv3_common.h |   21 +
 include/hw/ssi/aspeed_smc.h|4 +-
 target/arm/cpu.h   |9 +
 hw/arm/aspeed.c|   41 ++
 hw/arm/imx25_pdk.c |2 +-
 hw/arm/virt-acpi-build.c   |   36 +-
 hw/arm/virt.c  |   88 ++-
 hw/arm/xlnx-zynqmp.c   |2 +
 hw/block/m25p80.c  |   51 +-
 hw/i2c/imx_i2c.c   |2 +-
 hw/intc/arm_gic_common.c   |6 +
 hw/intc/arm_gicv3_common.c |   31 +
 hw/intc/arm_gicv3_cpuif.c  | 1351 +++-
 hw/ssi/aspeed_smc.c|  325 +++--
 target/arm/cpu.c   |   15 +
 target/arm/cpu64.c |8 +
 target/arm/helper.c|   21 +
 target/arm/psci.c  |   25 +-
 tests/m25p80-test.c|  133 
 hw/intc/trace-events   |   33 +
 23 files changed, 2149 insertions(+), 141 deletions(-)

Re: [Qemu-devel] [RFC PATCH v1] softfloat: Add round-to-odd rounding mode

2017-01-20 Thread Bharata B Rao

On Fri, Jan 20, 2017 at 10:28:22AM +, Peter Maydell wrote:
> On 20 January 2017 at 09:17, Bharata B Rao  wrote:
> > Power ISA 3.0 introduces a few quadruple precision floating point
> > instructions that support round-to-odd rounding mode. The
> > round-to-odd mode is explained as under:
> >
> > Let Z be the intermediate arithmetic result or the operand of a convert
> > operation. If Z can be represented exactly in the target format, the
> > result is Z. Otherwise the result is either Z1 or Z2 whichever is odd.
> > Here Z1 and Z2 are the next larger and smaller numbers representable
> > in the target format respectively.
> >
> > Signed-off-by: Bharata B Rao 
> > ---
> > Changes in v1:
> > - Addressed rounding to infinity in roundAndPackFloat128().
> > - Added 64bit round to odd implementation(roundAndPackFloat64()) as
> >   it is needed by xscvqpdp instruction of Power ISA 3.0.
> >
> > v0: http://patchwork.ozlabs.org/patch/716956/
> >
> >  fpu/softfloat.c | 10 ++
> >  include/fpu/softfloat.h |  2 ++
> >  2 files changed, 12 insertions(+)
> >
> > diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> > index c295f31..b9b36ad 100644
> > --- a/fpu/softfloat.c
> > +++ b/fpu/softfloat.c
> > @@ -623,6 +623,9 @@ static float64 roundAndPackFloat64(flag zSign, int 
> > zExp, uint64_t zSig,
> >  case float_round_down:
> >  roundIncrement = zSign ? 0x3ff : 0;
> >  break;
> > +case float_round_to_odd:
> > +roundIncrement = (zSig & 0x200) ? 0 : 0x3ff;
> 
> Are you sure that should be 0x200 and not 0x400 ?

You are right, it should have been 0x400 as we are discarding lower
10 precision bits. The even or odd check should have been for
the next higher significant bit to that of these 10 discarded bits.

> 
> > +break;
> >  default:
> >  abort();
> >  }
> 
> This isn't sufficient, because it won't do the right thing
> in the code which is picking between "round to infinity" and
> "round to largest/smallest representable number". That's
> phrased differently from the Float128 code but it's still
> there:
> 
> return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
> 
> will generate a result with an all-zeros mantissa for
> roundIncrement == 0 (ie go to infinity) and an all-ones
> mantissa otherwise (ie go to largest-representable).
> That works for the existing cases but it doesn't work
> for round_to_odd.

Based on my understanding of your and Richard's clarification, we
shouldn't overflow to infinity in round-to-odd mode. Like I did for 128 bit
case where we return the max possible value in the similar situation, I
suppose we should explicitly take care of returning max 64bit value here
for round-to-odd case ?

> 
> How are you testing these patches? You ought to be able
> to compare the results against a known-good implementation
> for a bunch of these corner cases and a lot of random
> input values, which is the best way to be confident there
> aren't bugs in them. (risu should be able to do this since
> it supports PPC now, though it can miss things depending
> on the number of iterations you ask it to do.)

I have been manually checking the result of new PPC instructions like
xsdivqp[o], xsmulqp[o], xscvqpdp[0]. But that is clearly not enough.
Let me look at RISU and do a more thorough corner cases testing
before posting the next version.

Thanks for your review.

Regards,
Bharata.

Re: [Qemu-devel] [RFC PATCH v1] softfloat: Add round-to-odd rounding mode

2017-01-20 Thread Peter Maydell

On 20 January 2017 at 11:36, Bharata B Rao  wrote:
> On Fri, Jan 20, 2017 at 10:28:22AM +, Peter Maydell wrote:
>> This isn't sufficient, because it won't do the right thing
>> in the code which is picking between "round to infinity" and
>> "round to largest/smallest representable number". That's
>> phrased differently from the Float128 code but it's still
>> there:
>>
>> return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
>>
>> will generate a result with an all-zeros mantissa for
>> roundIncrement == 0 (ie go to infinity) and an all-ones
>> mantissa otherwise (ie go to largest-representable).
>> That works for the existing cases but it doesn't work
>> for round_to_odd.
>
> Based on my understanding of your and Richard's clarification, we
> shouldn't overflow to infinity in round-to-odd mode. Like I did for 128 bit
> case where we return the max possible value in the similar situation, I
> suppose we should explicitly take care of returning max 64bit value here
> for round-to-odd case ?

I would suggest something like
  bool overflow_to_inf = roundingMode != float_round_to_odd &&
  roundIncrement != 0;

  float_raise(float_flag_overflow | float_flag_inexact, status);
  return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));

as the contents of the if() {} body for overflow detected.

thanks
-- PMM

Re: [Qemu-devel] [PATCH v6 kernel 3/5] virtio-balloon: speed up inflate/deflate process

2017-01-20 Thread Dr. David Alan Gilbert

* Liang Li (liang.z...@intel.com) wrote:

> +static void free_extended_page_bitmap(struct virtio_balloon *vb)
> +{
> + int i, bmap_count = vb->nr_page_bmap;
> +
> + for (i = 1; i < bmap_count; i++) {
> + kfree(vb->page_bitmap[i]);
> + vb->page_bitmap[i] = NULL;
> + vb->nr_page_bmap--;
> + }
> +}
> +
> +static void kfree_page_bitmap(struct virtio_balloon *vb)
> +{
> + int i;
> +
> + for (i = 0; i < vb->nr_page_bmap; i++)
> + kfree(vb->page_bitmap[i]);
> +}

It might be worth commenting that pair of functions to make it clear
why they are so different; I guess the kfree_page_bitmap
is used just before you free the structure above it so you
don't need to keep the count/pointers updated?

Dave
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

[Qemu-devel] [Bug 1657538] Re: qemu 2.7.x 2.8 softmmu dont work on BE machine

2017-01-20 Thread Thomas Huth

Something else to try: Could you please check whether you get any output
when you run "qemu-system-ppc -nographic" on your system? So we can make
sure that it is not related to GTK or something similar...

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1657538

Title:
  qemu 2.7.x 2.8 softmmu dont work on BE machine

Status in QEMU:
  New

Bug description:
  Build on Be machine qemu 2.7.1 and 2.8 in pure softmmu (tgc) dont work on big 
endian hardware .
  tested with ppc-softmmu,i386-softmmu,arm-softmmu same result:

  with :
   ./qemu-system-i386 
  Gtk-Message: Failed to load module "overlay-scrollbar"
  qemu-system-i386: Trying to execute code outside RAM or ROM at 0x000a
  This usually means one of the following happened:

  (1) You told QEMU to execute a kernel for the wrong machine type, and it 
crashed on startup (eg trying to run a raspberry pi kernel on a versatilepb 
QEMU machine)
  (2) You didn't give QEMU a kernel or BIOS filename at all, and QEMU executed 
a ROM full of no-op instructions until it fell off the end
  (3) Your guest kernel has a bug and crashed by jumping off into nowhere

  This is almost always one of the first two, so check your command line and 
that you are using the right type of kernel for this machine.
  If you think option (3) is likely then you can try debugging your guest with 
the -d debug options; in particular -d guest_errors will cause the log to 
include a dump of the guest register state at this point.

  Execution cannot continue; stopping here.

  
  I try to add the -L option with ../pc-bios/bios.bin 
  and have the same result.

  note the ppc-softmmu and ppc64-softmmu work in kvm mode only emulated
  mode have issue.

  
  tested on my hardware a  Qriq P5040 and G5 4x970MP with Ubuntu Mate 16.10 
  thanks
  Luigi

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1657538/+subscriptions

[Qemu-devel] [Bug 1657538] Re: qemu 2.7.x 2.8 softmmu dont work on BE machine

2017-01-20 Thread Thomas Huth

I can not reproduce this issue. QEMU 2.8 works fine here on a POWER8 big
endian host (running RHEL7). Can you still run older versions of QEMU
that used to work for you in the past, to check whether it is QEMU or
whether it is Ubuntu 16.10 that is causing the trouble here? Could you
please also post the full output of the "configure" run on your system?

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1657538

Title:
  qemu 2.7.x 2.8 softmmu dont work on BE machine

Status in QEMU:
  New

Bug description:
  Build on Be machine qemu 2.7.1 and 2.8 in pure softmmu (tgc) dont work on big 
endian hardware .
  tested with ppc-softmmu,i386-softmmu,arm-softmmu same result:

  with :
   ./qemu-system-i386 
  Gtk-Message: Failed to load module "overlay-scrollbar"
  qemu-system-i386: Trying to execute code outside RAM or ROM at 0x000a
  This usually means one of the following happened:

  (1) You told QEMU to execute a kernel for the wrong machine type, and it 
crashed on startup (eg trying to run a raspberry pi kernel on a versatilepb 
QEMU machine)
  (2) You didn't give QEMU a kernel or BIOS filename at all, and QEMU executed 
a ROM full of no-op instructions until it fell off the end
  (3) Your guest kernel has a bug and crashed by jumping off into nowhere

  This is almost always one of the first two, so check your command line and 
that you are using the right type of kernel for this machine.
  If you think option (3) is likely then you can try debugging your guest with 
the -d debug options; in particular -d guest_errors will cause the log to 
include a dump of the guest register state at this point.

  Execution cannot continue; stopping here.

  
  I try to add the -L option with ../pc-bios/bios.bin 
  and have the same result.

  note the ppc-softmmu and ppc64-softmmu work in kvm mode only emulated
  mode have issue.

  
  tested on my hardware a  Qriq P5040 and G5 4x970MP with Ubuntu Mate 16.10 
  thanks
  Luigi

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1657538/+subscriptions

Re: [Qemu-devel] Towards an ivshmem 2.0?

2017-01-20 Thread Wang, Wei W

On Tuesday, January 17, 2017 5:46 PM, Jan Kiszka wrote:
> On 2017-01-17 10:13, Wang, Wei W wrote:
> > Hi Jan,
> >
> > On Monday, January 16, 2017 9:10 PM, Jan Kiszka wrote:
> >> On 2017-01-16 13:41, Marc-André Lureau wrote:
> >>> On Mon, Jan 16, 2017 at 12:37 PM Jan Kiszka  >>> > wrote:
> >>> some of you may know that we are using a shared memory device similar
> to
> >>> ivshmem in the partitioning hypervisor Jailhouse [1].
> >>>
> >>> We started as being compatible to the original ivshmem that QEMU
> >>> implements, but we quickly deviated in some details, and in the recent
> >>> months even more. Some of the deviations are related to making the
> >>> implementation simpler. The new ivshmem takes <500 LoC - Jailhouse is
> >>> aiming at safety critical systems and, therefore, a small code base.
> >>> Other changes address deficits in the original design, like missing
> >>> life-cycle management.
> >>>
> >>> Now the question is if there is interest in defining a common new
> >>> revision of this device and maybe also of some protocols used on top,
> >>> such as virtual network links. Ideally, this would enable us to share
> >>> Linux drivers. We will definitely go for upstreaming at least a 
> >>> network
> >>> driver such as [2], a UIO driver and maybe also a serial port/console.
> >>>
> >>>
> >>> This sounds like duplicating efforts done with virtio and vhost-pci.
> >>> Have you looked at Wei Wang proposal?
> >>
> >> I didn't follow it recently, but the original concept was about
> >> introducing an IOMMU model to the picture, and that's complexity-wise
> >> a no-go for us (we can do this whole thing in less than 500 lines,
> >> even virtio itself is more complex). IIUC, the alternative to an
> >> IOMMU is mapping the whole frontend VM memory into the backend VM -
> that's security/safety-wise an absolute no-go.
> >
> > Though the virtio based solution might be complex for you, a big advantage 
> > is
> that we have lots of people working to improve virtio. For example, the
> upcoming virtio 1.1 has vring improvement, we can easily upgrade all the 
> virtio
> based solutions, such as vhost-pci, to take advantage of this improvement. 
> From
> the long term perspective, I think this kind of complexity is worthwhile.
> 
> We will adopt virtio 1.1 ring formats. That's one reason why there is also 
> still a
> bidirectional shared memory region: to host the new descriptors (while keeping
> the payload safely in the unidirectional regions).

The vring example I gave might be confusing, sorry about  that. My point is 
that every part of virtio is getting matured and improved from time to time.  
Personally, having a new device developed and maintained in an active and 
popular model is helpful. Also, as new features being gradually added in the 
future, a simple device could become complex. 

Having a theoretical analysis on the performance: 
The traditional shared memory mechanism, sharing an intermediate memory, 
requires 2 copies to get the packet transmitted. It's not just one more copy 
compared to the 1-copy solution, I think some more things we may need to take 
into account:
1) there are extra ring operation overhead  on both the sending and receiving 
side to access the shared memory (i.e. IVSHMEM);
2) extra protocol to use the shared memory;
3) the piece of allocated shared memory from the host = C(n,2), where n is the 
number of VMs. Like for 20 VMs who want to talk to each other, there will be 
190 pieces of memory allocated from the host. 

That being said, if people really want the 2-copy solution, we can also have 
vhost-pci support it that way as a new feature (not sure if you would be 
interested in collaborating on the project):
With the new feature added, the master VM sends only a piece of memory 
(equivalent to IVSHMEM, but allocated by the guest) to the slave over 
vhost-user protocol, and the vhost-pci device on the slave side only hosts that 
piece of shared memory.

Best,
Wei

[Qemu-devel] [PATCH] Introduce DEVICE_CATEGORY_CPU for CPU devices

2017-01-20 Thread Thomas Huth

Now that the hot-pluggable CPUs show up in the help text of
"-device ?", we should group them into an appropriate category.

Signed-off-by: Thomas Huth 
---
 hw/ppc/pnv_core.c   | 1 +
 hw/ppc/spapr_cpu_core.c | 1 +
 include/hw/qdev-core.h  | 1 +
 qdev-monitor.c  | 1 +
 target/i386/cpu.c   | 2 ++
 5 files changed, 6 insertions(+)

diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index d79d530..a9c981e 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -190,6 +190,7 @@ static void pnv_core_class_init(ObjectClass *oc, void *data)
 DeviceClass *dc = DEVICE_CLASS(oc);
 PnvCoreClass *pcc = PNV_CORE_CLASS(oc);
 
+set_bit(DEVICE_CATEGORY_CPU, dc->categories);
 dc->realize = pnv_core_realize;
 dc->props = pnv_core_properties;
 pcc->cpu_oc = cpu_class_by_name(TYPE_POWERPC_CPU, data);
diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
index c18632b..dd0028e 100644
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@@ -365,6 +365,7 @@ void spapr_cpu_core_class_init(ObjectClass *oc, void *data)
 DeviceClass *dc = DEVICE_CLASS(oc);
 sPAPRCPUCoreClass *scc = SPAPR_CPU_CORE_CLASS(oc);
 
+set_bit(DEVICE_CATEGORY_CPU, dc->categories);
 dc->realize = spapr_cpu_core_realize;
 scc->cpu_class = cpu_class_by_name(TYPE_POWERPC_CPU, data);
 g_assert(scc->cpu_class);
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 2c97347..b44b476 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -26,6 +26,7 @@ typedef enum DeviceCategory {
 DEVICE_CATEGORY_DISPLAY,
 DEVICE_CATEGORY_SOUND,
 DEVICE_CATEGORY_MISC,
+DEVICE_CATEGORY_CPU,
 DEVICE_CATEGORY_MAX
 } DeviceCategory;
 
diff --git a/qdev-monitor.c b/qdev-monitor.c
index c73410c..5f2fcdf 100644
--- a/qdev-monitor.c
+++ b/qdev-monitor.c
@@ -136,6 +136,7 @@ static void qdev_print_devinfos(bool show_no_user)
 [DEVICE_CATEGORY_DISPLAY] = "Display",
 [DEVICE_CATEGORY_SOUND]   = "Sound",
 [DEVICE_CATEGORY_MISC]= "Misc",
+[DEVICE_CATEGORY_CPU] = "CPU",
 [DEVICE_CATEGORY_MAX] = "Uncategorized",
 };
 GSList *list, *elt;
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index aba11ae..7b8ca5f 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -2381,8 +2381,10 @@ static void x86_cpu_cpudef_class_init(ObjectClass *oc, 
void *data)
 {
 X86CPUDefinition *cpudef = data;
 X86CPUClass *xcc = X86_CPU_CLASS(oc);
+DeviceClass *dc = DEVICE_CLASS(oc);
 
 xcc->cpu_def = cpudef;
+set_bit(DEVICE_CATEGORY_CPU, dc->categories);
 }
 
 static void x86_register_cpudef_type(X86CPUDefinition *def)
-- 
1.8.3.1

Re: [Qemu-devel] [PATCH v3 4/4] ARRAY_SIZE: check that argument is an array

2017-01-20 Thread Paolo Bonzini



On 20/01/2017 08:34, Markus Armbruster wrote:
> Eric Blake  writes:
> 
>> On 01/19/2017 04:11 PM, Michael S. Tsirkin wrote:
>>
> +#define QEMU_IS_ARRAY(x) (!__builtin_types_compatible_p(typeof(x), \
> +typeof(&(x)[0])))
>  #ifndef ARRAY_SIZE
> -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
> +#define ARRAY_SIZE(x) ((sizeof(x) / sizeof((x)[0])) + \
> +   QEMU_BUILD_BUG_ON_ZERO(!QEMU_IS_ARRAY(x)))

 We've got some double-negation going on here ("cause a build bug if the
 negation of QEMU_IS_ARRAY() is not 0") which takes some mental
 gymnastics, but it is the correct result.  [I kind of like that gnulib
 uses positive logic in its 'verify(x)' meaning "verify that x is true,
 or cause a build error"; compared to the negative logic in the kernal
 'BUILD_BUG_ON[_ZERO](x)' meaning "cause a build bug if x is non-zero" -
 but that's personal preference and not something for qemu to change]
>>>
>>> I can rename QEMU_IS_ARRAY to QEMU_IS_PTR and reverse the logic - would
>>> this be preferable?
>>
>> No, that's worse. As written, "cause a build bug if x is not an array"
>> is easier than "cause a build bug if x is a pointer", because now you
>> are missing an implicit "(instead of the intended array)".  Keep it the
>> way you have it.  I guess it's the _ZERO as a suffix that's throwing me;
>> a better name might have been QEMU_ZERO_OR_BUILD_BUG_ON(x) ("give me a
>> zero expression, or a build bug if x is non-zero") rather than
>> QEMU_BUILD_BUG_ON_ZERO (my first read was "give me a build bug if x is
>> zero", but a better read is "give me a build bug if x is not zero, else
>> give me x because it is zero") - but our choice of naming in patch 3/4
>> mirrors the kernel naming, so it's not worth changing.
> 
> Two ways to skin the assertion cat:
> 
> assert must_be_true
> bug_on must_be_false
> 
> The C language picks the first one, both with assert() and with C11's
> _Static_assert().  I'd prefer we stick to that, but I'm not asking you
> to change your series.

We should probably change it to QEMU_STATIC_ASSERT and
QEMU_STATIC_ASSERT_VALUE, but that shouldn't be in this series.

Paolo

Re: [Qemu-devel] [PATCH] bitmap: assert that start and nr are non negative

2017-01-20 Thread Paolo Bonzini



On 19/01/2017 17:43, Peter Lieven wrote:
> commit e1123a3b introduced a data corruption regression
> in the iscsi driver because it passed -1 as nr to bitmap_set
> and bitmap_clear. Add an assertion to catch such flaws earlier.
> 
> Suggested-by: Fam Zheng 
> Signed-off-by: Peter Lieven 
> ---
>  util/bitmap.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/util/bitmap.c b/util/bitmap.c
> index 43ed011..c1a84ca 100644
> --- a/util/bitmap.c
> +++ b/util/bitmap.c
> @@ -164,6 +164,8 @@ void bitmap_set(unsigned long *map, long start, long nr)
>  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
>  unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
>  
> +assert(start >= 0 && nr >= 0);
> +
>  while (nr - bits_to_set >= 0) {
>  *p |= mask_to_set;
>  nr -= bits_to_set;
> @@ -184,6 +186,8 @@ void bitmap_set_atomic(unsigned long *map, long start, 
> long nr)
>  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
>  unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
>  
> +assert(start >= 0 && nr >= 0);
> +
>  /* First word */
>  if (nr - bits_to_set > 0) {
>  atomic_or(p, mask_to_set);
> @@ -221,6 +225,8 @@ void bitmap_clear(unsigned long *map, long start, long nr)
>  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
>  unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
>  
> +assert(start >= 0 && nr >= 0);
> +
>  while (nr - bits_to_clear >= 0) {
>  *p &= ~mask_to_clear;
>  nr -= bits_to_clear;
> @@ -243,6 +249,8 @@ bool bitmap_test_and_clear_atomic(unsigned long *map, 
> long start, long nr)
>  unsigned long dirty = 0;
>  unsigned long old_bits;
>  
> +assert(start >= 0 && nr >= 0);
> +
>  /* First word */
>  if (nr - bits_to_clear > 0) {
>  old_bits = atomic_fetch_and(p, ~mask_to_clear);
> 

Queued, thanks.

Paolo

Re: [Qemu-devel] [qemu PATCH] pc.h: move x-mach-use-reliable-get-clock compat entry to PC_COMPAT_2_8

2017-01-20 Thread Paolo Bonzini



On 18/01/2017 18:53, Marcelo Tosatti wrote:
> 
> As noticed by David Gilbert, commit 6053a86 'kvmclock: reduce kvmclock
> differences on migration' added 'x-mach-use-reliable-get-clock' and a
> compatibility entry that turns it off; however it got merged after 2.8.0
> was released but the entry has gone into PC_COMPAT_2_7 where it should
> have gone into PC_COMPAT_2_8.
> 
> Fix it by moving the entry to PC_COMPAT_2_8.
> 
> Signed-off-by: Marcelo Tosatti 
> 
> 
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index b22e699..738bfd6 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -375,14 +375,15 @@ int e820_get_num_entries(void);
>  bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>  
>  #define PC_COMPAT_2_8 \
> -
> -#define PC_COMPAT_2_7 \
> -HW_COMPAT_2_7 \
> +HW_COMPAT_2_8 \
>  {\
>  .driver   = "kvmclock",\
>  .property = "x-mach-use-reliable-get-clock",\
>  .value= "off",\
>  },\
> +
> +#define PC_COMPAT_2_7 \
> +HW_COMPAT_2_7 \
>  {\
>  .driver   = TYPE_X86_CPU,\
>  .property = "l3-cache",\
> 
> 

Queued, thanks.

Paolo

Re: [Qemu-devel] [PATCH] Introduce DEVICE_CATEGORY_CPU for CPU devices

2017-01-20 Thread Paolo Bonzini

On 20/01/2017 13:18, Thomas Huth wrote:
> Now that the hot-pluggable CPUs show up in the help text of
> "-device ?", we should group them into an appropriate category.
> 
> Signed-off-by: Thomas Huth 
> ---
>  hw/ppc/pnv_core.c   | 1 +
>  hw/ppc/spapr_cpu_core.c | 1 +
>  include/hw/qdev-core.h  | 1 +
>  qdev-monitor.c  | 1 +
>  target/i386/cpu.c   | 2 ++
>  5 files changed, 6 insertions(+)
> 
> diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
> index d79d530..a9c981e 100644
> --- a/hw/ppc/pnv_core.c
> +++ b/hw/ppc/pnv_core.c
> @@ -190,6 +190,7 @@ static void pnv_core_class_init(ObjectClass *oc, void 
> *data)
>  DeviceClass *dc = DEVICE_CLASS(oc);
>  PnvCoreClass *pcc = PNV_CORE_CLASS(oc);
>  
> +set_bit(DEVICE_CATEGORY_CPU, dc->categories);
>  dc->realize = pnv_core_realize;
>  dc->props = pnv_core_properties;
>  pcc->cpu_oc = cpu_class_by_name(TYPE_POWERPC_CPU, data);
> diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c
> index c18632b..dd0028e 100644
> --- a/hw/ppc/spapr_cpu_core.c
> +++ b/hw/ppc/spapr_cpu_core.c
> @@ -365,6 +365,7 @@ void spapr_cpu_core_class_init(ObjectClass *oc, void 
> *data)
>  DeviceClass *dc = DEVICE_CLASS(oc);
>  sPAPRCPUCoreClass *scc = SPAPR_CPU_CORE_CLASS(oc);
>  
> +set_bit(DEVICE_CATEGORY_CPU, dc->categories);
>  dc->realize = spapr_cpu_core_realize;
>  scc->cpu_class = cpu_class_by_name(TYPE_POWERPC_CPU, data);
>  g_assert(scc->cpu_class);
> diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
> index 2c97347..b44b476 100644
> --- a/include/hw/qdev-core.h
> +++ b/include/hw/qdev-core.h
> @@ -26,6 +26,7 @@ typedef enum DeviceCategory {
>  DEVICE_CATEGORY_DISPLAY,
>  DEVICE_CATEGORY_SOUND,
>  DEVICE_CATEGORY_MISC,
> +DEVICE_CATEGORY_CPU,
>  DEVICE_CATEGORY_MAX
>  } DeviceCategory;
>  
> diff --git a/qdev-monitor.c b/qdev-monitor.c
> index c73410c..5f2fcdf 100644
> --- a/qdev-monitor.c
> +++ b/qdev-monitor.c
> @@ -136,6 +136,7 @@ static void qdev_print_devinfos(bool show_no_user)
>  [DEVICE_CATEGORY_DISPLAY] = "Display",
>  [DEVICE_CATEGORY_SOUND]   = "Sound",
>  [DEVICE_CATEGORY_MISC]= "Misc",
> +[DEVICE_CATEGORY_CPU] = "CPU",
>  [DEVICE_CATEGORY_MAX] = "Uncategorized",
>  };
>  GSList *list, *elt;
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index aba11ae..7b8ca5f 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -2381,8 +2381,10 @@ static void x86_cpu_cpudef_class_init(ObjectClass *oc, 
> void *data)
>  {
>  X86CPUDefinition *cpudef = data;
>  X86CPUClass *xcc = X86_CPU_CLASS(oc);
> +DeviceClass *dc = DEVICE_CLASS(oc);
>  
>  xcc->cpu_def = cpudef;
> +set_bit(DEVICE_CATEGORY_CPU, dc->categories);
>  }
>  
>  static void x86_register_cpudef_type(X86CPUDefinition *def)
> 

Queued, thanks.

Paolo

Re: [Qemu-devel] [PATCH RFC 0/3] vfio: allow to notify unmap for very big region

2017-01-20 Thread Peter Xu

On Fri, Jan 20, 2017 at 11:43:28AM +0800, Peter Xu wrote:

[...]

> > What I don't want to see is for this API bug to leak out into the rest
> > of the QEMU code such that intel_iommu code, or iommu code in general
> > subtly avoids it by artificially using a smaller range.  VT-d hardware
> > has an actual physical address space of either 2^39 or 2^48 bits, so if
> > you want to make the iommu address space match the device we're trying
> > to emulate, that's perfectly fine.  AIUI, AMD-Vi does actually have a
> > 64-bit address space on the IOMMU, so to handle that case I'd expect
> > the simplest solution would be to track the and mapped iova high water
> > mark per container in vfio and truncate unmaps to that high water end
> > address.  Realistically we're probably not going to see iovas at the end
> > of the 64-bit address space, but we can come up with some other
> > workaround in the vfio code or update the kernel API if we do.  Thanks,
> 
> Agree that high watermark can be a good solution for VT-d. I'll use
> that instead of 2^63-1.

Okay when I replied I didn't notice this "watermark" may need more
than several (even tens of) LOCs. :(

Considering that I see no further usage of this watermark, I'm
thinking whether it's okay I directly use (1ULL << VTD_MGAW) here as
the watermark - it's simple, efficient and secure imho.

Thanks,

-- peterx

[Qemu-devel] [PATCH v2 0/2] hw/usb/dev-hid: Make usb-tablet work with OS X/macOS guests

2017-01-20 Thread Phil Dennis-Jordan

This series makes the Qemu usb-tablet work correctly with OS X/macOS guests 
without the need for a special guest driver.

 * The usb-tablet should not have a boot protocol of 2. Other OSes seem to 
ignore this, but the IOHIDFamily driver stack chokes on it for anything but 
conventional (relative motion) mice.
 * A "mac_compat" boolean option is added to the usb-tablet, which changes its 
report descriptor to specify a usage of 0x02 (mouse) instead of 0x01 (pointer). 
This is required for correct operation in the Mac HID driver stack.

Changelog
=

v1 -> v2:
 * v1 Thread was "[PATCH] hw/usb/dev-hid: add a Mac guest compatibility option 
to usb-tablet"
 * Always apply the boot protocol (bInterfaceProtocol) change to usb-tablet, 
not just when the Mac compatibility option is active. The original value of 
0x02 was determined to be incorrect according to the spec anyway.
 * As the boot protocol change is permanent, separate interface and device 
descriptor constants for the Mac/non-Mac variants of the tablet are no longer 
required, and have been removed.

Phil Dennis-Jordan (2):
  hw/usb/dev-hid: set bInterfaceProtocol to 0x00 for usb-tablet
  hw/usb/dev-hid: add a usb-tablet Mac guest compatibility option

 hw/usb/dev-hid.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

-- 
2.3.2 (Apple Git-55)

[Qemu-devel] [PATCH v2 2/2] hw/usb/dev-hid: add a usb-tablet Mac guest compatibility option

2017-01-20 Thread Phil Dennis-Jordan

Darwin/OS X/macOS's HID driver stack previously did not correctly drive Qemu's 
simulated USB Tablet. This adds a boolean option "mac_compat" to the device 
which subtly changes the device's report descriptor so it behaves in a way that 
Mac guests can handle.

Absolute pointing devices with HID Report Descriptor usage page of 0x01 
(pointing) are handled by the macOS HID driver as analog sticks, and absolute 
coordinates are not directly translated to absolute mouse cursor positions.

The workaround is to report a usage page of 0x02 (mouse) instead. In 
combination with the previous commit's boot protocol fix, this allows the 
usb-tablet to correctly control the mouse cursor in OS X/macOS guest systems.

Signed-off-by: Phil Dennis-Jordan 
---
 hw/usb/dev-hid.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/hw/usb/dev-hid.c b/hw/usb/dev-hid.c
index a23e5d4..c4a0d22 100644
--- a/hw/usb/dev-hid.c
+++ b/hw/usb/dev-hid.c
@@ -51,6 +51,7 @@ typedef struct USBHIDState {
 uint32_t usb_version;
 char *display;
 uint32_t head;
+bool mac_compat;
 } USBHIDState;
 
 #define TYPE_USB_HID "usb-hid"
@@ -599,6 +600,9 @@ static void usb_hid_handle_control(USBDevice *dev, 
USBPacket *p,
 memcpy(data, qemu_tablet_hid_report_descriptor,
   sizeof(qemu_tablet_hid_report_descriptor));
 p->actual_length = sizeof(qemu_tablet_hid_report_descriptor);
+if (us->mac_compat) {
+data[3] = 0x02; /* Set usage to mouse, not pointing (1) */
+}
 } else if (hs->kind == HID_KEYBOARD) {
 memcpy(data, qemu_keyboard_hid_report_descriptor,
sizeof(qemu_keyboard_hid_report_descriptor));
@@ -801,6 +805,7 @@ static Property usb_tablet_properties[] = {
 DEFINE_PROP_UINT32("usb_version", USBHIDState, usb_version, 2),
 DEFINE_PROP_STRING("display", USBHIDState, display),
 DEFINE_PROP_UINT32("head", USBHIDState, head, 0),
+DEFINE_PROP_BOOL("mac_compat", USBHIDState, mac_compat, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.3.2 (Apple Git-55)

Re: [Qemu-devel] [PATCH] Introduce DEVICE_CATEGORY_CPU for CPU devices

2017-01-20 Thread Peter Maydell

On 20 January 2017 at 12:18, Thomas Huth  wrote:
> Now that the hot-pluggable CPUs show up in the help text of
> "-device ?", we should group them into an appropriate category.
>
> Signed-off-by: Thomas Huth 
> ---
>  hw/ppc/pnv_core.c   | 1 +
>  hw/ppc/spapr_cpu_core.c | 1 +
>  include/hw/qdev-core.h  | 1 +
>  qdev-monitor.c  | 1 +
>  target/i386/cpu.c   | 2 ++
>  5 files changed, 6 insertions(+)

Can we just set the category bit in the class init functions
for the TYPE_CPU and TYPE_CPU_CORE superclasses, rather
than having to do it in a subclass for each architecture
(and thus potentially forgetting to do it for other archs) ?

thanks
-- PMM

[Qemu-devel] [PATCH v2 1/2] hw/usb/dev-hid: set bInterfaceProtocol to 0x00 for usb-tablet

2017-01-20 Thread Phil Dennis-Jordan

This should be non-zero for boot protocol devices only, which the usb-tablet is 
not.

A boot protocol of 0x02 specifically confuses OS X/macOS' HID driver stack, 
causing it to generate additional bogus HID events with relative motion in 
addition to the tablet's absolute coordinate events.

Signed-off-by: Phil Dennis-Jordan 
---
 hw/usb/dev-hid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/usb/dev-hid.c b/hw/usb/dev-hid.c
index 24d05f7..a23e5d4 100644
--- a/hw/usb/dev-hid.c
+++ b/hw/usb/dev-hid.c
@@ -144,7 +144,7 @@ static const USBDescIface desc_iface_tablet = {
 .bInterfaceNumber  = 0,
 .bNumEndpoints = 1,
 .bInterfaceClass   = USB_CLASS_HID,
-.bInterfaceProtocol= 0x02,
+.bInterfaceProtocol= 0x00,
 .ndesc = 1,
 .descs = (USBDescOther[]) {
 {
@@ -174,7 +174,7 @@ static const USBDescIface desc_iface_tablet2 = {
 .bInterfaceNumber  = 0,
 .bNumEndpoints = 1,
 .bInterfaceClass   = USB_CLASS_HID,
-.bInterfaceProtocol= 0x02,
+.bInterfaceProtocol= 0x00,
 .ndesc = 1,
 .descs = (USBDescOther[]) {
 {
-- 
2.3.2 (Apple Git-55)

Re: [Qemu-devel] [PULL v2 00/36] target-arm queue

2017-01-20 Thread Peter Maydell

On 20 January 2017 at 11:36, Peter Maydell  wrote:
> Changes from v1: just squashed in
> http://patchwork.ozlabs.org/patch/717312/
> which deletes a now-unused function.
>
> thanks
> -- PMM
>
> The following changes since commit 0f6bcf68a99efdc531b209551f2b760b0bdcc554:
>
>   Merge remote-tracking branch 'remotes/artyom/tags/pull-sun4v-20170118' into 
> staging (2017-01-19 18:34:13 +)
>
> are available in the git repository at:
>
>   git://git.linaro.org/people/pmaydell/qemu-arm.git 
> tags/pull-target-arm-20170120
>
> for you to fetch changes up to f29cacfb5fc0a6e93efc3f6d2900d82d625f143e:
>
>   hw/arm/virt: Add board property to enable EL2 (2017-01-20 11:15:11 +)
>
> 
> target-arm queue:
>  * support virtualization in GICv3
>  * enable EL2 in AArch64 CPU models
>  * allow EL2 to be enabled on 'virt' board via -machine virtualization=on
>  * aspeed: SMC improvements
>  * m25p80: support die erase command
>  * m25p80: Add Quad Page Program 4byte
>  * m25p80: Improve 1GiB Micron flash definition
>  * arm: Uniquely name imx25 I2C buses
>

Applied, thanks.

-- PMM

Re: [Qemu-devel] [PATCH] Introduce DEVICE_CATEGORY_CPU for CPU devices

2017-01-20 Thread Thomas Huth

On 20.01.2017 13:31, Peter Maydell wrote:
> On 20 January 2017 at 12:18, Thomas Huth  wrote:
>> Now that the hot-pluggable CPUs show up in the help text of
>> "-device ?", we should group them into an appropriate category.
>>
>> Signed-off-by: Thomas Huth 
>> ---
>>  hw/ppc/pnv_core.c   | 1 +
>>  hw/ppc/spapr_cpu_core.c | 1 +
>>  include/hw/qdev-core.h  | 1 +
>>  qdev-monitor.c  | 1 +
>>  target/i386/cpu.c   | 2 ++
>>  5 files changed, 6 insertions(+)
> 
> Can we just set the category bit in the class init functions
> for the TYPE_CPU and TYPE_CPU_CORE superclasses, rather
> than having to do it in a subclass for each architecture
> (and thus potentially forgetting to do it for other archs) ?

Good idea, that seems to work, too. I'll send a v2 ...

 Thomas

[Qemu-devel] [PATCH v2] Introduce DEVICE_CATEGORY_CPU for CPU devices

2017-01-20 Thread Thomas Huth

Now that CPUs show up in the help text of "-device ?",
we should group them into an appropriate category.

Signed-off-by: Thomas Huth 
---
 v2:
 - set_bit in the TYPE_CPU and TYPE_CPU_CORES directly instead
   of doing this in the child classes

 hw/cpu/core.c  | 8 
 include/hw/qdev-core.h | 1 +
 qdev-monitor.c | 1 +
 qom/cpu.c  | 1 +
 4 files changed, 11 insertions(+)

diff --git a/hw/cpu/core.c b/hw/cpu/core.c
index eff90c1..2bf960d 100644
--- a/hw/cpu/core.c
+++ b/hw/cpu/core.c
@@ -72,10 +72,18 @@ static void cpu_core_instance_init(Object *obj)
 core->nr_threads = smp_threads;
 }
 
+static void cpu_core_class_init(ObjectClass *oc, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(oc);
+
+set_bit(DEVICE_CATEGORY_CPU, dc->categories);
+}
+
 static const TypeInfo cpu_core_type_info = {
 .name = TYPE_CPU_CORE,
 .parent = TYPE_DEVICE,
 .abstract = true,
+.class_init = cpu_core_class_init,
 .instance_size = sizeof(CPUCore),
 .instance_init = cpu_core_instance_init,
 };
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 2c97347..b44b476 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -26,6 +26,7 @@ typedef enum DeviceCategory {
 DEVICE_CATEGORY_DISPLAY,
 DEVICE_CATEGORY_SOUND,
 DEVICE_CATEGORY_MISC,
+DEVICE_CATEGORY_CPU,
 DEVICE_CATEGORY_MAX
 } DeviceCategory;
 
diff --git a/qdev-monitor.c b/qdev-monitor.c
index c73410c..5f2fcdf 100644
--- a/qdev-monitor.c
+++ b/qdev-monitor.c
@@ -136,6 +136,7 @@ static void qdev_print_devinfos(bool show_no_user)
 [DEVICE_CATEGORY_DISPLAY] = "Display",
 [DEVICE_CATEGORY_SOUND]   = "Sound",
 [DEVICE_CATEGORY_MISC]= "Misc",
+[DEVICE_CATEGORY_CPU] = "CPU",
 [DEVICE_CATEGORY_MAX] = "Uncategorized",
 };
 GSList *list, *elt;
diff --git a/qom/cpu.c b/qom/cpu.c
index cee4e6f..2120aaa 100644
--- a/qom/cpu.c
+++ b/qom/cpu.c
@@ -415,6 +415,7 @@ static void cpu_class_init(ObjectClass *klass, void *data)
 k->cpu_exec_enter = cpu_common_noop;
 k->cpu_exec_exit = cpu_common_noop;
 k->cpu_exec_interrupt = cpu_common_exec_interrupt;
+set_bit(DEVICE_CATEGORY_CPU, dc->categories);
 dc->realize = cpu_common_realizefn;
 dc->unrealize = cpu_common_unrealizefn;
 /*
-- 
1.8.3.1

Re: [Qemu-devel] [PATCH v2 23/25] console: make screendump async

2017-01-20 Thread Marc-André Lureau

Hi

On Thu, Jan 19, 2017 at 12:21 PM Gerd Hoffmann  wrote:

> On Mi, 2017-01-18 at 20:03 +0400, Marc-André Lureau wrote:
> > +surface = qemu_console_surface(con);
> > +
> > +/* FIXME: async save with coroutine? it would have to copy or
> > lock
> > + * the surface. */
> > +ppm_save(filename, surface, &err);
> > +
>
> No need to lock or copy.
>
> ppm_save() uses the pixman image (surface->image) only anyway, so
> changing it to accept a pixman image instead of the surface is easy.
>
> pixman images are reference counted, so you can just grab a reference
> using pixman_image_ref() and run with it, without risking it'll be
> released underneath your feet, then unref when done.
>
>
If you only ref, you could have the image being modified while it is being
saved asynchronously, this could result in tearing artefacts, no?

-- 
Marc-André Lureau

[Qemu-devel] [PATCH RFC v4 00/20] VT-d: vfio enablement and misc enhances

2017-01-20 Thread Peter Xu

This is v4 of vt-d vfio enablement series.

Sorry that v4 growed to 20 patches. Some newly added patches (which
are quite necessary):

[01/20] vfio: trace map/unmap for notify as well
[02/20] vfio: introduce vfio_get_vaddr()
[03/20] vfio: allow to notify unmap for very large region

  Patches from RFC series:

  "[PATCH RFC 0/3] vfio: allow to notify unmap for very big region"

  Which is required by patch [19/20].

[11/20] memory: provide IOMMU_NOTIFIER_FOREACH macro

  A helper only.

[19/20] intel_iommu: unmap existing pages before replay

  This solves Alex's concern that there might have existing mappings
  in previous domain when replay happens.

[20/20] intel_iommu: replay even with DSI/GLOBAL inv desc

  This solves Jason/Kevin's concern by handling DSI/GLOBAL
  invalidations as well.

Each individual patch will have more detailed explanation on itself.
Please refer to each of them.

Here I did separate work on patch 19/20 rather than squashing them
into patch 18 for easier modification and review. I prefer we have
them separately so we can see each problem separately, after all,
patch 18 survives in most use cases. Please let me know if we want to
squash them in some way. I can respin when necessary.

Besides the big things, lots of tiny tweaks as well. Here's the
changelog.

v4:
- convert all error_report()s into traces (in the two patches that did
  that)
- rebased to Jason's DMAR series (master + one more patch:
  "[PATCH V4 net-next] vhost_net: device IOTLB support")
- let vhost use the new api iommu_notifier_init() so it won't break
  vhost dmar [Jason]
- touch commit message of the patch:
  "intel_iommu: provide its own replay() callback"
  old replay is not a dead loop, but it will just consume lots of time
  [Jason]
- add comment for patch:
  "intel_iommu: do replay when context invalidate"
  telling why replay won't be a problem even without CM=1 [Jason]
- remove a useless comment line [Jason]
- remove dmar_enabled parameter for vtd_switch_address_space() and
  vtd_switch_address_space_all() [Mst, Jason]
- merged the vfio patches in, to support unmap of big ranges at the
  beginning ("[PATCH RFC 0/3] vfio: allow to notify unmap for very big
  region")
- using caching_mode instead of cache_mode_enabled, and "caching-mode"
  instead of "cache-mode" [Kevin]
- when receive context entry invalidation, we unmap the entire region
  first, then replay [Alex]
- fix commit message for patch:
  "intel_iommu: simplify irq region translation" [Kevin]
- handle domain/global invalidation, and notify where proper [Jason,
  Kevin]

v3:
- fix style error reported by patchew
- fix comment in domain switch patch: use "IOMMU address space" rather
  than "IOMMU region" [Kevin]
- add ack-by for Paolo in patch:
  "memory: add section range info for IOMMU notifier"
  (this is seperately collected besides this thread)
- remove 3 patches which are merged already (from Jason)
- rebase to master b6c0897

v2:
- change comment for "end" parameter in vtd_page_walk() [Tianyu]
- change comment for "a iova" to "an iova" [Yi]
- fix fault printed val for GPA address in vtd_page_walk_level (debug
  only)
- rebased to master (rather than Aviv's v6 series) and merged Aviv's
  series v6: picked patch 1 (as patch 1 in this series), dropped patch
  2, re-wrote patch 3 (as patch 17 of this series).
- picked up two more bugfix patches from Jason's DMAR series
- picked up the following patch as well:
  "[PATCH v3] intel_iommu: allow dynamic switch of IOMMU region"

This RFC series is a re-work for Aviv B.D.'s vfio enablement series
with vt-d:

  https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01452.html

Aviv has done a great job there, and what we still lack there are
mostly the following:

(1) VFIO got duplicated IOTLB notifications due to splitted VT-d IOMMU
memory region.

(2) VT-d still haven't provide a correct replay() mechanism (e.g.,
when IOMMU domain switches, things will broke).

This series should have solved the above two issues.

Online repo:

  https://github.com/xzpeter/qemu/tree/vtd-vfio-enablement-v4

I would be glad to hear about any review comments for above patches.

=
Test Done
=

Build test passed for x86_64/arm/ppc64.

Simply tested with x86_64, assigning two PCI devices to a single VM,
boot the VM using:

bin=x86_64-softmmu/qemu-system-x86_64
$bin -M q35,accel=kvm,kernel-irqchip=split -m 1G \
 -device intel-iommu,intremap=on,eim=off,caching-mode=on \
 -netdev user,id=net0,hostfwd=tcp::-:22 \
 -device virtio-net-pci,netdev=net0 \
 -device vfio-pci,host=03:00.0 \
 -device vfio-pci,host=02:00.0 \
 -trace events=".trace.vfio" \
 /var/lib/libvirt/images/vm1.qcow2

pxdev:bin [vtd-vfio-enablement]# cat .trace.vfio
vtd_page_walk*
vtd_replay*
vtd_inv_desc*

Then, in the guest, run the following tool:

  
https://github.com/xzpeter/clibs/blob/master/gpl/userspace/vfio-bind-group/vfio-bind-group.c

With parameter:

  ./vfio-bind-group 00:03.0 00:04.0

Chec

[Qemu-devel] [PATCH RFC v4 01/20] vfio: trace map/unmap for notify as well

2017-01-20 Thread Peter Xu

We traces its range, but we don't know whether it's a MAP/UNMAP. Let's
dump it as well.

Signed-off-by: Peter Xu 
---
 hw/vfio/common.c | 3 ++-
 hw/vfio/trace-events | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 801578b..174f351 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -305,7 +305,8 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 void *vaddr;
 int ret;
 
-trace_vfio_iommu_map_notify(iova, iova + iotlb->addr_mask);
+trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
+iova, iova + iotlb->addr_mask);
 
 if (iotlb->target_as != &address_space_memory) {
 error_report("Wrong target AS \"%s\", only system memory is allowed",
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index ef81609..7ae8233 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -84,7 +84,7 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
 # hw/vfio/common.c
 vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, 
unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
 vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t 
data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
-vfio_iommu_map_notify(uint64_t iova_start, uint64_t iova_end) "iommu map @ 
%"PRIx64" - %"PRIx64
+vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) 
"iommu %s @ %"PRIx64" - %"PRIx64
 vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING 
region_add %"PRIx64" - %"PRIx64
 vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add 
[iommu] %"PRIx64" - %"PRIx64
 vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void 
*vaddr) "region_add [ram] %"PRIx64" - %"PRIx64" [%p]"
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 03/20] vfio: allow to notify unmap for very large region

2017-01-20 Thread Peter Xu

Linux vfio driver supports to do VFIO_IOMMU_UNMAP_DMA for a very big
region. This can be leveraged by QEMU IOMMU implementation to cleanup
existing page mappings for an entire iova address space (by notifying
with an IOTLB with extremely huge addr_mask). However current
vfio_iommu_map_notify() does not allow that. It make sure that all the
translated address in IOTLB is falling into RAM range.

The check makes sense, but it should only be a sensible checker for
mapping operations, and mean little for unmap operations.

This patch moves this check into map logic only, so that we'll get
faster unmap handling (no need to translate again), and also we can then
better support unmapping a very big region when it covers non-ram ranges
or even not-existing ranges.

Signed-off-by: Peter Xu 
---
 hw/vfio/common.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index ce55dff..4d90844 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -354,11 +354,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 return;
 }
 
-if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
-return;
-}
-
 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
+if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
+return;
+}
 ret = vfio_dma_map(container, iova,
iotlb->addr_mask + 1, vaddr,
read_only);
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 04/20] IOMMU: add option to enable VTD_CAP_CM to vIOMMU capility exposoed to guest

2017-01-20 Thread Peter Xu

From: Aviv Ben-David 

This capability asks the guest to invalidate cache before each map operation.
We can use this invalidation to trap map operations in the hypervisor.

Signed-off-by: Aviv Ben-David 
[using "caching-mode" instead of "cache-mode" to align with spec]
Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c  | 5 +
 hw/i386/intel_iommu_internal.h | 1 +
 include/hw/i386/intel_iommu.h  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index ec62239..e58f1de 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2107,6 +2107,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2488,6 +2489,10 @@ static void vtd_init(IntelIOMMUState *s)
 s->ecap |= VTD_ECAP_DT;
 }
 
+if (s->caching_mode) {
+s->cap |= VTD_CAP_CM;
+}
+
 vtd_reset_context_cache(s);
 vtd_reset_iotlb(s);
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 356f188..4104121 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -202,6 +202,7 @@
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
 #define VTD_CAP_PSI (1ULL << 39)
 #define VTD_CAP_SLLPS   ((1ULL << 34) | (1ULL << 35))
+#define VTD_CAP_CM  (1ULL << 7)
 
 /* Supported Adjusted Guest Address Widths */
 #define VTD_CAP_SAGAW_SHIFT 8
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 405c9d1..fe645aa 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -257,6 +257,8 @@ struct IntelIOMMUState {
 uint8_t womask[DMAR_REG_SIZE];  /* WO (write only - read returns 0) */
 uint32_t version;
 
+bool caching_mode;  /* RO - is cap CM enabled? */
+
 dma_addr_t root;/* Current root table pointer */
 bool root_extended; /* Type of root table (extended or not) */
 bool dmar_enabled;  /* Set if DMA remapping is enabled */
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 02/20] vfio: introduce vfio_get_vaddr()

2017-01-20 Thread Peter Xu

A cleanup for vfio_iommu_map_notify(). Should have no functional change,
just to make the function shorter and easier to understand.

Signed-off-by: Peter Xu 
---
 hw/vfio/common.c | 58 +---
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 174f351..ce55dff 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -294,25 +294,14 @@ static bool 
vfio_listener_skipped_section(MemoryRegionSection *section)
section->offset_within_address_space & (1ULL << 63);
 }
 
-static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
+   bool *read_only)
 {
-VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
-VFIOContainer *container = giommu->container;
-hwaddr iova = iotlb->iova + giommu->iommu_offset;
 MemoryRegion *mr;
 hwaddr xlat;
 hwaddr len = iotlb->addr_mask + 1;
-void *vaddr;
-int ret;
-
-trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
-iova, iova + iotlb->addr_mask);
-
-if (iotlb->target_as != &address_space_memory) {
-error_report("Wrong target AS \"%s\", only system memory is allowed",
- iotlb->target_as->name ? iotlb->target_as->name : "none");
-return;
-}
+bool ret = false;
+bool writable = iotlb->perm & IOMMU_WO;
 
 /*
  * The IOMMU TLB entry we have just covers translation through
@@ -322,12 +311,13 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 rcu_read_lock();
 mr = address_space_translate(&address_space_memory,
  iotlb->translated_addr,
- &xlat, &len, iotlb->perm & IOMMU_WO);
+ &xlat, &len, writable);
 if (!memory_region_is_ram(mr)) {
 error_report("iommu map to non memory area %"HWADDR_PRIx"",
  xlat);
 goto out;
 }
+
 /*
  * Translation truncates length to the IOMMU page size,
  * check that it did not truncate too much.
@@ -337,11 +327,41 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
 goto out;
 }
 
+*vaddr = memory_region_get_ram_ptr(mr) + xlat;
+*read_only = !writable || mr->readonly;
+ret = true;
+
+out:
+rcu_read_unlock();
+return ret;
+}
+
+static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
+{
+VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
+VFIOContainer *container = giommu->container;
+hwaddr iova = iotlb->iova + giommu->iommu_offset;
+bool read_only;
+void *vaddr;
+int ret;
+
+trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
+iova, iova + iotlb->addr_mask);
+
+if (iotlb->target_as != &address_space_memory) {
+error_report("Wrong target AS \"%s\", only system memory is allowed",
+ iotlb->target_as->name ? iotlb->target_as->name : "none");
+return;
+}
+
+if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
+return;
+}
+
 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
-vaddr = memory_region_get_ram_ptr(mr) + xlat;
 ret = vfio_dma_map(container, iova,
iotlb->addr_mask + 1, vaddr,
-   !(iotlb->perm & IOMMU_WO) || mr->readonly);
+   read_only);
 if (ret) {
 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
  "0x%"HWADDR_PRIx", %p) = %d (%m)",
@@ -357,8 +377,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
  iotlb->addr_mask + 1, ret);
 }
 }
-out:
-rcu_read_unlock();
 }
 
 static void vfio_listener_region_add(MemoryListener *listener,
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 05/20] intel_iommu: simplify irq region translation

2017-01-20 Thread Peter Xu

Now we have a standalone memory region for MSI, all the irq region
requests should be redirected there. Cleaning up the block with an
assertion instead.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 28 ++--
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e58f1de..55b8ff4 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -818,28 +818,12 @@ static void vtd_do_iommu_translate(VTDAddressSpace 
*vtd_as, PCIBus *bus,
 bool writes = true;
 VTDIOTLBEntry *iotlb_entry;
 
-/* Check if the request is in interrupt address range */
-if (vtd_is_interrupt_addr(addr)) {
-if (is_write) {
-/* FIXME: since we don't know the length of the access here, we
- * treat Non-DWORD length write requests without PASID as
- * interrupt requests, too. Withoud interrupt remapping support,
- * we just use 1:1 mapping.
- */
-VTD_DPRINTF(MMU, "write request to interrupt address "
-"gpa 0x%"PRIx64, addr);
-entry->iova = addr & VTD_PAGE_MASK_4K;
-entry->translated_addr = addr & VTD_PAGE_MASK_4K;
-entry->addr_mask = ~VTD_PAGE_MASK_4K;
-entry->perm = IOMMU_WO;
-return;
-} else {
-VTD_DPRINTF(GENERAL, "error: read request from interrupt address "
-"gpa 0x%"PRIx64, addr);
-vtd_report_dmar_fault(s, source_id, addr, VTD_FR_READ, is_write);
-return;
-}
-}
+/*
+ * We have standalone memory region for interrupt addresses, we
+ * should never receive translation requests in this region.
+ */
+assert(!vtd_is_interrupt_addr(addr));
+
 /* Try to fetch slpte form IOTLB */
 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
 if (iotlb_entry) {
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 08/20] intel_iommu: fix trace for addr translation

2017-01-20 Thread Peter Xu

Another patch to convert the DPRINTF() stuffs. This patch focuses on the
address translation path and caching.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 84 ---
 hw/i386/trace-events  |  7 +
 2 files changed, 39 insertions(+), 52 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 343a2ad..2c13b7b 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -260,11 +260,9 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t 
source_id,
 uint64_t *key = g_malloc(sizeof(*key));
 uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
 
-VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " iova 0x%"PRIx64
-" slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte,
-domain_id);
+trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
-VTD_DPRINTF(CACHE, "iotlb exceeds size limit, forced to reset");
+trace_vtd_iotlb_reset("iotlb exceeds size limit");
 vtd_reset_iotlb(s);
 }
 
@@ -505,8 +503,7 @@ static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t 
index,
 
 addr = s->root + index * sizeof(*re);
 if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) {
-VTD_DPRINTF(GENERAL, "error: fail to access root-entry at 0x%"PRIx64
-" + %"PRIu8, s->root, index);
+trace_vtd_err("Fail to access root-entry");
 re->val = 0;
 return -VTD_FR_ROOT_TABLE_INV;
 }
@@ -525,14 +522,12 @@ static int vtd_get_context_entry_from_root(VTDRootEntry 
*root, uint8_t index,
 dma_addr_t addr;
 
 if (!vtd_root_entry_present(root)) {
-VTD_DPRINTF(GENERAL, "error: root-entry is not present");
+trace_vtd_err("Root-entry is not present");
 return -VTD_FR_ROOT_ENTRY_P;
 }
 addr = (root->val & VTD_ROOT_ENTRY_CTP) + index * sizeof(*ce);
 if (dma_memory_read(&address_space_memory, addr, ce, sizeof(*ce))) {
-VTD_DPRINTF(GENERAL, "error: fail to access context-entry at 0x%"PRIx64
-" + %"PRIu8,
-(uint64_t)(root->val & VTD_ROOT_ENTRY_CTP), index);
+trace_vtd_err("Fail to access context-entry");
 return -VTD_FR_CONTEXT_TABLE_INV;
 }
 ce->lo = le64_to_cpu(ce->lo);
@@ -644,7 +639,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
iova, bool is_write,
  * in CAP_REG and AW in context-entry.
  */
 if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
-VTD_DPRINTF(GENERAL, "error: iova 0x%"PRIx64 " exceeds limits", iova);
+trace_vtd_err("IOVA exceeds limits");
 return -VTD_FR_ADDR_BEYOND_MGAW;
 }
 
@@ -656,9 +651,7 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
iova, bool is_write,
 slpte = vtd_get_slpte(addr, offset);
 
 if (slpte == (uint64_t)-1) {
-VTD_DPRINTF(GENERAL, "error: fail to access second-level paging "
-"entry at level %"PRIu32 " for iova 0x%"PRIx64,
-level, iova);
+trace_vtd_err("Fail to access second-level paging entry");
 if (level == vtd_get_level_from_context_entry(ce)) {
 /* Invalid programming of context-entry */
 return -VTD_FR_CONTEXT_ENTRY_INV;
@@ -669,15 +662,11 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
iova, bool is_write,
 *reads = (*reads) && (slpte & VTD_SL_R);
 *writes = (*writes) && (slpte & VTD_SL_W);
 if (!(slpte & access_right_check)) {
-VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
-"iova 0x%"PRIx64 " slpte 0x%"PRIx64,
-(is_write ? "write" : "read"), iova, slpte);
+trace_vtd_err("Lack of permission for page");
 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
 }
 if (vtd_slpte_nonzero_rsvd(slpte, level)) {
-VTD_DPRINTF(GENERAL, "error: non-zero reserved field in second "
-"level paging entry level %"PRIu32 " slpte 0x%"PRIx64,
-level, slpte);
+trace_vtd_err_nonzero_reserved("second level paging entry");
 return -VTD_FR_PAGING_ENTRY_RSVD;
 }
 
@@ -704,12 +693,11 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, 
uint8_t bus_num,
 }
 
 if (!vtd_root_entry_present(&re)) {
-VTD_DPRINTF(GENERAL, "error: root-entry #%"PRIu8 " is not present",
-bus_num);
+/* Not error - it's okay we don't have root entry. */
+trace_vtd_re_not_present(bus_num);
 return -VTD_FR_ROOT_ENTRY_P;
 } else if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) {
-VTD_DPRINTF(GENERAL, "error: non-zero reserved field in root-entry "
-"hi 0x%"PRIx64 " lo 0x%"PRIx64, re.rsvd, re.val);
+trace_vtd_err_

[Qemu-devel] [PATCH RFC v4 10/20] memory: add section range info for IOMMU notifier

2017-01-20 Thread Peter Xu

In this patch, IOMMUNotifier.{start|end} are introduced to store section
information for a specific notifier. When notification occurs, we not
only check the notification type (MAP|UNMAP), but also check whether the
notified iova is in the range of specific IOMMU notifier, and skip those
notifiers if not in the listened range.

When removing an region, we need to make sure we removed the correct
VFIOGuestIOMMU by checking the IOMMUNotifier.start address as well.

Suggested-by: David Gibson 
Signed-off-by: Peter Xu 
---
changelog (start from vt-d vfio enablement series v3):
v4:
- introduce memory_region_iommu_notifier_init() [Jason]
---
 hw/vfio/common.c  | 12 +---
 hw/virtio/vhost.c |  4 ++--
 include/exec/memory.h | 19 ++-
 memory.c  |  5 -
 4 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 4d90844..49dc035 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -471,8 +471,13 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 giommu->iommu_offset = section->offset_within_address_space -
section->offset_within_region;
 giommu->container = container;
-giommu->n.notify = vfio_iommu_map_notify;
-giommu->n.notifier_flags = IOMMU_NOTIFIER_ALL;
+llend = int128_add(int128_make64(section->offset_within_region),
+   section->size);
+llend = int128_sub(llend, int128_one());
+iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
+IOMMU_NOTIFIER_ALL,
+section->offset_within_region,
+int128_get64(llend));
 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
 
 memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
@@ -543,7 +548,8 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
 VFIOGuestIOMMU *giommu;
 
 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
-if (giommu->iommu == section->mr) {
+if (giommu->iommu == section->mr &&
+giommu->n.start == section->offset_within_region) {
 memory_region_unregister_iommu_notifier(giommu->iommu,
 &giommu->n);
 QLIST_REMOVE(giommu, giommu_next);
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 9cacf55..cc99c6a 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1242,8 +1242,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 .priority = 10
 };
 
-hdev->n.notify = vhost_iommu_unmap_notify;
-hdev->n.notifier_flags = IOMMU_NOTIFIER_UNMAP;
+iommu_notifier_init(&hdev->n, vhost_iommu_unmap_notify,
+IOMMU_NOTIFIER_UNMAP, 0, ~0ULL);
 
 if (hdev->migration_blocker == NULL) {
 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
diff --git a/include/exec/memory.h b/include/exec/memory.h
index bec9756..ae4c9a9 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -81,13 +81,30 @@ typedef enum {
 
 #define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP)
 
+struct IOMMUNotifier;
+typedef void (*IOMMUNotify)(struct IOMMUNotifier *notifier,
+IOMMUTLBEntry *data);
+
 struct IOMMUNotifier {
-void (*notify)(struct IOMMUNotifier *notifier, IOMMUTLBEntry *data);
+IOMMUNotify notify;
 IOMMUNotifierFlag notifier_flags;
+/* Notify for address space range start <= addr <= end */
+hwaddr start;
+hwaddr end;
 QLIST_ENTRY(IOMMUNotifier) node;
 };
 typedef struct IOMMUNotifier IOMMUNotifier;
 
+static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
+   IOMMUNotifierFlag flags,
+   hwaddr start, hwaddr end)
+{
+n->notify = fn;
+n->notifier_flags = flags;
+n->start = start;
+n->end = end;
+}
+
 /* New-style MMIO accessors can indicate that the transaction failed.
  * A zero (MEMTX_OK) response means success; anything else is a failure
  * of some kind. The memory subsystem will bitwise-OR together results
diff --git a/memory.c b/memory.c
index 2bfc37f..89104b1 100644
--- a/memory.c
+++ b/memory.c
@@ -1610,6 +1610,7 @@ void memory_region_register_iommu_notifier(MemoryRegion 
*mr,
 
 /* We need to register for at least one bitfield */
 assert(n->notifier_flags != IOMMU_NOTIFIER_NONE);
+assert(n->start <= n->end);
 QLIST_INSERT_HEAD(&mr->iommu_notify, n, node);
 memory_region_update_iommu_notify_flags(mr);
 }
@@ -1671,7 +1672,9 @@ void memory_region_notify_iommu(MemoryRegion *mr,
 }
 
 QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) {
-if (iommu_notifier->notifier_flags & request_flags) {
+if (iommu_notifier->notifier_flags & request_flags &&
+iommu_notifie

[Qemu-devel] [PATCH RFC v4 12/20] memory: provide iommu_replay_all()

2017-01-20 Thread Peter Xu

This is an "global" version of exising memory_region_iommu_replay() - we
announce the translations to all the registered notifiers, instead of a
specific one.

Signed-off-by: Peter Xu 
---
 include/exec/memory.h | 8 
 memory.c  | 9 +
 2 files changed, 17 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index f0cb631..885c05f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -711,6 +711,14 @@ void memory_region_iommu_replay(MemoryRegion *mr, 
IOMMUNotifier *n,
 bool is_write);
 
 /**
+ * memory_region_iommu_replay_all: replay existing IOMMU translations
+ * to all the notifiers registered.
+ *
+ * @mr: the memory region to observe
+ */
+void memory_region_iommu_replay_all(MemoryRegion *mr);
+
+/**
  * memory_region_unregister_iommu_notifier: unregister a notifier for
  * changes to IOMMU translation entries.
  *
diff --git a/memory.c b/memory.c
index d1ee3e0..068666a 100644
--- a/memory.c
+++ b/memory.c
@@ -1646,6 +1646,15 @@ void memory_region_iommu_replay(MemoryRegion *mr, 
IOMMUNotifier *n,
 }
 }
 
+void memory_region_iommu_replay_all(MemoryRegion *mr)
+{
+IOMMUNotifier *notifier;
+
+IOMMU_NOTIFIER_FOREACH(notifier, mr) {
+memory_region_iommu_replay(mr, notifier, false);
+}
+}
+
 void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
  IOMMUNotifier *n)
 {
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 09/20] intel_iommu: vtd_slpt_level_shift check level

2017-01-20 Thread Peter Xu

This helps in debugging incorrect level passed in.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2c13b7b..6f5f68a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -168,6 +168,7 @@ static gboolean vtd_hash_remove_by_domain(gpointer key, 
gpointer value,
 /* The shift of an addr for a certain level of paging structure */
 static inline uint32_t vtd_slpt_level_shift(uint32_t level)
 {
+assert(level != 0);
 return VTD_PAGE_SHIFT_4K + (level - 1) * VTD_SL_LEVEL_BITS;
 }
 
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 06/20] intel_iommu: renaming gpa to iova where proper

2017-01-20 Thread Peter Xu

There are lots of places in current intel_iommu.c codes that named
"iova" as "gpa". It is really confusing to use a name "gpa" in these
places (which is very easily to be understood as "Guest Physical
Address", while it's not). To make the codes (much) easier to be read, I
decided to do this once and for all.

No functional change is made. Only literal ones.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 55b8ff4..b934b56 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -259,7 +259,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t 
source_id,
 uint64_t *key = g_malloc(sizeof(*key));
 uint64_t gfn = vtd_get_iotlb_gfn(addr, level);
 
-VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64
+VTD_DPRINTF(CACHE, "update iotlb sid 0x%"PRIx16 " iova 0x%"PRIx64
 " slpte 0x%"PRIx64 " did 0x%"PRIx16, source_id, addr, slpte,
 domain_id);
 if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
@@ -575,12 +575,12 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr, 
uint32_t index)
 return slpte;
 }
 
-/* Given a gpa and the level of paging structure, return the offset of current
- * level.
+/* Given an iova and the level of paging structure, return the offset
+ * of current level.
  */
-static inline uint32_t vtd_gpa_level_offset(uint64_t gpa, uint32_t level)
+static inline uint32_t vtd_iova_level_offset(uint64_t iova, uint32_t level)
 {
-return (gpa >> vtd_slpt_level_shift(level)) &
+return (iova >> vtd_slpt_level_shift(level)) &
 ((1ULL << VTD_SL_LEVEL_BITS) - 1);
 }
 
@@ -628,10 +628,10 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, 
uint32_t level)
 }
 }
 
-/* Given the @gpa, get relevant @slptep. @slpte_level will be the last level
+/* Given the @iova, get relevant @slptep. @slpte_level will be the last level
  * of the translation, can be used for deciding the size of large page.
  */
-static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool is_write,
+static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
 uint64_t *slptep, uint32_t *slpte_level,
 bool *reads, bool *writes)
 {
@@ -642,11 +642,11 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
 uint64_t access_right_check;
 
-/* Check if @gpa is above 2^X-1, where X is the minimum of MGAW in CAP_REG
- * and AW in context-entry.
+/* Check if @iova is above 2^X-1, where X is the minimum of MGAW
+ * in CAP_REG and AW in context-entry.
  */
-if (gpa & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
-VTD_DPRINTF(GENERAL, "error: gpa 0x%"PRIx64 " exceeds limits", gpa);
+if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
+VTD_DPRINTF(GENERAL, "error: iova 0x%"PRIx64 " exceeds limits", iova);
 return -VTD_FR_ADDR_BEYOND_MGAW;
 }
 
@@ -654,13 +654,13 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 access_right_check = is_write ? VTD_SL_W : VTD_SL_R;
 
 while (true) {
-offset = vtd_gpa_level_offset(gpa, level);
+offset = vtd_iova_level_offset(iova, level);
 slpte = vtd_get_slpte(addr, offset);
 
 if (slpte == (uint64_t)-1) {
 VTD_DPRINTF(GENERAL, "error: fail to access second-level paging "
-"entry at level %"PRIu32 " for gpa 0x%"PRIx64,
-level, gpa);
+"entry at level %"PRIu32 " for iova 0x%"PRIx64,
+level, iova);
 if (level == vtd_get_level_from_context_entry(ce)) {
 /* Invalid programming of context-entry */
 return -VTD_FR_CONTEXT_ENTRY_INV;
@@ -672,8 +672,8 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
gpa, bool is_write,
 *writes = (*writes) && (slpte & VTD_SL_W);
 if (!(slpte & access_right_check)) {
 VTD_DPRINTF(GENERAL, "error: lack of %s permission for "
-"gpa 0x%"PRIx64 " slpte 0x%"PRIx64,
-(is_write ? "write" : "read"), gpa, slpte);
+"iova 0x%"PRIx64 " slpte 0x%"PRIx64,
+(is_write ? "write" : "read"), iova, slpte);
 return is_write ? -VTD_FR_WRITE : -VTD_FR_READ;
 }
 if (vtd_slpte_nonzero_rsvd(slpte, level)) {
@@ -827,7 +827,7 @@ static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, 
PCIBus *bus,
 /* Try to fetch slpte form IOTLB */
 iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
 if (iotlb_entry) {
-VTD_DPRINTF(CACHE, "hit iotlb sid 0x%"PRIx16 " gpa 0x%"PRIx64
+VTD_DPRINTF(CACHE, "hit iotlb sid 0x%"PRIx16 "

[Qemu-devel] [PATCH RFC v4 13/20] memory: introduce memory_region_notify_one()

2017-01-20 Thread Peter Xu

Generalizing the notify logic in memory_region_notify_iommu() into a
single function. This can be further used in customized replay()
functions for IOMMUs.

Signed-off-by: Peter Xu 
---
 include/exec/memory.h | 15 +++
 memory.c  | 29 ++---
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 885c05f..75371e9 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -686,6 +686,21 @@ void memory_region_notify_iommu(MemoryRegion *mr,
 IOMMUTLBEntry entry);
 
 /**
+ * memory_region_notify_one: notify a change in an IOMMU translation
+ *   entry to a single notifier
+ *
+ * This works just like memory_region_notify_iommu(), but it only
+ * notifies a specific notifier, not all of them.
+ *
+ * @notifier: the notifier to be notified
+ * @entry: the new entry in the IOMMU translation table.  The entry
+ * replaces all old entries for the same virtual I/O address range.
+ * Deleted entries have .@perm == 0.
+ */
+void memory_region_notify_one(IOMMUNotifier *notifier,
+  IOMMUTLBEntry *entry);
+
+/**
  * memory_region_register_iommu_notifier: register a notifier for changes to
  * IOMMU translation entries.
  *
diff --git a/memory.c b/memory.c
index 068666a..a4affda 100644
--- a/memory.c
+++ b/memory.c
@@ -1666,26 +1666,33 @@ void 
memory_region_unregister_iommu_notifier(MemoryRegion *mr,
 memory_region_update_iommu_notify_flags(mr);
 }
 
-void memory_region_notify_iommu(MemoryRegion *mr,
-IOMMUTLBEntry entry)
+void memory_region_notify_one(IOMMUNotifier *notifier,
+  IOMMUTLBEntry *entry)
 {
-IOMMUNotifier *iommu_notifier;
 IOMMUNotifierFlag request_flags;
 
-assert(memory_region_is_iommu(mr));
-
-if (entry.perm & IOMMU_RW) {
+if (entry->perm & IOMMU_RW) {
 request_flags = IOMMU_NOTIFIER_MAP;
 } else {
 request_flags = IOMMU_NOTIFIER_UNMAP;
 }
 
+if (notifier->notifier_flags & request_flags &&
+notifier->start <= entry->iova &&
+notifier->end >= entry->iova) {
+notifier->notify(notifier, entry);
+}
+}
+
+void memory_region_notify_iommu(MemoryRegion *mr,
+IOMMUTLBEntry entry)
+{
+IOMMUNotifier *iommu_notifier;
+
+assert(memory_region_is_iommu(mr));
+
 IOMMU_NOTIFIER_FOREACH(iommu_notifier, mr) {
-if (iommu_notifier->notifier_flags & request_flags &&
-iommu_notifier->start <= entry.iova &&
-iommu_notifier->end >= entry.iova) {
-iommu_notifier->notify(iommu_notifier, &entry);
-}
+memory_region_notify_one(iommu_notifier, &entry);
 }
 }
 
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 16/20] intel_iommu: do replay when context invalidate

2017-01-20 Thread Peter Xu

Before this one we only invalidate context cache when we receive context
entry invalidations. However it's possible that the invalidation also
contains a domain switch (only if cache-mode is enabled for vIOMMU). In
that case we need to notify all the registered components about the new
mapping.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index f9c5142..4b08b4d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1146,6 +1146,16 @@ static void 
vtd_context_device_invalidate(IntelIOMMUState *s,
 trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
  VTD_PCI_FUNC(devfn_it));
 vtd_as->context_cache_entry.context_cache_gen = 0;
+/*
+ * So a device is moving out of (or moving into) a
+ * domain, a replay() suites here to notify all the
+ * IOMMU_NOTIFIER_MAP registers about this change.
+ * This won't bring bad even if we have no such
+ * notifier registered - the IOMMU notification
+ * framework will skip MAP notifications if that
+ * happened.
+ */
+memory_region_iommu_replay_all(&vtd_as->iommu);
 }
 }
 }
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 07/20] intel_iommu: fix trace for inv desc handling

2017-01-20 Thread Peter Xu

VT-d codes are still using static DEBUG_INTEL_IOMMU macro. That's not
good, and we should end the day when we need to recompile the code
before getting useful debugging information for vt-d. Time to switch to
the trace system.

This is the first patch to do it.

Generally, the rule of mine is:

- for the old GENERAL typed message, I use trace_vtd_err*() in general.

- for the non-GENERAL typed messages, convert into specified trace_*().

- for useless DPRINTFs, I removed them.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 97 +--
 hw/i386/trace-events  | 15 
 2 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index b934b56..343a2ad 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -35,6 +35,7 @@
 #include "sysemu/kvm.h"
 #include "hw/i386/apic_internal.h"
 #include "kvm_i386.h"
+#include "trace.h"
 
 /*#define DEBUG_INTEL_IOMMU*/
 #ifdef DEBUG_INTEL_IOMMU
@@ -474,22 +475,19 @@ static void vtd_handle_inv_queue_error(IntelIOMMUState *s)
 /* Set the IWC field and try to generate an invalidation completion interrupt 
*/
 static void vtd_generate_completion_event(IntelIOMMUState *s)
 {
-VTD_DPRINTF(INV, "completes an invalidation wait command with "
-"Interrupt Flag");
 if (vtd_get_long_raw(s, DMAR_ICS_REG) & VTD_ICS_IWC) {
-VTD_DPRINTF(INV, "there is a previous interrupt condition to be "
-"serviced by software, "
-"new invalidation event is not generated");
+trace_vtd_inv_desc_wait_irq("One pending, skip current");
 return;
 }
 vtd_set_clear_mask_long(s, DMAR_ICS_REG, 0, VTD_ICS_IWC);
 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, 0, VTD_IECTL_IP);
 if (vtd_get_long_raw(s, DMAR_IECTL_REG) & VTD_IECTL_IM) {
-VTD_DPRINTF(INV, "IM filed in IECTL_REG is set, new invalidation "
-"event is not generated");
+trace_vtd_inv_desc_wait_irq("IM in IECTL_REG is set, "
+"new event not generated");
 return;
 } else {
 /* Generate the interrupt event */
+trace_vtd_inv_desc_wait_irq("Generating complete event");
 vtd_generate_interrupt(s, DMAR_IEADDR_REG, DMAR_IEDATA_REG);
 vtd_set_clear_mask_long(s, DMAR_IECTL_REG, VTD_IECTL_IP, 0);
 }
@@ -923,6 +921,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState 
*s)
 
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
+trace_vtd_inv_desc_cc_global();
 s->context_cache_gen++;
 if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
 vtd_reset_context_cache(s);
@@ -962,9 +961,11 @@ static void vtd_context_device_invalidate(IntelIOMMUState 
*s,
 uint16_t mask;
 VTDBus *vtd_bus;
 VTDAddressSpace *vtd_as;
-uint16_t devfn;
+uint8_t bus_n, devfn;
 uint16_t devfn_it;
 
+trace_vtd_inv_desc_cc_devices(source_id, func_mask);
+
 switch (func_mask & 3) {
 case 0:
 mask = 0;   /* No bits in the SID field masked */
@@ -980,16 +981,16 @@ static void vtd_context_device_invalidate(IntelIOMMUState 
*s,
 break;
 }
 mask = ~mask;
-VTD_DPRINTF(INV, "device-selective invalidation source 0x%"PRIx16
-" mask %"PRIu16, source_id, mask);
-vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id));
+
+bus_n = VTD_SID_TO_BUS(source_id);
+vtd_bus = vtd_find_as_from_bus_num(s, bus_n);
 if (vtd_bus) {
 devfn = VTD_SID_TO_DEVFN(source_id);
 for (devfn_it = 0; devfn_it < X86_IOMMU_PCI_DEVFN_MAX; ++devfn_it) {
 vtd_as = vtd_bus->dev_as[devfn_it];
 if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
-VTD_DPRINTF(INV, "invalidate context-cahce of devfn 0x%"PRIx16,
-devfn_it);
+trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
+ VTD_PCI_FUNC(devfn_it));
 vtd_as->context_cache_entry.context_cache_gen = 0;
 }
 }
@@ -1302,9 +1303,7 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 {
 if ((inv_desc->hi & VTD_INV_DESC_WAIT_RSVD_HI) ||
 (inv_desc->lo & VTD_INV_DESC_WAIT_RSVD_LO)) {
-VTD_DPRINTF(GENERAL, "error: non-zero reserved field in Invalidation "
-"Wait Descriptor hi 0x%"PRIx64 " lo 0x%"PRIx64,
-inv_desc->hi, inv_desc->lo);
+trace_vtd_err_nonzero_reserved("invalidation wait desc");
 return false;
 }
 if (inv_desc->lo & VTD_INV_DESC_WAIT_SW) {
@@ -1316,21 +1315,18 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 
 /* FIXME: need to be masked with HAW? */
 dma_addr_t status_addr = inv_desc->hi;
-VTD_DPRINTF(INV, "status data 0x%x, status addr 0x%"PRIx64,
-

[Qemu-devel] [PATCH RFC v4 19/20] intel_iommu: unmap existing pages before replay

2017-01-20 Thread Peter Xu

Previous replay works for domain switch only if the original domain does
not have mapped pages. For example, if we switch domain from A to B, it
will only work if A has no existing mapping. If there is, then there's
problem - current replay didn't make sure the old mappings are cleared
before replaying the new one.

This patch let the replay go well even if original domain A has existing
mappings.

The idea is, when we replay, we unmap the whole address space first, no
matter what. Then, we replay the region, rebuild the pages.

We are leveraging the feature provided by vfio driver that it allows to
unmap a very big range of region, even if it is bigger than the mapped
area. It'll free all the mapped pages within the range. Here, we
choosed (0, 1ULL << VTD_MGAW) as the range to make sure every mapped
pages are unmapped.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c  | 64 --
 hw/i386/intel_iommu_internal.h |  1 +
 hw/i386/trace-events   |  1 +
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7cbf057..a038651 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2696,6 +2696,63 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 return vtd_dev_as;
 }
 
+/*
+ * Unmap the whole range in the notifier's scope. If we have recorded
+ * any high watermark (VTDAddressSpace.iova_max), we use it to limit
+ * the n->end as well.
+ */
+static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
+{
+IOMMUTLBEntry entry;
+hwaddr size;
+hwaddr start = n->start;
+hwaddr end = n->end;
+
+/*
+ * Note: all the codes in this function has a assumption that IOVA
+ * bits are no more than VTD_MGAW bits (which is restricted by
+ * VT-d spec), otherwise we need to consider overflow of 64 bits.
+ */
+
+if (end > VTD_ADDRESS_SIZE) {
+/*
+ * Don't need to unmap regions that is bigger than the whole
+ * VT-d supported address space size
+ */
+end = VTD_ADDRESS_SIZE;
+}
+
+assert(start <= end);
+size = end - start;
+
+if (ctpop64(size) != 1) {
+/*
+ * This size cannot format a correct mask. Let's enlarge it to
+ * suite the minimum available mask.
+ */
+int n = 64 - clz64(size);
+if (n > VTD_MGAW) {
+/* should not happen, but in case it happens, limit it */
+trace_vtd_err("Address space unmap found size too big");
+n = VTD_MGAW;
+}
+size = 1ULL << n;
+}
+
+entry.target_as = &address_space_memory;
+entry.iova = n->start;
+entry.translated_addr = 0;  /* useless for unmap */
+entry.perm = IOMMU_NONE;
+entry.addr_mask = size - 1;
+
+trace_vtd_as_unmap_whole(pci_bus_num(as->bus),
+ VTD_PCI_SLOT(as->devfn),
+ VTD_PCI_FUNC(as->devfn),
+ entry.iova, size);
+
+memory_region_notify_one(n, &entry);
+}
+
 static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private)
 {
 memory_region_notify_one((IOMMUNotifier *)private, entry);
@@ -2711,13 +2768,16 @@ static void vtd_iommu_replay(MemoryRegion *mr, 
IOMMUNotifier *n)
 
 if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
 /*
- * Scanned a valid context entry, walk over the pages and
- * notify when needed.
+ * Scanned a valid context entry, we first make sure to remove
+ * all existing mappings in old domain, by sending UNMAP to
+ * all the notifiers. Then, we walk over the pages and notify
+ * with existing mapped new entries in the new domain.
  */
 trace_vtd_replay_ce_valid(bus_n, PCI_SLOT(vtd_as->devfn),
   PCI_FUNC(vtd_as->devfn),
   VTD_CONTEXT_ENTRY_DID(ce.hi),
   ce.hi, ce.lo);
+vtd_address_space_unmap(vtd_as, n);
 vtd_page_walk(&ce, 0, ~0, vtd_replay_hook, (void *)n, false);
 } else {
 trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 4104121..29d6707 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -197,6 +197,7 @@
 #define VTD_DOMAIN_ID_MASK  ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
 #define VTD_CAP_ND  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
 #define VTD_MGAW39  /* Maximum Guest Address Width */
+#define VTD_ADDRESS_SIZE(1ULL << VTD_MGAW)
 #define VTD_CAP_MGAW(((VTD_MGAW - 1) & 0x3fULL) << 16)
 #define VTD_MAMV18ULL
 #define VTD_CAP_MAMV(VTD_MAMV << 48)
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index bd57b0a..ef725ca 100644
--- a/hw/i3

[Qemu-devel] [PATCH RFC v4 14/20] memory: add MemoryRegionIOMMUOps.replay() callback

2017-01-20 Thread Peter Xu

Originally we have one memory_region_iommu_replay() function, which is
the default behavior to replay the translations of the whole IOMMU
region. However, on some platform like x86, we may want our own replay
logic for IOMMU regions. This patch add one more hook for IOMMUOps for
the callback, and it'll override the default if set.

Signed-off-by: Peter Xu 
---
 include/exec/memory.h | 2 ++
 memory.c  | 6 ++
 2 files changed, 8 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 75371e9..bb4e654 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -195,6 +195,8 @@ struct MemoryRegionIOMMUOps {
 void (*notify_flag_changed)(MemoryRegion *iommu,
 IOMMUNotifierFlag old_flags,
 IOMMUNotifierFlag new_flags);
+/* Set this up to provide customized IOMMU replay function */
+void (*replay)(MemoryRegion *iommu, IOMMUNotifier *notifier);
 };
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
diff --git a/memory.c b/memory.c
index a4affda..169ead6 100644
--- a/memory.c
+++ b/memory.c
@@ -1630,6 +1630,12 @@ void memory_region_iommu_replay(MemoryRegion *mr, 
IOMMUNotifier *n,
 hwaddr addr, granularity;
 IOMMUTLBEntry iotlb;
 
+/* If the IOMMU has its own replay callback, override */
+if (mr->iommu_ops->replay) {
+mr->iommu_ops->replay(mr, n);
+return;
+}
+
 granularity = memory_region_iommu_get_min_page_size(mr);
 
 for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 17/20] intel_iommu: allow dynamic switch of IOMMU region

2017-01-20 Thread Peter Xu

This is preparation work to finally enabled dynamic switching ON/OFF for
VT-d protection. The old VT-d codes is using static IOMMU address space,
and that won't satisfy vfio-pci device listeners.

Let me explain.

vfio-pci devices depend on the memory region listener and IOMMU replay
mechanism to make sure the device mapping is coherent with the guest
even if there are domain switches. And there are two kinds of domain
switches:

  (1) switch from domain A -> B
  (2) switch from domain A -> no domain (e.g., turn DMAR off)

Case (1) is handled by the context entry invalidation handling by the
VT-d replay logic. What the replay function should do here is to replay
the existing page mappings in domain B.

However for case (2), we don't want to replay any domain mappings - we
just need the default GPA->HPA mappings (the address_space_memory
mapping). And this patch helps on case (2) to build up the mapping
automatically by leveraging the vfio-pci memory listeners.

Another important thing that this patch does is to seperate
IR (Interrupt Remapping) from DMAR (DMA Remapping). IR region should not
depend on the DMAR region (like before this patch). It should be a
standalone region, and it should be able to be activated without
DMAR (which is a common behavior of Linux kernel - by default it enables
IR while disabled DMAR).

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 78 ---
 hw/i386/trace-events  |  2 +-
 include/hw/i386/intel_iommu.h |  2 ++
 3 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4b08b4d..83a2e1f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1336,9 +1336,49 @@ static void vtd_handle_gcmd_sirtp(IntelIOMMUState *s)
 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, 0, VTD_GSTS_IRTPS);
 }
 
+static void vtd_switch_address_space(VTDAddressSpace *as)
+{
+assert(as);
+
+trace_vtd_switch_address_space(pci_bus_num(as->bus),
+   VTD_PCI_SLOT(as->devfn),
+   VTD_PCI_FUNC(as->devfn),
+   as->iommu_state->dmar_enabled);
+
+/* Turn off first then on the other */
+if (as->iommu_state->dmar_enabled) {
+memory_region_set_enabled(&as->sys_alias, false);
+memory_region_set_enabled(&as->iommu, true);
+} else {
+memory_region_set_enabled(&as->iommu, false);
+memory_region_set_enabled(&as->sys_alias, true);
+}
+}
+
+static void vtd_switch_address_space_all(IntelIOMMUState *s)
+{
+GHashTableIter iter;
+VTDBus *vtd_bus;
+int i;
+
+g_hash_table_iter_init(&iter, s->vtd_as_by_busptr);
+while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) {
+for (i = 0; i < X86_IOMMU_PCI_DEVFN_MAX; i++) {
+if (!vtd_bus->dev_as[i]) {
+continue;
+}
+vtd_switch_address_space(vtd_bus->dev_as[i]);
+}
+}
+}
+
 /* Handle Translation Enable/Disable */
 static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool en)
 {
+if (s->dmar_enabled == en) {
+return;
+}
+
 VTD_DPRINTF(CSR, "Translation Enable %s", (en ? "on" : "off"));
 
 if (en) {
@@ -1353,6 +1393,8 @@ static void vtd_handle_gcmd_te(IntelIOMMUState *s, bool 
en)
 /* Ok - report back to driver */
 vtd_set_clear_mask_long(s, DMAR_GSTS_REG, VTD_GSTS_TES, 0);
 }
+
+vtd_switch_address_space_all(s);
 }
 
 /* Handle Interrupt Remap Enable/Disable */
@@ -2566,15 +2608,43 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 vtd_dev_as->devfn = (uint8_t)devfn;
 vtd_dev_as->iommu_state = s;
 vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+
+/*
+ * Memory region relationships looks like (Address range shows
+ * only lower 32 bits to make it short in length...):
+ *
+ * |-+---+--|
+ * | Name| Address range | Priority |
+ * |-+---+--+
+ * | vtd_root| - |0 |
+ * |  intel_iommu| - |1 |
+ * |  vtd_sys_alias  | - |1 |
+ * |  intel_iommu_ir | fee0-feef |   64 |
+ * |-+---+--|
+ *
+ * We enable/disable DMAR by switching enablement for
+ * vtd_sys_alias and intel_iommu regions. IR region is always
+ * enabled.
+ */
 memory_region_init_iommu(&vtd_dev_as->iommu, OBJECT(s),
  &s->iommu_ops, "intel_iommu", UINT64_MAX);
+memory_region_init_alias(&vtd_dev_as->sys_alias, OBJECT(s),
+ "vtd_sys_alias", get_system_memory(),
+ 0, memory_region_size(get_sys

[Qemu-devel] [PATCH RFC v4 11/20] memory: provide IOMMU_NOTIFIER_FOREACH macro

2017-01-20 Thread Peter Xu

Signed-off-by: Peter Xu 
---
 include/exec/memory.h | 3 +++
 memory.c  | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index ae4c9a9..f0cb631 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -243,6 +243,9 @@ struct MemoryRegion {
 IOMMUNotifierFlag iommu_notify_flags;
 };
 
+#define IOMMU_NOTIFIER_FOREACH(n, mr) \
+QLIST_FOREACH((n), &(mr)->iommu_notify, node)
+
 /**
  * MemoryListener: callbacks structure for updates to the physical memory map
  *
diff --git a/memory.c b/memory.c
index 89104b1..d1ee3e0 100644
--- a/memory.c
+++ b/memory.c
@@ -1587,7 +1587,7 @@ static void 
memory_region_update_iommu_notify_flags(MemoryRegion *mr)
 IOMMUNotifierFlag flags = IOMMU_NOTIFIER_NONE;
 IOMMUNotifier *iommu_notifier;
 
-QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) {
+IOMMU_NOTIFIER_FOREACH(iommu_notifier, mr) {
 flags |= iommu_notifier->notifier_flags;
 }
 
@@ -1671,7 +1671,7 @@ void memory_region_notify_iommu(MemoryRegion *mr,
 request_flags = IOMMU_NOTIFIER_UNMAP;
 }
 
-QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) {
+IOMMU_NOTIFIER_FOREACH(iommu_notifier, mr) {
 if (iommu_notifier->notifier_flags & request_flags &&
 iommu_notifier->start <= entry.iova &&
 iommu_notifier->end >= entry.iova) {
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 20/20] intel_iommu: replay even with DSI/GLOBAL inv desc

2017-01-20 Thread Peter Xu

We were capturing context entry invalidations to trap IOMMU mapping
changes. This patch listens to domain/global invalidation requests too.

We need this for the sake that guest operating system might send one
domain/global invalidation instead of several PSIs in some cases. To
better survive with that, we'd better replay corresponding regions as
well for these invalidations, even if this will turn the performance
down a bit.

An example in Linux (4.10.0) Intel IOMMU driver:

/*
 * Fallback to domain selective flush if no PSI support or the size is
 * too big.
 * PSI requires page size to be 2 ^ x, and the base address is naturally
 * aligned to the size
 */
if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
iommu->flush.flush_iotlb(iommu, did, 0, 0,
DMA_TLB_DSI_FLUSH);
else
iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
DMA_TLB_PSI_FLUSH);

If we don't have this, when above DSI FLUSH happens, we might have
unaligned mapping.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a038651..e958f53 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1196,14 +1196,33 @@ static uint64_t 
vtd_context_cache_invalidate(IntelIOMMUState *s, uint64_t val)
 
 static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
 {
+IntelIOMMUNotifierNode *node;
+
 trace_vtd_iotlb_reset("global invalidation recved");
 vtd_reset_iotlb(s);
+
+QLIST_FOREACH(node, &s->notifiers_list, next) {
+memory_region_iommu_replay_all(&node->vtd_as->iommu);
+}
 }
 
 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
 {
+IntelIOMMUNotifierNode *node;
+VTDContextEntry ce;
+VTDAddressSpace *vtd_as;
+
 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
 &domain_id);
+
+QLIST_FOREACH(node, &s->notifiers_list, next) {
+vtd_as = node->vtd_as;
+if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+  vtd_as->devfn, &ce) &&
+domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
+memory_region_iommu_replay_all(&vtd_as->iommu);
+}
+}
 }
 
 static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
-- 
2.7.4

[Qemu-devel] [PATCH RFC v4 15/20] intel_iommu: provide its own replay() callback

2017-01-20 Thread Peter Xu

The default replay() don't work for VT-d since vt-d will have a huge
default memory region which covers address range 0-(2^64-1). This will
normally consumes a lot of time (which looks like a dead loop).

The solution is simple - we don't walk over all the regions. Instead, we
jump over the regions when we found that the page directories are empty.
It'll greatly reduce the time to walk the whole region.

To achieve this, we provided a page walk helper to do that, invoking
corresponding hook function when we found an page we are interested in.
vtd_page_walk_level() is the core logic for the page walking. It's
interface is designed to suite further use case, e.g., to invalidate a
range of addresses.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 216 --
 hw/i386/trace-events  |   7 ++
 include/exec/memory.h |   2 +
 3 files changed, 220 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 6f5f68a..f9c5142 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -598,6 +598,22 @@ static inline uint32_t 
vtd_get_agaw_from_context_entry(VTDContextEntry *ce)
 return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9;
 }
 
+static inline uint64_t vtd_iova_limit(VTDContextEntry *ce)
+{
+uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
+return 1ULL << MIN(ce_agaw, VTD_MGAW);
+}
+
+/* Return true if IOVA passes range check, otherwise false. */
+static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce)
+{
+/*
+ * Check if @iova is above 2^X-1, where X is the minimum of MGAW
+ * in CAP_REG and AW in context-entry.
+ */
+return !(iova & ~(vtd_iova_limit(ce) - 1));
+}
+
 static const uint64_t vtd_paging_entry_rsvd_field[] = {
 [0] = ~0ULL,
 /* For not large page */
@@ -633,13 +649,9 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
iova, bool is_write,
 uint32_t level = vtd_get_level_from_context_entry(ce);
 uint32_t offset;
 uint64_t slpte;
-uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce);
 uint64_t access_right_check;
 
-/* Check if @iova is above 2^X-1, where X is the minimum of MGAW
- * in CAP_REG and AW in context-entry.
- */
-if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) {
+if (!vtd_iova_range_check(iova, ce)) {
 trace_vtd_err("IOVA exceeds limits");
 return -VTD_FR_ADDR_BEYOND_MGAW;
 }
@@ -681,6 +693,168 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t 
iova, bool is_write,
 }
 }
 
+typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
+
+/**
+ * vtd_page_walk_level - walk over specific level for IOVA range
+ *
+ * @addr: base GPA addr to start the walk
+ * @start: IOVA range start address
+ * @end: IOVA range end address (start <= addr < end)
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @read: whether parent level has read permission
+ * @write: whether parent level has write permission
+ * @skipped: accumulated skipped ranges
+ * @notify_unmap: whether we should notify invalid entries
+ */
+static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
+   uint64_t end, vtd_page_walk_hook hook_fn,
+   void *private, uint32_t level,
+   bool read, bool write, uint64_t *skipped,
+   bool notify_unmap)
+{
+bool read_cur, write_cur, entry_valid;
+uint32_t offset;
+uint64_t slpte;
+uint64_t subpage_size, subpage_mask;
+IOMMUTLBEntry entry;
+uint64_t iova = start;
+uint64_t iova_next;
+uint64_t skipped_local = 0;
+int ret = 0;
+
+trace_vtd_page_walk_level(addr, level, start, end);
+
+subpage_size = 1ULL << vtd_slpt_level_shift(level);
+subpage_mask = vtd_slpt_level_page_mask(level);
+
+while (iova < end) {
+iova_next = (iova & subpage_mask) + subpage_size;
+
+offset = vtd_iova_level_offset(iova, level);
+slpte = vtd_get_slpte(addr, offset);
+
+/*
+ * When one of the following case happens, we assume the whole
+ * range is invalid:
+ *
+ * 1. read block failed
+ * 3. reserved area non-zero
+ * 2. both read & write flag are not set
+ */
+
+if (slpte == (uint64_t)-1) {
+trace_vtd_page_walk_skip_read(iova, iova_next);
+skipped_local++;
+goto next;
+}
+
+if (vtd_slpte_nonzero_rsvd(slpte, level)) {
+trace_vtd_page_walk_skip_reserve(iova, iova_next);
+skipped_local++;
+goto next;
+}
+
+/* Permissions are stacked with parents' */
+read_cur = read && (slpte & VTD_SL_R);
+write_cur = write && (slpte & VTD_SL_W);
+
+/*
+ * As long as we have either read/write permission, this is
+ * a v

[Qemu-devel] [PULL 02/35] megasas: fix guest-triggered memory leak

2017-01-20 Thread Paolo Bonzini

If the guest sets the sglist size to a value >=2GB, megasas_handle_dcmd
will return MFI_STAT_MEMORY_NOT_AVAILABLE without freeing the memory.
Avoid this by returning only the status from map_dcmd, and loading
cmd->iov_size in the caller.

Reported-by: Li Qiang 
Signed-off-by: Paolo Bonzini 
---
 hw/scsi/megasas.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index 67fc1e7..6233865 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -683,14 +683,14 @@ static int megasas_map_dcmd(MegasasState *s, MegasasCmd 
*cmd)
 trace_megasas_dcmd_invalid_sge(cmd->index,
cmd->frame->header.sge_count);
 cmd->iov_size = 0;
-return -1;
+return -EINVAL;
 }
 iov_pa = megasas_sgl_get_addr(cmd, &cmd->frame->dcmd.sgl);
 iov_size = megasas_sgl_get_len(cmd, &cmd->frame->dcmd.sgl);
 pci_dma_sglist_init(&cmd->qsg, PCI_DEVICE(s), 1);
 qemu_sglist_add(&cmd->qsg, iov_pa, iov_size);
 cmd->iov_size = iov_size;
-return cmd->iov_size;
+return 0;
 }
 
 static void megasas_finish_dcmd(MegasasCmd *cmd, uint32_t iov_size)
@@ -1559,19 +1559,20 @@ static const struct dcmd_cmd_tbl_t {
 
 static int megasas_handle_dcmd(MegasasState *s, MegasasCmd *cmd)
 {
-int opcode, len;
+int opcode;
 int retval = 0;
+size_t len;
 const struct dcmd_cmd_tbl_t *cmdptr = dcmd_cmd_tbl;
 
 opcode = le32_to_cpu(cmd->frame->dcmd.opcode);
 trace_megasas_handle_dcmd(cmd->index, opcode);
-len = megasas_map_dcmd(s, cmd);
-if (len < 0) {
+if (megasas_map_dcmd(s, cmd) < 0) {
 return MFI_STAT_MEMORY_NOT_AVAILABLE;
 }
 while (cmdptr->opcode != -1 && cmdptr->opcode != opcode) {
 cmdptr++;
 }
+len = cmd->iov_size;
 if (cmdptr->opcode == -1) {
 trace_megasas_dcmd_unhandled(cmd->index, opcode, len);
 retval = megasas_dcmd_dummy(s, cmd);
-- 
2.9.3

[Qemu-devel] [PULL v3 00/32] Misc patches for 2017-01-11

2017-01-20 Thread Paolo Bonzini

The following changes since commit 2ccede18bd24fce5db83fef3674563a1f256717b:

  Merge remote-tracking branch 'remotes/vivier/tags/m68k-for-2.9-pull-request' 
into staging (2017-01-16 12:41:35 +)

are available in the git repository at:

  git://github.com/bonzini/qemu.git tags/for-upstream

for you to fetch changes up to abc62c89f3191774dbd600a2caec803cbf557160:

  pc.h: move x-mach-use-reliable-get-clock compat entry to PC_COMPAT_2_8 
(2017-01-20 13:22:57 +0100)

Third time is the charm...

v2->v3 adds "KVM: PPC: eliminate unnecessary duplicate constants"
to fix the build failure, and patches 34-35 just because they're
small and nice.


* QOM interface fix (Eduardo)
* RTC fixes (Gaohuai, Igor)
* Memory leak fixes (Li Qiang, me)
* Ctrl-a b regression (Marc-André)
* Stubs cleanups and fixes (Leif, me)
* hxtool tweak (me)
* HAX support (Vincent)
* QemuThread, exec.c and SCSI fixes (Roman, Xinhua, me)
* PC_COMPAT_2_8 fix (Marcelo)
* stronger bitmap assertions (Peter)


Caoxinhua (1):
  qemu-thread: fix qemu_thread_set_name() race in qemu_thread_create()

Eduardo Habkost (1):
  qom: Make all interface types abstract

Igor Mammedov (1):
  pc: fix crash in rtc_set_memory() if initial cpu is marked as hotplugged

Leif Lindholm (1):
  smbios: filter based on CONFIG_SMBIOS rather than TARGET

Li Qiang (1):
  serial: fix memory leak in serial exit

Marc-André Lureau (2):
  char: fix ctrl-a b not working
  Revert "win32: don't run subprocess tests on Mingw32 platform"

Marcelo Tosatti (1):
  pc.h: move x-mach-use-reliable-get-clock compat entry to PC_COMPAT_2_8

Paolo Bonzini (17):
  megasas: fix guest-triggered memory leak
  stubs: merge all monitor stubs in one file, remove monitor_cur_is_qmp stub
  stubs: move smbios stubs to hw/smbios
  stubs: move acpi stubs to hw/acpi
  stubs: remove unused stub for serial_hd
  hw: move reset handlers from vl.c to hw/core
  stubs: group stubs for user-mode emulation
  stubs: group all monitor_fdset_* functions in a single file
  stubs: move vhost stubs to stubs/vhost.o
  event_notifier: cleanups around event_notifier_set_handler
  build: remove --enable-colo/--disable-colo
  stubs: remove stubs/kvm.c
  acpi: filter based on CONFIG_ACPI_X86 rather than TARGET
  scsi-block: fix direction of BYTCHK test for VERIFY commands
  hxtool: emit Texinfo headings as @subsection
  ramblock-notifier: new
  KVM: PPC: eliminate unnecessary duplicate constants

Peter Lieven (1):
  bitmap: assert that start and nr are non negative

Peter Xu (3):
  x86: ioapic: add traces for ioapic
  x86: ioapic: dump version for "info ioapic"
  x86: ioapic: fix fail migration when irqchip=split

Roman Kapl (1):
  exec: Add missing rcu_read_unlock

Vincent Palatin (4):
  kvm: move cpu synchronization code
  target/i386: Add Intel HAX files
  Plumb the HAXM-based hardware acceleration support
  hax: add Darwin support

hangaohuai (1):
  bugfix: vm halt when in reset looping

 MAINTAINERS|1 +
 Makefile.target|5 +-
 arch_init.c|   21 -
 configure  |   27 +-
 cpus.c |   79 +-
 exec.c |6 +
 gdbstub.c  |1 +
 hax-stub.c |   34 +
 hw/Makefile.objs   |6 +-
 hw/acpi/Makefile.objs  |   17 +-
 hw/acpi/acpi-stub.c|   29 +
 stubs/ipmi.c => hw/acpi/ipmi-stub.c|0
 hw/char/serial.c   |   10 +
 hw/core/Makefile.objs  |2 +-
 hw/core/reset.c|   72 ++
 hw/i386/kvm/apic.c |1 +
 hw/i386/kvmvapic.c |1 +
 hw/i386/pc.c   |4 +-
 hw/intc/apic_common.c  |3 +-
 hw/intc/ioapic.c   |   22 +-
 hw/intc/ioapic_common.c|3 +-
 hw/intc/trace-events   |7 +
 hw/misc/vmport.c   |2 +-
 hw/ppc/pnv_xscom.c |2 +-
 hw/ppc/ppce500_spin.c  |4 +-
 hw/ppc/spapr.c |2 +-
 hw/ppc/spapr_hcall.c   |2 +-
 hw/s390x/s390-pci-inst.c   |1 +
 hw/scsi/megasas.c  |   11 +-
 hw/scsi/scsi-disk.c

[Qemu-devel] [PATCH RFC v4 18/20] intel_iommu: enable vfio devices

2017-01-20 Thread Peter Xu

This patch is based on Aviv Ben-David ()'s patch
upstream:

  "IOMMU: enable intel_iommu map and unmap notifiers"
  https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg01453.html

However I removed/fixed some content, and added my own codes.

Instead of translate() every page for iotlb invalidations (which is
slower), we walk the pages when needed and notify in a hook function.

This patch enables vfio devices for VT-d emulation.

Signed-off-by: Peter Xu 
---
 hw/i386/intel_iommu.c | 66 +--
 include/hw/i386/intel_iommu.h |  8 ++
 2 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 83a2e1f..7cbf057 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -833,7 +833,8 @@ next:
  * @private: private data for the hook function
  */
 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
- vtd_page_walk_hook hook_fn, void *private)
+ vtd_page_walk_hook hook_fn, void *private,
+ bool notify_unmap)
 {
 dma_addr_t addr = vtd_get_slpt_base_from_context(ce);
 uint32_t level = vtd_get_level_from_context_entry(ce);
@@ -852,7 +853,7 @@ static int vtd_page_walk(VTDContextEntry *ce, uint64_t 
start, uint64_t end,
 trace_vtd_page_walk_level(addr, level, start, end);
 
 return vtd_page_walk_level(addr, start, end, hook_fn, private,
-   level, true, true, NULL, false);
+   level, true, true, NULL, notify_unmap);
 }
 
 /* Map a device to its corresponding domain (context-entry) */
@@ -1205,6 +1206,33 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState 
*s, uint16_t domain_id)
 &domain_id);
 }
 
+static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
+   void *private)
+{
+memory_region_notify_iommu((MemoryRegion *)private, *entry);
+return 0;
+}
+
+static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
+   uint16_t domain_id, hwaddr addr,
+   uint8_t am)
+{
+IntelIOMMUNotifierNode *node;
+VTDContextEntry ce;
+int ret;
+
+QLIST_FOREACH(node, &(s->notifiers_list), next) {
+VTDAddressSpace *vtd_as = node->vtd_as;
+ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+   vtd_as->devfn, &ce);
+if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
+vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
+  vtd_page_invalidate_notify_hook,
+  (void *)&vtd_as->iommu, true);
+}
+}
+}
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
   hwaddr addr, uint8_t am)
 {
@@ -1215,6 +1243,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, 
uint16_t domain_id,
 info.addr = addr;
 info.mask = ~((1 << am) - 1);
 g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }
 
 /* Flush IOTLB
@@ -2224,15 +2253,33 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion 
*iommu,
   IOMMUNotifierFlag new)
 {
 VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
+IntelIOMMUState *s = vtd_as->iommu_state;
+IntelIOMMUNotifierNode *node = NULL;
+IntelIOMMUNotifierNode *next_node = NULL;
 
-if (new & IOMMU_NOTIFIER_MAP) {
-error_report("Device at bus %s addr %02x.%d requires iommu "
- "notifier which is currently not supported by "
- "intel-iommu emulation",
- vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
- PCI_FUNC(vtd_as->devfn));
+if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
+error_report("We need to set cache_mode=1 for intel-iommu to enable "
+ "device assignment with IOMMU protection.");
 exit(1);
 }
+
+if (old == IOMMU_NOTIFIER_NONE) {
+node = g_malloc0(sizeof(*node));
+node->vtd_as = vtd_as;
+QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
+return;
+}
+
+/* update notifier node with new flags */
+QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
+if (node->vtd_as == vtd_as) {
+if (new == IOMMU_NOTIFIER_NONE) {
+QLIST_REMOVE(node, next);
+g_free(node);
+}
+return;
+}
+}
 }
 
 static const VMStateDescription vtd_vmstate = {
@@ -2671,7 +2718,7 @@ static void vtd_iommu_replay(MemoryRegion *mr, 
IOMMUNotifier *n)
   PCI_FUNC(vtd_as->devfn),

[Qemu-devel] [PULL 04/35] smbios: filter based on CONFIG_SMBIOS rather than TARGET

2017-01-20 Thread Paolo Bonzini

From: Leif Lindholm 

-smbios command line options were accepted but silently ignored on
TARGET_ARM, due to a test for TARGET_I386 in arch_init.c.

Copy the mechanism of hw/pci/pci-stub.c to implement an smbios-stub
instead, enabled for all targets without CONFIG_SMBIOS.

Signed-off-by: Leif Lindholm 
Message-Id: <20161222151828.28292-1-leif.lindh...@linaro.org>
Signed-off-by: Paolo Bonzini 
---
 arch_init.c|  8 
 hw/Makefile.objs   |  2 +-
 hw/smbios/Makefile.objs|  3 +++
 hw/smbios/smbios-stub.c| 31 +++
 hw/smbios/smbios.c |  2 +-
 include/hw/smbios/smbios.h |  2 +-
 include/sysemu/arch_init.h |  1 -
 vl.c   |  2 +-
 8 files changed, 38 insertions(+), 13 deletions(-)
 create mode 100644 hw/smbios/smbios-stub.c

diff --git a/arch_init.c b/arch_init.c
index 5cc58b2..34e7694 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -28,7 +28,6 @@
 #include "sysemu/arch_init.h"
 #include "hw/pci/pci.h"
 #include "hw/audio/audio.h"
-#include "hw/smbios/smbios.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
 #include "qmp-commands.h"
@@ -248,13 +247,6 @@ void do_acpitable_option(const QemuOpts *opts)
 #endif
 }
 
-void do_smbios_option(QemuOpts *opts)
-{
-#ifdef TARGET_I386
-smbios_entry_add(opts);
-#endif
-}
-
 int kvm_available(void)
 {
 #ifdef CONFIG_KVM
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 0ffd281..2a73ae5 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -33,7 +33,7 @@ devices-dirs-$(CONFIG_VIRTIO) += virtio/
 devices-dirs-$(CONFIG_SOFTMMU) += watchdog/
 devices-dirs-$(CONFIG_SOFTMMU) += xen/
 devices-dirs-$(CONFIG_MEM_HOTPLUG) += mem/
-devices-dirs-$(CONFIG_SMBIOS) += smbios/
+devices-dirs-$(CONFIG_SOFTMMU) += smbios/
 devices-dirs-y += core/
 common-obj-y += $(devices-dirs-y)
 obj-y += $(devices-dirs-y)
diff --git a/hw/smbios/Makefile.objs b/hw/smbios/Makefile.objs
index c3d3753..ee0712b 100644
--- a/hw/smbios/Makefile.objs
+++ b/hw/smbios/Makefile.objs
@@ -1,2 +1,5 @@
 common-obj-$(CONFIG_SMBIOS) += smbios.o
 common-obj-$(call land,$(CONFIG_SMBIOS),$(CONFIG_IPMI)) += smbios_type_38.o
+
+common-obj-$(call lnot,$(CONFIG_SMBIOS)) += smbios-stub.o
+common-obj-$(CONFIG_ALL) += smbios-stub.o
diff --git a/hw/smbios/smbios-stub.c b/hw/smbios/smbios-stub.c
new file mode 100644
index 000..3087394
--- /dev/null
+++ b/hw/smbios/smbios-stub.c
@@ -0,0 +1,31 @@
+/*
+ * SMBIOS stubs for platforms that don't support SMBIOS.
+ *
+ * Copyright (c) 2010 Isaku Yamahata 
+ *VA Linux Systems Japan K.K.
+ * Copyright (c) 2016 Leif Lindholm 
+ *Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see .
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/qmp/qerror.h"
+#include "qmp-commands.h"
+#include "hw/smbios/smbios.h"
+
+void smbios_entry_add(QemuOpts *opts, Error **errp)
+{
+error_setg(errp, QERR_UNSUPPORTED);
+}
diff --git a/hw/smbios/smbios.c b/hw/smbios/smbios.c
index 3a96ced..1a5437a 100644
--- a/hw/smbios/smbios.c
+++ b/hw/smbios/smbios.c
@@ -882,7 +882,7 @@ static void save_opt(const char **dest, QemuOpts *opts, 
const char *name)
 }
 }
 
-void smbios_entry_add(QemuOpts *opts)
+void smbios_entry_add(QemuOpts *opts, Error **errp)
 {
 const char *val;
 
diff --git a/include/hw/smbios/smbios.h b/include/hw/smbios/smbios.h
index 1cd53cc..31e8d5f 100644
--- a/include/hw/smbios/smbios.h
+++ b/include/hw/smbios/smbios.h
@@ -257,7 +257,7 @@ struct smbios_type_127 {
 struct smbios_structure_header header;
 } QEMU_PACKED;
 
-void smbios_entry_add(QemuOpts *opts);
+void smbios_entry_add(QemuOpts *opts, Error **errp);
 void smbios_set_cpuid(uint32_t version, uint32_t features);
 void smbios_set_defaults(const char *manufacturer, const char *product,
  const char *version, bool legacy_mode,
diff --git a/include/sysemu/arch_init.h b/include/sysemu/arch_init.h
index 1c9dad1..88dcf77 100644
--- a/include/sysemu/arch_init.h
+++ b/include/sysemu/arch_init.h
@@ -29,7 +29,6 @@ extern const uint32_t arch_type;
 
 void select_soundhw(const char *optarg);
 void do_acpitable_option(const QemuOpts *opts);
-void do_smbios_option(QemuOpts *opts);
 void audio_init(void);
 int kvm_available(void);
 int xen_available(void);
diff --git a/vl.c b/vl.c
index c643d3f..63355e5 100644
--- a/vl.c
+++ b/vl.c
@@ -3707,7 +3707,7 @@ int main(int a

1 2 3 >

1 - 100 of 273 matches

Mail list logo