[Xen-devel] [PATCH 1/3] x86/mm: drop guest_{map, get_eff}_l1e() hooks

2016-02-05 Thread Jan Beulich
Disallow the unmaintained and presumed broken translated-but-not-
external paging mode combination, allowing the respective paging hooks
to go away (which eliminates one pair of NULL callbacks in HAP mode).
As a result of them no longer being generic paging operations, make the
inline functions private to mm.c, dropping their struct vcpu parameters
where suitable.

The enforcement of the proper mode combination gets now done in
paging_enable(), requiring shadow_domctl() to no longer call
shadow_enable() directly.

Also as a result support for XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE gets
removed too.

Signed-off-by: Jan Beulich 

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -516,6 +516,67 @@ void update_cr3(struct vcpu *v)
 make_cr3(v, cr3_mfn);
 }
 
+/* Get a mapping of a PV guest's l1e for this virtual address. */
+static l1_pgentry_t *guest_map_l1e(unsigned long addr, unsigned long *gl1mfn)
+{
+l2_pgentry_t l2e;
+
+ASSERT(!paging_mode_translate(current->domain));
+ASSERT(!paging_mode_external(current->domain));
+
+if ( unlikely(!__addr_ok(addr)) )
+return NULL;
+
+/* Find this l1e and its enclosing l1mfn in the linear map. */
+if ( __copy_from_user(&l2e,
+  &__linear_l2_table[l2_linear_offset(addr)],
+  sizeof(l2_pgentry_t)) )
+return NULL;
+
+/* Check flags that it will be safe to read the l1e. */
+if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
+return NULL;
+
+*gl1mfn = l2e_get_pfn(l2e);
+
+return (l1_pgentry_t *)map_domain_page(_mfn(*gl1mfn)) +
+   l1_table_offset(addr);
+}
+
+/* Pull down the mapping we got from guest_map_l1e(). */
+static inline void guest_unmap_l1e(void *p)
+{
+unmap_domain_page(p);
+}
+
+/* Read a PV guest's l1e that maps this virtual address. */
+static inline void guest_get_eff_l1e(unsigned long addr, l1_pgentry_t *eff_l1e)
+{
+ASSERT(!paging_mode_translate(current->domain));
+ASSERT(!paging_mode_external(current->domain));
+
+if ( unlikely(!__addr_ok(addr)) ||
+ __copy_from_user(eff_l1e,
+  &__linear_l1_table[l1_linear_offset(addr)],
+  sizeof(l1_pgentry_t)) )
+*eff_l1e = l1e_empty();
+}
+
+/*
+ * Read the guest's l1e that maps this address, from the kernel-mode
+ * page tables.
+ */
+static inline void guest_get_eff_kern_l1e(struct vcpu *v, unsigned long addr,
+  void *eff_l1e)
+{
+bool_t user_mode = !(v->arch.flags & TF_kernel_mode);
+#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
+
+TOGGLE_MODE();
+guest_get_eff_l1e(addr, eff_l1e);
+TOGGLE_MODE();
+}
+
 static const char __section(".bss.page_aligned") zero_page[PAGE_SIZE];
 
 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
@@ -3985,7 +4046,7 @@ static int create_grant_va_mapping(
 
 adjust_guest_l1e(nl1e, d);
 
-pl1e = guest_map_l1e(v, va, &gl1mfn);
+pl1e = guest_map_l1e(va, &gl1mfn);
 if ( !pl1e )
 {
 MEM_LOG("Could not find L1 PTE for address %lx", va);
@@ -3994,7 +4055,7 @@ static int create_grant_va_mapping(
 
 if ( !get_page_from_pagenr(gl1mfn, current->domain) )
 {
-guest_unmap_l1e(v, pl1e);
+guest_unmap_l1e(pl1e);
 return GNTST_general_error;
 }
 
@@ -4002,7 +4063,7 @@ static int create_grant_va_mapping(
 if ( !page_lock(l1pg) )
 {
 put_page(l1pg);
-guest_unmap_l1e(v, pl1e);
+guest_unmap_l1e(pl1e);
 return GNTST_general_error;
 }
 
@@ -4010,7 +4071,7 @@ static int create_grant_va_mapping(
 {
 page_unlock(l1pg);
 put_page(l1pg);
-guest_unmap_l1e(v, pl1e);
+guest_unmap_l1e(pl1e);
 return GNTST_general_error;
 }
 
@@ -4019,7 +4080,7 @@ static int create_grant_va_mapping(
 
 page_unlock(l1pg);
 put_page(l1pg);
-guest_unmap_l1e(v, pl1e);
+guest_unmap_l1e(pl1e);
 
 if ( okay && !paging_mode_refcounts(d) )
 put_page_from_l1e(ol1e, d);
@@ -4035,7 +4096,7 @@ static int replace_grant_va_mapping(
 struct page_info *l1pg;
 int rc = 0;
 
-pl1e = guest_map_l1e(v, addr, &gl1mfn);
+pl1e = guest_map_l1e(addr, &gl1mfn);
 if ( !pl1e )
 {
 MEM_LOG("Could not find L1 PTE for address %lx", addr);
@@ -4085,7 +4146,7 @@ static int replace_grant_va_mapping(
 page_unlock(l1pg);
 put_page(l1pg);
  out:
-guest_unmap_l1e(v, pl1e);
+guest_unmap_l1e(pl1e);
 return rc;
 }
 
@@ -4197,7 +4258,7 @@ int replace_grant_host_mapping(
 if ( !new_addr )
 return destroy_grant_va_mapping(addr, frame, curr);
 
-pl1e = guest_map_l1e(curr, new_addr, &gl1mfn);
+pl1e = guest_map_l1e(new_addr, &gl1mfn);
 if ( !pl1e )
 {
 MEM_LOG("Could not find L1 PTE for address %lx",
@@ -4207,7 +4268,7 @@ int replace_grant_host_mapping(
 
 if ( !get_page_from_pagenr(gl1mfn, current->domain)

[Xen-devel] [PATCH 2/3] x86/mm: make {cmpxchg, write}_guest_entry() hook shadow mode specific

2016-02-05 Thread Jan Beulich
... as they're being used for PV guests only, which don't use HAP mode.
This eliminates another pair of NULL callbacks in HAP as well as in 2-
and 3-guest-level shadow modes.

Signed-off-by: Jan Beulich 

--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -369,15 +369,14 @@ static void sh_audit_gw(struct vcpu *v,
 #endif /* audit code */
 
 
-#if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS)
 /*
  * Write a new value into the guest pagetable, and update the shadows
  * appropriately.  Returns 0 if we page-faulted, 1 for success.
  */
-static int
-sh_write_guest_entry(struct vcpu *v, guest_intpte_t *p,
- guest_intpte_t new, mfn_t gmfn)
+static bool_t
+sh_write_guest_entry(struct vcpu *v, intpte_t *p, intpte_t new, mfn_t gmfn)
 {
+#if CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS
 int failed;
 
 paging_lock(v->domain);
@@ -387,6 +386,9 @@ sh_write_guest_entry(struct vcpu *v, gue
 paging_unlock(v->domain);
 
 return !failed;
+#else
+return 0;
+#endif
 }
 
 /*
@@ -395,10 +397,11 @@ sh_write_guest_entry(struct vcpu *v, gue
  * N.B. caller should check the value of "old" to see if the cmpxchg itself
  * was successful.
  */
-static int
-sh_cmpxchg_guest_entry(struct vcpu *v, guest_intpte_t *p,
-   guest_intpte_t *old, guest_intpte_t new, mfn_t gmfn)
+static bool_t
+sh_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, intpte_t *old,
+   intpte_t new, mfn_t gmfn)
 {
+#if CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS
 int failed;
 guest_intpte_t t = *old;
 
@@ -410,8 +413,10 @@ sh_cmpxchg_guest_entry(struct vcpu *v, g
 paging_unlock(v->domain);
 
 return !failed;
+#else
+return 0;
+#endif
 }
-#endif /* CONFIG == GUEST (== SHADOW) */
 
 /**/
 /* Functions to compute the correct index into a shadow page, given an
@@ -5194,14 +5199,12 @@ const struct paging_mode sh_paging_mode
 .update_cr3= sh_update_cr3,
 .update_paging_modes   = shadow_update_paging_modes,
 .write_p2m_entry   = shadow_write_p2m_entry,
-#if CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS
-.write_guest_entry = sh_write_guest_entry,
-.cmpxchg_guest_entry   = sh_cmpxchg_guest_entry,
-#endif
 .guest_levels  = GUEST_PAGING_LEVELS,
 .shadow.detach_old_tables  = sh_detach_old_tables,
 .shadow.x86_emulate_write  = sh_x86_emulate_write,
 .shadow.x86_emulate_cmpxchg= sh_x86_emulate_cmpxchg,
+.shadow.write_guest_entry  = sh_write_guest_entry,
+.shadow.cmpxchg_guest_entry= sh_cmpxchg_guest_entry,
 .shadow.make_monitor_table = sh_make_monitor_table,
 .shadow.destroy_monitor_table  = sh_destroy_monitor_table,
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -87,6 +87,11 @@ struct shadow_paging_mode {
 unsigned long new,
 unsigned int bytes,
 struct sh_emulate_ctxt *sh_ctxt);
+bool_t(*write_guest_entry )(struct vcpu *v, intpte_t *p,
+intpte_t new, mfn_t gmfn);
+bool_t(*cmpxchg_guest_entry   )(struct vcpu *v, intpte_t *p,
+intpte_t *old, intpte_t new,
+mfn_t gmfn);
 mfn_t (*make_monitor_table)(struct vcpu *v);
 void  (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn);
 int   (*guess_wrmap   )(struct vcpu *v, 
@@ -119,11 +124,6 @@ struct paging_mode {
 void  (*write_p2m_entry   )(struct domain *d, unsigned long 
gfn,
 l1_pgentry_t *p, l1_pgentry_t new,
 unsigned int level);
-int   (*write_guest_entry )(struct vcpu *v, intpte_t *p,
-intpte_t new, mfn_t gmfn);
-int   (*cmpxchg_guest_entry   )(struct vcpu *v, intpte_t *p,
-intpte_t *old, intpte_t new,
-mfn_t gmfn);
 
 unsigned int guest_levels;
 
@@ -299,14 +299,15 @@ static inline void paging_update_paging_
 /* Write a new value into the guest pagetable, and update the
  * paging-assistance state appropriately.  Returns 0 if we page-faulted,
  * 1 for success. */
-static inline int paging_write_guest_entry(struct vcpu *v, intpte_t *p,
-   intpte_t new, mfn_t gmfn)
+static inline bool_t paging_write_guest_entry(struct vcpu *v, intpte_t *p,
+  intpte_t new, mfn_t gmfn)
 {
-if ( unlikely(paging_mode_enabled(v->domain) 
-  

[Xen-devel] [PATCH 3/3] x86/shadow: remove a few 32-bit hypervisor leftovers

2016-02-05 Thread Jan Beulich
... related to 8-byte cmpxchg having required special precautions
there.

Signed-off-by: Jan Beulich 

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -259,10 +259,10 @@ hvm_emulate_cmpxchg(enum x86_segment seg
 struct sh_emulate_ctxt *sh_ctxt =
 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
 struct vcpu *v = current;
-unsigned long addr, old[2], new[2];
+unsigned long addr, old, new;
 int rc;
 
-if ( !is_x86_user_segment(seg) )
+if ( !is_x86_user_segment(seg) || bytes > sizeof(long) )
 return X86EMUL_UNHANDLEABLE;
 
 rc = hvm_translate_linear_addr(
@@ -270,15 +270,12 @@ hvm_emulate_cmpxchg(enum x86_segment seg
 if ( rc )
 return rc;
 
-old[0] = new[0] = 0;
-memcpy(old, p_old, bytes);
-memcpy(new, p_new, bytes);
-
-if ( bytes <= sizeof(long) )
-return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
-v, addr, old[0], new[0], bytes, sh_ctxt);
+old = new = 0;
+memcpy(&old, p_old, bytes);
+memcpy(&new, p_new, bytes);
 
-return X86EMUL_UNHANDLEABLE;
+return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
+   v, addr, old, new, bytes, sh_ctxt);
 }
 
 static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
@@ -335,21 +332,18 @@ pv_emulate_cmpxchg(enum x86_segment seg,
 {
 struct sh_emulate_ctxt *sh_ctxt =
 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
-unsigned long old[2], new[2];
+unsigned long old, new;
 struct vcpu *v = current;
 
-if ( !is_x86_user_segment(seg) )
+if ( !is_x86_user_segment(seg) || bytes > sizeof(long) )
 return X86EMUL_UNHANDLEABLE;
 
-old[0] = new[0] = 0;
-memcpy(old, p_old, bytes);
-memcpy(new, p_new, bytes);
-
-if ( bytes <= sizeof(long) )
-return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
-v, offset, old[0], new[0], bytes, sh_ctxt);
+old = new = 0;
+memcpy(&old, p_old, bytes);
+memcpy(&new, p_new, bytes);
 
-return X86EMUL_UNHANDLEABLE;
+return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
+   v, offset, old, new, bytes, sh_ctxt);
 }
 
 static const struct x86_emulate_ops pv_shadow_emulator_ops = {
--- a/xen/arch/x86/mm/shadow/types.h
+++ b/xen/arch/x86/mm/shadow/types.h
@@ -247,7 +247,6 @@ static inline shadow_l4e_t shadow_l4e_fr
 #define sh_detach_old_tables   INTERNAL_NAME(sh_detach_old_tables)
 #define sh_x86_emulate_write   INTERNAL_NAME(sh_x86_emulate_write)
 #define sh_x86_emulate_cmpxchg INTERNAL_NAME(sh_x86_emulate_cmpxchg)
-#define sh_x86_emulate_cmpxchg8b   INTERNAL_NAME(sh_x86_emulate_cmpxchg8b)
 #define sh_audit_l1_table  INTERNAL_NAME(sh_audit_l1_table)
 #define sh_audit_fl1_table INTERNAL_NAME(sh_audit_fl1_table)
 #define sh_audit_l2_table  INTERNAL_NAME(sh_audit_l2_table)



x86/shadow: remove a few 32-bit hypervisor leftovers

... related to 8-byte cmpxchg having required special precautions
there.

Signed-off-by: Jan Beulich 

--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -259,10 +259,10 @@ hvm_emulate_cmpxchg(enum x86_segment seg
 struct sh_emulate_ctxt *sh_ctxt =
 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
 struct vcpu *v = current;
-unsigned long addr, old[2], new[2];
+unsigned long addr, old, new;
 int rc;
 
-if ( !is_x86_user_segment(seg) )
+if ( !is_x86_user_segment(seg) || bytes > sizeof(long) )
 return X86EMUL_UNHANDLEABLE;
 
 rc = hvm_translate_linear_addr(
@@ -270,15 +270,12 @@ hvm_emulate_cmpxchg(enum x86_segment seg
 if ( rc )
 return rc;
 
-old[0] = new[0] = 0;
-memcpy(old, p_old, bytes);
-memcpy(new, p_new, bytes);
-
-if ( bytes <= sizeof(long) )
-return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
-v, addr, old[0], new[0], bytes, sh_ctxt);
+old = new = 0;
+memcpy(&old, p_old, bytes);
+memcpy(&new, p_new, bytes);
 
-return X86EMUL_UNHANDLEABLE;
+return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
+   v, addr, old, new, bytes, sh_ctxt);
 }
 
 static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
@@ -335,21 +332,18 @@ pv_emulate_cmpxchg(enum x86_segment seg,
 {
 struct sh_emulate_ctxt *sh_ctxt =
 container_of(ctxt, struct sh_emulate_ctxt, ctxt);
-unsigned long old[2], new[2];
+unsigned long old, new;
 struct vcpu *v = current;
 
-if ( !is_x86_user_segment(seg) )
+if ( !is_x86_user_segment(seg) || bytes > sizeof(long) )
 return X86EMUL_UNHANDLEABLE;
 
-old[0] = new[0] = 0;
-memcpy(old, p_old, bytes);
-memcpy(new, p_new, bytes);
-
-if ( bytes <= sizeof(long) )
-return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
-v, offset, old[0], new[0], bytes, sh_ctxt);
+old = new = 0;
+memcpy(&old, p_old, bytes);
+memcpy(&new, p_new, bytes);
 
-return X86EMUL_UNHANDLEABLE;

Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Roger Pau Monné
El 4/2/16 a les 21:17, Boris Ostrovsky ha escrit:
> On 02/04/2016 02:21 PM, Roger Pau Monné wrote:
>> El 4/2/16 a les 19:51, Samuel Thibault ha escrit:
>>> Boris Ostrovsky, on Thu 04 Feb 2016 13:38:02 -0500, wrote:
 On 02/04/2016 12:48 PM, Roger Pau Monné wrote:
> The format of the boot start info structure is the following
> (pointed to
> be %ebx):
>
>  struct hvm_start_info {
>  #define HVM_START_MAGIC_VALUE 0x336ec578
>  uint32_t magic; /* Contains the magic value
> 0x336ec578   */
>  /* ("xEn3" with the 0x80 bit
> of the "E" set).*/
>  uint32_t flags; /* SIF_xxx
> flags.*/
>  uint32_t cmdline_paddr; /* Physical address of the
> command line. */
>  uint32_t nr_modules;/* Number of modules passed to
> the kernel.   */
>  uint32_t modlist_paddr; /* Physical address of an
> array of   */
>  /*
> hvm_modlist_entry.*/
>  };
>
>  struct hvm_modlist_entry {
>  uint32_t paddr; /* Physical address of the
> module.   */
>  uint32_t size;  /* Size of the module in
> bytes.  */
>  };
 If there is more than one module, how is the guest expected to sort out
 which module is what?
>> In general I was expecting this would be done by position, or if that's
>> not enough an additional module (at either position 0 or n) should be
>> passed to contain that information.
> 
> Then we should specify it somehow --- e.g. that first module is always
> the ramdisk.

No, that's how Linux uses it, but it's not part of the spec at all. From
a Xen PoV, this 'modules' are just memory regions, it doesn't know
anything else about them, neither it needs to.

>>> +1
>>> We need that to pass parameters to gnumach modules.
>> Hm, parameters as in a string that's paired with a module, or something
>> more complex like a metadata block?
>>
>> I see that multiboot provides a string associated with each module, we
>> could do the same IMHO. I'm fine with adding it to the boot ABI, but I
>> would prefer if someone with access to such an OS does the actual
>> implementation of this feature.
>>
>> Just to be clear that we are on the same page, then the _entry struct
>> becomes:
>>
>> struct hvm_modlist_entry {
>> uint32_t paddr;
>> uint32_t size;
>> uint32_t cmdline_paddr;
>> };
>>
>> cmdline_paddr would work the same way as it does in the hvm_start_info
>> struct (ie: physical address of a zero-terminated ASCII string).
> 
> Doesn't this imply that strings should be part of this spec? Line "initrd"?

cmdline_paddr needs to be added to the spec, and I will do it in the
next revision (note that this will also require changes to the current
implementation). I'm not sure about your other part of the question,
making the concrete strings part of the implementation is completely out
of the spec, but I guess you mean something else which I don't get.

Roger.


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Jan Beulich
>>> On 04.02.16 at 18:12,  wrote:
> Two angles on this.
> 
> First, assuming that limiting the number of ranges is what we want:  I'm
> not really a fan of using HVM_PARAMs for this, but as long as it's not
> considered a public interface (i.e., it could go away or disappear and
> everything would Just Work), then I wouldn't object.

The parameter setting interface generally is guest exposed, it's just
that many parameters can't be set by guests for themselves. Of
course the new #define could be put inside a Xen/tools conditional.

> Although I would ask: would it instead be suitable for now to just set
> the default limit for WP_RAM to 8196 in the hypervisor, since we do
> expect it to be tracking gpfn ranges rather than IO regions?  And if we
> determine in the future that more ranges are necessary, to then do the
> work of moving it to using p2m types (or exposing a knob to adjust it)?

That's what collides with disaggregation (as pointed out before):
Any tool stack component could then create wp-ram pages in
guests it controls, no matter whether those guests actually have
a need for such. I continue to think that if we indeed got this
route, then the host admin should be required to explicitly state
that the risks are acceptable (by allowing only certain guests to
actually create [many] of such pages).

> But (and this the other angle): is simply marking a numerical limit
> sufficient to avoid memory exhaustion? Is there a danger that after
> creating several guests, such that Xen was now running very low on
> memory, that a guest would (purposely or not) cause memory to be
> exhausted sometime further after boot, causing a system-wide DoS (or
> just general lack of stability)?

The guest itself can't, but other than fully privileged tool stack
components could, and that's still something we need to avoid.

> In the shadow / hap memory case, the memory is pre-allocated up front,
> which makes sure that nothing a guest does can cause Xen to run out of
> memory once it's booted.  Without pre-allocating it, it's still possible
> that the admin might start up enough VMs that exhaustion is *possible*,
> but won't be triggered until later (when the guest starts using more GTTs).
> 
> Although in fact this really points to the need for a general overhaul
> in how memory allocation on behalf of a domain is handled in general;
> that's a bigger chunk of work.

Well, right now there's pretty little allocation happening at run time
afaict, i.e. having a couple of pages available will generally keep
things going smoothly. (Once upon a time it was that there were -
iirc - no active runtime allocations at all, i.e. such had occurred only
in the context of things like domctls, where failure could be dealt
with by ballooning a few pages out of some domain and retrying.
I'm afraid that's not the case anymore nowadays, and even if it
was would collide with disaggregation, as a tool stack component
legitimately issuing a domctl may not have the privileges to
balloon other than the domain it controls, in particular not Dom0.)

> But in any case, it seems to me that we can entirely avoid the question
> of how many ranges might ever be necessary by starting with a fixed
> limit in the hypervisor, and then moving to a p2m-type based
> implementation if and when that becomes unsatisfactory.

With all of the above I think we should rather explore the p2m-type
based approach, in particular the suggestion Kevin has made to
direct all p2m_mmio_write_dm (a name which appears to have been
badly chosen, considering that we're talking about RAM pages
here) write accesses to the default ioreq server (utilizing that
upstream qemu doesn't itself register as such).

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Jan Beulich
>>> On 05.02.16 at 04:44,  wrote:
> This is why Yu mentioned earlier whether we can just set a default
> limit which is good for majority of use cases, while extending our
> device mode to drop/recreate some shadow tables upon the limitation
> is hit. I think this matches how today's CPU shadow page table is
> implemented, which also has a limitation of how many shadow pages
> are allowed per-VM.

Except that un-shadowing some victim page in order to shadow one
to make forward progress can there be done in the page fault handler,
i.e. we don't need to up front determine the set of pages to be
shadowed for a certain operation to complete.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7 5/5] PCI: ACPI: Add a generic ACPI based host controller

2016-02-05 Thread Jayachandran Chandrashekaran Nair
Hi Bjorn,

On Fri, Feb 5, 2016 at 5:49 AM, Bjorn Helgaas  wrote:
> Hi Jayachandran,
>
> On Fri, Jan 29, 2016 at 02:35:40PM +0530, Jayachandran C wrote:
>> Add a simple ACPI based PCI host controller under config option
>> ACPI_PCI_HOST_GENERIC. This is done by providing an implementation
>> of pci_acpi_scan_root().
>>
>> The pci_mmcfg_list handling is done by the ACPI code, so we keep a
>> reference to the pci_mmcfg_region in sysdata. The ECAM region will
>> be already mapped, so map_bus can be implemented by using the
>> virt pointer for the pci_mmcfg_region. pci_generic_config_read
>> and pci_generic_config_write are used for config space read/write.
>>
>> Also, we provide implementations of raw_pci_read and raw_pci_write
>> hat are needed by ACPI based on the pci_mmcfg_list.
>>
>> pci_acpi_set_companion() and acpi_pci_get_segment() are defined
>> using sysdata of generic ACPI host controller so that PCI domain
>> and ACPI companion are set in PCI code rather than platform code.
>>
>> This code is currently enabled only for ARM64.
>>
>> Signed-off-by: Jayachandran C 
>> ---
>>  drivers/acpi/Kconfig |   8 ++
>>  drivers/acpi/Makefile|   1 +
>>  drivers/acpi/pci_host_acpi.c | 186 
>> +++
>>  include/linux/pci-acpi.h |  17 
>>  4 files changed, 212 insertions(+)
>>  create mode 100644 drivers/acpi/pci_host_acpi.c
>
> I'm speaking a little bit out of turn here, because this is ACPI code,
> but I'm confused about pci_host_acpi.c.  We already have pci_root.c,
> which is *supposed* to be arch-independent.  I know pci_root.c is
> crufty and could be improved, but it does work today on x86 and ia64,
> and it handles some generic things that pci_host_acpi.c does not,
> e.g., _OSC, NUMA, host bridge hotplug, etc.
>
> I'd really like to see pci_root.c improved so it could work on x86,
> ia64, and arm64.  I'm sure that was probably the first thing you
> tried, so likely there are issues there.  Are they insurmountable?

pci_root.c leaves the implementation of pci_acpi_scan_root() to the
architecture. Implementing pci_acpi_scan_root needs a
pci_acpi_root_ops instance and a pci_ops instance and related functions.
The architecture is also expected to implement raw_pci_read and
raw_pci_write.

pci_host_acpi.c is a generic implementation of these using a sysdata
pointing to acpi_pci_root_info, and using a pointer to the pci_mmcfg_region
to access ECAM area, Maybe I can rename this file to
pci_acpi_host_generic.c to reflect this better.

arm64 is the only user of this generic implementation now. The config
option CONFIG_ACPI_PCI_HOST_GENERIC has to be set on the
architecture that chooses to use this generic code instead of its own
implementation.

JC.
{Sorry if the formatting is wrong, using webmail due to internal IT changes]

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Yu, Zhang



On 2/4/2016 7:06 PM, George Dunlap wrote:

On Thu, Feb 4, 2016 at 9:38 AM, Yu, Zhang  wrote:

On 2/4/2016 5:28 PM, Paul Durrant wrote:

I assume this means that the emulator can 'unshadow' GTTs (I guess on an
LRU basis) so that it can shadow new ones when the limit has been exhausted?
If so, how bad is performance likely to be if we live with a lower limit
and take the hit of unshadowing if the guest GTTs become heavily fragmented?


Thank you, Paul.

Well, I was told the emulator have approaches to delay the shadowing of
the GTT till future GPU commands are submitted. By now, I'm not sure
about the performance penalties if the limit is set too low. Although
we are confident 8K is a secure limit, it seems still too high to be
accepted. We will perform more experiments with this new approach to
find a balance between the lowest limit and the XenGT performance.


Just to check some of my assumptions:

I assume that unlike memory accesses, your GPU hardware cannot
'recover' from faults in the GTTs. That is, for memory, you can take a
page fault, fix up the pagetables, and then re-execute the original
instruction; but so far I haven't heard of any devices being able to
seamlessly re-execute a transaction after a fault.  Is my
understanding correct?



Yes


If that is the case, then for every top-level value (whatever the
equivalent of the CR3), you need to be able to shadow the entire GTT
tree below it, yes?  You can't use a trick that the memory shadow
pagetables can use, of unshadowing parts of the tree and reshadowing
them.

So as long as the currently-in-use GTT tree contains no more than
$LIMIT ranges, you can unshadow and reshadow; this will be slow, but
strictly speaking correct.

What do you do if the guest driver switches to a GTT such that the
entire tree takes up more than $LIMIT entries?



Good question. Like the memory virtualization, IIUC, besides wp the
guest page tables, we can also track the updates of them when cr3 is
written or when a tlb flush occurs. We can consider to optimize our GPU
device model to achieve similar goal, e.g. when a root pointer(like
cr3) to the page table is written and when a set of commands is
submitted(Both situations are trigger by MMIO operations). But taking
consideration of performance, we may probably still need to wp all the
page tables when they are created at the first time. It requires a lot
optimization work in the device model side to find a balance between a
minimal wp-ed gpfns and a reasonable performance. We'd like to have a
try. :)

Yu

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Yu, Zhang



On 2/5/2016 1:12 AM, George Dunlap wrote:

On 04/02/16 14:08, Jan Beulich wrote:

On 04.02.16 at 14:33,  wrote:

Jan Beulich writes ("Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter
max_wp_ram_ranges."):

On 04.02.16 at 10:38,  wrote:

So another question is, if value of this limit really matters, will a
lower one be more acceptable(the current 256 being not enough)?


If you've carefully read George's replies, [...]


Thanks to George for the very clear explanation, and also to him for
an illuminating in-person discussion.

It is disturbing that as a result of me as a tools maintainer asking
questions about what seems to me to be a troublesome a user-visible
control setting in libxl, we are now apparently revisiting lower
layers of the hypervisor design, which have already been committed.

While I find George's line of argument convincing, neither I nor
George are maintainers of the relevant hypervisor code.  I am not
going to insist that anything in the hypervisor is done different and
am not trying to use my tools maintainer position to that end.

Clearly there has been a failure of our workflow to consider and
review everything properly together.  But given where we are now, I
think that this discussion about hypervisor internals is probably a
distraction.


While I recall George having made that alternative suggestion,
both Yu and Paul having reservations against it made me not
insist on that alternative. Instead I've been trying to limit some
of the bad effects that the variant originally proposed brought
with it. Clearly, with the more detailed reply George has now
given (involving areas where he is the maintainer for), I should
have been more demanding towards the exploration of that
alternative. That's clearly unfortunate, and I apologize for that,
but such things happen.

As to one of the patches already having for committed - I'm not
worried about that at all. We can always revert, that's why the
thing is called "unstable".


It looks like I should have been more careful to catch up on the current
state of things before I started arguing again -- please accept my
apologies.



In fact, I need to say thank you all for your patience and suggestions.
I'm thrilled to see XenGT is receiving so much attention. :)


I see that patch 2/3 addresses the gpfn/io question in the commit
message by saying, "Previously, a new hypercall or subop was suggested
to map write-protected pages into ioreq server. However, it turned out
handler of this new hypercall would be almost the same with the existing
pair - HVMOP_[un]map_io_range_to_ioreq_server, and there's already a
type parameter in this hypercall. So no new hypercall defined, only a
new type is introduced."

And I see that 2/3 internally separates the WP_RAM type into a separate
rangeset, whose size can be adjusted separately.

This addresses my complaint about the interface using gpfns rather than
MMIO ranges as an interface (somewhat anyway).  Sorry for not
acknowledging this at first.

The question of the internal implementation -- whether to use RB tree
rangesets, or radix trees (as apparently ARM memaccess does) or p2m
types -- is an internal implementation question.  I think p2m types is
long-term the best way to go, but it won't hurt to have the current
implementation checked in, as long as it doesn't have any impacts on the
stable interface.

At the moment, as far as I can tell, there's no way for libxl to even
run a version of qemu with XenGT enabled, so there's no real need for
libxl to be involved.



I agree.


The purpose of having the limit would putatively be to prevent a guest
being able to trigger an exhaustion of hypervisor memory by inducing the
device model to mark an arbitrary number of ranges as mmio_dm.

Two angles on this.

First, assuming that limiting the number of ranges is what we want:  I'm
not really a fan of using HVM_PARAMs for this, but as long as it's not
considered a public interface (i.e., it could go away or disappear and
everything would Just Work), then I wouldn't object.

Although I would ask: would it instead be suitable for now to just set
the default limit for WP_RAM to 8196 in the hypervisor, since we do
expect it to be tracking gpfn ranges rather than IO regions?  And if we


That is what we have suggesting in v9. But Jan proposed we leave this
option to the admin. And to some extent, I can understand his concern.


determine in the future that more ranges are necessary, to then do the
work of moving it to using p2m types (or exposing a knob to adjust it)?

But (and this the other angle): is simply marking a numerical limit
sufficient to avoid memory exhaustion? Is there a danger that after
creating several guests, such that Xen was now running very low on
memory, that a guest would (purposely or not) cause memory to be
exhausted sometime further after boot, causing a system-wide DoS (or
just general lack of stability)?



This worry sounds reasonable. So from this point of view, I guess value
of thi

Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Yu, Zhang



On 2/5/2016 12:18 PM, Tian, Kevin wrote:

From: George Dunlap [mailto:george.dun...@citrix.com]
Sent: Friday, February 05, 2016 1:12 AM

On 04/02/16 14:08, Jan Beulich wrote:

On 04.02.16 at 14:33,  wrote:

Jan Beulich writes ("Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter
max_wp_ram_ranges."):

On 04.02.16 at 10:38,  wrote:

So another question is, if value of this limit really matters, will a
lower one be more acceptable(the current 256 being not enough)?


If you've carefully read George's replies, [...]


Thanks to George for the very clear explanation, and also to him for
an illuminating in-person discussion.

It is disturbing that as a result of me as a tools maintainer asking
questions about what seems to me to be a troublesome a user-visible
control setting in libxl, we are now apparently revisiting lower
layers of the hypervisor design, which have already been committed.

While I find George's line of argument convincing, neither I nor
George are maintainers of the relevant hypervisor code.  I am not
going to insist that anything in the hypervisor is done different and
am not trying to use my tools maintainer position to that end.

Clearly there has been a failure of our workflow to consider and
review everything properly together.  But given where we are now, I
think that this discussion about hypervisor internals is probably a
distraction.


While I recall George having made that alternative suggestion,
both Yu and Paul having reservations against it made me not
insist on that alternative. Instead I've been trying to limit some
of the bad effects that the variant originally proposed brought
with it. Clearly, with the more detailed reply George has now
given (involving areas where he is the maintainer for), I should
have been more demanding towards the exploration of that
alternative. That's clearly unfortunate, and I apologize for that,
but such things happen.

As to one of the patches already having for committed - I'm not
worried about that at all. We can always revert, that's why the
thing is called "unstable".


It looks like I should have been more careful to catch up on the current
state of things before I started arguing again -- please accept my
apologies.


Thanks George for your careful thinking.



I see that patch 2/3 addresses the gpfn/io question in the commit
message by saying, "Previously, a new hypercall or subop was suggested
to map write-protected pages into ioreq server. However, it turned out
handler of this new hypercall would be almost the same with the existing
pair - HVMOP_[un]map_io_range_to_ioreq_server, and there's already a
type parameter in this hypercall. So no new hypercall defined, only a
new type is introduced."

And I see that 2/3 internally separates the WP_RAM type into a separate
rangeset, whose size can be adjusted separately.

This addresses my complaint about the interface using gpfns rather than
MMIO ranges as an interface (somewhat anyway).  Sorry for not
acknowledging this at first.

The question of the internal implementation -- whether to use RB tree
rangesets, or radix trees (as apparently ARM memaccess does) or p2m
types -- is an internal implementation question.  I think p2m types is
long-term the best way to go, but it won't hurt to have the current
implementation checked in, as long as it doesn't have any impacts on the
stable interface.


I'm still trying to understand your suggestion vs. this one. Today we
already have a p2m_mmio_write_dm type. It's there already, and any
write fault hitting that type will be delivered to ioreq server. Then next
open is how a ioreq server could know whether it should handle this
request or not, which is why some tracking structures (either RB/radix)
are created to maintain that specific information. It's under the assumption
that multiple ioreq servers co-exist, so a loop check on all ioreq servers
is required to identify the right target. And multiple ioreq servers are
real case in XenGT, because our vGPU device model is in kernel, as
part of Intel i915 graphics driver. So at least two ioreq servers already
exist, with one routing to XenGT in Dom0 kernel space and the other
to the default Qemu in Dom0 user.

In your long-term approach with p2m types, looks you are proposing
encoding ioreq server ID in p2m type directly (e.g. 4bits), which then
eliminates the need of tracking in ioreq server side so the whole
security concern is gone. And no limitation at all. Because available
p2m bits are limited, as Andrew pointed out, so it might be reasonable
to implement this approach when a new p2t structure is added, which
is why we consider it as a long-term approach.

Please correct me if above understanding is correct?



At the moment, as far as I can tell, there's no way for libxl to even
run a version of qemu with XenGT enabled, so there's no real need for
libxl to be involved.


no way because we have upstreamed all toolstack changes yet, but
we should still discuss the requirement as we've been d

Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Jan Beulich
>>> On 04.02.16 at 19:22,  wrote:
> On 04/02/16 17:48, Roger Pau Monné wrote:
>>  - HVMlite hardware domain: can we get rid of the PHYSDEV ops and PIRQ 
>>event channels?
>>  - HVMlite PCI-passthrough: can we get rid of pciback/pcifront?
> 
> +1000, for both.

I'm a little lost here: However nice that would be, how do you
envision this to work? For the first one, as pointed out before,
there are physdevops which the hardware domain needs to
issue to assist Xen (as a result of parsing and executing AML).
And for the second one, something needs to translate virtual
guest PCI topology to host physical one as well as mediate
config space accesses.

>>  * `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
>>Bit 8 (TF) must be cleared. Other bits are all unspecified.
> 
> I would also specify that the direction flag shall be clear, to prevent
> all kernels needing to `cld` on entry.

In which case IOPL and AC state should perhaps also be nailed down?
Possibly even all of the control ones (leaving only the status flags
unspecified)?

Jan

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH V2] arm: p2m.c bug-fix: hypervisor hang on __p2m_get_mem_access

2016-02-05 Thread Corneliu ZUZU

On 2/3/2016 2:23 PM, Ian Campbell wrote:

On Wed, 2016-02-03 at 13:54 +0200, Corneliu ZUZU wrote:

I thought this mail was not sent properly (didn't find it any longer on
the web (?)) and I resent it just earlier.
I figured it must've been the fact that I forgot to put a "Changed since
v1" section & that I didn't include an
"--in-reply-to" option. Apparently it was actually sent correctly.
Sorry, ignore the last one (which contains a "Changed since v1" section).

OK, please check that what is currently in xen.git#staging is what you
think should be there.

Ian.


Yep, just checked, the changes are there.

Corneliu.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Paul Durrant
> -Original Message-
> From: Jan Beulich [mailto:jbeul...@suse.com]
> Sent: 05 February 2016 08:33
> To: George Dunlap
> Cc: Andrew Cooper; Ian Campbell; Paul Durrant; Stefano Stabellini; Wei Liu;
> Ian Jackson; Kevin Tian; zhiyuan...@intel.com; Zhang Yu; xen-
> de...@lists.xen.org; Keir (Xen.org)
> Subject: Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter
> max_wp_ram_ranges.
> 
> >>> On 04.02.16 at 18:12,  wrote:
> > Two angles on this.
> >
> > First, assuming that limiting the number of ranges is what we want:  I'm
> > not really a fan of using HVM_PARAMs for this, but as long as it's not
> > considered a public interface (i.e., it could go away or disappear and
> > everything would Just Work), then I wouldn't object.
> 
> The parameter setting interface generally is guest exposed, it's just
> that many parameters can't be set by guests for themselves. Of
> course the new #define could be put inside a Xen/tools conditional.
> 
> > Although I would ask: would it instead be suitable for now to just set
> > the default limit for WP_RAM to 8196 in the hypervisor, since we do
> > expect it to be tracking gpfn ranges rather than IO regions?  And if we
> > determine in the future that more ranges are necessary, to then do the
> > work of moving it to using p2m types (or exposing a knob to adjust it)?
> 
> That's what collides with disaggregation (as pointed out before):
> Any tool stack component could then create wp-ram pages in
> guests it controls, no matter whether those guests actually have
> a need for such. I continue to think that if we indeed got this
> route, then the host admin should be required to explicitly state
> that the risks are acceptable (by allowing only certain guests to
> actually create [many] of such pages).
> 
> > But (and this the other angle): is simply marking a numerical limit
> > sufficient to avoid memory exhaustion? Is there a danger that after
> > creating several guests, such that Xen was now running very low on
> > memory, that a guest would (purposely or not) cause memory to be
> > exhausted sometime further after boot, causing a system-wide DoS (or
> > just general lack of stability)?
> 
> The guest itself can't, but other than fully privileged tool stack
> components could, and that's still something we need to avoid.
> 
> > In the shadow / hap memory case, the memory is pre-allocated up front,
> > which makes sure that nothing a guest does can cause Xen to run out of
> > memory once it's booted.  Without pre-allocating it, it's still possible
> > that the admin might start up enough VMs that exhaustion is *possible*,
> > but won't be triggered until later (when the guest starts using more GTTs).
> >
> > Although in fact this really points to the need for a general overhaul
> > in how memory allocation on behalf of a domain is handled in general;
> > that's a bigger chunk of work.
> 
> Well, right now there's pretty little allocation happening at run time
> afaict, i.e. having a couple of pages available will generally keep
> things going smoothly. (Once upon a time it was that there were -
> iirc - no active runtime allocations at all, i.e. such had occurred only
> in the context of things like domctls, where failure could be dealt
> with by ballooning a few pages out of some domain and retrying.
> I'm afraid that's not the case anymore nowadays, and even if it
> was would collide with disaggregation, as a tool stack component
> legitimately issuing a domctl may not have the privileges to
> balloon other than the domain it controls, in particular not Dom0.)
> 
> > But in any case, it seems to me that we can entirely avoid the question
> > of how many ranges might ever be necessary by starting with a fixed
> > limit in the hypervisor, and then moving to a p2m-type based
> > implementation if and when that becomes unsatisfactory.
> 
> With all of the above I think we should rather explore the p2m-type
> based approach, in particular the suggestion Kevin has made to
> direct all p2m_mmio_write_dm (a name which appears to have been
> badly chosen, considering that we're talking about RAM pages
> here) write accesses to the default ioreq server (utilizing that
> upstream qemu doesn't itself register as such).
> 

Utilizing the default server is a backwards step. GVT-g would have to use the 
old HVM_PARAM mechanism to cause it's emulator to become default. I think a 
more appropriate mechanism would be p2m_mmio_write_dm to become something like 
'p2m_ioreq_server_write' and then have a hypercall to allow it to be mapped to 
a particular ioreq server.
Obviously only one could claim it but, with a p2t, the bit could be re-purposed 
to simply mean 'go look in the p2t' for more information and then the p2t could 
be structured to allow emulations to be steered to one of many ioreq servers 
(for read and/or write emulation).

  Paul

> Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-deve

[Xen-devel] [PATCH OSSTEST v3 1/2] Move collectversions from ts-xen-build into Osstest::BuildSupport

2016-02-05 Thread Ian Campbell
I'm going to have a need for it elsewhere.

Signed-off-by: Ian Campbell 
Acked-by: Ian Jackson 
---
 Osstest/BuildSupport.pm | 12 
 ts-xen-build| 13 +
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/Osstest/BuildSupport.pm b/Osstest/BuildSupport.pm
index 933f6e1..a183546 100644
--- a/Osstest/BuildSupport.pm
+++ b/Osstest/BuildSupport.pm
@@ -42,6 +42,7 @@ BEGIN {
 
   xendist
   $xendist
+  collect_xen_built_versions
 
   submodulefixup submodule_have submodule_find
 
@@ -84,6 +85,17 @@ sub xendist () {
($ho, 'xendist', '', $r{"buildjob"});
 }
 
+sub collect_xen_built_versions () {
+my $tools="$builddir/xen/tools";
+my $extras="$builddir/xen/extras";
+store_revision($ho, 'qemu', "$tools/ioemu-dir", 1);
+store_revision($ho, 'qemu', "$tools/qemu-xen-traditional-dir", 1);
+store_revision($ho, 'qemuu', "$tools/qemu-xen-dir", 1);
+store_revision($ho, 'seabios', "$tools/firmware/seabios-dir", 1);
+store_revision($ho, 'ovmf', "$tools/firmware/ovmf-dir", 1);
+store_revision($ho, 'minios', "$extras/mini-os", 1);
+}
+
 #- submodules -
 
 sub submodulefixup () {
diff --git a/ts-xen-build b/ts-xen-build
index 8f92729..382fe62 100755
--- a/ts-xen-build
+++ b/ts-xen-build
@@ -151,17 +151,6 @@ END
 }
 }
 
-sub collectversions () {
-my $tools="$builddir/xen/tools";
-my $extras="$builddir/xen/extras";
-store_revision($ho, 'qemu', "$tools/ioemu-dir", 1);
-store_revision($ho, 'qemu', "$tools/qemu-xen-traditional-dir", 1);
-store_revision($ho, 'qemuu', "$tools/qemu-xen-dir", 1);
-store_revision($ho, 'seabios', "$tools/firmware/seabios-dir", 1);
-store_revision($ho, 'ovmf', "$tools/firmware/ovmf-dir", 1);
-store_revision($ho, 'minios', "$extras/mini-os", 1);
-}
-
 sub divide () {
 # Only move hv to xeninstall, so that we can have
 # xenpolicy in tools tarball.
@@ -248,7 +237,7 @@ sub trapping ($) {
 checkout();
 
 trapping(\&build);
-trapping(\&collectversions);
+trapping(\&collect_xen_built_versions);
 
 die "*** something failed:\n\n".(join "\n\n",@probs)."\n** something failed"
 if @probs;
-- 
2.6.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH OSSTEST v3 2/2] Add a weekly coverity flight

2016-02-05 Thread Ian Campbell
This primarily consists of ts-coverity-{build,upload} and
make-coverity-flight which constructs the sole job.

The branch is named "xen-unstable-coverity" which matches various xen*
in the cr-* scripts. Places which needed special treatement are
handled by matching xen-*-coverity, which leaves the possibility of
xen-4.7-testing-coverity etc in the future, but note that care would
be needed so that coverity's tracking of new vs existing issues would
likely be confused by uploading different branches without
differentiating somehow (I don't know how this is supposed to work).

The most recently scanned revision is pushed to a new
coverity-scanned/master branch in the usual xen.git, tests are run on
the master branch.

I initially thoughts that $c{CoverityEmail} would need to be an actual
account registered with scan, however a manual experiment using
email=secur...@xen.org was accepted by the service. An "analysis
complete" message was sent to security@ while individual results mails
were sent to each member of the coverity project who was configured to
receive them. I think this is what we want. The "analysis complete"
mail contained no sensitive data, but also no real information other
than "success" (or presumably "failure" if that were to be the case).
I think going to security@ is probably OK.

The upload URL defaults to a dummy local URL, which will fail (it
would be possible in principal to put a stunt CGI there though). When
run with "cr-daily-branch --real" (i.e. in full on production mode)
then this is set instead to the value of CoverityUploadUrl from the
config (production-config etc). This means that adhoc and play runs
still exercise all the code (but the curl will fail) while --real runs
upload to a site-configurable location. (Note that the URL includes
the coverity project name, which would likely differ for different
instances).

I have run this via cr-daily-branch --real on the production infra
and it did upload as expected (flight 80516). Since
master==coverity-tested/master at this point it came out as a baseline
test which didn't attempt ap-push, which I would have expected to fail
anyway since it was running as my user in the colo which cannot push
to osstest@xenbits.

In my experiments the curl command took ~35 minutes to complete (rate
in the 100-200k range). Not sure if this is a problem, but use curl
--max-time passing it an hour to bound things. Note that curl is run
on the controller (via system_checked).  timeout etc.

Note that the token must be supplied with 
Cc: Andrew Cooper 
---

v2:
 - Split move of collect_xen_built_versions() into separate patch
 - Implemented support for coverity_upload = true (but don't yet set
   it)
 - Add host_hostflags to the job so it can actually run somewhere.
 - Call tsreadconfig() before referencing $r{coverity_upload} so that
   $r is actually populated.
 - use token=http://www.gnu.org/licenses/>.
+
+
+set -e -o posix
+
+branch=$1
+xenbranch=$2
+blessing=$3
+buildflight=$4
+
+flight=`./cs-flight-create $blessing $branch`
+
+. ./cri-common
+. ./ap-common
+. ./mfi-common
+
+# daily-cron-settings-real will have set this to $c{CoverityUploadUrl}
+# in real runs. .
+: ${OSSTEST_COVERITY_URL:=http://localhost/xen-osstest/stunt-coverity}
+
+defsuite=`getconfig DebianSuite`
+
+arch=amd64
+suite=$defsuite
+
+build_hostflags=share-build-$suite-$arch,arch-$arch,suite-$suite,purpose-build
+
+./cs-job-create $flight coverity-$arch coverity \
+   arch=$arch host_hostflags=$build_hostflags \
+   tree_xen=$TREE_XEN \
+   revision_xen=$REVISION_XEN \
+   coverity_submit_url=${OSSTEST_COVERITY_URL}
+
+echo $flight
+
+# Local variables:
+# mode: sh
+# sh-basic-offset: 2
+# indent-tabs-mode: nil
+# End:
diff --git a/production-config b/production-config
index f2f0584..410893e 100644
--- a/production-config
+++ b/production-config
@@ -100,6 +100,13 @@ TftpGrubVersion -XX-XX
 XenUsePath /usr/groups/xencore/systems/bin/xenuse
 XenUseUser osstest
 
+# Results might include potential vulnerabilities.
+CoverityEmail secur...@xen.org
+# This is only read from daily-cron-settings-real, everything else
+# gets the default/dummy path
+CoverityUploadUrl https://scan.coverity.com/builds?project=XenProject
+CoverityTools cov-analysis-linux64-7.7.0.4.tar.gz
+
 # We use the IP address because Citrix can't manage reliable nameservice
 #DebianMirrorHost debian.uk.xensource.com
 #DebianMirrorHost 10.80.16.196
diff --git a/sg-run-job b/sg-run-job
index 20ebb64..3e0f966 100755
--- a/sg-run-job
+++ b/sg-run-job
@@ -445,6 +445,12 @@ proc prepare-build-host {} {
 run-ts . host-build-prep ts-xen-build-prep
 }
 
+proc need-hosts/coverity {} { return BUILD }
+proc run-job/coverity {} {
+run-ts . = ts-coverity-build + host
+run-ts . = ts-coverity-upload + host
+}
+
 #-- main program --
 
 jobdb::set-flight
diff --git a/ts-coverity-build b/ts-coverity-build
new file mode 100755
index 000..1d8bd0c
--- /dev/null
+++ b/ts-coverity-build
@@ -

Re: [Xen-devel] [PATCH v7 5/5] PCI: ACPI: Add a generic ACPI based host controller

2016-02-05 Thread Lorenzo Pieralisi
On Fri, Feb 05, 2016 at 02:05:37PM +0530, Jayachandran Chandrashekaran Nair 
wrote:

[...]

> pci_host_acpi.c is a generic implementation of these using a sysdata
> pointing to acpi_pci_root_info, and using a pointer to the pci_mmcfg_region
> to access ECAM area, Maybe I can rename this file to
> pci_acpi_host_generic.c to reflect this better.

Maybe you should stop sending this series and work with Tomasz to
get this done, you are confusing everyone and I am really really
annoyed about this.

Do you realize there is no point in having two patch series doing
the same thing and wasting everyone's review time ?

Do you realize he started this work long before you and went through
several rounds of review already (I told you before but in case you
forgot) ?

Tomasz posted a version yesterday, integrating comments following months
of review and testing and I think it is ready to get upstream:

https://lkml.org/lkml/2016/2/4/646

Did you even consider reviewing his code or helping him instead of
churning out more patches doing the *SAME* thing ?

Do you want all of us to go through your code and re-fix what has
already been fixed in Tomasz's series with the end result of missing
yet another merge window ?

This is really annoying, stop it please, really.

Thank you,
Lorenzo

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Roger Pau Monné
El 5/2/16 a les 10:12, Jan Beulich ha escrit:
 On 04.02.16 at 19:22,  wrote:
>> On 04/02/16 17:48, Roger Pau Monné wrote:
>>>  - HVMlite hardware domain: can we get rid of the PHYSDEV ops and PIRQ 
>>>event channels?
>>>  - HVMlite PCI-passthrough: can we get rid of pciback/pcifront?
>>
>> +1000, for both.
> 
> I'm a little lost here: However nice that would be, how do you
> envision this to work? For the first one, as pointed out before,
> there are physdevops which the hardware domain needs to
> issue to assist Xen (as a result of parsing and executing AML).
> And for the second one, something needs to translate virtual
> guest PCI topology to host physical one as well as mediate
> config space accesses.

I've got a little carried over in this first statement, inside of the
"ACPI" section in the document below there's a list of physdevops that
we cannot get rid of, however that's considerably smaller than the
current set. We are at least going to keep PHYSDEVOP_pci_device_add and
PHYSDEVOP_pci_mmcfg_reserved.

Regarding PIRQs, for MSI/MSI-X I think we already have the ability to
trap and emulate IIRC, which should allow us to detect when the hardware
domain is trying to set them and act consequently. Xen should receive
the native interrupts and inject them to the guest, but I assume this is
quite similar to what's already done for PCI-passthrough.

For legacy PCI interrupts, we can parse the MADT inside of Xen in order
to properly setup the lines/overwrites and inject the interrupts that
are not handled by Xen straight into the hardware domain. This will
require us to be able to emulate the same topology as what is found in
native (eg: if there are two IO APICs in the hardware we should also
provide two emulated ones to the hw domain).

As for PCI config space accesses, don't we already do that? We trap on
access to the 0xcf8 io port.

>>>  * `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
>>>Bit 8 (TF) must be cleared. Other bits are all unspecified.
>>
>> I would also specify that the direction flag shall be clear, to prevent
>> all kernels needing to `cld` on entry.
> 
> In which case IOPL and AC state should perhaps also be nailed down?
> Possibly even all of the control ones (leaving only the status flags
> unspecified)?

Status flag? Why don't we just say that all user-settable bits in the
status register will be set to 0 (or cleared)?

Roger.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [BUG?] qemuu only built with i386-softmmu

2016-02-05 Thread Ian Campbell
On Fri, 2016-02-05 at 08:09 +1100, Steven Haigh wrote:
> In building my Xen 4.6.0 packages, I disable qemu-traditional and ONLY
> build qemu-upstream - however as the value for i386-softmmu is not based
> on variables, I'm not sure this makes a difference.

QEMU in a Xen system only provides device model (DM) emulation and not any
CPU instruction emulation, so the nominal arch doesn't actually matter and
Xen build's i386 everywhere as a basically arbitrary choice.

It happens that the Xen DM part of QEMU is quite closely tied to the x86
scaffolding for various historical reasons, so we end up using qemu-system-
i386 even e.g. on ARM!

This comes up a lot, So I've also pasted the two paras above into a new
section in http://wiki.xenproject.org/wiki/QEMU_Upstream . If anyone thinks
the above is inaccurate then please edit the wiki (and post here too if you
like).

On thing I was sure on (so didn't write) is whether the second paragraph
could have an extra sentence:

If you are using a distro supplied QEMU then the qemu-system-x86_64
could also be used, but it makes no practical difference to the
functionality of the system.

I wasn't sure if that was true (I suspect it is) and in any case I think
various bits of libxl etc will look for qemu-system-i386 in various paths
so a user would need to try reasonably hard to do so by giving an explicit
path and there is no real reason to do so maybe better not to muddy the
waters?

Ian.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [distros-debian-jessie test] 38726: trouble: broken/pass

2016-02-05 Thread Platform Team regression test user
flight 38726 distros-debian-jessie real [real]
http://osstest.xs.citrite.net/~osstest/testlogs/logs/38726/

Failures and problems with tests :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-i386-amd64-jessie-netboot-pygrub 3 host-install(3) broken REGR. vs. 
38713

Tests which did not succeed, but are not blocking:
 test-armhf-armhf-armhf-jessie-netboot-pygrub 12 saverestore-support-check fail 
never pass
 test-armhf-armhf-armhf-jessie-netboot-pygrub 11 migrate-support-check fail 
never pass

baseline version:
 flight   38713

jobs:
 build-amd64  pass
 build-armhf  pass
 build-i386   pass
 build-amd64-pvopspass
 build-armhf-pvopspass
 build-i386-pvops pass
 test-amd64-amd64-amd64-jessie-netboot-pvgrub pass
 test-amd64-i386-i386-jessie-netboot-pvgrub   pass
 test-amd64-i386-amd64-jessie-netboot-pygrub  broken  
 test-armhf-armhf-armhf-jessie-netboot-pygrub pass
 test-amd64-amd64-i386-jessie-netboot-pygrub  pass



sg-report-flight on osstest.xs.citrite.net
logs: /home/osstest/logs
images: /home/osstest/images

Logs, config files, etc. are available at
http://osstest.xs.citrite.net/~osstest/testlogs/logs

Test harness code can be found at
http://xenbits.xensource.com/gitweb?p=osstest.git;a=summary


Push not applicable.


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] new idl helper, append to Array

2016-02-05 Thread Olaf Hering
On Thu, Feb 04, Olaf Hering wrote:

> On Thu, Feb 04, Ian Campbell wrote:
> 
> > I think the append_to variant is probably least gross.
> 
> libxl_device_vscsidev_append_to_vscsictrl() would work too.

While looking at the MERGE macro in libxl.c, a _remove_from could be
added as well. I have not checked if there are other users beside the
the MERGE function.

Olaf

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [QUESTION] x86_64 -> i386/i686 CPU translation between xl and qemu binary?

2016-02-05 Thread Ian Campbell
On Fri, 2016-02-05 at 00:12 +, Andrew Cooper wrote:
> On 04/02/2016 23:14, Steven Haigh wrote:
> > On 2016-02-05 09:22, Andrew Cooper wrote:
> > > On 04/02/2016 22:06, Alex Braunegg wrote:
> > > Qemu is only used for device emulation when used with Xen, not CPU
> > > emulation.

I answered Steve's original thread before I saw this and also added
http://wiki.xenproject.org/wiki/QEMU_Upstream#Why_is_qemu-system-i386_used_even_on_x86_64_and_even_non-x86.3F
to the wiki.

If anyone things that anything mention in this subthread is worth recording
there then please do update the wiki.

Ian.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [BUG?] qemuu only built with i386-softmmu

2016-02-05 Thread Steven Haigh
On 05/02/16 20:51, Ian Campbell wrote:
> On Fri, 2016-02-05 at 08:09 +1100, Steven Haigh wrote:
>> In building my Xen 4.6.0 packages, I disable qemu-traditional and ONLY
>> build qemu-upstream - however as the value for i386-softmmu is not based
>> on variables, I'm not sure this makes a difference.
> 
> QEMU in a Xen system only provides device model (DM) emulation and not any
> CPU instruction emulation, so the nominal arch doesn't actually matter and
> Xen build's i386 everywhere as a basically arbitrary choice.
> 
> It happens that the Xen DM part of QEMU is quite closely tied to the x86
> scaffolding for various historical reasons, so we end up using qemu-system-
> i386 even e.g. on ARM!
> 
> This comes up a lot, So I've also pasted the two paras above into a new
> section in http://wiki.xenproject.org/wiki/QEMU_Upstream . If anyone thinks
> the above is inaccurate then please edit the wiki (and post here too if you
> like).

I think this is a great addition that explains the situation well.
Documenting these is always a good thing.

> 
> On thing I was sure on (so didn't write) is whether the second paragraph
> could have an extra sentence:
> 
> If you are using a distro supplied QEMU then the qemu-system-x86_64
> could also be used, but it makes no practical difference to the
> functionality of the system.
> 
> I wasn't sure if that was true (I suspect it is) and in any case I think
> various bits of libxl etc will look for qemu-system-i386 in various paths
> so a user would need to try reasonably hard to do so by giving an explicit
> path and there is no real reason to do so maybe better not to muddy the
> waters?

Maybe go along the lines of:

"There is no practical difference between qemu-system-i386 and
qemu-system-x86_64 therefore both can be interchanged freely."

-- 
Steven Haigh

Email: net...@crc.id.au
Web: https://www.crc.id.au
Phone: (03) 9001 6090 - 0412 935 897



signature.asc
Description: OpenPGP digital signature
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] new idl helper, append to Array

2016-02-05 Thread Ian Campbell
On Fri, 2016-02-05 at 10:55 +0100, Olaf Hering wrote:
> On Thu, Feb 04, Olaf Hering wrote:
> 
> > On Thu, Feb 04, Ian Campbell wrote:
> > 
> > > I think the append_to variant is probably least gross.
> > 
> > libxl_device_vscsidev_append_to_vscsictrl() would work too.
> 
> While looking at the MERGE macro in libxl.c, a _remove_from could be
> added as well. I have not checked if there are other users beside the
> the MERGE function.

If you have the tuits please feel free to arrange to autogenerate as much
as you like ;-)

(If you are unsure about the general utility you could always make them
internal only for now)

Ian.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [BUG?] qemuu only built with i386-softmmu

2016-02-05 Thread Ian Campbell
On Fri, 2016-02-05 at 20:57 +1100, Steven Haigh wrote:
> 
> > On thing I was sure on (so didn't write) is whether the second
> > paragraph
> > could have an extra sentence:
> > 
> > If you are using a distro supplied QEMU then the qemu-system-x86_64
> > could also be used, but it makes no practical difference to the
> > functionality of the system.
> > 
> > I wasn't sure if that was true (I suspect it is) and in any case I
> > think
> > various bits of libxl etc will look for qemu-system-i386 in various
> > paths
> > so a user would need to try reasonably hard to do so by giving an
> > explicit
> > path and there is no real reason to do so maybe better not to muddy the
> > waters?
> 
> Maybe go along the lines of:
> 
> "There is no practical difference between qemu-system-i386 and
> qemu-system-x86_64 therefore both can be interchanged freely."

Thanks, this is a good wording if it is indeed true (I'll wait for
confirmation of that before I write it on the wiki).

Ian.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v7 2/5] docs: add vscsi to xenstore-paths.markdown

2016-02-05 Thread Olaf Hering
Signed-off-by: Olaf Hering 
Acked-by: Ian Campbell 
Cc: Ian Campbell 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Keir Fraser 
Cc: Tim Deegan 
---
 docs/misc/xenstore-paths.markdown | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/docs/misc/xenstore-paths.markdown 
b/docs/misc/xenstore-paths.markdown
index 6a6fda9..76f67b1 100644
--- a/docs/misc/xenstore-paths.markdown
+++ b/docs/misc/xenstore-paths.markdown
@@ -265,6 +265,11 @@ A virtual keyboard device frontend. Described by
 A virtual network device frontend. Described by
 [xen/include/public/io/netif.h][NETIF]
 
+ ~/device/vscsi/$DEVID/* []
+
+A virtual scsi device frontend. Described by
+[xen/include/public/io/vscsiif.h][SCSIIF]
+
  ~/console/* []
 
 The primary PV console device. Described in [console.txt](console.txt)
@@ -335,6 +340,10 @@ A virtual keyboard device backend. Described by
 A virtual network device backend. Described by
 [xen/include/public/io/netif.h][NETIF]
 
+ ~/backend/vscsi/$DOMID/$DEVID/* []
+
+A PV SCSI backend.
+
  ~/backend/console/$DOMID/$DEVID/* []
 
 A PV console backend. Described in [console.txt](console.txt)
@@ -525,6 +534,7 @@ domain instead of a daemon in dom0.
 [KBDIF]: 
http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,kbdif.h.html
 [LIBXLMEM]: http://xenbits.xen.org/docs/unstable/misc/libxl_memory.txt
 [NETIF]: 
http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,netif.h.html
+[SCSIIF]: 
http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,vscsiif.h.html
 [SI]: 
http://xenbits.xen.org/docs/unstable/hypercall/include,public,xen.h.html#Struct_start_info
 [VCPU]: 
http://xenbits.xen.org/docs/unstable/hypercall/include,public,vcpu.h.html
 [XSWIRE]: 
http://xenbits.xen.org/docs/unstable/hypercall/include,public,io,xs_wire.h.html

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v7 5/5] Scripts to create and delete xen-scsiback nodes in Linux target framework

2016-02-05 Thread Olaf Hering
Just to make them public, not meant for merging:
The scripts used during development to create a bunch of SCSI devices in
dom0 using the Linux target framework. targetcli3 and rtslib3 is used.

A patch is required for python-rtslib:
http://article.gmane.org/gmane.linux.scsi.target.devel/8146

Signed-off-by: Olaf Hering 
---
 tools/misc/Makefile  |   4 +
 tools/misc/target-create-xen-scsiback.sh | 135 +++
 tools/misc/target-delete-xen-scsiback.sh |  41 ++
 3 files changed, 180 insertions(+)

diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index a2ef0ec..180c9f5 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -35,6 +35,8 @@ INSTALL_SBIN += $(INSTALL_SBIN-y)
 
 # Everything to be installed in a private bin/
 INSTALL_PRIVBIN+= xenpvnetboot
+INSTALL_PRIVBIN+= target-create-xen-scsiback.sh
+INSTALL_PRIVBIN+= target-delete-xen-scsiback.sh
 
 # Everything to be installed
 TARGETS_ALL := $(INSTALL_BIN) $(INSTALL_SBIN) $(INSTALL_PRIVBIN)
@@ -45,6 +47,8 @@ TARGETS_COPY += xen-ringwatch
 TARGETS_COPY += xencons
 TARGETS_COPY += xencov_split
 TARGETS_COPY += xenpvnetboot
+TARGETS_COPY += target-create-xen-scsiback.sh
+TARGETS_COPY += target-delete-xen-scsiback.sh
 
 # Everything which needs to be built
 TARGETS_BUILD := $(filter-out $(TARGETS_COPY),$(TARGETS_ALL))
diff --git a/tools/misc/target-create-xen-scsiback.sh 
b/tools/misc/target-create-xen-scsiback.sh
new file mode 100755
index 000..96d4c39
--- /dev/null
+++ b/tools/misc/target-create-xen-scsiback.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+unset LANG
+unset ${!LC_*}
+set -x
+set -e
+
+modprobe --version
+targetcli --version
+udevadm --version
+blockdev --version
+parted --version
+sfdisk --version
+mkswap --version
+
+configfs=/sys/kernel/config
+target_path=$configfs/target
+
+num_luns=4
+num_hosts=4
+
+case "$1" in
+   -p)
+   backend="pvops"
+   ;;
+   -x)
+   backend="xenlinux"
+   ;;
+   *)
+   : "usage: $0 [-p|-x]"
+   if grep -qw xenfs$ /proc/filesystems
+   then
+   backend="pvops"
+   else
+   backend="xenlinux"
+   fi
+   ;;
+esac
+
+get_wwn() {
+   sed '
+   s@-@@g
+   s@^\(.\{16\}\)\(.*\)@\1@
+   ' /proc/sys/kernel/random/uuid
+}
+
+if test ! -d "${target_path}"
+then
+   modprobe -v configfs
+   mount -vt configfs configfs $configfs
+   modprobe -v target_core_mod
+fi
+if test "${backend}" = "pvops"
+then
+   modprobe -v xen-scsiback
+fi
+
+host=0
+while test $host -lt $num_hosts
+do
+   host=$(( $host + 1 ))
+   lun=0
+   loopback_wwn="naa.`get_wwn`"
+   pvscsi_wwn="naa.`get_wwn`"
+   targetcli /loopback create ${loopback_wwn}
+   if test "${backend}" = "pvops"
+   then
+   targetcli /xen-pvscsi create ${pvscsi_wwn}
+   fi
+   while test $lun -lt $num_luns
+   do
+   : h $host l $lun
+   f_file=/dev/shm/Fileio.${host}.${lun}.file
+   f_uuid=/dev/shm/Fileio.${host}.${lun}.uuid
+   f_link=/dev/shm/Fileio.${host}.${lun}.link
+   fileio_name="fio_${host}.${lun}"
+   pscsi_name="ps_${host}.${lun}"
+
+   targetcli /backstores/fileio create name=${fileio_name} 
"file_or_dev=${f_file}" size=$((1024*1024 * 8 )) sparse=true
+   targetcli /loopback/${loopback_wwn}/luns create 
/backstores/fileio/${fileio_name} $lun
+
+   udevadm settle --timeout=4
+
+   vpd_uuid="`sed -n '/^T10 VPD Unit Serial 
Number:/s@^[^:]\+:[[:blank:]]\+@@p' 
/sys/kernel/config/target/core/fileio_*/${fileio_name}/wwn/vpd_unit_serial`"
+   if test -z "${vpd_uuid}"
+   then
+   exit 1
+   fi
+   echo "${vpd_uuid}" > "${f_uuid}"
+   by_id="`echo ${vpd_uuid} | sed 
's@-@@g;s@^\(.\{25\}\)\(.*\)@scsi-36001405\1@'`"
+   ln -sfvbn "/dev/disk/by-id/${by_id}" "${f_link}"
+
+   f_major=$((`stat --dereference --format=0x%t "${f_link}"`))
+   f_minor=$((`stat --dereference --format=0x%T "${f_link}"`))
+   if test -z "${f_major}" || test -z "${f_minor}"
+   then
+   exit 1
+   fi
+   f_alias=`ls -d 
/sys/dev/block/${f_major}:${f_minor}/device/scsi_device/*:*:*:*`
+   if test -z "${f_alias}"
+   then
+   exit 1
+   fi
+   f_alias=${f_alias##*/}
+
+   blockdev --rereadpt "${f_link}"
+   udevadm settle --timeout=4
+   echo 1,12,S | sfdisk "${f_link}"
+   udevadm settle --timeout=4
+   blockdev --rereadpt "${f_link}"
+   udevadm settle --timeout=4
+   parted -s "${f_link}" unit s print
+
+   d_link="`readlink \"${f_link}\"`"
+   if test -n "${d_link

[Xen-devel] [PATCH v7 3/5] libxl: add support for vscsi

2016-02-05 Thread Olaf Hering
Port pvscsi support from xend to libxl:

 vscsi=['pdev,vdev{,options}']
 xl scsi-attach
 xl scsi-detach
 xl scsi-list

Signed-off-by: Olaf Hering 
Cc: Ian Jackson 
Cc: Stefano Stabellini 
Cc: Ian Campbell 
Cc: Wei Liu 
---
 docs/man/xl.cfg.pod.5|  55 +++
 docs/man/xl.pod.1|  18 +
 tools/libxl/Makefile |   2 +
 tools/libxl/libxl.c  | 578 
 tools/libxl/libxl.h  |  35 ++
 tools/libxl/libxl_create.c   |   1 +
 tools/libxl/libxl_device.c   |   2 +
 tools/libxl/libxl_internal.h |  14 +
 tools/libxl/libxl_types.idl  |  53 +++
 tools/libxl/libxl_types_internal.idl |   1 +
 tools/libxl/libxl_vscsi.c| 344 +
 tools/libxl/libxlu_vscsi.c   | 718 +++
 tools/libxl/libxlutil.h  |  18 +
 tools/libxl/xl.h |   3 +
 tools/libxl/xl_cmdimpl.c | 208 +-
 tools/libxl/xl_cmdtable.c|  15 +
 16 files changed, 2064 insertions(+), 1 deletion(-)

diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
index 8899f75..3b5b1c5 100644
--- a/docs/man/xl.cfg.pod.5
+++ b/docs/man/xl.cfg.pod.5
@@ -517,6 +517,61 @@ value is optional if this is a guest domain.
 
 =back
 
+=item B
+
+Specifies the PVSCSI devices to be provided to the guest. PVSCSI passes
+SCSI devices from the backend domain to the guest.
+
+Each VSCSI_SPEC_STRING consists of "pdev,vdev[,options]".
+'pdev' describes the physical device, preferable in a persistent format such 
as /dev/disk/by-*/*.
+'vdev' is the domU device in vHOST:CHANNEL:TARGET:LUN notation, all integers.
+'options' lists additional flags which a backend may recognize.
+
+The supported values for "pdev" and "options" depends on the backend driver 
used:
+
+=over 4
+
+=item B
+
+=over 4
+
+=item C
+
+The backend driver in the pvops kernel is part of the Linux-IO Target framework
+(LIO). As such the SCSI devices have to be configured first with the tools
+provided by this framework, such as a xen-scsiback aware targetcli. The "pdev"
+in domU.cfg has to refer to a config item in that framework instead of the raw
+device. Usually this is a WWN in the form of "na.WWN:LUN".
+
+=item C
+
+No options recognized.
+
+=back
+
+=item B
+
+=over 4
+
+=item C
+
+The dom0 device in either /dev/scsidev or pHOST:CHANNEL:TARGET:LUN notation.
+
+It's recommended to use persistent names "/dev/disk/by-*/*" to refer to a 
"pdev".
+The toolstack will translate this internally to "h:c:t:l" notation, which is 
how
+the backend driver will access the device. Using the "h:c:t:l" notation for
+"pdev" in domU.cfg is discouraged because this value will change across 
reboots,
+depending on the detection order in the OS.
+
+=item C
+
+Currently only the option value "feature-host" is recognized. SCSI command
+emulation in backend driver is bypassed when "feature-host" is specified.
+
+=back
+
+=back
+
 =item B
 
 Specifies the paravirtual framebuffer devices which should be supplied
diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index 4279c7c..0674149 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -1293,6 +1293,24 @@ List virtual trusted platform modules for a domain.
 
 =back
 
+=head2 PVSCSI DEVICES
+
+=over 4
+
+=item B I I I,I<[feature-host]>
+
+Creates a new vscsi device in the domain specified by I.
+
+=item B I I
+
+Removes the vscsi device from domain specified by I.
+
+=item B I I<[domain-id] ...>
+
+List vscsi devices for the domain specified by I.
+
+=back
+
 =head1 PCI PASS-THROUGH
 
 =over 4
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index 620720e..a71abf2 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -99,6 +99,7 @@ endif
 LIBXL_LIBS += -lyajl
 
 LIBXL_OBJS = flexarray.o libxl.o libxl_create.o libxl_dm.o libxl_pci.o \
+   libxl_vscsi.o \
libxl_dom.o libxl_exec.o libxl_xshelp.o libxl_device.o \
libxl_internal.o libxl_utils.o libxl_uuid.o \
libxl_json.o libxl_aoutils.o libxl_numa.o libxl_vnuma.o 
\
@@ -141,6 +142,7 @@ AUTOINCS= libxlu_cfg_y.h libxlu_cfg_l.h _libxl_list.h 
_paths.h \
 AUTOSRCS= libxlu_cfg_y.c libxlu_cfg_l.c
 AUTOSRCS += _libxl_save_msgs_callout.c _libxl_save_msgs_helper.c
 LIBXLU_OBJS = libxlu_cfg_y.o libxlu_cfg_l.o libxlu_cfg.o \
+   libxlu_vscsi.o \
libxlu_disk_l.o libxlu_disk.o libxlu_vif.o libxlu_pci.o
 $(LIBXLU_OBJS): CFLAGS += $(CFLAGS_libxenctrl) # For xentoollog.h
 
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 2bde0f5..ece7301 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -2059,6 +2059,572 @@ static int libxl__resolve_domid(libxl__gc *gc, const 
char *name,
 }
 
 
/**/
+
+static void libxl__device_vscsidev_backend_rm(libxl__gc *gc,
+  li

[Xen-devel] [PATCH v7 1/5] vscsiif.h: fix WWN notation for p-dev property

2016-02-05 Thread Olaf Hering
The pvops kernel expects either "naa.WWN:LUN" or "h:c:t:l" in the p-dev
property. Add the missing :LUN part to the comment.

Signed-off-by: Olaf Hering 
Acked-by: Ian Campbell 
Cc: Ian Campbell 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Keir Fraser 
Cc: Tim Deegan 
---
 xen/include/public/io/vscsiif.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/include/public/io/vscsiif.h b/xen/include/public/io/vscsiif.h
index 7a1db05..e8e38a9 100644
--- a/xen/include/public/io/vscsiif.h
+++ b/xen/include/public/io/vscsiif.h
@@ -60,7 +60,7 @@
  *
  *  A string specifying the backend device: either a 4-tuple "h:c:t:l"
  *  (host, controller, target, lun, all integers), or a WWN (e.g.
- *  "naa.60014054ac780582").
+ *  "naa.60014054ac780582:0").
  *
  * v-dev
  *  Values: string

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v7 0/5] libbxl: add support for pvscsi, iteration 7

2016-02-05 Thread Olaf Hering
Port vscsi=[] and scsi-{attach,detach,list} commands from xend to libxl.

libvirt uses its existing SCSI support:
http://lists.xenproject.org/archives/html/xen-devel/2015-04/msg02963.html

targetcli/rtslib has to be aware of xen-scsiback (upstream unresponsive):
http://article.gmane.org/gmane.linux.scsi.target.devel/8146

TODO:
 - check if "detach " can be done with single host/single device instead
   of the ->remove field
 - check if a transaction should be used in libxl__device_vscsi_add
 - maybe use events instead of polling for "state" changes in reconfigure
   (libxl__wait_for_backend vs. libxl__ev_devstate_wait)
 - comply with tools/libxl/CODING_STYLE everywhere
 - document pvscsi in xen wiki

Changes between v6 and v7:
 - rebase to 'staging' (3b971de)
 - Introduce libxl_device_vscsictrl_append_vscsidev
 - Add libxl__vscsi_collect_ctrls and used it in libxl_device_vscsictrl_list
 - Convert type of lun from u32 to u64 per SCSI spec
 - Use vscsi_saved in libxl__device_vscsictrl_add
 - Assign unique vscsidev_id per vscsictrl
 - Rename vscsi_dev to vscsidev in function names
 - Rename variables in libxl_vscsiinfo
 - Rename locals to refer to ctrl/dev
 - Rename various strings from host to ctrl
 - Rename local variables vscsi_ctrl to vscsictrl
 - Rename libxl_device_vscsictrl->vscsi_devs to vscsidevs
 - Remove libxl_device_vscsidev->remove, rework detach
 - Rename libxl_device_vscsidev->vscsi_dev_id to vscsidev_id
 - Rename local variables vscsi_host to vscsi_ctrl
 - Rename local variables v_hst to v_ctrl
 - Remove libxl_device_vscsictrl->v_hst
 - Rename libxl_vscsi_dev to libxl_device_vscsidev
 - Rename libxl_device_vscsi to libxl_device_vscsictrl

Changes between v5 and v6:
 - rebase to 'staging' (a7b39c8 and b7e7ad8)
 - Fix off-by-one in xlu__vscsi_compare_udev
 - xl.cfg: use options instead of option
 - xl.cfg: fix grammar for pdev/options
 - xl.cfg: fix Usually typo
 - Remove next_vscsi_dev_id from libxl_device_vscsi
 - Use XLU_WWN_LEN also in libxl_vscsi.c
http://lists.xenproject.org/archives/html/xen-devel/2015-11/msg01446.html

Changes between v4 and v5:
 - vscsiif.h: refer to backend_domid
 - Set update_json in libxl_device_vscsi_remove
 - Remove comment from libxl__device_vscsi_add
 - Remove debug LOG from libxl__device_vscsi_reconfigure
 - Move local nb variable in libxl__device_vscsi_reconfigure
 - Make be_path const in libxl__device_vscsi_reconfigure
 - Adjust libxl__device_vscsi_dev_backend_set to avoid long lines
 - Adjust libxl__device_vscsi_dev_backend_rm to avoid long lines
 - Use CTX in libxl__device_vscsi_dev_backend_rm
 - Make be_path const in libxl__device_vscsi_dev_backend_rm
 - xl.cfg: Its typo
 - xl.cfg: Use persistent instead of persistant
 - Rename feature_host to scsi_raw_cmds
 - target-create-xen-scsiback.sh: detect pvops and xenlinux
 - Wrap long lines in main_vscsilist
 - Call libxl_vscsiinfo_dispose unconditional
 - Let scsi-list print p-dev instead of p-devname
 - Handle broken vscsi device entry in xenstore
 - Split libxl__vscsi_fill_host from libxl_device_vscsi_list
 - Make xlu_vscsi_append_dev static
 - Remove reference to pvscsi.txt from xenstore-paths.markdown
 - xl.cfg: update Linux and xenlinux
 - xl.cfg: refer to backend domain instead of dom0
 - xl.cfg: be more verbose what persistant format is
 - return if libxl__device_vscsi_dev_backend_set fails in 
libxl__device_vscsi_new_backend
 - target-create-xen-scsiback.sh: set also alias for libvirt
http://lists.xenproject.org/archives/html/xen-devel/2015-05/msg00523.html

Changes between v3 and v4:
 - Use libxl__device_nextid in libxl__device_vscsi_add
 - Remove check for duplicate pdev assignment from libxl_device_vscsi_get_host
 - Caller provides libxl_device_vscsi to libxl_device_vscsi_get_host
 - Define LIBXL_HAVE_VSCSI
 - Remove init_val from libxl_vscsi_pdev_type
 - Move some functions from libxl to libxlu
 - Introduce libxl_device_vscsi->next_vscsi_dev_id to handle holes
 - Use Struct in KeyedUnion for ocaml idl
 - docs: Mention pvscsi in xl and xl.cfg
 - Turn feature_host into a defbool and add checking
 - Support pvops and /dev/ nodes in config
 - Wrap entire libxlu_vscsi.c with ifdef linux
 - Set remove flag in libxl_device_vscsi_list
 - Fix vscsiif path in xenstore-paths.markdown
 - vscsiif.h: add some notes about xenstore layout
 - Add copyright to libxlu_vscsi.c and libxl_vscsi.c
 - Scripts to create and delete xen-scsiback nodes in Linux target framework
 - Remove pvscsi.txt
http://lists.xenproject.org/archives/html/xen-devel/2015-04/msg01949.html

Changes between v2 and v3:
 - Adjust change for vscsiif.h
 - Support "naa.wwn:lun" notation in pvops kernel
 - Add example for pvops kernel using targetcli
   patch required for python-rtslib:
   http://article.gmane.org/gmane.linux.scsi.target.devel/8146
 - Use vdev variable in libxl_device_vscsi_parse
http://lists.xenproject.org/archives/html/xen-devel/2015-03/msg00734.html

Changes between v1 and v2:
 - ported to current staging
http://lists.xenproject.org/

[Xen-devel] [PATCH v7 4/5] vscsiif.h: add some notes about xenstore layout

2016-02-05 Thread Olaf Hering
Signed-off-by: Olaf Hering 
Acked-by: Ian Campbell 
Cc: Ian Campbell 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Keir Fraser 
Cc: Tim Deegan 
---
 xen/include/public/io/vscsiif.h | 68 +
 1 file changed, 68 insertions(+)

diff --git a/xen/include/public/io/vscsiif.h b/xen/include/public/io/vscsiif.h
index e8e38a9..2c5f04a 100644
--- a/xen/include/public/io/vscsiif.h
+++ b/xen/include/public/io/vscsiif.h
@@ -104,6 +104,74 @@
  *  response structures.
  */
 
+/*
+ * Xenstore format in practice
+ * ===
+ * 
+ * The backend driver uses a single_host:many_devices notation to manage domU
+ * devices. Everything is stored in 
/local/domain//backend/vscsi/.
+ * The xenstore layout looks like this (dom0 is assumed to be the 
backend_domid):
+ * 
+ * //feature-host = "0"
+ * //frontend = "/local/domain//device/vscsi/0"
+ * //frontend-id = ""
+ * //online = "1"
+ * //state = "4"
+ * //vscsi-devs/dev-0/p-dev = "8:0:2:1" or "naa.wwn:lun"
+ * //vscsi-devs/dev-0/state = "4"
+ * //vscsi-devs/dev-0/v-dev = "0:0:0:0"
+ * //vscsi-devs/dev-1/p-dev = "8:0:2:2"
+ * //vscsi-devs/dev-1/state = "4"
+ * //vscsi-devs/dev-1/v-dev = "0:0:1:0"
+ * 
+ * The frontend driver maintains its state in
+ * /local/domain//device/vscsi/.
+ * 
+ * /backend = "/local/domain/0/backend/vscsi//"
+ * /backend-id = "0"
+ * /event-channel = "20"
+ * /ring-ref = "43"
+ * /state = "4"
+ * /vscsi-devs/dev-0/state = "4"
+ * /vscsi-devs/dev-1/state = "4"
+ * 
+ * In addition to the entries for backend and frontend these flags are stored
+ * for the toolstack:
+ * 
+ * //vscsi-devs/dev-1/p-devname = "/dev/$device"
+ * 
+ * 
+ * Backend/frontend protocol
+ * =
+ * 
+ * To create a vhost along with a device:
+ * //feature-host = "0"
+ * //frontend = "/local/domain//device/vscsi/0"
+ * //frontend-id = ""
+ * //online = "1"
+ * //state = "1"
+ * //vscsi-devs/dev-0/p-dev = "8:0:2:1"
+ * //vscsi-devs/dev-0/state = "1"
+ * //vscsi-devs/dev-0/v-dev = "0:0:0:0"
+ * Wait for //state + //vscsi-devs/dev-0/state 
become 4
+ * 
+ * To add another device to a vhost:
+ * //state = "7"
+ * //vscsi-devs/dev-1/p-dev = "8:0:2:2"
+ * //vscsi-devs/dev-1/state = "1"
+ * //vscsi-devs/dev-1/v-dev = "0:0:1:0"
+ * Wait for //state + //vscsi-devs/dev-1/state 
become 4
+ * 
+ * To remove a device from a vhost:
+ * //state = "7"
+ * //vscsi-devs/dev-1/state = "5"
+ * Wait for //state to become 4
+ * Wait for //vscsi-devs/dev-1/state become 6
+ * Remove //vscsi-devs/dev-1/{state,p-dev,v-dev,p-devname}
+ * Remove //vscsi-devs/dev-1/
+ *
+ */
+
 /* Requests from the frontend to the backend */
 
 /*

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] Help in fixing a issue

2016-02-05 Thread PREETI MISHRA
I have a patch in Xen which stores some information of VM process. I have
another program running in Dom0 which intercept this information.

i) I want to configure my patch running in Xen to send the alert
notification to program running in Dom0 to read data, probably using event
channels. How to configure event channel?
ii) Which API or command will be used to read the data from patch buffer.

I am running xen 4.6 in ubuntu 14.04

regards,

Preeti
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 6/7] VT-d: Refactor iommu_flush .iotlb()

2016-02-05 Thread Quan Xu
to pass down a flag indicating whether the lock is being held,
and check the way up the call trees.

Signed-off-by: Quan Xu 
---
 xen/arch/x86/mm/p2m-ept.c|  3 +-
 xen/drivers/passthrough/vtd/iommu.c  | 73 ++--
 xen/drivers/passthrough/vtd/iommu.h  |  3 +-
 xen/drivers/passthrough/vtd/qinval.c |  3 +-
 xen/include/asm-x86/iommu.h  |  3 +-
 5 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index ecf7e67..4c9bdfe 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -829,7 +829,8 @@ out:
  need_modify_vtd_table )
 {
 if ( iommu_hap_pt_share )
-rc = iommu_pte_flush(d, gfn, &ept_entry->epte, order, 
vtd_pte_present);
+rc = iommu_pte_flush(d, gfn, &ept_entry->epte, order,
+ vtd_pte_present, NONE_LOCK);
 else
 {
 if ( iommu_flags )
diff --git a/xen/drivers/passthrough/vtd/iommu.c 
b/xen/drivers/passthrough/vtd/iommu.c
index 6696b16..8d54c01 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -410,7 +410,8 @@ static int iommu_flush_context_device(
 /* return value determine if we need a write buffer flush */
 static int flush_iotlb_reg(void *_iommu, u16 did,
u64 addr, unsigned int size_order, u64 type,
-   int flush_non_present_entry, int flush_dev_iotlb)
+   int flush_non_present_entry, int flush_dev_iotlb,
+   unsigned int lock)
 {
 struct iommu *iommu = (struct iommu *) _iommu;
 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
@@ -475,7 +476,8 @@ static int flush_iotlb_reg(void *_iommu, u16 did,
 }
 
 static int iommu_flush_iotlb_global(struct iommu *iommu,
-int flush_non_present_entry, int flush_dev_iotlb)
+int flush_non_present_entry, int flush_dev_iotlb,
+unsigned int lock)
 {
 struct iommu_flush *flush = iommu_get_flush(iommu);
 int status;
@@ -484,7 +486,8 @@ static int iommu_flush_iotlb_global(struct iommu *iommu,
 vtd_ops_preamble_quirk(iommu);
 
 status = flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
-flush_non_present_entry, flush_dev_iotlb);
+  flush_non_present_entry, flush_dev_iotlb,
+  lock);
 
 /* undo platform specific errata workarounds */
 vtd_ops_postamble_quirk(iommu);
@@ -493,7 +496,8 @@ static int iommu_flush_iotlb_global(struct iommu *iommu,
 }
 
 static int iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
-int flush_non_present_entry, int flush_dev_iotlb)
+int flush_non_present_entry, int flush_dev_iotlb,
+unsigned int lock)
 {
 struct iommu_flush *flush = iommu_get_flush(iommu);
 int status;
@@ -502,7 +506,8 @@ static int iommu_flush_iotlb_dsi(struct iommu *iommu, u16 
did,
 vtd_ops_preamble_quirk(iommu);
 
 status =  flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
-flush_non_present_entry, flush_dev_iotlb);
+   flush_non_present_entry, flush_dev_iotlb,
+   lock);
 
 /* undo platform specific errata workarounds */
 vtd_ops_postamble_quirk(iommu);
@@ -512,7 +517,8 @@ static int iommu_flush_iotlb_dsi(struct iommu *iommu, u16 
did,
 
 static int iommu_flush_iotlb_psi(
 struct iommu *iommu, u16 did, u64 addr, unsigned int order,
-int flush_non_present_entry, int flush_dev_iotlb)
+int flush_non_present_entry, int flush_dev_iotlb,
+unsigned int lock)
 {
 struct iommu_flush *flush = iommu_get_flush(iommu);
 int status;
@@ -521,11 +527,13 @@ static int iommu_flush_iotlb_psi(
 
 /* Fallback to domain selective flush if no PSI support */
 if ( !cap_pgsel_inv(iommu->cap) )
-return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, 
flush_dev_iotlb);
+return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry,
+ flush_dev_iotlb, lock);
 
 /* Fallback to domain selective flush if size is too big */
 if ( order > cap_max_amask_val(iommu->cap) )
-return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, 
flush_dev_iotlb);
+return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry,
+ flush_dev_iotlb, lock);
 
 addr >>= PAGE_SHIFT_4K + order;
 addr <<= PAGE_SHIFT_4K + order;
@@ -534,7 +542,8 @@ static int iommu_flush_iotlb_psi(
 vtd_ops_preamble_quirk(iommu);
 
 status = flush->iotlb(iommu, did, addr, order, DMA_TLB_PSI_FLUSH,
-flush_non_present_entry, flush_dev_iotlb);
+  flush_non_present_entry, flush_dev_iotlb,
+  lock);
 
 /* undo platform specific errata workarounds */
 vtd_ops_postamble_quirk(iommu);
@@ -542,7 +551,7 @@ stat

[Xen-devel] [PATCH v5 5/7] VT-d: Refactor iommu_ops .map_page() and unmap_page()

2016-02-05 Thread Quan Xu
to pass down a flag indicating whether the lock is being held,
and check the way up the call trees.

Signed-off-by: Quan Xu 
---
 xen/arch/x86/mm.c |  9 ++---
 xen/arch/x86/mm/p2m-ept.c |  7 ---
 xen/arch/x86/mm/p2m-pt.c  |  7 ---
 xen/arch/x86/mm/p2m.c | 24 +++-
 xen/arch/x86/x86_64/mm.c  |  5 +++--
 xen/common/grant_table.c  | 11 +++
 xen/drivers/passthrough/amd/iommu_map.c   |  7 ---
 xen/drivers/passthrough/amd/pci_amd_iommu.c   |  3 ++-
 xen/drivers/passthrough/arm/smmu.c|  2 +-
 xen/drivers/passthrough/iommu.c   | 11 ++-
 xen/drivers/passthrough/vtd/iommu.c   | 10 ++
 xen/drivers/passthrough/vtd/x86/vtd.c |  5 +++--
 xen/drivers/passthrough/x86/iommu.c   |  3 ++-
 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h |  4 ++--
 xen/include/asm-x86/p2m.h |  6 --
 xen/include/xen/iommu.h   |  8 
 16 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 1e50b94..f9030e5 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2443,14 +2443,17 @@ static int __get_page_type(struct page_info *page, 
unsigned long type,
 if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) )
 {
 if ( (x & PGT_type_mask) == PGT_writable_page )
-iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
+iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
+ NONE_LOCK);
 else if ( type == PGT_writable_page )
 {
 rc = iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
 page_to_mfn(page),
-IOMMUF_readable|IOMMUF_writable);
+IOMMUF_readable|IOMMUF_writable,
+NONE_LOCK);
 if ( rc )
-iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
+iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
+ NONE_LOCK);
 }
 }
 }
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 9e1f5c6..ecf7e67 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -835,16 +835,17 @@ out:
 if ( iommu_flags )
 for ( i = 0; i < (1 << order); i++ )
 {
-rc = iommu_map_page(d, gfn + i, mfn_x(mfn) + i, 
iommu_flags);
+rc = iommu_map_page(d, gfn + i, mfn_x(mfn) + i,
+iommu_flags, NONE_LOCK);
 if ( rc )
 {
 while ( i-- > 0 )
-iommu_unmap_page(d, gfn + i);
+iommu_unmap_page(d, gfn + i, NONE_LOCK);
 }
 }
 else
 for ( i = 0; i < (1 << order); i++ )
-iommu_unmap_page(d, gfn + i);
+iommu_unmap_page(d, gfn + i, NONE_LOCK);
 }
 }
 
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index 942a11c..e73c0e8 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -677,16 +677,17 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long 
gfn, mfn_t mfn,
 for ( i = 0; i < (1UL << page_order); i++ )
 {
 rc = iommu_map_page(p2m->domain, gfn + i, mfn_x(mfn) + i,
-iommu_pte_flags);
+iommu_pte_flags, NONE_LOCK);
 if ( rc )
 {
 while ( i-- > 0 )
-iommu_unmap_page(p2m->domain, gfn + i);
+iommu_unmap_page(p2m->domain, gfn + i,
+ NONE_LOCK);
 }
 }
 else
 for ( i = 0; i < (1UL << page_order); i++ )
-iommu_unmap_page(p2m->domain, gfn + i);
+iommu_unmap_page(p2m->domain, gfn + i, NONE_LOCK);
 }
 
 /*
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index c6b883d..76748d4 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -610,7 +610,7 @@ p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, 
unsigned long mfn,
 {
 if ( need_iommu(p2m->domain) )
 for ( i = 0; i < (1 << page_order); i++ )
-iommu_unmap_page(p2m->domain, mfn + i);
+iommu_unmap_page(p2m->domain, mfn + i, NONE_LOCK);
 return 0;
 }
 
@@ -662,12 +662,13 @@ guest_physmap_add_entry(struct domain *d, unsigned long 
gfn,
 {
 for ( i = 0; i < (

[Xen-devel] [PATCH v5 4/7] VT-d: Refactor iommu_ops .iotlb_flush() and iotlb_flush_all()

2016-02-05 Thread Quan Xu
to pass down a flag indicating whether the lock is being held.

Signed-off-by: Quan Xu 
---
 xen/arch/arm/p2m.c  |  2 +-
 xen/common/memory.c |  4 ++--
 xen/drivers/passthrough/iommu.c |  9 +
 xen/drivers/passthrough/vtd/iommu.c |  5 +++--
 xen/drivers/passthrough/x86/iommu.c |  2 +-
 xen/include/xen/iommu.h | 17 +
 6 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index e396c40..6eec959 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1100,7 +1100,7 @@ tlbflush:
 if ( flush )
 {
 flush_tlb_domain(d);
-iommu_iotlb_flush(d, sgfn, egfn - sgfn);
+iommu_iotlb_flush(d, sgfn, egfn - sgfn, NONE_LOCK);
 }
 
 out:
diff --git a/xen/common/memory.c b/xen/common/memory.c
index c228d9f..e68c3dd 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -631,9 +631,9 @@ static int xenmem_add_to_physmap(struct domain *d,
 if ( need_iommu(d) )
 {
 this_cpu(iommu_dont_flush_iotlb) = 0;
-rc = iommu_iotlb_flush(d, xatp->idx - done, done);
+rc = iommu_iotlb_flush(d, xatp->idx - done, done, NONE_LOCK);
 if ( !rc )
-rc = iommu_iotlb_flush(d, xatp->gpfn - done, done);
+rc = iommu_iotlb_flush(d, xatp->gpfn - done, done, NONE_LOCK);
 }
 #endif
 
diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
index cdf8e9a..ebd6d47 100644
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -270,24 +270,25 @@ static void iommu_free_pagetables(unsigned long unused)
 cpumask_cycle(smp_processor_id(), 
&cpu_online_map));
 }
 
-int iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int 
page_count)
+int iommu_iotlb_flush(struct domain *d, unsigned long gfn,
+  unsigned int page_count, unsigned int lock)
 {
 struct hvm_iommu *hd = domain_hvm_iommu(d);
 
 if ( !iommu_enabled || !hd->platform_ops || !hd->platform_ops->iotlb_flush 
)
 return 0;
 
-return hd->platform_ops->iotlb_flush(d, gfn, page_count);
+return hd->platform_ops->iotlb_flush(d, gfn, page_count, lock);
 }
 
-int iommu_iotlb_flush_all(struct domain *d)
+int iommu_iotlb_flush_all(struct domain *d, unsigned int lock)
 {
 struct hvm_iommu *hd = domain_hvm_iommu(d);
 
 if ( !iommu_enabled || !hd->platform_ops || 
!hd->platform_ops->iotlb_flush_all )
 return 0;
 
-return hd->platform_ops->iotlb_flush_all(d);
+return hd->platform_ops->iotlb_flush_all(d, lock);
 }
 
 int __init iommu_setup(void)
diff --git a/xen/drivers/passthrough/vtd/iommu.c 
b/xen/drivers/passthrough/vtd/iommu.c
index a780632..e8cbfdb 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -601,12 +601,13 @@ static int __intel_iommu_iotlb_flush(struct domain *d, 
unsigned long gfn,
 return rc;
 }
 
-static int intel_iommu_iotlb_flush(struct domain *d, unsigned long gfn, 
unsigned int page_count)
+static int intel_iommu_iotlb_flush(struct domain *d, unsigned long gfn,
+   unsigned int page_count, unsigned int lock)
 {
 return __intel_iommu_iotlb_flush(d, gfn, 1, page_count);
 }
 
-static int intel_iommu_iotlb_flush_all(struct domain *d)
+static int intel_iommu_iotlb_flush_all(struct domain *d, unsigned int lock)
 {
 return __intel_iommu_iotlb_flush(d, 0, 0, 0);
 }
diff --git a/xen/drivers/passthrough/x86/iommu.c 
b/xen/drivers/passthrough/x86/iommu.c
index 6674fb0..4bbf5f8 100644
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -105,7 +105,7 @@ int arch_iommu_populate_page_table(struct domain *d)
 
 if ( !rc )
 {
-rc = iommu_iotlb_flush_all(d);
+rc = iommu_iotlb_flush_all(d, PCIDEVS_LOCK);
 if ( rc )
 return rc;
 }
diff --git a/xen/include/xen/iommu.h b/xen/include/xen/iommu.h
index f5b6f7e..f58e9d6 100644
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -131,6 +131,13 @@ struct page_info;
  * callback pair.
  */
 typedef int iommu_grdm_t(xen_pfn_t start, xen_ulong_t nr, u32 id, void *ctxt);
+/*
+ * A flag indicates whether the lock is being held.
+ * NONE_LOCK - no lock is being held.
+ * PCIDEVS_LOCK - pcidevs_lock is being held.
+ */
+#define NONE_LOCK 0
+#define PCIDEVS_LOCK 1
 
 struct iommu_ops {
 int (*init)(struct domain *d);
@@ -161,8 +168,9 @@ struct iommu_ops {
 void (*resume)(void);
 void (*share_p2m)(struct domain *d);
 int (*crash_shutdown)(void);
-int (*iotlb_flush)(struct domain *d, unsigned long gfn, unsigned int 
page_count);
-int (*iotlb_flush_all)(struct domain *d);
+int (*iotlb_flush)(struct domain *d, unsigned long gfn, unsigned int 
page_count,
+   unsigned int lock);
+int (*iotlb_flush_all)(struct domain *d, unsigned int lock);
 int (*get_reserved_device_memory)(iommu

[Xen-devel] [PATCH v5 1/7] VT-d: Check VT-d Device-TLB flush error(IOMMU part).

2016-02-05 Thread Quan Xu
This patch checks all kinds of error and all the way up
the call trees of VT-d Device-TLB flush(IOMMU part).

Signed-off-by: Quan Xu 
---
 xen/drivers/passthrough/amd/iommu_init.c  |   4 +-
 xen/drivers/passthrough/amd/pci_amd_iommu.c   |   4 +-
 xen/drivers/passthrough/arm/smmu.c|  13 +--
 xen/drivers/passthrough/iommu.c   |  37 +---
 xen/drivers/passthrough/vtd/extern.h  |   4 +-
 xen/drivers/passthrough/vtd/iommu.c   | 125 --
 xen/drivers/passthrough/vtd/qinval.c  |   2 +-
 xen/drivers/passthrough/vtd/quirks.c  |  26 +++---
 xen/drivers/passthrough/vtd/x86/vtd.c |  17 +++-
 xen/drivers/passthrough/x86/iommu.c   |   6 +-
 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h |   4 +-
 xen/include/asm-x86/iommu.h   |   2 +-
 xen/include/xen/iommu.h   |  20 ++---
 13 files changed, 166 insertions(+), 98 deletions(-)

diff --git a/xen/drivers/passthrough/amd/iommu_init.c 
b/xen/drivers/passthrough/amd/iommu_init.c
index d90a2d2..ec47e22 100644
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -1340,12 +1340,14 @@ static void invalidate_all_devices(void)
 iterate_ivrs_mappings(_invalidate_all_devices);
 }
 
-void amd_iommu_suspend(void)
+int amd_iommu_suspend(void)
 {
 struct amd_iommu *iommu;
 
 for_each_amd_iommu ( iommu )
 disable_iommu(iommu);
+
+return 0;
 }
 
 void amd_iommu_resume(void)
diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c 
b/xen/drivers/passthrough/amd/pci_amd_iommu.c
index c1c0b6b..449de13 100644
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -280,7 +280,7 @@ static int amd_iommu_domain_init(struct domain *d)
 return 0;
 }
 
-static void __hwdom_init amd_iommu_hwdom_init(struct domain *d)
+static int __hwdom_init amd_iommu_hwdom_init(struct domain *d)
 {
 unsigned long i; 
 const struct amd_iommu *iommu;
@@ -312,6 +312,8 @@ static void __hwdom_init amd_iommu_hwdom_init(struct domain 
*d)
 BUG();
 
 setup_hwdom_pci_devices(d, amd_iommu_setup_hwdom_device);
+
+return 0;
 }
 
 void amd_iommu_disable_domain_device(struct domain *domain,
diff --git a/xen/drivers/passthrough/arm/smmu.c 
b/xen/drivers/passthrough/arm/smmu.c
index bb08827..155b7f3 100644
--- a/xen/drivers/passthrough/arm/smmu.c
+++ b/xen/drivers/passthrough/arm/smmu.c
@@ -2544,7 +2544,7 @@ static int force_stage = 2;
  */
 static u32 platform_features = ARM_SMMU_FEAT_COHERENT_WALK;
 
-static void arm_smmu_iotlb_flush_all(struct domain *d)
+static int arm_smmu_iotlb_flush_all(struct domain *d)
 {
struct arm_smmu_xen_domain *smmu_domain = 
domain_hvm_iommu(d)->arch.priv;
struct iommu_domain *cfg;
@@ -2561,13 +2561,15 @@ static void arm_smmu_iotlb_flush_all(struct domain *d)
arm_smmu_tlb_inv_context(cfg->priv);
}
spin_unlock(&smmu_domain->lock);
+
+return 0;
 }
 
-static void arm_smmu_iotlb_flush(struct domain *d, unsigned long gfn,
- unsigned int page_count)
+static int arm_smmu_iotlb_flush(struct domain *d, unsigned long gfn,
+unsigned int page_count)
 {
 /* ARM SMMU v1 doesn't have flush by VMA and VMID */
-arm_smmu_iotlb_flush_all(d);
+return arm_smmu_iotlb_flush_all(d);
 }
 
 static struct iommu_domain *arm_smmu_get_domain(struct domain *d,
@@ -2737,8 +2739,9 @@ static int arm_smmu_iommu_domain_init(struct domain *d)
return 0;
 }
 
-static void __hwdom_init arm_smmu_iommu_hwdom_init(struct domain *d)
+static int __hwdom_init arm_smmu_iommu_hwdom_init(struct domain *d)
 {
+return 0;
 }
 
 static void arm_smmu_iommu_domain_teardown(struct domain *d)
diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
index d5137733..cdf8e9a 100644
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -146,14 +146,15 @@ static void __hwdom_init check_hwdom_reqs(struct domain 
*d)
 iommu_dom0_strict = 1;
 }
 
-void __hwdom_init iommu_hwdom_init(struct domain *d)
+int __hwdom_init iommu_hwdom_init(struct domain *d)
 {
 struct hvm_iommu *hd = domain_hvm_iommu(d);
+int rc = 0;
 
 check_hwdom_reqs(d);
 
 if ( !iommu_enabled )
-return;
+return 0;
 
 register_keyhandler('o', &iommu_p2m_table);
 d->need_iommu = !!iommu_dom0_strict;
@@ -171,7 +172,10 @@ void __hwdom_init iommu_hwdom_init(struct domain *d)
  ((page->u.inuse.type_info & PGT_type_mask)
   == PGT_writable_page) )
 mapping |= IOMMUF_writable;
-hd->platform_ops->map_page(d, gfn, mfn, mapping);
+rc = hd->platform_ops->map_page(d, gfn, mfn, mapping);
+if ( rc )
+return rc;
+
 if ( !(i++ & 0xf) )
 process_pending_softirqs();
 }

[Xen-devel] [PATCH v5 7/7] VT-d: Fix vt-d Device-TLB flush timeout issue.

2016-02-05 Thread Quan Xu
If Device-TLB flush is timeout, we'll hide the target ATS
device and crash the domain owning this ATS device.

If impacted domain is hardware domain, just throw out a warning.

The hidden device will be disallowed to be further assigned to
any domain.

Signed-off-by: Quan Xu 
---
 xen/drivers/passthrough/pci.c |  2 +-
 xen/drivers/passthrough/vtd/extern.h  |  8 +++-
 xen/drivers/passthrough/vtd/qinval.c  | 77 ++-
 xen/drivers/passthrough/vtd/x86/ats.c | 14 ++-
 4 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
index 27b3ca7..2d7dc59 100644
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -407,7 +407,7 @@ static void _pci_hide_device(struct pci_dev *pdev)
 list_add(&pdev->domain_list, &dom_xen->arch.pdev_list);
 }
 
-int __init pci_hide_device(int bus, int devfn)
+int pci_hide_device(int bus, int devfn)
 {
 struct pci_dev *pdev;
 int rc = -ENOMEM;
diff --git a/xen/drivers/passthrough/vtd/extern.h 
b/xen/drivers/passthrough/vtd/extern.h
index ec9c513..a129460 100644
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -56,8 +56,12 @@ struct acpi_drhd_unit * find_ats_dev_drhd(struct iommu 
*iommu);
 
 int ats_device(const struct pci_dev *, const struct acpi_drhd_unit *);
 
-int dev_invalidate_iotlb(struct iommu *iommu, u16 did,
- u64 addr, unsigned int size_order, u64 type);
+int dev_invalidate_iotlb(struct iommu *iommu, u16 did, u64 addr,
+ unsigned int size_order, u64 type,
+ unsigned int lock);
+int dev_invalidate_iotlb_sync(struct iommu *iommu, u16 did,
+  u16 seg, u8 bus, u8 devfn,
+  unsigned int lock);
 
 int qinval_device_iotlb(struct iommu *iommu,
 u32 max_invs_pend, u16 sid, u16 size, u64 addr);
diff --git a/xen/drivers/passthrough/vtd/qinval.c 
b/xen/drivers/passthrough/vtd/qinval.c
index f2e7ffb..76046a7 100644
--- a/xen/drivers/passthrough/vtd/qinval.c
+++ b/xen/drivers/passthrough/vtd/qinval.c
@@ -229,6 +229,69 @@ int qinval_device_iotlb(struct iommu *iommu,
 return 0;
 }
 
+static void dev_invalidate_iotlb_timeout(struct iommu *iommu, u16 did,
+ u16 seg, u8 bus, u8 devfn,
+ unsigned int lock)
+{
+struct domain *d = NULL;
+struct pci_dev *pdev;
+
+if ( test_bit(did, iommu->domid_bitmap) )
+d = rcu_lock_domain_by_id(iommu->domid_map[did]);
+
+if ( d == NULL )
+return;
+
+for_each_pdev(d, pdev)
+{
+if ( (pdev->seg == seg) &&
+ (pdev->bus == bus) &&
+ (pdev->devfn == devfn) )
+{
+ASSERT ( pdev->domain );
+list_del(&pdev->domain_list);
+pdev->domain = NULL;
+
+if ( !(lock & PCIDEVS_LOCK) )
+spin_lock(&pcidevs_lock);
+
+if ( pci_hide_device(bus, devfn) )
+{
+printk(XENLOG_ERR
+   "IOMMU hide device %04x:%02x:%02x.%02x error.",
+   seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+}
+
+if ( !(lock & PCIDEVS_LOCK) )
+spin_unlock(&pcidevs_lock);
+
+break;
+}
+}
+
+if ( !is_hardware_domain(d) )
+domain_crash(d);
+rcu_unlock_domain(d);
+}
+
+int dev_invalidate_iotlb_sync(struct iommu *iommu, u16 did,
+  u16 seg, u8 bus, u8 devfn,
+  unsigned int lock)
+{
+struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
+int rc = 0;
+
+if ( qi_ctrl->qinval_maddr )
+{
+rc = queue_invalidate_wait(iommu, 0, 1, 1);
+if ( rc == -ETIMEDOUT )
+dev_invalidate_iotlb_timeout(iommu, did,
+ seg, bus, devfn, lock);
+}
+
+return rc;
+}
+
 static void queue_invalidate_iec(struct iommu *iommu, u8 granu, u8 im, u16 
iidx)
 {
 unsigned long flags;
@@ -350,9 +413,19 @@ static int flush_iotlb_qi(
 queue_invalidate_iotlb(iommu,
type >> DMA_TLB_FLUSH_GRANU_OFFSET, dr,
dw, did, size_order, 0, addr);
-if ( flush_dev_iotlb )
-ret = dev_invalidate_iotlb(iommu, did, addr, size_order, type);
+
+/*
+ * Before Device-TLB invalidation we need to synchronize
+ * invalidation completions with hardware.
+ */
 rc = invalidate_sync(iommu);
+if ( rc )
+ return rc;
+
+if ( flush_dev_iotlb )
+ret = dev_invalidate_iotlb(iommu, did, addr, size_order,
+   type, lock);
+
 if ( !ret )
 ret = rc;
 }
diff --git a/xen/drivers/passthrough/vtd/x86/ats.c 
b/xen/drivers/passthroug

[Xen-devel] [PATCH v5 2/7] VT-d: Check VT-d Device-TLB flush error(MMU part).

2016-02-05 Thread Quan Xu
This patch checks all kinds of error and all the way up
the call trees of VT-d Device-TLB flush(MMU part).

Signed-off-by: Quan Xu 
---
 xen/arch/x86/acpi/power.c   |  6 +-
 xen/arch/x86/crash.c|  3 ++-
 xen/arch/x86/domain_build.c |  5 -
 xen/arch/x86/mm.c   | 10 +++---
 xen/arch/x86/mm/p2m-ept.c   | 11 +--
 xen/arch/x86/mm/p2m-pt.c| 11 +--
 xen/common/domain.c |  2 +-
 xen/common/grant_table.c|  5 +++--
 xen/common/memory.c |  5 +++--
 9 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/xen/arch/x86/acpi/power.c b/xen/arch/x86/acpi/power.c
index f41f0de..ff397c3 100644
--- a/xen/arch/x86/acpi/power.c
+++ b/xen/arch/x86/acpi/power.c
@@ -45,6 +45,8 @@ void do_suspend_lowlevel(void);
 
 static int device_power_down(void)
 {
+int rc;
+
 console_suspend();
 
 time_suspend();
@@ -53,7 +55,9 @@ static int device_power_down(void)
 
 ioapic_suspend();
 
-iommu_suspend();
+rc = iommu_suspend();
+if ( rc )
+return rc;
 
 lapic_suspend();
 
diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index 888a214..59e1af6 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -170,7 +170,8 @@ static void nmi_shootdown_cpus(void)
 
 /* Crash shutdown any IOMMU functionality as the crashdump kernel is not
  * happy when booting if interrupt/dma remapping is still enabled */
-iommu_crash_shutdown();
+if ( iommu_crash_shutdown() )
+printk("Failed to shut down IOMMU.\n");
 
 __stop_this_cpu();
 
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index bca6fe7..d10321a 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -1627,7 +1627,10 @@ int __init construct_dom0(
 }
 
 if ( d->domain_id == hardware_domid )
-iommu_hwdom_init(d);
+{
+rc = iommu_hwdom_init(d);
+BUG_ON(rc != 0);
+}
 
 return 0;
 
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 202ff76..1e50b94 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2445,9 +2445,13 @@ static int __get_page_type(struct page_info *page, 
unsigned long type,
 if ( (x & PGT_type_mask) == PGT_writable_page )
 iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
 else if ( type == PGT_writable_page )
-iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
-   page_to_mfn(page),
-   IOMMUF_readable|IOMMUF_writable);
+{
+rc = iommu_map_page(d, mfn_to_gmfn(d, page_to_mfn(page)),
+page_to_mfn(page),
+IOMMUF_readable|IOMMUF_writable);
+if ( rc )
+iommu_unmap_page(d, mfn_to_gmfn(d, page_to_mfn(page)));
+}
 }
 }
 
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 9860c6c..9e1f5c6 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -829,12 +829,19 @@ out:
  need_modify_vtd_table )
 {
 if ( iommu_hap_pt_share )
-iommu_pte_flush(d, gfn, &ept_entry->epte, order, vtd_pte_present);
+rc = iommu_pte_flush(d, gfn, &ept_entry->epte, order, 
vtd_pte_present);
 else
 {
 if ( iommu_flags )
 for ( i = 0; i < (1 << order); i++ )
-iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
+{
+rc = iommu_map_page(d, gfn + i, mfn_x(mfn) + i, 
iommu_flags);
+if ( rc )
+{
+while ( i-- > 0 )
+iommu_unmap_page(d, gfn + i);
+}
+}
 else
 for ( i = 0; i < (1 << order); i++ )
 iommu_unmap_page(d, gfn + i);
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index 709920a..942a11c 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -675,8 +675,15 @@ p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long 
gfn, mfn_t mfn,
 }
 else if ( iommu_pte_flags )
 for ( i = 0; i < (1UL << page_order); i++ )
-iommu_map_page(p2m->domain, gfn + i, mfn_x(mfn) + i,
-   iommu_pte_flags);
+{
+rc = iommu_map_page(p2m->domain, gfn + i, mfn_x(mfn) + i,
+iommu_pte_flags);
+if ( rc )
+{
+while ( i-- > 0 )
+iommu_unmap_page(p2m->domain, gfn + i);
+}
+}
 else
 for ( i = 0; i < (1UL << page_order); i++ )
 iommu_unmap_page(p2m->domain, gfn + i);
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 1b9fcfc..577eb3d 100644
--- a/xen/common/domain.c
+++ b/x

[Xen-devel] [PATCH v5 3/7] VT-d: Reduce spin timeout to 1ms, which can be boot-time changed.

2016-02-05 Thread Quan Xu
Signed-off-by: Quan Xu 
---
 docs/misc/xen-command-line.markdown  |  7 +++
 xen/drivers/passthrough/vtd/qinval.c | 11 +--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index a565c1b..6ed5cd8 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -968,6 +968,13 @@ Use this to work around firmware issues providing correct 
RMRR entries. Rather
 than only mapping RAM pages for IOMMU accesses for Dom0, with this option all
 pages not marked as unusable in the E820 table will get a mapping established.
 
+### vtd\_qi\_timeout (VT-d)
+> `= `
+
+> Default: `1`
+
+>> Specify the timeout of the VT-d Queued Invalidation in ms.
+
 ### irq\_ratelimit
 > `= `
 
diff --git a/xen/drivers/passthrough/vtd/qinval.c 
b/xen/drivers/passthrough/vtd/qinval.c
index 946e812..f9e752b 100644
--- a/xen/drivers/passthrough/vtd/qinval.c
+++ b/xen/drivers/passthrough/vtd/qinval.c
@@ -28,6 +28,11 @@
 #include "vtd.h"
 #include "extern.h"
 
+static unsigned int __read_mostly vtd_qi_timeout = 1;
+integer_param("vtd_qi_timeout", vtd_qi_timeout);
+
+#define IOMMU_QI_TIMEOUT (vtd_qi_timeout * MILLISECS(1))
+
 static void print_qi_regs(struct iommu *iommu)
 {
 u64 val;
@@ -167,10 +172,12 @@ static int queue_invalidate_wait(struct iommu *iommu,
 start_time = NOW();
 while ( poll_slot != QINVAL_STAT_DONE )
 {
-if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
+if ( NOW() > (start_time + IOMMU_QI_TIMEOUT) )
 {
 print_qi_regs(iommu);
-panic("queue invalidate wait descriptor was not executed");
+dprintk(XENLOG_WARNING VTDPREFIX,
+"Queue invalidate wait descriptor was timeout.\n");
+return -ETIMEDOUT;
 }
 cpu_relax();
 }
-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v5 0/7] VT-d Device-TLB flush issue

2016-02-05 Thread Quan Xu
This patches fix current timeout concern and also allow limited ATS support:

1. Check VT-d Device-TLB flush error.
   This patch checks all kinds of error and all the way up the call trees of 
VT-d Device-TLB flush.

2. Reduce spin timeout to 1ms, which can be boot-time changed with 
'vtd_qi_timeout'.
   For example:
   multiboot /boot/xen.gz ats=1 vtd_qi_timeout=100

3. Pass down a flag indicating whether the lock is being held.
 
4. Fix vt-d Device-TLB flush timeout issue.
   If Device-TLB flush is timeout, we'll hide the target ATS device and crash 
the domain owning this ATS device.
   If impacted domain is hardware domain, just throw out a warning.
   The hidden device will be disallowed to be further assigned to  any domain.

 

 * DMAR_OPERATION_TIMEOUT should be also chopped down to a low number of 
milliseconds.
   As Kevin Tian mentioned in 'Revisit VT-d asynchronous flush issue', We also 
confirmed with hardware team
   that 1ms is large enough for IOMMU internal flush. So I can change 
DMAR_OPERATION_TIMEOUT from 1000 ms to 1 ms.

   IOMMU_WAIT_OP() is only for VT-d registers read/write, and there is also a 
panic. We need a further discussion
   whether or how to remove this panic in next patch set.

 * The coming patch set will fix IOTLB/Context/IETC flush timeout.

-Changes in v5:
  * Split the 'check VT-d Device-TLB flush error' into MMU part and IOMMU part. 
(P1-P2)
  * Add a new standalone entry for new command 'vtd_qi_timeout' in 
docs/misc/xen-command-line.markdown.(P3)
  * Change the option name from 'iommu_qi_timeout_ms' to 'vtd_qi_timeout'.(P3)
  * Pass down a flag indicating whether the lock is being held.(P4-P6)
  * Fix multiple return points when this can be trivially avoided.(P7)
  * Enhance the print out message. (P7)
  * Enhance the comment.(P7)
  * Consult the bitmap along with the domain ID array.(P7)



Quan Xu (7):
  VT-d: Check VT-d Device-TLB flush error(IOMMU part).
  VT-d: Check VT-d Device-TLB flush error(MMU part).
  VT-d: Reduce spin timeout to 1ms, which can be boot-time changed.
  VT-d: Refactor iommu_ops .iotlb_flush() and iotlb_flush_all()
  VT-d: Refactor iommu_ops .map_page() and unmap_page()
  VT-d: Refactor iommu_flush .iotlb()
  VT-d: Fix vt-d Device-TLB flush timeout issue.

 docs/misc/xen-command-line.markdown   |   7 ++
 xen/arch/arm/p2m.c|   2 +-
 xen/arch/x86/acpi/power.c |   6 +-
 xen/arch/x86/crash.c  |   3 +-
 xen/arch/x86/domain_build.c   |   5 +-
 xen/arch/x86/mm.c |  15 ++-
 xen/arch/x86/mm/p2m-ept.c |  15 ++-
 xen/arch/x86/mm/p2m-pt.c  |  14 ++-
 xen/arch/x86/mm/p2m.c |  24 ++--
 xen/arch/x86/x86_64/mm.c  |   5 +-
 xen/common/domain.c   |   2 +-
 xen/common/grant_table.c  |  16 ++-
 xen/common/memory.c   |   5 +-
 xen/drivers/passthrough/amd/iommu_init.c  |   4 +-
 xen/drivers/passthrough/amd/iommu_map.c   |   7 +-
 xen/drivers/passthrough/amd/pci_amd_iommu.c   |   7 +-
 xen/drivers/passthrough/arm/smmu.c|  15 ++-
 xen/drivers/passthrough/iommu.c   |  47 ---
 xen/drivers/passthrough/pci.c |   2 +-
 xen/drivers/passthrough/vtd/extern.h  |  12 +-
 xen/drivers/passthrough/vtd/iommu.c   | 173 +-
 xen/drivers/passthrough/vtd/iommu.h   |   3 +-
 xen/drivers/passthrough/vtd/qinval.c  |  93 +-
 xen/drivers/passthrough/vtd/quirks.c  |  26 ++--
 xen/drivers/passthrough/vtd/x86/ats.c |  14 ++-
 xen/drivers/passthrough/vtd/x86/vtd.c |  18 ++-
 xen/drivers/passthrough/x86/iommu.c   |   9 +-
 xen/include/asm-x86/hvm/svm/amd-iommu-proto.h |   8 +-
 xen/include/asm-x86/iommu.h   |   3 +-
 xen/include/asm-x86/p2m.h |   6 +-
 xen/include/xen/iommu.h   |  37 +++---
 31 files changed, 429 insertions(+), 174 deletions(-)

-- 
1.9.1


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Ian Campbell
On Thu, 2016-02-04 at 18:48 +0100, Roger Pau Monné wrote:
> Hello,
> 
> I've Cced a bunch of people who have expressed interest in the HVMlite 
> design/implementation,

I think "HVMlite" has now reached the point where we should start the
transition from PVH (classic) to PVH (hvmlite) naming rather than
introducing yet another guest type terminology where end users are going to
see it (the 4.7 release, specifications in tree, etc).

So IMHO HVMlite should be referred to as "PVH" throughout, with the
original implementation retconned to be called "PVH (classic)" or
"Prototype-PVH" or something. A short paragraph explaining the background
might be appropriate.

Calling them PVHv1 and PVHv2 would also be tolerable.

This should extend to all the documentation etc as well as IMHO to patch
postings (in that case "PVH (hvmlite)" might be appropriate in places where
there might be confusion until "PVH (classic)" really goes away).

The point is that dmlite was always supposed to be a reimplementation of
the PVH concept using the lessons learned from the "come at it from the PV
end" attempt, it's not (from a user PoV) a new operating mode.

Ian.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v7 3/5] libxl: add support for vscsi

2016-02-05 Thread Olaf Hering
On Fri, Feb 05, Olaf Hering wrote:

> @@ -6799,6 +7375,8 @@ int libxl_retrieve_domain_configuration(libxl_ctx *ctx, 
> uint32_t domid,
>  
>  MERGE(nic, nics, COMPARE_DEVID, {});
>  
> +MERGE(vscsictrl, vscsictrls, COMPARE_VSCSI, {});

I think the actual "merge" should be like this, to copy the data
gathered from xenstore into the d_config which was built from JSON.
{
libxl_device_vscsictrl_dispose(dst);
libxl_device_vscsictrl_copy(CTX, dst, src);
}

Olaf

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Nested virtualization off VMware vSphere 6.0 with EL6 guests crashes on Xen 4.6

2016-02-05 Thread Jan Beulich
>>> On 04.02.16 at 19:36,  wrote:
> (XEN) nvmx_handle_vmwrite 1: IO_BITMAP_A(2000)[0=]
> (XEN) nvmx_handle_vmwrite 0: IO_BITMAP_A(2000)[0=]
> (XEN) nvmx_handle_vmwrite 1: IO_BITMAP_B(2002)[0=]
> (XEN) nvmx_handle_vmwrite 2: IO_BITMAP_A(2000)[0=]
> (XEN) nvmx_handle_vmwrite 1: VIRTUAL_APIC_PAGE_ADDR(2012)[0=]
> (XEN) nvmx_handle_vmwrite 2: IO_BITMAP_B(2002)[0=]
> (XEN) nvmx_handle_vmwrite 1: (2006)[0=]
> (XEN) nvmx_handle_vmwrite 2: VIRTUAL_APIC_PAGE_ADDR(2012)[0=]
> (XEN) nvmx_handle_vmwrite 1: VM_EXIT_MSR_LOAD_ADDR(2008)[0=]
> (XEN) nvmx_handle_vmwrite 3: IO_BITMAP_A(2000)[0=]
> (XEN) nvmx_handle_vmwrite 3: IO_BITMAP_B(2002)[0=]
> (XEN) nvmx_handle_vmwrite 2: MSR_BITMAP(2004)[0=]
> (XEN) nvmx_handle_vmwrite 1: MSR_BITMAP(2004)[0=]
> (XEN) nvmx_handle_vmwrite 0: MSR_BITMAP(2004)[0=]
> (XEN) nvmx_handle_vmwrite 3: (2006)[0=]
> (XEN) nvmx_handle_vmwrite 3: VM_EXIT_MSR_LOAD_ADDR(2008)[0=]
> (XEN) nvmx_handle_vmwrite 3: MSR_BITMAP(2004)[0=]

So there's a whole lot of "interesting" writes of all ones, and indeed
VIRTUAL_APIC_PAGE_ADDR is among them, and the code doesn't
handle that case (nor the equivalent for APIC_ACCESS_ADDR).
What's odd though is that the writes are for vCPU 1 and 2, while
the crash is on vCPU 3 (it would of course help if the guest had as
few vCPU-s as possible without making the issue disappear). While
you have circumvented the ASSERT() you've originally hit, the log
messages you've added there don't appear anywhere, which is
clearly confusing, so I wonder what other unintended effects your
debugging code has (there's clearly an uninitialized variable issue
in your additions to vmx_vmexit_handler(), but that shouldn't
matter here, albeit it should have cause build failure, making me
suspect the patch to be stale).

Oddly enough the various bitmap field VMWRITEs above should all
fail, yet the guest appears to recover from (ignore?) these
failures. (From all I can tell we're prone to NULL dereferences due
to that at least in _shadow_io_bitmap().)

> (XEN) Failed vm entry (exit reason 0x8021) caused by invalid guest state 
> (4).

4 means invalid VMCS link pointer - interesting.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Memory Sharing

2016-02-05 Thread David Vrabel
On 04/02/16 21:04, hanji unit wrote:
> Hello, does Xen support sharing memory pages between multiple domains
> (such as as Dom0, DomU1, DomU2)? The Grant Table hypercalls seem
> limited to:
> 
> IOCTL_GNTALLOC_ALLOC_GREF
> IOCTL_GNTALLOC_DEALLOC_GREF
> IOCTL_GNTALLOC_SET_UNMAP_NOTIFY

These are the ioctls provided by the gntalloc driver.  The grant table
mechanism itself can share any page of guest memory.

If you need to share an already allocated page of userspace memory, the
gntalloc driver would need to be extended with a new ioctl.

David

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Jan Beulich
>>> On 05.02.16 at 10:50,  wrote:
> For legacy PCI interrupts, we can parse the MADT inside of Xen in order
> to properly setup the lines/overwrites and inject the interrupts that
> are not handled by Xen straight into the hardware domain. This will
> require us to be able to emulate the same topology as what is found in
> native (eg: if there are two IO APICs in the hardware we should also
> provide two emulated ones to the hw domain).

I don't think MADT contains all the needed information, or else we
wouldn't need PHYSDEVOP_setup_gsi.

> As for PCI config space accesses, don't we already do that? We trap on
> access to the 0xcf8 io port.

We intercept that, but iirc we do no translation (and for DomU
these get forwarded to qemu anyway).

  * `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
Bit 8 (TF) must be cleared. Other bits are all unspecified.
>>>
>>> I would also specify that the direction flag shall be clear, to prevent
>>> all kernels needing to `cld` on entry.
>> 
>> In which case IOPL and AC state should perhaps also be nailed down?
>> Possibly even all of the control ones (leaving only the status flags
>> unspecified)?
> 
> Status flag? Why don't we just say that all user-settable bits in the
> status register will be set to 0 (or cleared)?

Would be an option too.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Jan Beulich
>>> On 05.02.16 at 10:24,  wrote:
> Utilizing the default server is a backwards step. GVT-g would have to use the 
> old HVM_PARAM mechanism to cause it's emulator to become default. I think a 
> more appropriate mechanism would be p2m_mmio_write_dm to become something 
> like 'p2m_ioreq_server_write' and then have a hypercall to allow it to be 
> mapped to a particular ioreq server.
> Obviously only one could claim it but, with a p2t, the bit could be 
> re-purposed to simply mean 'go look in the p2t' for more information and 
> then the p2t could be structured to allow emulations to be steered to one of 
> many ioreq servers (for read and/or write emulation).

Sounds reasonable.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Help in fixing a issue

2016-02-05 Thread Wei Liu
On Fri, Feb 05, 2016 at 09:15:52PM +1100, PREETI MISHRA wrote:
> I have a patch in Xen which stores some information of VM process. I have
> another program running in Dom0 which intercept this information.
> 

I'm not sure I can parse this sentence. It's too vague for what you want
to do. For one, I don't know what "VM process" is.

> i) I want to configure my patch running in Xen to send the alert
> notification to program running in Dom0 to read data, probably using event
> channels. How to configure event channel?
> ii) Which API or command will be used to read the data from patch buffer.
> 

If you're writing user space program, code in QEMU might be a good
reference.

Please search for files like "xen_nic.c" in QEMU source code and start
from there.

You can also explore various Xen driver code inside Linux if you're
keen.

Another thing that might be of interest to you is a program lives in
xen.git called xentrace. It extracts buffer from hypervisor, which might
be what you need.

Wei.

> I am running xen 4.6 in ubuntu 14.04
> 
> regards,
> 
> Preeti

> ___
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [xen-unstable test] 80434: regressions - FAIL

2016-02-05 Thread osstest service owner
flight 80434 xen-unstable real [real]
http://logs.test-lab.xenproject.org/osstest/logs/80434/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 9 debian-hvm-install fail 
REGR. vs. 79422
 test-armhf-armhf-xl-credit2  11 guest-start   fail REGR. vs. 79422

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-xl-qemut-win7-amd64 16 guest-stopfail REGR. vs. 79422
 test-armhf-armhf-xl-rtds 11 guest-start  fail   like 79379
 build-amd64-rumpuserxen   6 xen-buildfail   like 79422
 build-i386-rumpuserxen6 xen-buildfail   like 79422
 test-amd64-i386-xl-qemut-win7-amd64 16 guest-stop  fail like 79422
 test-amd64-i386-xl-qemuu-win7-amd64 16 guest-stop  fail like 79422

Tests which did not succeed, but are not blocking:
 test-amd64-i386-rumpuserxen-i386  1 build-check(1)   blocked  n/a
 test-amd64-amd64-rumpuserxen-amd64  1 build-check(1)   blocked n/a
 test-amd64-amd64-xl-pvh-amd  11 guest-start  fail   never pass
 test-amd64-amd64-xl-pvh-intel 11 guest-start  fail  never pass
 test-armhf-armhf-libvirt 14 guest-saverestorefail   never pass
 test-armhf-armhf-libvirt 12 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 13 guest-saverestorefail   never pass
 test-armhf-armhf-libvirt-raw 11 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 12 migrate-support-checkfail   never pass
 test-amd64-amd64-qemuu-nested-amd 16 debian-hvm-install/l1/l2  fail never pass
 test-amd64-i386-libvirt  12 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  12 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 10 migrate-support-check 
fail never pass
 test-armhf-armhf-xl  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  13 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 12 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 13 saverestore-support-checkfail never pass
 test-amd64-amd64-libvirt-xsm 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 12 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-xsm  13 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-xsm  12 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 11 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-xsm 12 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-xsm 14 guest-saverestorefail   never pass
 test-armhf-armhf-libvirt-qcow2 11 migrate-support-checkfail never pass
 test-armhf-armhf-libvirt-qcow2 13 guest-saverestorefail never pass
 test-armhf-armhf-xl-vhd  11 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  12 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 saverestore-support-checkfail   never pass

version targeted for testing:
 xen  be05b5385fb261c1fa1cbb6b4bdc12a6e8676c4b
baseline version:
 xen  9937763265d9597e5f2439249b16d995842cdf0f

Last test of basis79422  2016-01-29 14:09:49 Z6 days
Failing since 79502  2016-01-30 20:16:40 Z5 days6 attempts
Testing same since80434  2016-02-04 10:25:38 Z1 days1 attempts


People who touched revisions under test:
  Alan.Robinson 
  Andrew Cooper 
  Boris Ostrovsky 
  Corneliu ZUZU 
  David Vrabel 
  Doug Goldstein 
  Graeme Gregory 
  Hanjun Guo 
  Ian Campbell 
  Ian Jackson 
  Jan Beulich 
  Jennifer Herbert 
  Juergen Gross 
  Kevin Tian 
  Konrad Rzeszutek Wilk 
  Parth Dixit 
  Rafael J. Wysocki 
  Razvan Cojocaru 
  Roger Pau Monne 
  Roger Pau Monné 
  Shannon Zhao 
  Shuai Ruan 
  Stefano Stabellini 
  Tamas K Lengyel 
  Tim Deegan 
  Vitaly Kuznetsov 
  Wei Liu 
  Yu Zhang 
  Zoltan Kiss 

jobs:
 build-amd64-xsm  pass
 build-armhf-xsm  pass
 build-i386-xsm   pass
 build-amd64  pass
 build-armhf  pass
 build-i386   pass
 build-amd64-libvirt  pass
 build-armhf-libvirt  pass
 build-i386-l

Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Andrew Cooper
On 05/02/16 10:40, Jan Beulich wrote:
 On 05.02.16 at 10:50,  wrote:
>> For legacy PCI interrupts, we can parse the MADT inside of Xen in order
>> to properly setup the lines/overwrites and inject the interrupts that
>> are not handled by Xen straight into the hardware domain. This will
>> require us to be able to emulate the same topology as what is found in
>> native (eg: if there are two IO APICs in the hardware we should also
>> provide two emulated ones to the hw domain).
> I don't think MADT contains all the needed information, or else we
> wouldn't need PHYSDEVOP_setup_gsi.
>
>> As for PCI config space accesses, don't we already do that? We trap on
>> access to the 0xcf8 io port.
> We intercept that, but iirc we do no translation (and for DomU
> these get forwarded to qemu anyway).

This is one aspect which will change with the proposed plans to have a
small host bridge/root complex in Xen.

Currently, cf8/cf8 handling is already done partly in Xen because of
multiple ioreq server handling.  However, the current setup completely
fails if the guest attempts to renumber the PCI Buses, and requires each
ioreq server to coordinate with their introduced topology.

A small host bridge and root complex in Xen solves all of these problems
for us, reduces the number of broadcast ioreqs Xen needs to make, and
allows multiple ioreq servers to function completely without any
self-coordination.

>
>  * `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
>Bit 8 (TF) must be cleared. Other bits are all unspecified.
 I would also specify that the direction flag shall be clear, to prevent
 all kernels needing to `cld` on entry.
>>> In which case IOPL and AC state should perhaps also be nailed down?
>>> Possibly even all of the control ones (leaving only the status flags
>>> unspecified)?
>> Status flag? Why don't we just say that all user-settable bits in the
>> status register will be set to 0 (or cleared)?
> Would be an option too.

What about the ID bit, which probably ought to be set?

~Andrew

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread George Dunlap
On Fri, Feb 5, 2016 at 3:44 AM, Tian, Kevin  wrote:
>> > So as long as the currently-in-use GTT tree contains no more than
>> > $LIMIT ranges, you can unshadow and reshadow; this will be slow, but
>> > strictly speaking correct.
>> >
>> > What do you do if the guest driver switches to a GTT such that the
>> > entire tree takes up more than $LIMIT entries?
>>
>> GPU has some special properties different from CPU, which make things
>> easier. The GPU page table is constructed by CPU and used by GPU
>> workloads. GPU workload itself will not change the page table.
>> Meanwhile, GPU workload submission, in virtualized environment, is
>> controled by our device model. Then we can reshadow the whole table
>> every time before we submit workload. That can reduce the total number
>> required for write protection but with performance impact, because GPU
>> has to have more idle time waiting for CPU. Hope the info helps.
>> Thanks!
>>
>
> Putting in another way, it's fully under mediation when a GPU page table
> (GTT) will be referenced by the GPU, so there're plenty of room to
> optimize existing shadowing (always shadowing all recognized GPU page
> tables), e.g. shadowing only active one when a VM is scheduled in. It's
> a performance matter but no correctness issue.
>
> This is why Yu mentioned earlier whether we can just set a default
> limit which is good for majority of use cases, while extending our
> device mode to drop/recreate some shadow tables upon the limitation
> is hit. I think this matches how today's CPU shadow page table is
> implemented, which also has a limitation of how many shadow pages
> are allowed per-VM.

I don't think you've understood my question (or maybe I still don't
understood the situation properly).

So in memory pagetables, there's a "tree" that contains a single
top-level page, which points to other pages, which defines one address
space (usually corresponding to one process or thread).   (This is
often just refered to as 'cr3', since it's the value you write into
the cr3 register on x86 processors.) I'm assuming that the structure
is similar for your GPU translation tables -- that a single GTT is
effectively a "tree" sort of like a process address space for an OS.

And it sounds like what you're saying is: suppose we have 10 different
GTTs (i.e., an entire tree / gpu thread), and each one require 1024
ranges to shadow.  In that case, a limit of 8192 ranges means we can
only keep 8 of the ten actually shadowed at any one time.  This is not
optimal, since it will occasionally mean unshadowing an entire GTT and
re-shadowing another one, but it will work, because we can always make
sure that the currently-active GTT is shadowed.

My question is, suppose a single GTT / gpu thread / tree has 9000
ranges.  It would be trivial for an attacker to break into the
operating system and *construct* such a tree, but it's entirely
possible that due to a combination of memory fragmentation and very
large usage, the normal driver might accidentally create such a GTT.
In that case, the device model will not be able to write-protect all
the pages in the single GTT, and thus will not be able to correctly
track changes to the currently-active GTT.  What does your device
model do in that case?

 -George

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Jan Beulich
>>> On 05.02.16 at 12:04,  wrote:
> On 05/02/16 10:40, Jan Beulich wrote:
> On 05.02.16 at 10:50,  wrote:
>>> Status flag? Why don't we just say that all user-settable bits in the
>>> status register will be set to 0 (or cleared)?
>> Would be an option too.
> 
> What about the ID bit, which probably ought to be set?

Why that? This flag exists solely to indicate presence of CPUID,
and does so by being modifiable (not by being set).

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Help in fixing a issue

2016-02-05 Thread Wei Liu
Add back xen-devel, please use "reply-all" in the future.

And please don't top-post.

On Fri, Feb 05, 2016 at 10:01:57PM +1100, PREETI MISHRA wrote:
> Thanks for the reply,
> 
> actually, I have a virtual machine in which some processes are running. I
> want to analysis their behavior using VMI at xen.
> 

Have you checked out libvmi in this case?

> My tool has two components:i) xen patch running at hypervisor ii) analyzing
> component running at Dom0
> 
> 1. Xen patch is responsible for collecting the system call information of a
> monitored process. It can be syscall no and process id.
>   - how to take this information? //particularly i want to trap any
> execution of monitored programs.
>   - how to store this information in buffer?
>-how to send alert to my another daemon (analyzing component)
> running in Dom0.

See xentrace / xenanalyze in xen.git.

> 2. On receiving alert, analyzing component perform analysis over the
> collected data.
>   - how to send the buffer information (pid and system call no) to
> analyzing component?
> 

Anyway, I'm not expert on this. I will let other people to have a look.

My gut feeling is that libvmi should be the way to go.

Wei.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread George Dunlap
On Fri, Feb 5, 2016 at 9:24 AM, Paul Durrant  wrote:
> Utilizing the default server is a backwards step. GVT-g would have to use the 
> old HVM_PARAM mechanism to cause it's emulator to become default. I think a 
> more appropriate mechanism would be p2m_mmio_write_dm to become something 
> like 'p2m_ioreq_server_write' and then have a hypercall to allow it to be 
> mapped to a particular ioreq server.
> Obviously only one could claim it but, with a p2t, the bit could be 
> re-purposed to simply mean 'go look in the p2t' for more information and then 
> the p2t could be structured to allow emulations to be steered to one of many 
> ioreq servers (for read and/or write emulation).

Right; I had in mind that Xen would allow at any given time a max of N
ioreq servers to register for mmio_write_dm ranges, first-come
first-served; with 'N' being '1' to begin with.  If a second ioreq
server requested mmio_write_dm functionality, it would get -EBUSY.
This would allow their current setup (one qemu dm which doesn't do
mmio_write_dm, one xengt dm which does) to work without needing to
worry any more about how many pages might need to be tracked (either
for efficiency or correctness).

We could then extend this to some larger number (4 seems pretty
reasonable to me) either by adding an extra 3 types, or by some other
method (such as the one Paul suggests).

 -George

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] Help in fixing a issue

2016-02-05 Thread Wei Liu
BTW please check out:

 http://wiki.xen.org/wiki/Asking_Xen_Devel_Questions

Wei.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter max_wp_ram_ranges.

2016-02-05 Thread Paul Durrant
> -Original Message-
> From: dunl...@gmail.com [mailto:dunl...@gmail.com] On Behalf Of
> George Dunlap
> Sent: 05 February 2016 11:14
> To: Paul Durrant
> Cc: Jan Beulich; George Dunlap; Kevin Tian; Wei Liu; Ian Campbell; Andrew
> Cooper; Zhang Yu; xen-devel@lists.xen.org; Stefano Stabellini;
> zhiyuan...@intel.com; Ian Jackson; Keir (Xen.org)
> Subject: Re: [Xen-devel] [PATCH v3 3/3] tools: introduce parameter
> max_wp_ram_ranges.
> 
> On Fri, Feb 5, 2016 at 9:24 AM, Paul Durrant 
> wrote:
> > Utilizing the default server is a backwards step. GVT-g would have to use
> the old HVM_PARAM mechanism to cause it's emulator to become default. I
> think a more appropriate mechanism would be p2m_mmio_write_dm to
> become something like 'p2m_ioreq_server_write' and then have a hypercall
> to allow it to be mapped to a particular ioreq server.
> > Obviously only one could claim it but, with a p2t, the bit could be re-
> purposed to simply mean 'go look in the p2t' for more information and then
> the p2t could be structured to allow emulations to be steered to one of many
> ioreq servers (for read and/or write emulation).
> 
> Right; I had in mind that Xen would allow at any given time a max of N
> ioreq servers to register for mmio_write_dm ranges, first-come
> first-served; with 'N' being '1' to begin with.  If a second ioreq
> server requested mmio_write_dm functionality, it would get -EBUSY.
> This would allow their current setup (one qemu dm which doesn't do
> mmio_write_dm, one xengt dm which does) to work without needing to
> worry any more about how many pages might need to be tracked (either
> for efficiency or correctness).
> 
> We could then extend this to some larger number (4 seems pretty
> reasonable to me) either by adding an extra 3 types, or by some other
> method (such as the one Paul suggests).

I think it would be best to do away with the 'write dm' name though. I would 
like to see it be possible to steer reads+writes, as well as writes (and maybe 
just reads?) to a particular ioreq server based on type information. So maybe 
we just call the existing type 'p2m_ioreq_server' and then, in the absence of a 
p2t, hardcode this to go to whichever emulator makes the new TBD hypercall.
I think we need a proper design at this point. Given that it's Chinese New Year 
maybe I'll have a stab in Yu's absence.

  Paul

> 
>  -George
___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Roger Pau Monné
El 5/2/16 a les 11:40, Jan Beulich ha escrit:
 On 05.02.16 at 10:50,  wrote:
>> For legacy PCI interrupts, we can parse the MADT inside of Xen in order
>> to properly setup the lines/overwrites and inject the interrupts that
>> are not handled by Xen straight into the hardware domain. This will
>> require us to be able to emulate the same topology as what is found in
>> native (eg: if there are two IO APICs in the hardware we should also
>> provide two emulated ones to the hw domain).
> 
> I don't think MADT contains all the needed information, or else we
> wouldn't need PHYSDEVOP_setup_gsi.

AFAICT, I think we could do something like:

 - IRQs [0, 15]: edge-trigger, low-polarity.
 - IRQs [16, n]: level-triggered, high-polarity.

Unless there's an overwrite in the MADT. Then there are interrupts that
are handled by Xen, which would not be passed-through to the hardware
domain, the rest would be.

I expect that Xen will already have some code to deal with this, since
it's also used for regular PCI-passthrough.

>> As for PCI config space accesses, don't we already do that? We trap on
>> access to the 0xcf8 io port.
> 
> We intercept that, but iirc we do no translation (and for DomU
> these get forwarded to qemu anyway).
> 
>  * `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
>Bit 8 (TF) must be cleared. Other bits are all unspecified.

 I would also specify that the direction flag shall be clear, to prevent
 all kernels needing to `cld` on entry.
>>>
>>> In which case IOPL and AC state should perhaps also be nailed down?
>>> Possibly even all of the control ones (leaving only the status flags
>>> unspecified)?
>>
>> Status flag? Why don't we just say that all user-settable bits in the
>> status register will be set to 0 (or cleared)?
> 
> Would be an option too.

AFAICT that's what we already do, so I will add it to the next iteration.


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v4 04/10] x86/hvm: Collect information of TSC scaling ratio

2016-02-05 Thread Jan Beulich
>>> On 17.01.16 at 22:58,  wrote:
> Both VMX TSC scaling and SVM TSC ratio use the 64-bit TSC scaling ratio,
> but the number of fractional bits of the ratio is different between VMX
> and SVM. This patch adds the architecture code to collect the number of
> fractional bits and other related information into fields of struct
> hvm_function_table so that they can be used in the common code.
> 
> Signed-off-by: Haozhong Zhang 
> Reviewed-by: Kevin Tian 
> Reviewed-by: Boris Ostrovsky 
> ---
> Changes in v4:
>  (addressing Jan Beulich's comments in v3 patch 12)
>  * Set TSC scaling parameters in hvm_funcs conditionally.
>  * Remove TSC scaling parameter tsc_scaling_supported in hvm_funcs which
>can be derived from other parameters.
>  (code cleanup)
>  * Merge with v3 patch 11 "x86/hvm: Detect TSC scaling through hvm_funcs"
>whose work can be done early in this patch.

I really think this the scope of these changes should have invalidated
all earlier tags.

> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1450,6 +1450,14 @@ const struct hvm_function_table * __init 
> start_svm(void)
>  if ( !cpu_has_svm_nrips )
>  clear_bit(SVM_FEATURE_DECODEASSISTS, &svm_feature_flags);
>  
> +if ( cpu_has_tsc_ratio )
> +{
> +svm_function_table.default_tsc_scaling_ratio = DEFAULT_TSC_RATIO;
> +svm_function_table.max_tsc_scaling_ratio = ~TSC_RATIO_RSVD_BITS;
> +svm_function_table.tsc_scaling_ratio_frac_bits = 32;
> +svm_function_table.scale_tsc = svm_scale_tsc;
> +}
> +
>  #define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; }
>  P(cpu_has_svm_npt, "Nested Page Tables (NPT)");
>  P(cpu_has_svm_lbrv, "Last Branch Record (LBR) Virtualisation");
> @@ -2269,8 +2277,6 @@ static struct hvm_function_table __initdata 
> svm_function_table = {
>  .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
>  .nhvm_intr_blocked = nsvm_intr_blocked,
>  .nhvm_hap_walk_L1_p2m = nsvm_hap_walk_L1_p2m,
> -
> -.scale_tsc= svm_scale_tsc,
>  };

>From at the first glance purely mechanical POV this change was
unnecessary with ...

> @@ -249,6 +261,8 @@ void hvm_set_guest_tsc_fixed(struct vcpu *v, u64 
> guest_tsc, u64 at_tsc);
>  u64 hvm_get_guest_tsc_fixed(struct vcpu *v, u64 at_tsc);
>  #define hvm_get_guest_tsc(v) hvm_get_guest_tsc_fixed(v, 0)
>  
> +#define hvm_tsc_scaling_supported (!!hvm_funcs.default_tsc_scaling_ratio)

... this, but considering our general aim to avoid having NULL
callback pointers wherever possible, I think this is more than just
a mechanical concern: I'd prefer if at least the callback pointer
always be statically initialized, and ideally also two of the other
fields. Only one field should be dynamically initialized (unless -
considering the VMX code to come - static initialization is
impossible), and ideally one which, if zero, would not have any
bad consequences if used by mistake (frac_bits maybe). And
perhaps an ASSERT() should be placed inside svm_scale_tsc()
making sure the dynamically initialized field actually is initialized.

The conditional here would then check _all_ fields which either
vendor's code leaves uninitialized (i.e. the VMX patch may then
add to the above).

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Jan Beulich
>>> On 05.02.16 at 12:30,  wrote:
> El 5/2/16 a les 11:40, Jan Beulich ha escrit:
> On 05.02.16 at 10:50,  wrote:
>>> For legacy PCI interrupts, we can parse the MADT inside of Xen in order
>>> to properly setup the lines/overwrites and inject the interrupts that
>>> are not handled by Xen straight into the hardware domain. This will
>>> require us to be able to emulate the same topology as what is found in
>>> native (eg: if there are two IO APICs in the hardware we should also
>>> provide two emulated ones to the hw domain).
>> 
>> I don't think MADT contains all the needed information, or else we
>> wouldn't need PHYSDEVOP_setup_gsi.
> 
> AFAICT, I think we could do something like:
> 
>  - IRQs [0, 15]: edge-trigger, low-polarity.
>  - IRQs [16, n]: level-triggered, high-polarity.

That's not a valid assumption - I've seen systems with other settings
on GSI >= 16 ...

> Unless there's an overwrite in the MADT.

... and iirc that was without any MADT override (but instead coming
from the DSDT/SSDT).

> I expect that Xen will already have some code to deal with this, since
> it's also used for regular PCI-passthrough.

This has little to do with pass-through - we first of all need to get
the host working correctly on its own.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Roger Pau Monné
El 5/2/16 a les 12:45, Jan Beulich ha escrit:
 On 05.02.16 at 12:30,  wrote:
>> El 5/2/16 a les 11:40, Jan Beulich ha escrit:
>> On 05.02.16 at 10:50,  wrote:
 For legacy PCI interrupts, we can parse the MADT inside of Xen in order
 to properly setup the lines/overwrites and inject the interrupts that
 are not handled by Xen straight into the hardware domain. This will
 require us to be able to emulate the same topology as what is found in
 native (eg: if there are two IO APICs in the hardware we should also
 provide two emulated ones to the hw domain).
>>>
>>> I don't think MADT contains all the needed information, or else we
>>> wouldn't need PHYSDEVOP_setup_gsi.
>>
>> AFAICT, I think we could do something like:
>>
>>  - IRQs [0, 15]: edge-trigger, low-polarity.
>>  - IRQs [16, n]: level-triggered, high-polarity.
> 
> That's not a valid assumption - I've seen systems with other settings
> on GSI >= 16 ...

Then we just propagate how the emulated IO APIC pins are setup to the
real one, this should match reality, and is no different from using
PHYSDEVOP_setup_gsi. AFAICT it's just a different way of getting the
same information.

Roger.


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [qemu-mainline test] 80469: regressions - FAIL

2016-02-05 Thread Anthony PERARD
On Fri, Feb 05, 2016 at 06:30:25AM +, osstest service owner wrote:
> flight 80469 qemu-mainline real [real]
> http://logs.test-lab.xenproject.org/osstest/logs/80469/
> 
> Regressions :-(
> 
> Tests which did not succeed and are blocking,
> including tests which could not be run:
>  build-i3865 xen-build fail REGR. vs. 
> 79947
>  build-amd64   5 xen-build fail REGR. vs. 
> 79947
>  build-i386-xsm5 xen-build fail REGR. vs. 
> 79947
>  build-amd64-xsm   5 xen-build fail REGR. vs. 
> 79947
>  build-armhf   5 xen-build fail REGR. vs. 
> 79947
>  build-armhf-xsm   5 xen-build fail REGR. vs. 
> 79947

./configure of QEMU fail with:
"ERROR: invalid trace backends
Please choose supported trace backends."

They have remove the "stderr" tracebackend, and replaced it by "log". Which
also became the default. I have not look at what to do yet, but the configure
option "--enable-trace-backend=stderr" as became invalid.

-- 
Anthony PERARD

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH] x86/HVM: rewrite the start info structure definition in binary form

2016-02-05 Thread Roger Pau Monne
This will prevent alignments from getting in the way. It's not safe to
define this memory structures using C anyway, since the ABI depends on the
bitness, while our protocol does not.

Also add a command line parameter to each module, and a reserved field in
order to have the layout aligned. Note that the current implementation in
libxc doesn't make use of the module command line at all.

Signed-off-by: Roger Pau Monné 
---
Cc: Samuel Thibault 
Cc: Ian Jackson 
Cc: Ian Campbell 
Cc: Wei Liu 
Cc: Jan Beulich 
Cc: Andrew Cooper 
---
 tools/libxc/include/xc_dom.h | 28 
 xen/include/public/xen.h | 42 --
 2 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h
index cac4698..e5ab56c 100644
--- a/tools/libxc/include/xc_dom.h
+++ b/tools/libxc/include/xc_dom.h
@@ -216,6 +216,34 @@ struct xc_dom_image {
 struct xc_hvm_firmware_module smbios_module;
 };
 
+#if defined(__i386__) || defined(__x86_64__)
+/* C representation of the x86/HVM start info layout.
+ *
+ * The canonical definition of this layout resides in public/xen.h, this
+ * is just a way to represent the layout described there using C types.
+ *
+ * NB: the packed attribute is not really needed, but it helps us enforce
+ * the fact this this is just a representation, and it might indeed
+ * be required in the future if there are alignment changes.
+ */
+struct hvm_start_info {
+uint32_t magic; /* Contains the magic value 0x336ec578   */
+/* ("xEn3" with the 0x80 bit of the "E" set).*/
+uint32_t flags; /* SIF_xxx flags.*/
+uint32_t cmdline_paddr; /* Physical address of the command line. */
+uint32_t nr_modules;/* Number of modules passed to the kernel.   */
+uint32_t modlist_paddr; /* Physical address of an array of   */
+/* hvm_modlist_entry.*/
+} __attribute__((packed));
+
+struct hvm_modlist_entry {
+uint32_t paddr; /* Physical address of the module.   */
+uint32_t size;  /* Size of the module in bytes.  */
+uint32_t cmdline_paddr; /* Physical address of the command line. */
+uint32_t reserved;
+} __attribute__((packed));
+#endif /* x86 */
+
 /* --- pluggable kernel loader - */
 
 struct xc_dom_loader {
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index 7b629b1..e1350d0 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -790,22 +790,36 @@ typedef struct start_info start_info_t;
  * NOTE: nothing will be loaded at physical address 0, so
  * a 0 value in any of the address fields should be treated
  * as not present.
+ *
+ *  0 ++
+ *| magic  | Contains the magic value HVM_START_MAGIC_VALUE
+ *|| ("xEn3" with the 0x80 bit of the "E" set).
+ *  4 ++
+ *| flags  | SIF_xxx flags.
+ *  8 ++
+ *| cmdline_paddr  | Physical address of the command line,
+ *|| a zero-terminated ASCII string.
+ * 12 ++
+ *| nr_modules | Number of modules passed to the kernel.
+ * 16 ++
+ *| modlist_paddr  | Physical address of an array of modules
+ *|| (layout of the structure below).
+ * 20 ++
+ *
+ * The layout of each entry in the module structure is the following:
+ *
+ *  0 ++
+ *| paddr  | Physical address of the module.
+ *  4 ++
+ *| size   | Size of the module in bytes.
+ *  8 ++
+ *| cmdline_paddr  | Physical address of the command line,
+ *|| a zero-terminated ASCII string.
+ * 12 ++
+ *| reserved   |
+ * 16 ++
  */
-struct hvm_start_info {
 #define HVM_START_MAGIC_VALUE 0x336ec578
-uint32_t magic; /* Contains the magic value 0x336ec578   */
-/* ("xEn3" with the 0x80 bit of the "E" set).*/
-uint32_t flags; /* SIF_xxx flags.*/
-uint32_t cmdline_paddr; /* Physical address of the command line. */
-uint32_t nr_modules;/* Number of modules passed to the kernel.   */
-uint32_t modlist_paddr; /* Physical address of an array of   */
-/* hvm_modlist_entry.*/
-};
-
-struct hvm_modlist_entry {
-uint32_t paddr; /* Physical address of the module.   */
-uint32_t size;  /* Size of the module in bytes.  */
-};
 
 /* New console union for dom0 introduced in 0x00030203. */
 #if __XEN_INTERFACE_VERSION__ < 0x00030203
-- 
2.5.4 (Apple Git-61)


___

Re: [Xen-devel] [PATCH] x86/HVM: rewrite the start info structure definition in binary form

2016-02-05 Thread Jan Beulich
>>> On 05.02.16 at 13:28,  wrote:
> This will prevent alignments from getting in the way. It's not safe to
> define this memory structures using C anyway, since the ABI depends on the
> bitness, while our protocol does not.
> 
> Also add a command line parameter to each module, and a reserved field in
> order to have the layout aligned. Note that the current implementation in
> libxc doesn't make use of the module command line at all.

Which would seem wrong then - what use is the field if it doesn't
get filled? Or is that because it has nowhere to come from? But
even then - wouldn't what I've read on the other thread mean
at least the filename should be put there (as kind of the first
command line element)?

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 0/2] xen/scsiback: correct two issues

2016-02-05 Thread Juergen Gross
Correct two issues in the Xen pvscsi backend.

Juergen Gross (2):
  xen/scsiback: correct frontend counting
  xen/scsiback: avoid warnings when adding multiple LUNs to a domain

 drivers/xen/xen-scsiback.c | 75 ++
 1 file changed, 42 insertions(+), 33 deletions(-)

-- 
2.6.2


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 1/2] xen/scsiback: correct frontend counting

2016-02-05 Thread Juergen Gross
When adding a new frontend to xen-scsiback don't decrement the number
of active frontends in case of no error. Not doing so results in a
failure when trying to remove the xen-pvscsi nexus even if no domain
is using it.

Signed-off-by: Juergen Gross 
Cc: sta...@vger.kernel.org
---
 drivers/xen/xen-scsiback.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
index ad4eb10..51387d7 100644
--- a/drivers/xen/xen-scsiback.c
+++ b/drivers/xen/xen-scsiback.c
@@ -939,12 +939,12 @@ out:
spin_unlock_irqrestore(&info->v2p_lock, flags);
 
 out_free:
-   mutex_lock(&tpg->tv_tpg_mutex);
-   tpg->tv_tpg_fe_count--;
-   mutex_unlock(&tpg->tv_tpg_mutex);
-
-   if (err)
+   if (err) {
+   mutex_lock(&tpg->tv_tpg_mutex);
+   tpg->tv_tpg_fe_count--;
+   mutex_unlock(&tpg->tv_tpg_mutex);
kfree(new);
+   }
 
return err;
 }
-- 
2.6.2


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH 2/2] xen/scsiback: avoid warnings when adding multiple LUNs to a domain

2016-02-05 Thread Juergen Gross
When adding more than one LUN to a frontend a warning for a failed
assignment is issued in dom0 for each already existing LUN. Avoid this
warning.

Signed-off-by: Juergen Gross 
Cc: sta...@vger.kernel.org
---
 drivers/xen/xen-scsiback.c | 65 ++
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
index 51387d7..69de879 100644
--- a/drivers/xen/xen-scsiback.c
+++ b/drivers/xen/xen-scsiback.c
@@ -849,15 +849,31 @@ static int scsiback_map(struct vscsibk_info *info)
 }
 
 /*
+  Check for a translation entry being present
+*/
+static struct v2p_entry *scsiback_chk_translation_entry(
+   struct vscsibk_info *info, struct ids_tuple *v)
+{
+   struct list_head *head = &(info->v2p_entry_lists);
+   struct v2p_entry *entry;
+
+   list_for_each_entry(entry, head, l)
+   if ((entry->v.chn == v->chn) &&
+   (entry->v.tgt == v->tgt) &&
+   (entry->v.lun == v->lun))
+   return entry;
+
+   return NULL;
+}
+
+/*
   Add a new translation entry
 */
 static int scsiback_add_translation_entry(struct vscsibk_info *info,
  char *phy, struct ids_tuple *v)
 {
int err = 0;
-   struct v2p_entry *entry;
struct v2p_entry *new;
-   struct list_head *head = &(info->v2p_entry_lists);
unsigned long flags;
char *lunp;
unsigned long long unpacked_lun;
@@ -917,15 +933,10 @@ static int scsiback_add_translation_entry(struct 
vscsibk_info *info,
spin_lock_irqsave(&info->v2p_lock, flags);
 
/* Check double assignment to identical virtual ID */
-   list_for_each_entry(entry, head, l) {
-   if ((entry->v.chn == v->chn) &&
-   (entry->v.tgt == v->tgt) &&
-   (entry->v.lun == v->lun)) {
-   pr_warn("Virtual ID is already used. Assignment was not 
performed.\n");
-   err = -EEXIST;
-   goto out;
-   }
-
+   if (scsiback_chk_translation_entry(info, v)) {
+   pr_warn("Virtual ID is already used. Assignment was not 
performed.\n");
+   err = -EEXIST;
+   goto out;
}
 
/* Create a new translation entry and add to the list */
@@ -933,7 +944,7 @@ static int scsiback_add_translation_entry(struct 
vscsibk_info *info,
new->v = *v;
new->tpg = tpg;
new->lun = unpacked_lun;
-   list_add_tail(&new->l, head);
+   list_add_tail(&new->l, &info->v2p_entry_lists);
 
 out:
spin_unlock_irqrestore(&info->v2p_lock, flags);
@@ -962,33 +973,31 @@ static int scsiback_del_translation_entry(struct 
vscsibk_info *info,
  struct ids_tuple *v)
 {
struct v2p_entry *entry;
-   struct list_head *head = &(info->v2p_entry_lists);
unsigned long flags;
 
spin_lock_irqsave(&info->v2p_lock, flags);
/* Find out the translation entry specified */
-   list_for_each_entry(entry, head, l) {
-   if ((entry->v.chn == v->chn) &&
-   (entry->v.tgt == v->tgt) &&
-   (entry->v.lun == v->lun)) {
-   goto found;
-   }
-   }
-
-   spin_unlock_irqrestore(&info->v2p_lock, flags);
-   return 1;
-
-found:
-   /* Delete the translation entry specfied */
-   __scsiback_del_translation_entry(entry);
+   entry = scsiback_chk_translation_entry(info, v);
+   if (entry)
+   __scsiback_del_translation_entry(entry);
 
spin_unlock_irqrestore(&info->v2p_lock, flags);
-   return 0;
+   return entry == NULL;
 }
 
 static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state,
char *phy, struct ids_tuple *vir, int try)
 {
+   struct v2p_entry *entry;
+   unsigned long flags;
+
+   if (try) {
+   spin_lock_irqsave(&info->v2p_lock, flags);
+   entry = scsiback_chk_translation_entry(info, vir);
+   spin_unlock_irqrestore(&info->v2p_lock, flags);
+   if (entry)
+   return;
+   }
if (!scsiback_add_translation_entry(info, phy, vir)) {
if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
  "%d", XenbusStateInitialised)) {
-- 
2.6.2


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] HVMlite ABI specification DRAFT A

2016-02-05 Thread Jan Beulich
>>> On 05.02.16 at 12:50,  wrote:
> El 5/2/16 a les 12:45, Jan Beulich ha escrit:
> On 05.02.16 at 12:30,  wrote:
>>> El 5/2/16 a les 11:40, Jan Beulich ha escrit:
>>> On 05.02.16 at 10:50,  wrote:
> For legacy PCI interrupts, we can parse the MADT inside of Xen in order
> to properly setup the lines/overwrites and inject the interrupts that
> are not handled by Xen straight into the hardware domain. This will
> require us to be able to emulate the same topology as what is found in
> native (eg: if there are two IO APICs in the hardware we should also
> provide two emulated ones to the hw domain).

 I don't think MADT contains all the needed information, or else we
 wouldn't need PHYSDEVOP_setup_gsi.
>>>
>>> AFAICT, I think we could do something like:
>>>
>>>  - IRQs [0, 15]: edge-trigger, low-polarity.
>>>  - IRQs [16, n]: level-triggered, high-polarity.
>> 
>> That's not a valid assumption - I've seen systems with other settings
>> on GSI >= 16 ...
> 
> Then we just propagate how the emulated IO APIC pins are setup to the
> real one, this should match reality, and is no different from using
> PHYSDEVOP_setup_gsi. AFAICT it's just a different way of getting the
> same information.

That won't work either I'm afraid: For one, Dom0 may not even write
RTEs for interrupts it never enables. And even if it did, it would write
them masked, yet we mustn't derive information from masked RTEs -
see commit 669d4b85c4 ("x86/IO-APIC: don't create pIRQ mapping
from masked RTE"). Also consider e.g. the device IRQ which the
serial driver may be using: We specifically suppress modifications to
RTEs for in-use IRQs in current code and would of course need to
do so in the PVHv2 code too. That way there would be no proper
way to establish the two bits (short of grabbing the data from what
Dom0 tries to write despite us otherwise suppressing the write).

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v4 05/10] x86: Add functions for 64-bit integer arithmetic

2016-02-05 Thread Jan Beulich
>>> On 17.01.16 at 22:58,  wrote:
> This patch adds several functions to take multiplication, division and
> shifting involving 64-bit integers.
> 
> Signed-off-by: Haozhong Zhang 
> Reviewed-by: Boris Ostrovsky 
> ---
> Changes in v4:
>  (addressing Jan Beulich's comments)
>  * Rewrite mul_u64_u64_shr() in assembly.

Thanks, but it puzzles me that the other one didn't get converted
as well. Anyway, I'm not going to make this a requirement, since
at least it appears to match Linux'es variant.

> +static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int n)
> +{
> +u64 hi, lo;
> +
> +asm volatile ( "mulq %2; shrdq %1,%0"
> +   : "=a" (lo), "=d" (hi)
> +   : "rm" (mul), "0" (a), "c" (n) );

SHRD formally is a 3-operand instruction, and the fact that gas'
AT&T syntax supports a 2-operand "alias" is, well, odd. Please
let's use the specification mandated 3-operand form properly,
to avoid surprises with e.g. clang.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH RFC v2 00/30] x86: Improvements to cpuid handling for guests

2016-02-05 Thread Andrew Cooper
Presented here is v2 of my work to improve cpuid levelling for guests.

This series is available in git form at:
  http://xenbits.xen.org/git-http/people/andrewcoop/xen.git levelling-v2

Major changes from v1 include a rebase onto staging, reworking of the
automatic generation of cpu featureset information, and fixes to xsave
handling for PV guests on Intel.

There is still an outstanding issue with xsave handling for PV guests on AMD
which I am investigating, and is the cause for the series still being RFC.

The current cpuid code, both in the hypervisor and toolstack, has grown
organically for a very long time, and is flawed in many ways.  This series
focuses specifically on the fixing the bits pertaining to the visible
features, and I will be fixing other areas in future work (e.g. per-core,
per-package values, auditing of incoming migration values, etc.)

These changes alter the workflow of cpuid handling as follows:

Xen boots and evaluates its current capabilities.  It uses this information to
calculate the maximum featuresets it can provide to guests, and provides this
information for toolstack consumption.  A toolstack may then calculate a safe
set of features (taking into account migratability), and sets a guests cpuid
policy.  Xen then takes care of context switching the levelling state.

In particular, this means that PV guests may have different levels while
running on the same host, an option which was not previously available.

Andrew Cooper (30):
  xen/x86: Drop X86_FEATURE_3DNOW_ALT
  xen/x86: Do not store VIA/Cyrix/Centaur CPU features
  xen/x86: Drop cpuinfo_x86.x86_power
  xen/x86: Improvements to pv_cpuid()
  xen/public: Export cpu featureset information in the public API
  xen/x86: Script to automatically process featureset information
  xen/x86: Collect more cpuid feature leaves
  xen/x86: Mask out unknown features from Xen's capabilities
  xen/x86: Store antifeatures inverted in a featureset
  xen/x86: Annotate VM applicability in featureset
  xen/x86: Calculate maximum host and guest featuresets
  xen/x86: Generate deep dependencies of features
  xen/x86: Clear dependent features when clearing a cpu cap
  xen/x86: Improve disabling of features which have dependencies
  xen/x86: Improvements to in-hypervisor cpuid sanity checks
  x86/cpu: Move set_cpumask() calls into c_early_init()
  x86/cpu: Common infrastructure for levelling context switching
  x86/cpu: Rework AMD masking MSR setup
  x86/cpu: Rework Intel masking/faulting setup
  x86/cpu: Context switch cpuid masks and faulting state in
context_switch()
  x86/pv: Provide custom cpumasks for PV domains
  x86/domctl: Update PV domain cpumasks when setting cpuid policy
  xen+tools: Export maximum host and guest cpu featuresets via SYSCTL
  tools/libxc: Modify bitmap operations to take void pointers
  tools/libxc: Use public/featureset.h for cpuid policy generation
  tools/libxc: Expose the automatically generated cpu featuremask
information
  tools: Utility for dealing with featuresets
  tools/libxc: Wire a featureset through to cpuid policy logic
  tools/libxc: Use featuresets rather than guesswork
  tools/libxc: Calculate xstate cpuid leaf from guest information

 .gitignore  |   2 +
 tools/libxc/Makefile|   9 +
 tools/libxc/include/xenctrl.h   |  21 +-
 tools/libxc/xc_bitops.h |  21 +-
 tools/libxc/xc_cpufeature.h | 147 
 tools/libxc/xc_cpuid_x86.c  | 550 
 tools/libxl/libxl_cpuid.c   |   2 +-
 tools/misc/Makefile |   4 +
 tools/misc/xen-cpuid.c  | 394 
 tools/ocaml/libs/xc/xenctrl.ml  |   3 +
 tools/ocaml/libs/xc/xenctrl.mli |   4 +
 tools/ocaml/libs/xc/xenctrl_stubs.c |  37 +-
 tools/python/xen/lowlevel/xc/xc.c   |   2 +-
 xen/arch/x86/Makefile   |   1 +
 xen/arch/x86/apic.c |   2 +-
 xen/arch/x86/cpu/amd.c  | 309 ++--
 xen/arch/x86/cpu/centaur.c  |   3 -
 xen/arch/x86/cpu/common.c   |  51 ++-
 xen/arch/x86/cpu/intel.c| 269 +-
 xen/arch/x86/cpuid.c| 227 
 xen/arch/x86/domain.c   |  17 +-
 xen/arch/x86/domctl.c   |  88 +
 xen/arch/x86/hvm/hvm.c  |  56 ++-
 xen/arch/x86/setup.c|   3 +
 xen/arch/x86/sysctl.c   |  66 
 xen/arch/x86/traps.c| 223 ++-
 xen/arch/x86/xstate.c   |   6 +-
 xen/include/Makefile|  10 +
 xen/include/asm-x86/cpufeature.h| 174 +
 xen/include/asm-x86/cpuid.h |  44 +++
 xen/include/asm-x86/domain.h|   2 +
 xen/include/asm-x86

[Xen-devel] [PATCH v2 02/30] xen/x86: Do not store VIA/Cyrix/Centaur CPU features

2016-02-05 Thread Andrew Cooper
Nothing uses them.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

New in v2
---
 xen/arch/x86/cpu/centaur.c   |  3 ---
 xen/include/asm-x86/cpufeature.h | 12 +---
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/xen/arch/x86/cpu/centaur.c b/xen/arch/x86/cpu/centaur.c
index c0ac117..b137d55 100644
--- a/xen/arch/x86/cpu/centaur.c
+++ b/xen/arch/x86/cpu/centaur.c
@@ -38,9 +38,6 @@ static void init_c3(struct cpuinfo_x86 *c)
wrmsrl(MSR_VIA_RNG, msr_content | RNG_ENABLE);
printk(KERN_INFO "CPU: Enabled h/w RNG\n");
}
-
-   c->x86_capability[cpufeat_word(X86_FEATURE_XSTORE)]
-= cpuid_edx(0xC001);
}
 
if (c->x86 == 0x6 && c->x86_model >= 0xf) {
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index 6583039..e7e369b 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -109,17 +109,7 @@
 #define X86_FEATURE_RDRAND (4*32+30) /* Digital Random Number Generator */
 #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running under some hypervisor */
 
-/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC001, word 5 */
-#define X86_FEATURE_XSTORE (5*32+ 2) /* on-CPU RNG present (xstore insn) */
-#define X86_FEATURE_XSTORE_EN  (5*32+ 3) /* on-CPU RNG enabled */
-#define X86_FEATURE_XCRYPT (5*32+ 6) /* on-CPU crypto (xcrypt insn) */
-#define X86_FEATURE_XCRYPT_EN  (5*32+ 7) /* on-CPU crypto enabled */
-#define X86_FEATURE_ACE2   (5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN(5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE(5*32+ 10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN (5*32+ 11) /* PHE enabled */
-#define X86_FEATURE_PMM(5*32+ 12) /* PadLock Montgomery 
Multiplier */
-#define X86_FEATURE_PMM_EN (5*32+ 13) /* PMM enabled */
+/* UNUSED, word 5 */
 
 /* More extended AMD flags: CPUID level 0x8001, ecx, word 6 */
 #define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 08/30] xen/x86: Mask out unknown features from Xen's capabilities

2016-02-05 Thread Andrew Cooper
If Xen doesn't know about a feature, it is unsafe for use and should be
deliberately hidden from Xen's capabilities.

This doesn't make a practical difference yet, but will make a difference
later when the guest featuresets are seeded from the host featureset.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2:
 * Reduced substantially from v1, by using the autogenerated information.
---
 xen/arch/x86/Makefile|  1 +
 xen/arch/x86/cpu/common.c|  3 +++
 xen/arch/x86/cpuid.c | 19 +++
 xen/include/asm-x86/cpufeature.h |  3 +--
 xen/include/asm-x86/cpuid.h  | 24 
 xen/tools/gen-cpuid.py   | 24 
 6 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 xen/arch/x86/cpuid.c
 create mode 100644 xen/include/asm-x86/cpuid.h

diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
index 8e6e901..0e2b1d5 100644
--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -12,6 +12,7 @@ obj-y += bitops.o
 obj-bin-y += bzimage.init.o
 obj-bin-y += clear_page.o
 obj-bin-y += copy_page.o
+obj-y += cpuid.o
 obj-y += compat.o x86_64/compat.o
 obj-$(CONFIG_KEXEC) += crash.o
 obj-y += debug.o
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index a99cc7c..151dfe4 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -341,6 +341,9 @@ void identify_cpu(struct cpuinfo_x86 *c)
 * The vendor-specific functions might have changed features.  Now
 * we do "generic changes."
 */
+   for (i = 0; i < FSCAPINTS; ++i) {
+   c->x86_capability[i] &= known_features[i];
+   }
 
for (i = 0 ; i < NCAPINTS ; ++i)
c->x86_capability[i] &= ~cleared_caps[i];
diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
new file mode 100644
index 000..fb3a6ac
--- /dev/null
+++ b/xen/arch/x86/cpuid.c
@@ -0,0 +1,19 @@
+#include 
+#include 
+
+const uint32_t known_features[] = INIT_KNOWN_FEATURES;
+
+static void __maybe_unused build_assertions(void)
+{
+BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index d069563..a984a81 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -13,9 +13,8 @@
 
 #include 
 
-#include 
+#include 
 
-#define FSCAPINTS FEATURESET_NR_ENTRIES
 #define NCAPINTS (FSCAPINTS + 1) /* N 32-bit words worth of info */
 
 /* Other features, Linux-defined mapping, FSMAX+1 */
diff --git a/xen/include/asm-x86/cpuid.h b/xen/include/asm-x86/cpuid.h
new file mode 100644
index 000..6cca5ea
--- /dev/null
+++ b/xen/include/asm-x86/cpuid.h
@@ -0,0 +1,24 @@
+#ifndef __X86_CPUID_H__
+#define __X86_CPUID_H__
+
+#include 
+
+#define FSCAPINTS FEATURESET_NR_ENTRIES
+
+#ifndef __ASSEMBLY__
+#include 
+
+extern const uint32_t known_features[FSCAPINTS];
+
+#endif /* __ASSEMBLY__ */
+#endif /* !__X86_CPUID_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
index c8240c0..0843be6 100755
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -19,6 +19,8 @@ class State(object):
 
 # State calculated
 self.nr_entries = 0 # Number of words in a featureset
+self.common = 0 # Common features between 1d and e1d
+self.known = [] # All known features
 
 def parse_definitions(state):
 """
@@ -89,6 +91,22 @@ def crunch_numbers(state):
 # Size of bitmaps
 state.nr_entries = nr_entries = (max(state.names.keys()) >> 5) + 1
 
+# Features common between 1d and e1d.
+common_1d = (FPU, VME, DE, PSE, TSC, MSR, PAE, MCE, CX8, APIC,
+ MTRR, PGE, MCA, CMOV, PAT, PSE36, MMX, FXSR)
+
+# All known features.  Duplicate the common features in e1d
+e1d_base = (SYSCALL >> 5) << 5
+state.known = featureset_to_uint32s(
+state.names.keys() + [ e1d_base + (x % 32) for x in common_1d ],
+nr_entries)
+
+# Fold common back into names
+for f in common_1d:
+state.names[e1d_base + (f % 32)] = "E1D_" + state.names[f]
+
+state.common = featureset_to_uint32s(common_1d, 1)[0]
+
 
 def write_results(state):
 state.output.write(
@@ -103,7 +121,13 @@ def write_results(state):
 state.output.write(
 """
 #define FEATURESET_NR_ENTRIES %s
+
+#define INIT_COMMON_FEATURES %s
+
+#define INIT_KNOWN_FEATURES { \\\n%s\n}
 """ % (state.nr_entries,
+   state.common,
+   format_uint32s(state.known, 4),
))
 
 state.output.write(
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 09/30] xen/x86: Store antifeatures inverted in a featureset

2016-02-05 Thread Andrew Cooper
Awkwardly, some new feature bits mean "Feature $X no longer works".
Store these inverted in a featureset.

This permits safe zero-extending of a smaller featureset as part of a
comparison, and safe reasoning (subset?, superset?, compatible? etc.)
without specific knowldge of meaning of each bit.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2: Annotate inverted features using a magic comment and autogeneration.
---
 xen/arch/x86/cpu/common.c   |  1 +
 xen/arch/x86/cpuid.c|  2 ++
 xen/include/asm-x86/cpufeature.h|  2 +-
 xen/include/asm-x86/cpuid.h |  1 +
 xen/include/public/arch-x86/cpufeatureset.h | 18 +-
 xen/tools/gen-cpuid.py  | 15 ++-
 6 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 151dfe4..39c340b 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -343,6 +343,7 @@ void identify_cpu(struct cpuinfo_x86 *c)
 */
for (i = 0; i < FSCAPINTS; ++i) {
c->x86_capability[i] &= known_features[i];
+   c->x86_capability[i] ^= inverted_features[i];
}
 
for (i = 0 ; i < NCAPINTS ; ++i)
diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index fb3a6ac..30a3392 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -2,10 +2,12 @@
 #include 
 
 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
+const uint32_t inverted_features[] = INIT_INVERTED_FEATURES;
 
 static void __maybe_unused build_assertions(void)
 {
 BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
+BUILD_BUG_ON(ARRAY_SIZE(inverted_features) != FSCAPINTS);
 }
 
 /*
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index a984a81..f228fa2 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -65,7 +65,7 @@
 
 #define cpu_has_smepboot_cpu_has(X86_FEATURE_SMEP)
 #define cpu_has_smapboot_cpu_has(X86_FEATURE_SMAP)
-#define cpu_has_fpu_sel (!boot_cpu_has(X86_FEATURE_NO_FPU_SEL))
+#define cpu_has_fpu_sel boot_cpu_has(X86_FEATURE_FPU_SEL)
 
 #define cpu_has_ffxsr   ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) \
  && boot_cpu_has(X86_FEATURE_FFXSR))
diff --git a/xen/include/asm-x86/cpuid.h b/xen/include/asm-x86/cpuid.h
index 6cca5ea..341dbc1 100644
--- a/xen/include/asm-x86/cpuid.h
+++ b/xen/include/asm-x86/cpuid.h
@@ -9,6 +9,7 @@
 #include 
 
 extern const uint32_t known_features[FSCAPINTS];
+extern const uint32_t inverted_features[FSCAPINTS];
 
 #endif /* __ASSEMBLY__ */
 #endif /* !__X86_CPUID_H__ */
diff --git a/xen/include/public/arch-x86/cpufeatureset.h 
b/xen/include/public/arch-x86/cpufeatureset.h
index 02d695d..2748cfd 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -37,10 +37,26 @@
  * contain any synthesied values.  New words may be added to the end of
  * featureset.
  *
+ * "Anti" features have their representation inverted.  This permits safe
+ * zero-extending of a smaller featureset as part of a comparison, and safe
+ * reasoning (subset?, superset?, compatible? etc.) without specific knowldge
+ * of meaning of each bit.
+ *
  * All featureset words currently originate from leaves specified for the
  * CPUID instruction, but this is not preclude other sources of information.
  */
 
+/*
+ * Attribute syntax:
+ *
+ * Attributes for a particular feature are provided as characters before the
+ * first space in the comment immediately following the feature value.
+ *
+ * Inverted: '!'
+ *   This feature has its value in a featureset inverted, compared to how it
+ *   is specified by vendor architecture manuals.
+ */
+
 /* Intel-defined CPU features, CPUID level 0x0001.edx, word 0 */
 #define X86_FEATURE_FPU   ( 0*32+ 0) /*   Onboard FPU */
 #define X86_FEATURE_VME   ( 0*32+ 1) /*   Virtual Mode Extensions */
@@ -158,7 +174,7 @@
 #define X86_FEATURE_INVPCID   ( 5*32+10) /*   Invalidate Process Context 
ID */
 #define X86_FEATURE_RTM   ( 5*32+11) /*   Restricted Transactional 
Memory */
 #define X86_FEATURE_CMT   ( 5*32+12) /*   Cache Monitoring Technology 
*/
-#define X86_FEATURE_NO_FPU_SEL( 5*32+13) /*   FPU CS/DS stored as zero */
+#define X86_FEATURE_FPU_SEL   ( 5*32+13) /*!  FPU CS/DS stored as zero */
 #define X86_FEATURE_MPX   ( 5*32+14) /*   Memory Protection Extensions 
*/
 #define X86_FEATURE_CAT   ( 5*32+15) /*   Cache Allocation Technology 
*/
 #define X86_FEATURE_RDSEED( 5*32+18) /*   RDSEED instruction */
diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
index 0843be6..9e0cc34 100755
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -16,11 +16,13 @@ class State(object):
 
 # State parsed from input
 self.names = {} # Name =>

[Xen-devel] [PATCH v2 04/30] xen/x86: Improvements to pv_cpuid()

2016-02-05 Thread Andrew Cooper
pv_cpuid() has two completely separate paths inside it depending on whether
current is dom0 or a domU.  This causes unnecessary divergence, and
complicates future improvements.  Take steps to undo it.

Changes:
 * Create leaf and subleaf variables and use them consistently, instead of a
   mix of {a,c} and regs->e{a,c}x as the input parameters.
 * Combine the dom0 and domU hypervisor leaf handling, with an early exit.
 * Apply sanity checks to domU as well.  This brings PV domU cpuid handling in
   line with HVM domains and PV dom0.
 * Perform a real cpuid instruction for calculating CPUID.0xD[ECX=0].EBX.  The
   correct xcr0 is in context, and this avoids the O(M*N) loop over the domain
   cpuid policy list which exists currently.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

New in v2
---
 xen/arch/x86/traps.c | 74 
 1 file changed, 29 insertions(+), 45 deletions(-)

diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index e105b95..6a181bb 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -824,51 +824,24 @@ int cpuid_hypervisor_leaves( uint32_t idx, uint32_t 
sub_idx,
 
 void pv_cpuid(struct cpu_user_regs *regs)
 {
-uint32_t a, b, c, d;
+uint32_t leaf, subleaf, a, b, c, d;
 struct vcpu *curr = current;
 struct domain *currd = curr->domain;
 
-a = regs->eax;
+leaf = a = regs->eax;
 b = regs->ebx;
-c = regs->ecx;
+subleaf = c = regs->ecx;
 d = regs->edx;
 
-if ( !is_control_domain(currd) && !is_hardware_domain(currd) )
-{
-unsigned int cpuid_leaf = a, sub_leaf = c;
-
-if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
-domain_cpuid(currd, a, c, &a, &b, &c, &d);
-
-switch ( cpuid_leaf )
-{
-case XSTATE_CPUID:
-{
-unsigned int _eax, _ebx, _ecx, _edx;
-/* EBX value of main leaf 0 depends on enabled xsave features */
-if ( sub_leaf == 0 && curr->arch.xcr0 )
-{
-/* reset EBX to default value first */
-b = XSTATE_AREA_MIN_SIZE;
-for ( sub_leaf = 2; sub_leaf < 63; sub_leaf++ )
-{
-if ( !(curr->arch.xcr0 & (1ULL << sub_leaf)) )
-continue;
-domain_cpuid(currd, cpuid_leaf, sub_leaf,
- &_eax, &_ebx, &_ecx, &_edx);
-if ( (_eax + _ebx) > b )
-b = _eax + _ebx;
-}
-}
-goto xstate;
-}
-}
+if ( cpuid_hypervisor_leaves(leaf, subleaf, &a, &b, &c, &d) )
 goto out;
-}
 
-cpuid_count(a, c, &a, &b, &c, &d);
+if ( !is_control_domain(currd) && !is_hardware_domain(currd) )
+domain_cpuid(currd, leaf, subleaf, &a, &b, &c, &d);
+else
+cpuid_count(leaf, subleaf, &a, &b, &c, &d);
 
-if ( (regs->eax & 0x7fff) == 0x0001 )
+if ( (leaf & 0x7fff) == 0x0001 )
 {
 /* Modify Feature Information. */
 if ( !cpu_has_apic )
@@ -883,7 +856,7 @@ void pv_cpuid(struct cpu_user_regs *regs)
 }
 }
 
-switch ( regs->_eax )
+switch ( leaf )
 {
 case 0x0001:
 /* Modify Feature Information. */
@@ -918,7 +891,7 @@ void pv_cpuid(struct cpu_user_regs *regs)
 break;
 
 case 0x0007:
-if ( regs->_ecx == 0 )
+if ( subleaf == 0 )
 b &= (cpufeat_mask(X86_FEATURE_BMI1) |
   cpufeat_mask(X86_FEATURE_HLE)  |
   cpufeat_mask(X86_FEATURE_AVX2) |
@@ -934,14 +907,29 @@ void pv_cpuid(struct cpu_user_regs *regs)
 break;
 
 case XSTATE_CPUID:
-xstate:
 if ( !cpu_has_xsave )
 goto unsupported;
-if ( regs->_ecx == 1 )
+switch ( subleaf )
+{
+case 0:
 {
+uint32_t tmp;
+
+/*
+ * Always read CPUID.0xD[ECX=0].EBX from hardware, rather than
+ * domain policy.  It varies with enabled xstate, and the correct
+ * xcr0 is in context.
+ */
+if ( !is_control_domain(currd) && !is_hardware_domain(currd) )
+cpuid_count(leaf, subleaf, &tmp, &b, &tmp, &tmp);
+break;
+}
+
+case 1:
 a &= 
(boot_cpu_data.x86_capability[cpufeat_word(X86_FEATURE_XSAVEOPT)] &
   ~cpufeat_mask(X86_FEATURE_XSAVES));
 b = c = d = 0;
+break;
 }
 break;
 
@@ -983,15 +971,11 @@ void pv_cpuid(struct cpu_user_regs *regs)
 unsupported:
 a = b = c = d = 0;
 break;
-
-default:
-(void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
-break;
 }
 
  out:
 /* VPMU may decide to modify some of the leaves */
-vpmu_do_cpuid(regs->eax, &a, &b, &c, &d);
+vpmu_do_cpuid(leaf, &a, &b, &c, &d);
 
 regs->eax = a;
 regs

[Xen-devel] [PATCH v2 06/30] xen/x86: Script to automatically process featureset information

2016-02-05 Thread Andrew Cooper
This script consumes include/public/arch-x86/cpufeatureset.h and generates a
single include/asm-x86/cpuid-autogen.h containing all the processed
information.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Tim Deegan 
CC: Ian Campbell 

For all intents and purposes, new in v2.  All generated information is now
expressed by #defines (using C structure initialisers for most) and contained
in a single header file.
---
 .gitignore   |   1 +
 xen/include/Makefile |  10 ++
 xen/include/asm-x86/cpufeature.h |   4 +-
 xen/tools/gen-cpuid.py   | 191 +++
 4 files changed, 205 insertions(+), 1 deletion(-)
 create mode 100755 xen/tools/gen-cpuid.py

diff --git a/.gitignore b/.gitignore
index 91f690c..b40453e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -252,6 +252,7 @@ xen/include/headers.chk
 xen/include/headers++.chk
 xen/include/asm
 xen/include/asm-*/asm-offsets.h
+xen/include/asm-x86/cpuid-autogen.h
 xen/include/compat/*
 xen/include/config/
 xen/include/generated/
diff --git a/xen/include/Makefile b/xen/include/Makefile
index 9c8188b..268bc9d 100644
--- a/xen/include/Makefile
+++ b/xen/include/Makefile
@@ -117,5 +117,15 @@ headers++.chk: $(PUBLIC_HEADERS) Makefile
 
 endif
 
+ifeq ($(XEN_TARGET_ARCH),x86_64)
+
+$(BASEDIR)/include/asm-x86/cpuid-autogen.h: 
$(BASEDIR)/include/public/arch-x86/cpufeatureset.h 
$(BASEDIR)/tools/gen-cpuid.py FORCE
+   $(PYTHON) $(BASEDIR)/tools/gen-cpuid.py -i $^ -o $@.new
+   $(call move-if-changed,$@.new,$@)
+
+all: $(BASEDIR)/include/asm-x86/cpuid-autogen.h
+endif
+
 clean::
rm -rf compat headers.chk headers++.chk
+   rm -f $(BASEDIR)/include/asm-x86/cpuid-autogen.h
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index eb6eb63..d069563 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -13,7 +13,9 @@
 
 #include 
 
-#define FSCAPINTS 9
+#include 
+
+#define FSCAPINTS FEATURESET_NR_ENTRIES
 #define NCAPINTS (FSCAPINTS + 1) /* N 32-bit words worth of info */
 
 /* Other features, Linux-defined mapping, FSMAX+1 */
diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
new file mode 100755
index 000..c8240c0
--- /dev/null
+++ b/xen/tools/gen-cpuid.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys, os, re
+
+class Fail(Exception):
+pass
+
+class State(object):
+
+def __init__(self, input, output):
+
+self.source = input
+self.input  = open_file_or_fd(input, "r", 2)
+self.output = open_file_or_fd(output, "w", 2)
+
+# State parsed from input
+self.names = {} # Name => value mapping
+
+# State calculated
+self.nr_entries = 0 # Number of words in a featureset
+
+def parse_definitions(state):
+"""
+Parse featureset information from @param f and mutate the global
+namespace with symbols
+"""
+feat_regex = re.compile(
+r"^#define X86_FEATURE_([A-Z0-9_]+)"
+"\s+\(([\s\d]+\*[\s\d]+\+[\s\d]+)\).*$")
+
+this = sys.modules[__name__]
+
+for l in state.input.readlines():
+# Short circuit the regex...
+if not l.startswith("#define X86_FEATURE_"):
+continue
+
+res = feat_regex.match(l)
+
+if res is None:
+raise Fail("Failed to interpret '%s'" % (l.strip(), ))
+
+name = res.groups()[0]
+val = eval(res.groups()[1]) # Regex confines this to a very simple 
expression
+
+if hasattr(this, name):
+raise Fail("Duplicate symbol %s" % (name,))
+
+if val in state.names:
+raise Fail("Aliased value between %s and %s" %
+(name, state.names[val]))
+
+# Mutate the current namespace to insert a feature literal with its
+# bit index
+setattr(this, name, val)
+
+# Construct a reverse mapping of value to name
+state.names[val] = name
+
+
+def featureset_to_uint32s(fs, nr):
+""" Represent a featureset as a list of C-compatible uint32_t's """
+
+bitmap = 0L
+for f in fs:
+bitmap |= 1L << f
+
+words = []
+while bitmap:
+words.append(bitmap & ((1L << 32) - 1))
+bitmap >>= 32
+
+assert len(words) <= nr
+
+if len(words) < nr:
+words.extend([0] * (nr - len(words)))
+
+return [ "0x%08xU" % x for x in words ]
+
+def format_uint32s(words, indent):
+""" Format a list of uint32_t's sutable for a macro definition """
+spaces = " " * indent
+return spaces + (", \\\n" + spaces).join(words) + ", \\"
+
+
+def crunch_numbers(state):
+
+# Size of bitmaps
+state.nr_entries = nr_entries = (max(state.names.keys()) >> 5) + 1
+
+
+def write_results(state):
+state.output.write(
+"""/*
+ * Automatically generated by %s - Do not edit!
+ * Source data: %s
+ */
+#ifndef __XEN_X86__FEATURESET_DATA__
+#define __XEN_X86__FEATURESET_DATA__
+""" % (sys.argv[0], state.source)

[Xen-devel] [PATCH v2 05/30] xen/public: Export cpu featureset information in the public API

2016-02-05 Thread Andrew Cooper
For the featureset to be a useful object, it needs a stable interpretation, a
property which is missing from the current hw_caps interface.

Additionly, introduce TSC_ADJUST, SHA, PREFETCHWT1, ITSC, EFRO and CLZERO
which will be used by later changes.

To maintain compilation, FSCAPINTS is currently hardcoded at 9.  Future
changes will change this to being dynamically generated.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Tim Deegan 
CC: Ian Campbell 

v2:
 * Rebase over upstream changes
 * Collect all feature introductions from later in the series
 * Restrict API to Xen and toolstack
---
 xen/include/asm-x86/cpufeature.h| 159 +++
 xen/include/public/arch-x86/cpufeatureset.h | 195 
 2 files changed, 210 insertions(+), 144 deletions(-)
 create mode 100644 xen/include/public/arch-x86/cpufeatureset.h

diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index e7e369b..eb6eb63 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -11,151 +11,22 @@
 
 #include 
 
-#define NCAPINTS   9   /* N 32-bit words worth of info */
-
-/* Intel-defined CPU features, CPUID level 0x0001 (edx), word 0 */
-#define X86_FEATURE_FPU(0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME(0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE(0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC(0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR(0*32+ 5) /* Model-Specific Registers, 
RDMSR, WRMSR */
-#define X86_FEATURE_PAE(0*32+ 6) /* Physical Address 
Extensions */
-#define X86_FEATURE_MCE(0*32+ 7) /* Machine Check Architecture 
*/
-#define X86_FEATURE_CX8(0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC   (0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP(0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR   (0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE(0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA(0*32+14) /* Machine Check Architecture 
*/
-#define X86_FEATURE_CMOV   (0*32+15) /* CMOV instruction (FCMOVCC and 
FCOMI too if FPU present) */
-#define X86_FEATURE_PAT(0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36  (0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLSH (0*32+19) /* Supports the CLFLUSH instruction */
-#define X86_FEATURE_DS (0*32+21) /* Debug Store */
-#define X86_FEATURE_ACPI   (0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX(0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR   (0*32+24) /* FXSAVE and FXRSTOR instructions 
(fast save and restore */
- /* of FPU context), and CR4.OSFXSR 
available */
-#define X86_FEATURE_XMM(0*32+25) /* Streaming SIMD Extensions 
*/
-#define X86_FEATURE_XMM2   (0*32+26) /* Streaming SIMD Extensions-2 */
-#define X86_FEATURE_SELFSNOOP  (0*32+27) /* CPU self snoop */
-#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC(0*32+29) /* Automatic clock control */
-#define X86_FEATURE_IA64   (0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE(0*32+31) /* Pending Break Enable */
-
-/* AMD-defined CPU features, CPUID level 0x8001, word 1 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL(1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FFXSR   (1*32+25) /* FFXSR instruction optimizations */
-#define X86_FEATURE_PAGE1GB(1*32+26) /* 1Gb large page support */
-#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT   (1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW  (1*32+31) /* 3DNow! */
-
-/* Intel-defined CPU features, CPUID level 0x000D:1 (eax), word 2 */
-#define X86_FEATURE_XSAVEOPT   (2*32+ 0) /* XSAVEOPT instruction. */
-#define X86_FEATURE_XSAVEC (2*32+ 1) /* XSAVEC/XRSTORC instructions. */
-#define X86_FEATURE_XGETBV1(2*32+ 2) /* XGETBV with %ecx=1. */
-#define X86_FEATURE_XSAVES (2*32+ 3) /* XSAVES/XRSTORS instructions. */
-
-/* Other features, Linux-defined mapping, word 3 */
+#include 
+
+#define FSCAPINTS 9
+#define NCAPINTS (FSCAPINTS + 1) /* N 32-bit words worth of info */
+
+/* Other features, Linux-defined mapping, FSMAX+1 */
 /* This rang

[Xen-devel] [PATCH v2 03/30] xen/x86: Drop cpuinfo_x86.x86_power

2016-02-05 Thread Andrew Cooper
Nothing uses it.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

New in v2
---
 xen/arch/x86/cpu/amd.c  | 3 +--
 xen/include/asm-x86/processor.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 1ac44e0..c184f57 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -475,8 +475,7 @@ static void init_amd(struct cpuinfo_x86 *c)
}
 
if (c->extended_cpuid_level >= 0x8007) {
-   c->x86_power = cpuid_edx(0x8007);
-   if (c->x86_power & (1<<8)) {
+   if (cpuid_edx(0x8007) & (1<<8)) {
__set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
__set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
if (c->x86 != 0x11)
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 26ba141..271340e 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -191,7 +191,6 @@ struct cpuinfo_x86 {
 char x86_model_id[64];
 int  x86_cache_size; /* in KB - valid for CPUS which support this call  */
 int  x86_cache_alignment;/* In bytes */
-int  x86_power;
 __u32 x86_max_cores; /* cpuid returned max cores value */
 __u32 booted_cores;  /* number of cores as seen by OS */
 __u32 x86_num_siblings; /* cpuid logical cpus per chip value */
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 01/30] xen/x86: Drop X86_FEATURE_3DNOW_ALT

2016-02-05 Thread Andrew Cooper
Introducing an X86_FEATURE aliased value turns out to complicate automatic
processing of the feature list.  Drop X86_FEATURE_3DNOW_ALT and use
X86_FEATURE_PBE, extending the comment accordingly.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

New in v2
---
 xen/arch/x86/cpu/amd.c   | 9 ++---
 xen/include/asm-x86/cpufeature.h | 1 -
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 8ec841b..1ac44e0 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -440,9 +440,12 @@ static void init_amd(struct cpuinfo_x86 *c)
wrmsrl(MSR_K7_HWCR, value);
}
 
-   /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-  3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-   __clear_bit(X86_FEATURE_3DNOW_ALT, c->x86_capability);
+   /*
+* Some AMD CPUs duplicate the 3DNow bit in base and extended CPUID
+* leaves.  Unfortunately, this aliases PBE on Intel CPUs. Clobber the
+* alias, leaving 3DNow in the extended leaf.
+*/
+   __clear_bit(X86_FEATURE_PBE, c->x86_capability);

if (c->x86 == 0xf && c->x86_model < 0x14
&& cpu_has(c, X86_FEATURE_LAHF_LM)) {
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index 23f9fb2..6583039 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -45,7 +45,6 @@
 #define X86_FEATURE_ACC(0*32+29) /* Automatic clock control */
 #define X86_FEATURE_IA64   (0*32+30) /* IA-64 processor */
 #define X86_FEATURE_PBE(0*32+31) /* Pending Break Enable */
-#define X86_FEATURE_3DNOW_ALT  (0*32+31) /* AMD nonstandard 3DNow (Aliases 
PBE) */
 
 /* AMD-defined CPU features, CPUID level 0x8001, word 1 */
 /* Don't duplicate feature flags which are redundant with Intel! */
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 07/30] xen/x86: Collect more cpuid feature leaves

2016-02-05 Thread Andrew Cooper
New words are:
 * 0x8007.edx - Contains Invarient TSC
 * 0x8008.ebx - Newly used for AMD Zen processors

In addition, replace some open-coded ITSC and EFRO manipulation.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2:
 * Rely on ordering of generic_identify() to simplify init_amd()
 * Remove opencoded EFRO manipulation as well
---
 xen/arch/x86/cpu/amd.c| 21 +++--
 xen/arch/x86/cpu/common.c |  6 ++
 xen/arch/x86/cpu/intel.c  |  2 +-
 xen/arch/x86/domain.c |  2 +-
 4 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index c184f57..f9dc532 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -294,21 +294,6 @@ int cpu_has_amd_erratum(const struct cpuinfo_x86 *cpu, int 
osvw_id, ...)
return 0;
 }
 
-/* Can this system suffer from TSC drift due to C1 clock ramping? */
-static int c1_ramping_may_cause_clock_drift(struct cpuinfo_x86 *c) 
-{ 
-   if (cpuid_edx(0x8007) & (1<<8)) {
-   /*
-* CPUID.AdvPowerMgmtInfo.TscInvariant
-* EDX bit 8, 8000_0007
-* Invariant TSC on 8th Gen or newer, use it
-* (assume all cores have invariant TSC)
-*/
-   return 0;
-   }
-   return 1;
-}
-
 /*
  * Disable C1-Clock ramping if enabled in PMM7.CpuLowPwrEnh on 8th-generation
  * cores only. Assume BIOS has setup all Northbridges equivalently.
@@ -475,7 +460,7 @@ static void init_amd(struct cpuinfo_x86 *c)
}
 
if (c->extended_cpuid_level >= 0x8007) {
-   if (cpuid_edx(0x8007) & (1<<8)) {
+   if (cpu_has(c, X86_FEATURE_ITSC)) {
__set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
__set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
if (c->x86 != 0x11)
@@ -600,14 +585,14 @@ static void init_amd(struct cpuinfo_x86 *c)
wrmsrl(MSR_K7_PERFCTR3, 0);
}
 
-   if (cpuid_edx(0x8007) & (1 << 10)) {
+   if (cpu_has(c, X86_FEATURE_EFRO)) {
rdmsr(MSR_K7_HWCR, l, h);
l |= (1 << 27); /* Enable read-only APERF/MPERF bit */
wrmsr(MSR_K7_HWCR, l, h);
}
 
/* Prevent TSC drift in non single-processor, single-core platforms. */
-   if ((smp_processor_id() == 1) && c1_ramping_may_cause_clock_drift(c))
+   if ((smp_processor_id() == 1) && !cpu_has(c, X86_FEATURE_ITSC))
disable_c1_ramping();
 
set_cpuidmask(c);
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 335f044..a99cc7c 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -269,6 +269,12 @@ static void generic_identify(struct cpuinfo_x86 *c)
 
if (c->extended_cpuid_level >= 0x8004)
get_model_name(c); /* Default name */
+   if (c->extended_cpuid_level >= 0x8007)
+   c->x86_capability[cpufeat_word(X86_FEATURE_ITSC)]
+   = cpuid_edx(0x8007);
+   if (c->extended_cpuid_level >= 0x8008)
+   c->x86_capability[cpufeat_word(X86_FEATURE_CLZERO)]
+   = cpuid_ebx(0x8008);
 
/* Intel-defined flags: level 0x0007 */
if ( c->cpuid_level >= 0x0007 )
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index d4f574b..bdf89f6 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -281,7 +281,7 @@ static void init_intel(struct cpuinfo_x86 *c)
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
__set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-   if (cpuid_edx(0x8007) & (1u<<8)) {
+   if (cpu_has(c, X86_FEATURE_ITSC)) {
__set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
__set_bit(X86_FEATURE_NONSTOP_TSC, c->x86_capability);
__set_bit(X86_FEATURE_TSC_RELIABLE, c->x86_capability);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 9d43f7b..8f2c0b6 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2605,7 +2605,7 @@ void domain_cpuid(
  */
 if ( (input == 0x8007) && /* Advanced Power Management */
  !d->disable_migrate && !d->arch.vtsc )
-*edx &= ~(1u<<8); /* TSC Invariant */
+*edx &= ~cpufeat_mask(X86_FEATURE_ITSC);
 
 return;
 }
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v4 06/10] x86/hvm: Setup TSC scaling ratio

2016-02-05 Thread Jan Beulich
>>> On 17.01.16 at 22:58,  wrote:
> +u64 hvm_get_tsc_scaling_ratio(u32 gtsc_khz)
> +{
> +u64 ratio;
> +
> +if ( !hvm_tsc_scaling_supported )
> +return 0;
> +
> +/*
> + * The multiplication of the first two terms may overflow a 64-bit
> + * integer, so use mul_u64_u32_div() instead to keep precision.
> + */
> +ratio = mul_u64_u32_div(1ULL << hvm_funcs.tsc_scaling_ratio_frac_bits,
> +gtsc_khz, cpu_khz);

Is this the only use for this new math64 function? If so, I don't
see the point of adding that function, because (leaving limited
significant bits aside) the above simply is

(gtsc_khz << hvm_funcs.tsc_scaling_ratio_frac_bits) / cpu_khz

which can be had without any multiplication. Personally, if indeed
the only use I'd favor converting the above to inline assembly
here instead of adding that helper function (just like we have a
number of asm()-s in x86/time.c for similar reasons).

> +void hvm_setup_tsc_scaling(struct vcpu *v)
> +{
> +v->arch.hvm_vcpu.tsc_scaling_ratio =
> +hvm_get_tsc_scaling_ratio(v->domain->arch.tsc_khz);
> +}

So why again is this per-vCPU setup of per-vCPU state when it
only depends on a per-domain input? If this was per-domain, its
setup could be where it belongs - in arch_hvm_load().

> @@ -5504,6 +5536,9 @@ void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, 
> uint16_t ip)
>  hvm_set_segment_register(v, x86_seg_gdtr, ®);
>  hvm_set_segment_register(v, x86_seg_idtr, ®);
>  
> +if ( hvm_tsc_scaling_supported && !d->arch.vtsc )
> +hvm_setup_tsc_scaling(v);

Could you remind me why this is needed? What state of the guest
would have changed making this necessary? Is this perhaps just
because it's per-vCPU instead of per-domain?

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 29/30] tools/libxc: Use featuresets rather than guesswork

2016-02-05 Thread Andrew Cooper
It is conceptually wrong to base a VM's featureset on the features visible to
the toolstack which happens to construct it.

Instead, the featureset used is either an explicit one passed by the
toolstack, or the default which Xen believes it can give to the guest.

Collect all the feature manipulation into a single function which adjusts the
featureset, and perform deep dependency removal.

Signed-off-by: Andrew Cooper 
---
CC: Ian Campbell 
CC: Ian Jackson 
CC: Wei Liu 

v2: Join several related patches together
---
 tools/libxc/xc_cpuid_x86.c | 331 -
 1 file changed, 119 insertions(+), 212 deletions(-)

diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index e762d73..0e79812 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -21,18 +21,22 @@
 
 #include 
 #include 
+#include 
 #include "xc_private.h"
+#include "xc_bitops.h"
 #include "_xc_cpuid_autogen.h"
 #include 
 #include 
 
+#define featureword_of(idx) ((idx) >> 5)
 #define bitmaskof(idx)  (1u << ((idx) & 31))
-#define clear_bit(idx, dst) ((dst) &= ~bitmaskof(idx))
-#define set_bit(idx, dst)   ((dst) |=  bitmaskof(idx))
+#define clear_feature(idx, dst) ((dst) &= ~bitmaskof(idx))
+#define set_feature(idx, dst)   ((dst) |=  bitmaskof(idx))
 
 #define DEF_MAX_BASE 0x000du
 #define DEF_MAX_INTELEXT  0x8008u
 #define DEF_MAX_AMDEXT0x801cu
+#define COMMON_1D INIT_COMMON_FEATURES
 
 int xc_get_cpu_featureset(xc_interface *xch, uint32_t index,
   uint32_t *nr_features, uint32_t *featureset)
@@ -304,38 +308,6 @@ static void amd_xc_cpuid_policy(xc_interface *xch,
 regs[0] = DEF_MAX_AMDEXT;
 break;
 
-case 0x8001: {
-if ( !info->pae )
-clear_bit(X86_FEATURE_PAE, regs[3]);
-
-/* Filter all other features according to a whitelist. */
-regs[2] &= (bitmaskof(X86_FEATURE_LAHF_LM) |
-bitmaskof(X86_FEATURE_CMP_LEGACY) |
-(info->nestedhvm ? bitmaskof(X86_FEATURE_SVM) : 0) |
-bitmaskof(X86_FEATURE_CR8_LEGACY) |
-bitmaskof(X86_FEATURE_ABM) |
-bitmaskof(X86_FEATURE_SSE4A) |
-bitmaskof(X86_FEATURE_MISALIGNSSE) |
-bitmaskof(X86_FEATURE_3DNOWPREFETCH) |
-bitmaskof(X86_FEATURE_OSVW) |
-bitmaskof(X86_FEATURE_XOP) |
-bitmaskof(X86_FEATURE_LWP) |
-bitmaskof(X86_FEATURE_FMA4) |
-bitmaskof(X86_FEATURE_TBM) |
-bitmaskof(X86_FEATURE_DBEXT));
-regs[3] &= (0x0183f3ff | /* features shared with 0x0001:EDX */
-bitmaskof(X86_FEATURE_NX) |
-bitmaskof(X86_FEATURE_LM) |
-bitmaskof(X86_FEATURE_PAGE1GB) |
-bitmaskof(X86_FEATURE_SYSCALL) |
-bitmaskof(X86_FEATURE_MP) |
-bitmaskof(X86_FEATURE_MMXEXT) |
-bitmaskof(X86_FEATURE_FFXSR) |
-bitmaskof(X86_FEATURE_3DNOW) |
-bitmaskof(X86_FEATURE_3DNOWEXT));
-break;
-}
-
 case 0x8008:
 /*
  * ECX[15:12] is ApicIdCoreSize: ECX[7:0] is NumberOfCores (minus one).
@@ -382,12 +354,6 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
 {
 switch ( input[0] )
 {
-case 0x0001:
-/* ECX[5] is availability of VMX */
-if ( info->nestedhvm )
-set_bit(X86_FEATURE_VMXE, regs[2]);
-break;
-
 case 0x0004:
 /*
  * EAX[31:26] is Maximum Cores Per Package (minus one).
@@ -403,19 +369,6 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
 regs[0] = DEF_MAX_INTELEXT;
 break;
 
-case 0x8001: {
-/* Only a few features are advertised in Intel's 0x8001. */
-regs[2] &= (bitmaskof(X86_FEATURE_LAHF_LM) |
-bitmaskof(X86_FEATURE_3DNOWPREFETCH) |
-bitmaskof(X86_FEATURE_ABM));
-regs[3] &= (bitmaskof(X86_FEATURE_NX) |
-bitmaskof(X86_FEATURE_LM) |
-bitmaskof(X86_FEATURE_PAGE1GB) |
-bitmaskof(X86_FEATURE_SYSCALL) |
-bitmaskof(X86_FEATURE_RDTSCP));
-break;
-}
-
 case 0x8005:
 regs[0] = regs[1] = regs[2] = 0;
 break;
@@ -467,11 +420,8 @@ static void xc_cpuid_config_xsave(xc_interface *xch,
 regs[1] = 512 + 64; /* FP/SSE + XSAVE.HEADER */
 break;
 case 1: /* leaf 1 */
-regs[0] &= (XSAVEOPT | XSAVEC | XGETBV1 | XSAVES);
-if ( !info->hvm )
-regs[0] &= ~XSAVES;
-regs[2] &= info->xfeature_mask;
-regs[3] = 0;
+regs[0] = info->featureset[featureword_of(X86_FEATURE_XSAVEOPT)];
+regs[1] = regs[2] = regs[3] = 0;
 break;
 case 2 ... 63: /* sub-leaves *

[Xen-devel] [PATCH v2 23/30] xen+tools: Export maximum host and guest cpu featuresets via SYSCTL

2016-02-05 Thread Andrew Cooper
And provide stubs for toolstack use.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Tim Deegan 
CC: Ian Campbell 
CC: Wei Liu 
CC: David Scott 
CC: Rob Hoes 

v2:
 * Rebased to use libxencall
 * Improve hypercall documentation
---
 tools/libxc/include/xenctrl.h   |  3 ++
 tools/libxc/xc_cpuid_x86.c  | 27 +++
 tools/ocaml/libs/xc/xenctrl.ml  |  3 ++
 tools/ocaml/libs/xc/xenctrl.mli |  4 +++
 tools/ocaml/libs/xc/xenctrl_stubs.c | 35 
 xen/arch/x86/sysctl.c   | 66 +
 xen/include/public/sysctl.h | 25 ++
 7 files changed, 163 insertions(+)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 1a5f4ec..5a7500a 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2571,6 +2571,9 @@ int xc_psr_cat_get_domain_data(xc_interface *xch, 
uint32_t domid,
 int xc_psr_cat_get_l3_info(xc_interface *xch, uint32_t socket,
uint32_t *cos_max, uint32_t *cbm_len,
bool *cdp_enabled);
+
+int xc_get_cpu_featureset(xc_interface *xch, uint32_t index,
+  uint32_t *nr_features, uint32_t *featureset);
 #endif
 
 /* Compat shims */
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index c142595..7b802da 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -33,6 +33,33 @@
 #define DEF_MAX_INTELEXT  0x8008u
 #define DEF_MAX_AMDEXT0x801cu
 
+int xc_get_cpu_featureset(xc_interface *xch, uint32_t index,
+  uint32_t *nr_features, uint32_t *featureset)
+{
+DECLARE_SYSCTL;
+DECLARE_HYPERCALL_BOUNCE(featureset,
+ *nr_features * sizeof(*featureset),
+ XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+int ret;
+
+if ( xc_hypercall_bounce_pre(xch, featureset) )
+return -1;
+
+sysctl.cmd = XEN_SYSCTL_get_cpu_featureset;
+sysctl.u.cpu_featureset.index = index;
+sysctl.u.cpu_featureset.nr_features = *nr_features;
+set_xen_guest_handle(sysctl.u.cpu_featureset.features, featureset);
+
+ret = do_sysctl(xch, &sysctl);
+
+xc_hypercall_bounce_post(xch, featureset);
+
+if ( !ret )
+*nr_features = sysctl.u.cpu_featureset.nr_features;
+
+return ret;
+}
+
 struct cpuid_domain_info
 {
 enum
diff --git a/tools/ocaml/libs/xc/xenctrl.ml b/tools/ocaml/libs/xc/xenctrl.ml
index 58a53a1..75006e7 100644
--- a/tools/ocaml/libs/xc/xenctrl.ml
+++ b/tools/ocaml/libs/xc/xenctrl.ml
@@ -242,6 +242,9 @@ external version_changeset: handle -> string = 
"stub_xc_version_changeset"
 external version_capabilities: handle -> string =
   "stub_xc_version_capabilities"
 
+type featureset_index = Featureset_raw | Featureset_host | Featureset_pv | 
Featureset_hvm
+external get_cpu_featureset : handle -> featureset_index -> int64 array = 
"stub_xc_get_cpu_featureset"
+
 external watchdog : handle -> int -> int32 -> int
   = "stub_xc_watchdog"
 
diff --git a/tools/ocaml/libs/xc/xenctrl.mli b/tools/ocaml/libs/xc/xenctrl.mli
index 16443df..720e4b2 100644
--- a/tools/ocaml/libs/xc/xenctrl.mli
+++ b/tools/ocaml/libs/xc/xenctrl.mli
@@ -147,6 +147,10 @@ external version_compile_info : handle -> compile_info
 external version_changeset : handle -> string = "stub_xc_version_changeset"
 external version_capabilities : handle -> string
   = "stub_xc_version_capabilities"
+
+type featureset_index = Featureset_raw | Featureset_host | Featureset_pv | 
Featureset_hvm
+external get_cpu_featureset : handle -> featureset_index -> int64 array = 
"stub_xc_get_cpu_featureset"
+
 type core_magic = Magic_hvm | Magic_pv
 type core_header = {
   xch_magic : core_magic;
diff --git a/tools/ocaml/libs/xc/xenctrl_stubs.c 
b/tools/ocaml/libs/xc/xenctrl_stubs.c
index 74928e9..e7adf37 100644
--- a/tools/ocaml/libs/xc/xenctrl_stubs.c
+++ b/tools/ocaml/libs/xc/xenctrl_stubs.c
@@ -1214,6 +1214,41 @@ CAMLprim value stub_xc_domain_deassign_device(value xch, 
value domid, value desc
CAMLreturn(Val_unit);
 }
 
+CAMLprim value stub_xc_get_cpu_featureset(value xch, value idx)
+{
+   CAMLparam2(xch, idx);
+   CAMLlocal1(bitmap_val);
+
+   /* Safe, because of the global ocaml lock. */
+   static uint32_t fs_len;
+
+   if (fs_len == 0)
+   {
+   int ret = xc_get_cpu_featureset(_H(xch), 0, &fs_len, NULL);
+
+   if (ret || (fs_len == 0))
+   failwith_xc(_H(xch));
+   }
+
+   {
+   /* To/from hypervisor to retrieve actual featureset */
+   uint32_t fs[fs_len], len = fs_len;
+   unsigned int i;
+
+   int ret = xc_get_cpu_featureset(_H(xch), Int_val(idx), &len, 
fs);
+
+   if (ret)
+   failwith_xc(_H(xch));
+
+   bitmap_val = caml_alloc(len, 0);
+
+   for (i = 0; i < len; ++i)
+   Store_fie

[Xen-devel] [PATCH v2 18/30] x86/cpu: Rework AMD masking MSR setup

2016-02-05 Thread Andrew Cooper
This patch is best reviewed as its end result rather than as a diff, as it
rewrites almost all of the setup.

On the BSP, cpuid information is used to evaluate the potential available set
of masking MSRs, and they are unconditionally probed, filling in the
availability information and hardware defaults.

The command line parameters are then combined with the hardware defaults to
further restrict the Xen default masking level.  Each cpu is then context
switched into the default levelling state.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2:
 * Provide extra information if opt_cpu_info
 * Extra comment indicating the expected use of amd_ctxt_switch_levelling()
---
 xen/arch/x86/cpu/amd.c | 267 +++--
 1 file changed, 170 insertions(+), 97 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 5908cba..1708dd9 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -80,6 +80,13 @@ static inline int wrmsr_amd_safe(unsigned int msr, unsigned 
int lo,
return err;
 }
 
+static void wrmsr_amd(unsigned int msr, uint64_t val)
+{
+   asm volatile("wrmsr" ::
+"c" (msr), "a" ((uint32_t)val),
+"d" (val >> 32), "D" (0x9c5a203a));
+}
+
 static const struct cpuidmask {
uint16_t fam;
char rev[2];
@@ -126,126 +133,189 @@ static const struct cpuidmask *__init noinline 
get_cpuidmask(const char *opt)
 }
 
 /*
+ * Sets caps in expected_levelling_cap, probes for the specified mask MSR, and
+ * set caps in levelling_caps if it is found.  Processors prior to Fam 10h
+ * required a 32-bit password for masking MSRs.  Reads the default value into
+ * msr_val.
+ */
+static void __init __probe_mask_msr(unsigned int msr, uint64_t caps,
+uint64_t *msr_val)
+{
+   unsigned int hi, lo;
+
+expected_levelling_cap |= caps;
+
+   if ((rdmsr_amd_safe(msr, &lo, &hi) == 0) &&
+   (wrmsr_amd_safe(msr, lo, hi) == 0))
+   levelling_caps |= caps;
+
+   *msr_val = ((uint64_t)hi << 32) | lo;
+}
+
+/*
+ * Probe for the existance of the expected masking MSRs.  They might easily
+ * not be available if Xen is running virtualised.
+ */
+static void __init noinline probe_masking_msrs(void)
+{
+   const struct cpuinfo_x86 *c = &boot_cpu_data;
+
+   /*
+* First, work out which masking MSRs we should have, based on
+* revision and cpuid.
+*/
+
+   /* Fam11 doesn't support masking at all. */
+   if (c->x86 == 0x11)
+   return;
+
+   __probe_mask_msr(MSR_K8_FEATURE_MASK, LCAP_1cd,
+&cpuidmask_defaults._1cd);
+   __probe_mask_msr(MSR_K8_EXT_FEATURE_MASK, LCAP_e1cd,
+&cpuidmask_defaults.e1cd);
+
+   if (c->cpuid_level >= 7)
+   __probe_mask_msr(MSR_AMD_L7S0_FEATURE_MASK, LCAP_7ab0,
+&cpuidmask_defaults._7ab0);
+
+   if (c->x86 == 0x15 && c->cpuid_level >= 6 && cpuid_ecx(6))
+   __probe_mask_msr(MSR_AMD_THRM_FEATURE_MASK, LCAP_6c,
+&cpuidmask_defaults._6c);
+
+   /*
+* Don't bother warning about a mismatch if virtualised.  These MSRs
+* are not architectural and almost never virtualised.
+*/
+   if ((expected_levelling_cap == levelling_caps) ||
+   cpu_has_hypervisor)
+   return;
+
+   printk(XENLOG_WARNING "Mismatch between expected (%#x) "
+  "and real (%#x) levelling caps: missing %#x\n",
+  expected_levelling_cap, levelling_caps,
+  (expected_levelling_cap ^ levelling_caps) & levelling_caps);
+   printk(XENLOG_WARNING "Fam %#x, model %#x level %#x\n",
+  c->x86, c->x86_model, c->cpuid_level);
+   printk(XENLOG_WARNING
+  "If not running virtualised, please report a bug\n");
+}
+
+/*
+ * Context switch levelling state to the next domain.  A parameter of NULL is
+ * used to context switch to the default host state, and is used by the BSP/AP
+ * startup code.
+ */
+static void amd_ctxt_switch_levelling(const struct domain *nextd)
+{
+   struct cpuidmasks *these_masks = &this_cpu(cpuidmasks);
+   const struct cpuidmasks *masks = &cpuidmask_defaults;
+
+#define LAZY(cap, msr, field)  \
+   ({  \
+   if (((levelling_caps & cap) == cap) &&  \
+   (these_masks->field != masks->field))   \
+   {   \
+   wrmsr_amd(msr, masks->field);   \
+   these_masks->field = masks->field;  \
+   }   \
+   })
+
+   LAZY(LCAP_1cd,  MSR_K8_FEATURE_MASK,   _1cd);
+  

[Xen-devel] [PATCH v2 11/30] xen/x86: Calculate maximum host and guest featuresets

2016-02-05 Thread Andrew Cooper
All of this information will be used by the toolstack to make informed
levelling decisions for VMs, and by Xen to sanity check toolstack-provided
information.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
---
 xen/arch/x86/cpuid.c| 152 
 xen/arch/x86/setup.c|   3 +
 xen/include/asm-x86/cpuid.h |  17 +
 3 files changed, 172 insertions(+)

diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index 30a3392..1af0e6c 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -1,13 +1,165 @@
 #include 
 #include 
+#include 
+#include 
+#include 
+
+#define COMMON_1D INIT_COMMON_FEATURES
 
 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
 const uint32_t inverted_features[] = INIT_INVERTED_FEATURES;
 
+static const uint32_t pv_featuremask[] = INIT_PV_FEATURES;
+static const uint32_t hvm_shadow_featuremask[] = INIT_HVM_SHADOW_FEATURES;
+static const uint32_t hvm_hap_featuremask[] = INIT_HVM_HAP_FEATURES;
+
+uint32_t __read_mostly raw_featureset[FSCAPINTS];
+uint32_t __read_mostly host_featureset[FSCAPINTS];
+uint32_t __read_mostly pv_featureset[FSCAPINTS];
+uint32_t __read_mostly hvm_featureset[FSCAPINTS];
+
+static void sanitise_featureset(uint32_t *fs)
+{
+unsigned int i;
+
+for ( i = 0; i < FSCAPINTS; ++i )
+{
+/* Clamp to known mask. */
+fs[i] &= known_features[i];
+}
+
+switch ( boot_cpu_data.x86_vendor )
+{
+case X86_VENDOR_INTEL:
+/* Intel clears the common bits in e1d. */
+fs[FEATURESET_e1d] &= ~COMMON_1D;
+break;
+
+case X86_VENDOR_AMD:
+/* AMD duplicates the common bits between 1d and e1d. */
+fs[FEATURESET_e1d] = ((fs[FEATURESET_1d]  &  COMMON_1D) |
+  (fs[FEATURESET_e1d] & ~COMMON_1D));
+break;
+}
+}
+
+static void calculate_raw_featureset(void)
+{
+unsigned int i, max, tmp;
+
+max = cpuid_eax(0);
+
+if ( max >= 1 )
+cpuid(0x1, &tmp, &tmp,
+  &raw_featureset[FEATURESET_1c],
+  &raw_featureset[FEATURESET_1d]);
+if ( max >= 7 )
+cpuid_count(0x7, 0, &tmp,
+&raw_featureset[FEATURESET_7b0],
+&raw_featureset[FEATURESET_7c0],
+&tmp);
+if ( max >= 0xd )
+cpuid_count(0xd, 1,
+&raw_featureset[FEATURESET_Da1],
+&tmp, &tmp, &tmp);
+
+max = cpuid_eax(0x8000);
+if ( max >= 0x8001 )
+cpuid(0x8001, &tmp, &tmp,
+  &raw_featureset[FEATURESET_e1c],
+  &raw_featureset[FEATURESET_e1d]);
+if ( max >= 0x8007 )
+cpuid(0x8007, &tmp, &tmp, &tmp,
+  &raw_featureset[FEATURESET_e7d]);
+if ( max >= 0x8008 )
+cpuid(0x8008, &tmp,
+  &raw_featureset[FEATURESET_e8b],
+  &tmp, &tmp);
+
+for ( i = 0; i < ARRAY_SIZE(raw_featureset); ++i )
+raw_featureset[i] ^= inverted_features[i];
+}
+
+static void calculate_host_featureset(void)
+{
+memcpy(host_featureset, boot_cpu_data.x86_capability,
+   sizeof(host_featureset));
+}
+
+static void calculate_pv_featureset(void)
+{
+unsigned int i;
+
+for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
+pv_featureset[i] = host_featureset[i] & pv_featuremask[i];
+
+/* Unconditionally claim to be able to set the hypervisor bit. */
+__set_bit(X86_FEATURE_HYPERVISOR, pv_featureset);
+
+sanitise_featureset(pv_featureset);
+}
+
+static void calculate_hvm_featureset(void)
+{
+unsigned int i;
+const uint32_t *hvm_featuremask;
+
+if ( !hvm_enabled )
+return;
+
+hvm_featuremask = hvm_funcs.hap_supported ?
+hvm_hap_featuremask : hvm_shadow_featuremask;
+
+for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
+hvm_featureset[i] = host_featureset[i] & hvm_featuremask[i];
+
+/* Unconditionally claim to be able to set the hypervisor bit. */
+__set_bit(X86_FEATURE_HYPERVISOR, hvm_featureset);
+
+/*
+ * On AMD, PV guests are entirely unable to use 'sysenter' as Xen runs in
+ * long mode (and init_amd() has cleared it out of host capabilities), but
+ * HVM guests are able if running in protected mode.
+ */
+if ( (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ test_bit(X86_FEATURE_SEP, raw_featureset) )
+__set_bit(X86_FEATURE_SEP, hvm_featureset);
+
+/*
+ * With VT-x, some features are only supported by Xen if dedicated
+ * hardware support is also available.
+ */
+if ( cpu_has_vmx )
+{
+if ( !(vmx_vmexit_control & VM_EXIT_CLEAR_BNDCFGS) ||
+ !(vmx_vmentry_control & VM_ENTRY_LOAD_BNDCFGS) )
+__clear_bit(X86_FEATURE_MPX, hvm_featureset);
+
+if ( !cpu_has_vmx_xsaves )
+__clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
+
+if ( !cpu_has_vmx_pcommit )
+__clear_bit(X86_FEATURE_PCOMMIT, hvm_feature

[Xen-devel] [PATCH v2 15/30] xen/x86: Improvements to in-hypervisor cpuid sanity checks

2016-02-05 Thread Andrew Cooper
* Use the boot-generated pv and hvm featureset to clamp the visible features,
  rather than picking and choosing at individual features.  This subsumes the
  static feature manipulation.
* More use of compiler-visible &'s and |'s, rather than clear,set bit.
* Remove logic which hides PSE36 out of PAE mode.  This is not how real
  hardware behaves.
* Improve logic to set OSXSAVE.  The bit is cleared by virtue of not being
  valid in a featureset, and should be a strict fast-forward from %cr4.
  Provide a very big health warning for OXSAVE for PV guests, which is
  non-architectural.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2:
 * Reinstate some of the dynamic checks for now.  Future development work will
   instate a complete per-domain policy.
 * Fix OSXSAVE handling for PV guests.
---
 xen/arch/x86/hvm/hvm.c |  56 +-
 xen/arch/x86/traps.c   | 151 -
 2 files changed, 100 insertions(+), 107 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 35ec6c9..03b3868 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -71,6 +71,7 @@
 #include 
 #include 
 #include 
+#include 
 
 bool_t __read_mostly hvm_enabled;
 
@@ -4617,50 +4618,39 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, 
unsigned int *ebx,
 /* Fix up VLAPIC details. */
 *ebx &= 0x00FFu;
 *ebx |= (v->vcpu_id * 2) << 24;
+
+*ecx &= hvm_featureset[FEATURESET_1c];
+*edx &= hvm_featureset[FEATURESET_1d];
+
 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
-__clear_bit(X86_FEATURE_APIC & 31, edx);
+*edx &= ~cpufeat_bit(X86_FEATURE_APIC);
 
 /* Fix up OSXSAVE. */
-if ( cpu_has_xsave )
-*ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ?
- cpufeat_mask(X86_FEATURE_OSXSAVE) : 0;
+if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
+*ecx |= cpufeat_mask(X86_FEATURE_OSXSAVE);
 
 /* Don't expose PCID to non-hap hvm. */
 if ( !hap_enabled(d) )
 *ecx &= ~cpufeat_mask(X86_FEATURE_PCID);
-
-/* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
-if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
-*edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
 break;
+
 case 0x7:
 if ( count == 0 )
 {
-if ( !cpu_has_smep )
-*ebx &= ~cpufeat_mask(X86_FEATURE_SMEP);
-
-if ( !cpu_has_smap )
-*ebx &= ~cpufeat_mask(X86_FEATURE_SMAP);
-
-/* Don't expose MPX to hvm when VMX support is not available */
-if ( !(vmx_vmexit_control & VM_EXIT_CLEAR_BNDCFGS) ||
- !(vmx_vmentry_control & VM_ENTRY_LOAD_BNDCFGS) )
-*ebx &= ~cpufeat_mask(X86_FEATURE_MPX);
+*ebx &= hvm_featureset[FEATURESET_7b0];
+*ecx &= hvm_featureset[FEATURESET_7c0];
 
 /* Don't expose INVPCID to non-hap hvm. */
 if ( !hap_enabled(d) )
 *ebx &= ~cpufeat_mask(X86_FEATURE_INVPCID);
-
-/* Don't expose PCOMMIT to hvm when VMX support is not available */
-if ( !cpu_has_vmx_pcommit )
-*ebx &= ~cpufeat_mask(X86_FEATURE_PCOMMIT);
 }
-
 break;
+
 case 0xb:
 /* Fix the x2APIC identifier. */
 *edx = v->vcpu_id * 2;
 break;
+
 case 0xd:
 /* EBX value of main leaf 0 depends on enabled xsave features */
 if ( count == 0 && v->arch.xcr0 ) 
@@ -4677,9 +4667,12 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, 
unsigned int *ebx,
 *ebx = _eax + _ebx;
 }
 }
+
 if ( count == 1 )
 {
-if ( cpu_has_xsaves && cpu_has_vmx_xsaves )
+*eax &= hvm_featureset[FEATURESET_Da1];
+
+if ( *eax & cpufeat_mask(X86_FEATURE_XSAVES) )
 {
 *ebx = XSTATE_AREA_MIN_SIZE;
 if ( v->arch.xcr0 | v->arch.hvm_vcpu.msr_xss )
@@ -4694,6 +4687,9 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, 
unsigned int *ebx,
 break;
 
 case 0x8001:
+*ecx &= hvm_featureset[FEATURESET_e1c];
+*edx &= hvm_featureset[FEATURESET_e1d];
+
 /* We expose RDTSCP feature to guest only when
tsc_mode == TSC_MODE_DEFAULT and host_tsc_is_safe() returns 1 */
 if ( d->arch.tsc_mode != TSC_MODE_DEFAULT ||
@@ -4702,12 +4698,10 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, 
unsigned int *ebx,
 /* Hide 1GB-superpage feature if we can't emulate it. */
 if (!hvm_pse1gb_supported(d))
 *edx &= ~cpufeat_mask(X86_FEATURE_PAGE1GB);
-/* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
-if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
-*edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
-

[Xen-devel] [PATCH v2 17/30] x86/cpu: Common infrastructure for levelling context switching

2016-02-05 Thread Andrew Cooper
This change is purely scaffolding to reduce the complexity of the following
three patches.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2: s/cpumasks/cpuidmasks/
---
 xen/arch/x86/cpu/common.c|  6 ++
 xen/include/asm-x86/cpufeature.h |  1 +
 xen/include/asm-x86/processor.h  | 28 
 3 files changed, 35 insertions(+)

diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 46d93a6..3fdae96 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -36,6 +36,12 @@ integer_param("cpuid_mask_ext_ecx", opt_cpuid_mask_ext_ecx);
 unsigned int opt_cpuid_mask_ext_edx = ~0u;
 integer_param("cpuid_mask_ext_edx", opt_cpuid_mask_ext_edx);
 
+unsigned int __initdata expected_levelling_cap;
+unsigned int __read_mostly levelling_caps;
+
+DEFINE_PER_CPU(struct cpuidmasks, cpuidmasks);
+struct cpuidmasks __read_mostly cpuidmask_defaults;
+
 const struct cpu_dev *__read_mostly cpu_devs[X86_VENDOR_NUM] = {};
 
 unsigned int paddr_bits __read_mostly = 36;
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index f228fa2..8ac6b56 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -95,6 +95,7 @@
 #define cpu_has_xsavec boot_cpu_has(X86_FEATURE_XSAVEC)
 #define cpu_has_xgetbv1boot_cpu_has(X86_FEATURE_XGETBV1)
 #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
+#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
 
 enum _cache_type {
 CACHE_TYPE_NULL = 0,
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 271340e..09e82d8 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -574,6 +574,34 @@ void microcode_set_module(unsigned int);
 int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void), unsigned long len);
 int microcode_resume_cpu(unsigned int cpu);
 
+#define LCAP_faulting (1U << 0)
+#define LCAP_1cd  (3U << 1)
+#define LCAP_e1cd (3U << 3)
+#define LCAP_Da1  (1U << 5)
+#define LCAP_6c   (1U << 6)
+#define LCAP_7ab0 (3U << 7)
+
+/*
+ * Expected levelling capabilities (given cpuid vendor/family information),
+ * and levelling capabilities actually available (given MSR probing).
+ */
+extern unsigned int expected_levelling_cap, levelling_caps;
+
+struct cpuidmasks
+{
+uint64_t _1cd;
+uint64_t e1cd;
+uint64_t Da1;
+uint64_t _6c;
+uint64_t _7ab0;
+};
+
+/* Per CPU shadows of masking MSR values, for lazy context switching. */
+DECLARE_PER_CPU(struct cpuidmasks, cpuidmasks);
+
+/* Default masking MSR values, calculated at boot. */
+extern struct cpuidmasks cpuidmask_defaults;
+
 enum get_cpu_vendor {
gcv_host_early,
gcv_host_late,
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 14/30] xen/x86: Improve disabling of features which have dependencies

2016-02-05 Thread Andrew Cooper
APIC and XSAVE have dependent features, which also need disabling if Xen
chooses to disable a feature.

Use setup_clear_cpu_cap() rather than clear_bit(), as it takes care of
dependent features as well.

Signed-off-by: Andrew Cooper 
Reviewed-by: Jan Beulich 
---
v2: Move boolean_param() adjacent to use_xsave in xstate_init()
---
 xen/arch/x86/apic.c   |  2 +-
 xen/arch/x86/cpu/common.c | 12 +++-
 xen/arch/x86/xstate.c |  6 +-
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c
index b9601ad..8df5bd3 100644
--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -1349,7 +1349,7 @@ void pmu_apic_interrupt(struct cpu_user_regs *regs)
 int __init APIC_init_uniprocessor (void)
 {
 if (enable_local_apic < 0)
-__clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+setup_clear_cpu_cap(X86_FEATURE_APIC);
 
 if (!smp_found_config && !cpu_has_apic) {
 skip_ioapic_setup = 1;
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index e205565..46d93a6 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -16,9 +16,6 @@
 
 #include "cpu.h"
 
-static bool_t use_xsave = 1;
-boolean_param("xsave", use_xsave);
-
 bool_t opt_arat = 1;
 boolean_param("arat", opt_arat);
 
@@ -343,12 +340,6 @@ void identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_init)
this_cpu->c_init(c);
 
-/* Initialize xsave/xrstor features */
-   if ( !use_xsave )
-   __clear_bit(X86_FEATURE_XSAVE, boot_cpu_data.x86_capability);
-
-   if ( cpu_has_xsave )
-   xstate_init(c);
 
if ( !opt_pku )
setup_clear_cpu_cap(X86_FEATURE_PKU);
@@ -374,6 +365,9 @@ void identify_cpu(struct cpuinfo_x86 *c)
 
/* Now the feature flags better reflect actual CPU features! */
 
+   if ( cpu_has_xsave )
+   xstate_init(c);
+
 #ifdef NOISY_CAPS
printk(KERN_DEBUG "CPU: After all inits, caps:");
for (i = 0; i < NCAPINTS; i++)
diff --git a/xen/arch/x86/xstate.c b/xen/arch/x86/xstate.c
index c5d17ff..56b5df2 100644
--- a/xen/arch/x86/xstate.c
+++ b/xen/arch/x86/xstate.c
@@ -505,11 +505,15 @@ unsigned int xstate_ctxt_size(u64 xcr0)
 /* Collect the information of processor's extended state */
 void xstate_init(struct cpuinfo_x86 *c)
 {
+static bool_t __initdata use_xsave = 1;
+boolean_param("xsave", use_xsave);
+
 bool_t bsp = c == &boot_cpu_data;
 u32 eax, ebx, ecx, edx;
 u64 feature_mask;
 
-if ( boot_cpu_data.cpuid_level < XSTATE_CPUID )
+if ( (bsp && !use_xsave) ||
+ boot_cpu_data.cpuid_level < XSTATE_CPUID )
 {
 BUG_ON(!bsp);
 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 10/30] xen/x86: Annotate VM applicability in featureset

2016-02-05 Thread Andrew Cooper
Use attributes to specify whether a feature is applicable to be exposed to:
 1) All guests
 2) HVM guests
 3) HVM HAP guests

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2: Annotate features using a magic comment and autogeneration.
---
 xen/include/public/arch-x86/cpufeatureset.h | 187 ++--
 xen/tools/gen-cpuid.py  |  32 -
 2 files changed, 127 insertions(+), 92 deletions(-)

diff --git a/xen/include/public/arch-x86/cpufeatureset.h 
b/xen/include/public/arch-x86/cpufeatureset.h
index 2748cfd..d10b725 100644
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -55,139 +55,144 @@
  * Inverted: '!'
  *   This feature has its value in a featureset inverted, compared to how it
  *   is specified by vendor architecture manuals.
+ *
+ * Applicability to guests: 'A', 'S' or 'H'
+ *   'A' = All guests.
+ *   'S' = All HVM guests (not PV guests).
+ *   'H' = HVM HAP guests (not PV or HVM Shadow guests).
  */
 
 /* Intel-defined CPU features, CPUID level 0x0001.edx, word 0 */
-#define X86_FEATURE_FPU   ( 0*32+ 0) /*   Onboard FPU */
-#define X86_FEATURE_VME   ( 0*32+ 1) /*   Virtual Mode Extensions */
-#define X86_FEATURE_DE( 0*32+ 2) /*   Debugging Extensions */
-#define X86_FEATURE_PSE   ( 0*32+ 3) /*   Page Size Extensions */
-#define X86_FEATURE_TSC   ( 0*32+ 4) /*   Time Stamp Counter */
-#define X86_FEATURE_MSR   ( 0*32+ 5) /*   Model-Specific Registers, 
RDMSR, WRMSR */
-#define X86_FEATURE_PAE   ( 0*32+ 6) /*   Physical Address Extensions 
*/
-#define X86_FEATURE_MCE   ( 0*32+ 7) /*   Machine Check Architecture */
-#define X86_FEATURE_CX8   ( 0*32+ 8) /*   CMPXCHG8 instruction */
-#define X86_FEATURE_APIC  ( 0*32+ 9) /*   Onboard APIC */
-#define X86_FEATURE_SEP   ( 0*32+11) /*   SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR  ( 0*32+12) /*   Memory Type Range Registers 
*/
-#define X86_FEATURE_PGE   ( 0*32+13) /*   Page Global Enable */
-#define X86_FEATURE_MCA   ( 0*32+14) /*   Machine Check Architecture */
-#define X86_FEATURE_CMOV  ( 0*32+15) /*   CMOV instruction (FCMOVCC 
and FCOMI too if FPU present) */
-#define X86_FEATURE_PAT   ( 0*32+16) /*   Page Attribute Table */
-#define X86_FEATURE_PSE36 ( 0*32+17) /*   36-bit PSEs */
+#define X86_FEATURE_FPU   ( 0*32+ 0) /*A  Onboard FPU */
+#define X86_FEATURE_VME   ( 0*32+ 1) /*S  Virtual Mode Extensions */
+#define X86_FEATURE_DE( 0*32+ 2) /*A  Debugging Extensions */
+#define X86_FEATURE_PSE   ( 0*32+ 3) /*S  Page Size Extensions */
+#define X86_FEATURE_TSC   ( 0*32+ 4) /*A  Time Stamp Counter */
+#define X86_FEATURE_MSR   ( 0*32+ 5) /*A  Model-Specific Registers, 
RDMSR, WRMSR */
+#define X86_FEATURE_PAE   ( 0*32+ 6) /*A  Physical Address Extensions 
*/
+#define X86_FEATURE_MCE   ( 0*32+ 7) /*A  Machine Check Architecture */
+#define X86_FEATURE_CX8   ( 0*32+ 8) /*A  CMPXCHG8 instruction */
+#define X86_FEATURE_APIC  ( 0*32+ 9) /*A  Onboard APIC */
+#define X86_FEATURE_SEP   ( 0*32+11) /*A  SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR  ( 0*32+12) /*S  Memory Type Range Registers 
*/
+#define X86_FEATURE_PGE   ( 0*32+13) /*S  Page Global Enable */
+#define X86_FEATURE_MCA   ( 0*32+14) /*A  Machine Check Architecture */
+#define X86_FEATURE_CMOV  ( 0*32+15) /*A  CMOV instruction (FCMOVCC 
and FCOMI too if FPU present) */
+#define X86_FEATURE_PAT   ( 0*32+16) /*A  Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /*S  36-bit PSEs */
 #define X86_FEATURE_PN( 0*32+18) /*   Processor serial number */
-#define X86_FEATURE_CLFLSH( 0*32+19) /*   CLFLUSH instruction */
+#define X86_FEATURE_CLFLSH( 0*32+19) /*A  CLFLUSH instruction */
 #define X86_FEATURE_DS( 0*32+21) /*   Debug Store */
-#define X86_FEATURE_ACPI  ( 0*32+22) /*   ACPI via MSR */
-#define X86_FEATURE_MMX   ( 0*32+23) /*   Multimedia Extensions */
-#define X86_FEATURE_FXSR  ( 0*32+24) /*   FXSAVE and FXRSTOR 
instructions */
-#define X86_FEATURE_XMM   ( 0*32+25) /*   Streaming SIMD Extensions */
-#define X86_FEATURE_XMM2  ( 0*32+26) /*   Streaming SIMD Extensions-2 
*/
+#define X86_FEATURE_ACPI  ( 0*32+22) /*A  ACPI via MSR */
+#define X86_FEATURE_MMX   ( 0*32+23) /*A  Multimedia Extensions */
+#define X86_FEATURE_FXSR  ( 0*32+24) /*A  FXSAVE and FXRSTOR 
instructions */
+#define X86_FEATURE_XMM   ( 0*32+25) /*A  Streaming SIMD Extensions */
+#define X86_FEATURE_XMM2  ( 0*32+26) /*A  Streaming SIMD Extensions-2 
*/
 #define X86_FEATURE_SELFSNOOP ( 0*32+27) /*   CPU self snoop */
-#define X86_FEATURE_HT( 0*32+28) /*   Hyper-Threading */
+#define X86_FEATURE_HT(

[Xen-devel] [PATCH v2 13/30] xen/x86: Clear dependent features when clearing a cpu cap

2016-02-05 Thread Andrew Cooper
When clearing a cpu cap, clear all dependent features.  This avoids having a
featureset with intermediate features disabled, but leaf features enabled.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
---
 xen/arch/x86/cpu/common.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 39c340b..e205565 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -53,8 +53,24 @@ static unsigned int cleared_caps[NCAPINTS];
 
 void __init setup_clear_cpu_cap(unsigned int cap)
 {
+   const uint32_t *dfs;
+   unsigned int i;
+
+   if ( test_bit(cap, cleared_caps) )
+   return;
+
__clear_bit(cap, boot_cpu_data.x86_capability);
__set_bit(cap, cleared_caps);
+
+   dfs = lookup_deep_deps(cap);
+
+   if ( !dfs )
+   return;
+
+   for ( i = 0; i < FSCAPINTS; ++i ) {
+   cleared_caps[i] |= dfs[i];
+   boot_cpu_data.x86_capability[i] &= ~dfs[i];
+   }
 }
 
 static void default_init(struct cpuinfo_x86 * c)
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 19/30] x86/cpu: Rework Intel masking/faulting setup

2016-02-05 Thread Andrew Cooper
This patch is best reviewed as its end result rather than as a diff, as it
rewrites almost all of the setup.

On the BSP, cpuid information is used to evaluate the potential available set
of masking MSRs, and they are unconditionally probed, filling in the
availability information and hardware defaults.  A side effect of this is that
probe_intel_cpuid_faulting() can move to being __init.

The command line parameters are then combined with the hardware defaults to
further restrict the Xen default masking level.  Each cpu is then context
switched into the default levelling state.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2:
 * Style fixes
 * Provide extra information if opt_cpu_info
 * Extra comment indicating the expected use of intel_ctxt_switch_levelling()
---
 xen/arch/x86/cpu/intel.c | 242 +--
 1 file changed, 150 insertions(+), 92 deletions(-)

diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index ad22375..143f497 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -18,11 +18,18 @@
 
 #define select_idle_routine(x) ((void)0)
 
-static unsigned int probe_intel_cpuid_faulting(void)
+static bool_t __init probe_intel_cpuid_faulting(void)
 {
uint64_t x;
-   return !rdmsr_safe(MSR_INTEL_PLATFORM_INFO, x) &&
-   (x & MSR_PLATFORM_INFO_CPUID_FAULTING);
+
+   if ( rdmsr_safe(MSR_INTEL_PLATFORM_INFO, x) ||
+!(x & MSR_PLATFORM_INFO_CPUID_FAULTING) )
+   return 0;
+
+   expected_levelling_cap |= LCAP_faulting;
+   levelling_caps |=  LCAP_faulting;
+   __set_bit(X86_FEATURE_CPUID_FAULTING, boot_cpu_data.x86_capability);
+   return 1;
 }
 
 static DEFINE_PER_CPU(bool_t, cpuid_faulting_enabled);
@@ -44,41 +51,46 @@ void set_cpuid_faulting(bool_t enable)
 }
 
 /*
- * opt_cpuid_mask_ecx/edx: cpuid.1[ecx, edx] feature mask.
- * For example, E8400[Intel Core 2 Duo Processor series] ecx = 0x0008E3FD,
- * edx = 0xBFEBFBFF when executing CPUID.EAX = 1 normally. If you want to
- * 'rev down' to E8400, you can set these values in these Xen boot parameters.
+ * Set caps in expected_levelling_cap, probe a specific masking MSR, and set
+ * caps in levelling_caps if it is found, or clobber the MSR index if missing.
+ * If preset, reads the default value into msr_val.
  */
-static void set_cpuidmask(const struct cpuinfo_x86 *c)
+static void __init __probe_mask_msr(unsigned int *msr, uint64_t caps,
+   uint64_t *msr_val)
 {
-   static unsigned int msr_basic, msr_ext, msr_xsave;
-   static enum { not_parsed, no_mask, set_mask } status;
-   u64 msr_val;
+   uint64_t val;
 
-   if (status == no_mask)
-   return;
+   expected_levelling_cap |= caps;
 
-   if (status == set_mask)
-   goto setmask;
+   if (rdmsr_safe(*msr, val) || wrmsr_safe(*msr, val))
+   *msr = 0;
+   else
+   {
+   levelling_caps |= caps;
+   *msr_val = val;
+   }
+}
 
-   ASSERT((status == not_parsed) && (c == &boot_cpu_data));
-   status = no_mask;
+/* Indicies of the masking MSRs, or 0 if unavailable. */
+static unsigned int __read_mostly msr_basic, msr_ext, msr_xsave;
 
-   if (!~(opt_cpuid_mask_ecx & opt_cpuid_mask_edx &
-  opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx &
-  opt_cpuid_mask_xsave_eax))
-   return;
+/*
+ * Probe for the existance of the expected masking MSRs.  They might easily
+ * not be available if Xen is running virtualised.
+ */
+static void __init probe_masking_msrs(void)
+{
+   const struct cpuinfo_x86 *c = &boot_cpu_data;
+   unsigned int exp_msr_basic = 0, exp_msr_ext = 0, exp_msr_xsave = 0;
 
/* Only family 6 supports this feature. */
-   if (c->x86 != 6) {
-   printk("No CPUID feature masking support available\n");
+   if (c->x86 != 6)
return;
-   }
 
switch (c->x86_model) {
case 0x17: /* Yorkfield, Wolfdale, Penryn, Harpertown(DP) */
case 0x1d: /* Dunnington(MP) */
-   msr_basic = MSR_INTEL_MASK_V1_CPUID1;
+   exp_msr_basic = msr_basic = MSR_INTEL_MASK_V1_CPUID1;
break;
 
case 0x1a: /* Bloomfield, Nehalem-EP(Gainestown) */
@@ -88,71 +100,126 @@ static void set_cpuidmask(const struct cpuinfo_x86 *c)
case 0x2c: /* Gulftown, Westmere-EP */
case 0x2e: /* Nehalem-EX(Beckton) */
case 0x2f: /* Westmere-EX */
-   msr_basic = MSR_INTEL_MASK_V2_CPUID1;
-   msr_ext   = MSR_INTEL_MASK_V2_CPUID8001;
+   exp_msr_basic = msr_basic = MSR_INTEL_MASK_V2_CPUID1;
+   exp_msr_ext   = msr_ext   = MSR_INTEL_MASK_V2_CPUID8001;
break;
 
case 0x2a: /* SandyBridge */
case 0x2d: /* SandyBridge-E, SandyBridge-EN, SandyBridge-EP */
-   msr_basic = MSR_INTEL_MASK_V3_CPUID1;
- 

[Xen-devel] [PATCH v2 21/30] x86/pv: Provide custom cpumasks for PV domains

2016-02-05 Thread Andrew Cooper
And use them in preference to cpumask_defaults on context switch.  HVM domains
must not be masked (to avoid interfering with cpuid calls within the guest),
so always lazily context switch to the host default.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2:
 * s/cpumasks/cpuidmasks/
 * Use structure assignment
 * Fix error path in arch_domain_create()
---
 xen/arch/x86/cpu/amd.c   |  4 +++-
 xen/arch/x86/cpu/intel.c |  5 -
 xen/arch/x86/domain.c| 11 +++
 xen/include/asm-x86/domain.h |  2 ++
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 9d162bc..deb98ea 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -208,7 +208,9 @@ static void __init noinline probe_masking_msrs(void)
 static void amd_ctxt_switch_levelling(const struct domain *nextd)
 {
struct cpuidmasks *these_masks = &this_cpu(cpuidmasks);
-   const struct cpuidmasks *masks = &cpuidmask_defaults;
+   const struct cpuidmasks *masks =
+(nextd && is_pv_domain(nextd) && nextd->arch.pv_domain.cpuidmasks)
+? nextd->arch.pv_domain.cpuidmasks : &cpuidmask_defaults;
 
 #define LAZY(cap, msr, field)  \
({  \
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index 95d44dd..b403af4 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -151,13 +151,16 @@ static void __init probe_masking_msrs(void)
 static void intel_ctxt_switch_levelling(const struct domain *nextd)
 {
struct cpuidmasks *these_masks = &this_cpu(cpuidmasks);
-   const struct cpuidmasks *masks = &cpuidmask_defaults;
+   const struct cpuidmasks *masks;
 
if (cpu_has_cpuid_faulting) {
set_cpuid_faulting(nextd && is_pv_domain(nextd));
return;
}
 
+   masks = (nextd && is_pv_domain(nextd) && 
nextd->arch.pv_domain.cpuidmasks)
+   ? nextd->arch.pv_domain.cpuidmasks : &cpuidmask_defaults;
+
 #define LAZY(msr, field)   \
({  \
if (msr && (these_masks->field != masks->field))\
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index dbce90f..d7cd4d2 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -574,6 +574,11 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags,
 goto fail;
 clear_page(d->arch.pv_domain.gdt_ldt_l1tab);
 
+d->arch.pv_domain.cpuidmasks = xmalloc(struct cpuidmasks);
+if ( !d->arch.pv_domain.cpuidmasks )
+goto fail;
+*d->arch.pv_domain.cpuidmasks = cpuidmask_defaults;
+
 rc = create_perdomain_mapping(d, GDT_LDT_VIRT_START,
   GDT_LDT_MBYTES << (20 - PAGE_SHIFT),
   NULL, NULL);
@@ -663,7 +668,10 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags,
 paging_final_teardown(d);
 free_perdomain_mappings(d);
 if ( is_pv_domain(d) )
+{
+xfree(d->arch.pv_domain.cpuidmasks);
 free_xenheap_page(d->arch.pv_domain.gdt_ldt_l1tab);
+}
 psr_domain_free(d);
 return rc;
 }
@@ -683,7 +691,10 @@ void arch_domain_destroy(struct domain *d)
 
 free_perdomain_mappings(d);
 if ( is_pv_domain(d) )
+{
 free_xenheap_page(d->arch.pv_domain.gdt_ldt_l1tab);
+xfree(d->arch.pv_domain.cpuidmasks);
+}
 
 free_xenheap_page(d->shared_info);
 cleanup_domain_irq_mapping(d);
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index 4072e27..c464932 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -252,6 +252,8 @@ struct pv_domain
 
 /* map_domain_page() mapping cache. */
 struct mapcache_domain mapcache;
+
+struct cpuidmasks *cpuidmasks;
 };
 
 struct monitor_write_data {
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 26/30] tools/libxc: Expose the automatically generated cpu featuremask information

2016-02-05 Thread Andrew Cooper
Signed-off-by: Andrew Cooper 
---
CC: Ian Campbell 
CC: Ian Jackson 
CC: Wei Liu 

New in v2
---
 tools/libxc/Makefile  |  9 ++
 tools/libxc/include/xenctrl.h | 14 
 tools/libxc/xc_cpuid_x86.c| 75 +++
 3 files changed, 98 insertions(+)

diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
index 0a8614c..30de3fe 100644
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -145,6 +145,15 @@ $(eval $(genpath-target))
 
 xc_private.h: _paths.h
 
+ifeq ($(CONFIG_X86),y)
+
+_xc_cpuid_autogen.h: $(XEN_ROOT)/xen/include/public/arch-x86/cpufeatureset.h 
$(XEN_ROOT)/xen/tools/gen-cpuid.py
+   $(PYTHON) $(XEN_ROOT)/xen/tools/gen-cpuid.py -i $^ -o $@.new
+   $(call move-if-changed,$@.new,$@)
+
+build: _xc_cpuid_autogen.h
+endif
+
 $(CTRL_LIB_OBJS) $(GUEST_LIB_OBJS) \
 $(CTRL_PIC_OBJS) $(GUEST_PIC_OBJS): xc_private.h
 
diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 5a7500a..1da372d 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -2574,6 +2574,20 @@ int xc_psr_cat_get_l3_info(xc_interface *xch, uint32_t 
socket,
 
 int xc_get_cpu_featureset(xc_interface *xch, uint32_t index,
   uint32_t *nr_features, uint32_t *featureset);
+
+uint32_t xc_get_cpu_featureset_size(void);
+
+enum xc_static_cpu_featuremask {
+XC_FEATUREMASK_KNOWN,
+XC_FEATUREMASK_INVERTED,
+XC_FEATUREMASK_PV,
+XC_FEATUREMASK_HVM_SHADOW,
+XC_FEATUREMASK_HVM_HAP,
+XC_FEATUREMASK_DEEP_FEATURES,
+};
+const uint32_t *xc_get_static_cpu_featuremask(enum xc_static_cpu_featuremask);
+const uint32_t *xc_get_feature_deep_deps(uint32_t feature);
+
 #endif
 
 /* Compat shims */
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index 348cbdd..7ef37d2 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include "xc_private.h"
+#include "_xc_cpuid_autogen.h"
 #include 
 #include 
 
@@ -60,6 +61,80 @@ int xc_get_cpu_featureset(xc_interface *xch, uint32_t index,
 return ret;
 }
 
+uint32_t xc_get_cpu_featureset_size(void)
+{
+return FEATURESET_NR_ENTRIES;
+}
+
+const uint32_t *xc_get_static_cpu_featuremask(
+enum xc_static_cpu_featuremask mask)
+{
+const static uint32_t known[FEATURESET_NR_ENTRIES] = INIT_KNOWN_FEATURES,
+inverted[FEATURESET_NR_ENTRIES] = INIT_INVERTED_FEATURES,
+pv[FEATURESET_NR_ENTRIES] = INIT_PV_FEATURES,
+hvm_shadow[FEATURESET_NR_ENTRIES] = INIT_HVM_SHADOW_FEATURES,
+hvm_hap[FEATURESET_NR_ENTRIES] = INIT_HVM_HAP_FEATURES,
+deep_features[FEATURESET_NR_ENTRIES] = INIT_DEEP_FEATURES;
+
+XC_BUILD_BUG_ON(ARRAY_SIZE(known) != FEATURESET_NR_ENTRIES);
+XC_BUILD_BUG_ON(ARRAY_SIZE(inverted) != FEATURESET_NR_ENTRIES);
+XC_BUILD_BUG_ON(ARRAY_SIZE(pv) != FEATURESET_NR_ENTRIES);
+XC_BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow) != FEATURESET_NR_ENTRIES);
+XC_BUILD_BUG_ON(ARRAY_SIZE(hvm_hap) != FEATURESET_NR_ENTRIES);
+XC_BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FEATURESET_NR_ENTRIES);
+
+switch ( mask )
+{
+case XC_FEATUREMASK_KNOWN:
+return known;
+
+case XC_FEATUREMASK_INVERTED:
+return inverted;
+
+case XC_FEATUREMASK_PV:
+return pv;
+
+case XC_FEATUREMASK_HVM_SHADOW:
+return hvm_shadow;
+
+case XC_FEATUREMASK_HVM_HAP:
+return hvm_hap;
+
+case XC_FEATUREMASK_DEEP_FEATURES:
+return deep_features;
+
+default:
+return NULL;
+}
+}
+
+const uint32_t *xc_get_feature_deep_deps(uint32_t feature)
+{
+static const struct {
+uint32_t feature;
+uint32_t fs[FEATURESET_NR_ENTRIES];
+} deep_deps[] = INIT_DEEP_DEPS;
+
+unsigned int start = 0, end = ARRAY_SIZE(deep_deps);
+
+XC_BUILD_BUG_ON(ARRAY_SIZE(deep_deps) != NR_DEEP_DEPS);
+
+/* deep_deps[] is sorted.  Perform a binary search. */
+while ( start < end )
+{
+unsigned int mid = start + ((end - start) / 2);
+
+if ( deep_deps[mid].feature > feature )
+end = mid;
+else if ( deep_deps[mid].feature < feature )
+start = mid + 1;
+else
+return deep_deps[mid].fs;
+}
+
+return NULL;
+}
+
 struct cpuid_domain_info
 {
 enum
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 28/30] tools/libxc: Wire a featureset through to cpuid policy logic

2016-02-05 Thread Andrew Cooper
Later changes will cause the cpuid generation logic to seed their information
from a featureset.  This patch adds the infrastructure to specify a
featureset, and will obtain the appropriate default from Xen if omitted.

Signed-off-by: Andrew Cooper 
---
CC: Ian Campbell 
CC: Ian Jackson 
CC: Wei Liu 

v2:
 * Modify existing call rather than introducing a new one.
 * Fix up in-tree callsites.
---
 tools/libxc/include/xenctrl.h   |  4 ++-
 tools/libxc/xc_cpuid_x86.c  | 69 -
 tools/libxl/libxl_cpuid.c   |  2 +-
 tools/ocaml/libs/xc/xenctrl_stubs.c |  2 +-
 tools/python/xen/lowlevel/xc/xc.c   |  2 +-
 5 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
index 1da372d..230f834 100644
--- a/tools/libxc/include/xenctrl.h
+++ b/tools/libxc/include/xenctrl.h
@@ -1927,7 +1927,9 @@ int xc_cpuid_set(xc_interface *xch,
  const char **config,
  char **config_transformed);
 int xc_cpuid_apply_policy(xc_interface *xch,
-  domid_t domid);
+  domid_t domid,
+  uint32_t *featureset,
+  unsigned int nr_features);
 void xc_cpuid_to_str(const unsigned int *regs,
  char **strs); /* some strs[] may be NULL if ENOMEM */
 int xc_mca_op(xc_interface *xch, struct xen_mc *mc);
diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index 7ef37d2..e762d73 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -148,6 +148,9 @@ struct cpuid_domain_info
 bool pvh;
 uint64_t xfeature_mask;
 
+uint32_t *featureset;
+unsigned int nr_features;
+
 /* PV-only information. */
 bool pv64;
 
@@ -179,11 +182,14 @@ static void cpuid(const unsigned int *input, unsigned int 
*regs)
 }
 
 static int get_cpuid_domain_info(xc_interface *xch, domid_t domid,
- struct cpuid_domain_info *info)
+ struct cpuid_domain_info *info,
+ uint32_t *featureset,
+ unsigned int nr_features)
 {
 struct xen_domctl domctl = {};
 xc_dominfo_t di;
 unsigned int in[2] = { 0, ~0U }, regs[4];
+unsigned int i, host_nr_features = xc_get_cpu_featureset_size();
 int rc;
 
 cpuid(in, regs);
@@ -205,6 +211,23 @@ static int get_cpuid_domain_info(xc_interface *xch, 
domid_t domid,
 info->hvm = di.hvm;
 info->pvh = di.pvh;
 
+info->featureset = calloc(host_nr_features, sizeof(*info->featureset));
+if ( !info->featureset )
+return -ENOMEM;
+
+info->nr_features = host_nr_features;
+
+if ( featureset )
+{
+memcpy(info->featureset, featureset,
+   min(host_nr_features, nr_features) * sizeof(*info->featureset));
+
+/* Check for truncated set bits. */
+for ( i = nr_features; i < host_nr_features; ++i )
+if ( featureset[i] != 0 )
+return -EOPNOTSUPP;
+}
+
 /* Get xstate information. */
 domctl.cmd = XEN_DOMCTL_getvcpuextstate;
 domctl.domain = domid;
@@ -229,6 +252,14 @@ static int get_cpuid_domain_info(xc_interface *xch, 
domid_t domid,
 return rc;
 
 info->nestedhvm = !!val;
+
+if ( !featureset )
+{
+rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_hvm,
+   &host_nr_features, info->featureset);
+if ( rc )
+return rc;
+}
 }
 else
 {
@@ -239,11 +270,24 @@ static int get_cpuid_domain_info(xc_interface *xch, 
domid_t domid,
 return rc;
 
 info->pv64 = (width == 8);
+
+if ( !featureset )
+{
+rc = xc_get_cpu_featureset(xch, XEN_SYSCTL_cpu_featureset_pv,
+   &host_nr_features, info->featureset);
+if ( rc )
+return rc;
+}
 }
 
 return 0;
 }
 
+static void free_cpuid_domain_info(struct cpuid_domain_info *info)
+{
+free(info->featureset);
+}
+
 static void amd_xc_cpuid_policy(xc_interface *xch,
 const struct cpuid_domain_info *info,
 const unsigned int *input, unsigned int *regs)
@@ -764,16 +808,18 @@ void xc_cpuid_to_str(const unsigned int *regs, char 
**strs)
 }
 }
 
-int xc_cpuid_apply_policy(xc_interface *xch, domid_t domid)
+int xc_cpuid_apply_policy(xc_interface *xch, domid_t domid,
+  uint32_t *featureset,
+  unsigned int nr_features)
 {
 struct cpuid_domain_info info = {};
 unsigned int input[2] = { 0, 0 }, regs[4];
 unsigned int base_max, ext_max;
 int rc;
 
-rc = get_cpuid_domain_info(xch, domid, &info);
+rc = get_cpuid_domain_info(xch, domid, &info, featureset, nr_features);
 if ( rc )
-   

[Xen-devel] [PATCH v2 30/30] tools/libxc: Calculate xstate cpuid leaf from guest information

2016-02-05 Thread Andrew Cooper
It is unsafe to generate the guests xstate leaves from host information, as it
prevents the differences between hosts from being hidden.

Signed-off-by: Andrew Cooper 
---
CC: Ian Campbell 
CC: Ian Jackson 
CC: Wei Liu 
---
 tools/libxc/xc_cpuid_x86.c | 44 ++--
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/tools/libxc/xc_cpuid_x86.c b/tools/libxc/xc_cpuid_x86.c
index 0e79812..810377c 100644
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -380,6 +380,11 @@ static void intel_xc_cpuid_policy(xc_interface *xch,
 }
 }
 
+#define X86_XCR0_X87(1ULL <<  0)
+#define X86_XCR0_SSE(1ULL <<  1)
+#define X86_XCR0_AVX(1ULL <<  2)
+#define X86_XCR0_LWP(1ULL << 62)
+
 #define XSAVEOPT(1 << 0)
 #define XSAVEC  (1 << 1)
 #define XGETBV1 (1 << 2)
@@ -389,34 +394,53 @@ static void xc_cpuid_config_xsave(xc_interface *xch,
   const struct cpuid_domain_info *info,
   const unsigned int *input, unsigned int 
*regs)
 {
-if ( info->xfeature_mask == 0 )
+uint64_t guest_xfeature_mask;
+
+if ( info->xfeature_mask == 0 ||
+ !test_bit(X86_FEATURE_XSAVE, info->featureset) )
 {
 regs[0] = regs[1] = regs[2] = regs[3] = 0;
 return;
 }
 
+guest_xfeature_mask = X86_XCR0_SSE | X86_XCR0_X87;
+
+if ( test_bit(X86_FEATURE_AVX, info->featureset) )
+guest_xfeature_mask |= X86_XCR0_AVX;
+
+if ( test_bit(X86_FEATURE_LWP, info->featureset) )
+guest_xfeature_mask |= X86_XCR0_LWP;
+
+/*
+ * Clamp to host mask.  Should be no-op, as guest_xfeature_mask should not
+ * be able to be calculated as larger than info->xfeature_mask.
+ *
+ * TODO - see about making this a harder error.
+ */
+guest_xfeature_mask &= info->xfeature_mask;
+
 switch ( input[1] )
 {
-case 0: 
+case 0:
 /* EAX: low 32bits of xfeature_enabled_mask */
-regs[0] = info->xfeature_mask & 0x;
+regs[0] = guest_xfeature_mask & 0x;
 /* EDX: high 32bits of xfeature_enabled_mask */
-regs[3] = (info->xfeature_mask >> 32) & 0x;
+regs[3] = (guest_xfeature_mask >> 32) & 0x;
 /* ECX: max size required by all HW features */
 {
 unsigned int _input[2] = {0xd, 0x0}, _regs[4];
 regs[2] = 0;
-for ( _input[1] = 2; _input[1] < 64; _input[1]++ )
+for ( _input[1] = 2; _input[1] <= 62; _input[1]++ )
 {
 cpuid(_input, _regs);
 if ( (_regs[0] + _regs[1]) > regs[2] )
 regs[2] = _regs[0] + _regs[1];
 }
 }
-/* EBX: max size required by enabled features. 
- * This register contains a dynamic value, which varies when a guest 
- * enables or disables XSTATE features (via xsetbv). The default size 
- * after reset is 576. */ 
+/* EBX: max size required by enabled features.
+ * This register contains a dynamic value, which varies when a guest
+ * enables or disables XSTATE features (via xsetbv). The default size
+ * after reset is 576. */
 regs[1] = 512 + 64; /* FP/SSE + XSAVE.HEADER */
 break;
 case 1: /* leaf 1 */
@@ -424,7 +448,7 @@ static void xc_cpuid_config_xsave(xc_interface *xch,
 regs[1] = regs[2] = regs[3] = 0;
 break;
 case 2 ... 63: /* sub-leaves */
-if ( !(info->xfeature_mask & (1ULL << input[1])) )
+if ( !(guest_xfeature_mask & (1ULL << input[1])) )
 {
 regs[0] = regs[1] = regs[2] = regs[3] = 0;
 break;
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 27/30] tools: Utility for dealing with featuresets

2016-02-05 Thread Andrew Cooper
It is able to reports the current featuresets; both the static masks and
dynamic featuresets from Xen, or to decode an arbitrary featureset into
`/proc/cpuinfo` style strings.

Signed-off-by: Andrew Cooper 
---
CC: Ian Campbell 
CC: Ian Jackson 
CC: Wei Liu 

v2: No linking hackary
---
 .gitignore |   1 +
 tools/misc/Makefile|   4 +
 tools/misc/xen-cpuid.c | 394 +
 3 files changed, 399 insertions(+)
 create mode 100644 tools/misc/xen-cpuid.c

diff --git a/.gitignore b/.gitignore
index b40453e..20ffa2d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,6 +179,7 @@ tools/misc/cpuperf/cpuperf-perfcntr
 tools/misc/cpuperf/cpuperf-xen
 tools/misc/xc_shadow
 tools/misc/xen_cpuperf
+tools/misc/xen-cpuid
 tools/misc/xen-detect
 tools/misc/xen-tmem-list-parse
 tools/misc/xenperf
diff --git a/tools/misc/Makefile b/tools/misc/Makefile
index a2ef0ec..a94dad9 100644
--- a/tools/misc/Makefile
+++ b/tools/misc/Makefile
@@ -10,6 +10,7 @@ CFLAGS += $(CFLAGS_xeninclude)
 CFLAGS += $(CFLAGS_libxenstore)
 
 # Everything to be installed in regular bin/
+INSTALL_BIN-$(CONFIG_X86)  += xen-cpuid
 INSTALL_BIN-$(CONFIG_X86)  += xen-detect
 INSTALL_BIN+= xencons
 INSTALL_BIN+= xencov_split
@@ -68,6 +69,9 @@ clean:
 .PHONY: distclean
 distclean: clean
 
+xen-cpuid: xen-cpuid.o
+   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(LDLIBS_libxenguest) 
$(APPEND_LDFLAGS)
+
 xen-hvmctx: xen-hvmctx.o
$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
new file mode 100644
index 000..d0f2a5c
--- /dev/null
+++ b/tools/misc/xen-cpuid.c
@@ -0,0 +1,394 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define ARRAY_SIZE(a) (sizeof a / sizeof *a)
+static uint32_t nr_features;
+
+static const char *str_1d[32] =
+{
+[ 0] = "fpu",  [ 1] = "vme",
+[ 2] = "de",   [ 3] = "pse",
+[ 4] = "tsc",  [ 5] = "msr",
+[ 6] = "pae",  [ 7] = "mce",
+[ 8] = "cx8",  [ 9] = "apic",
+[10] = "REZ",  [11] = "sysenter",
+[12] = "mtrr", [13] = "pge",
+[14] = "mca",  [15] = "cmov",
+[16] = "pat",  [17] = "pse36",
+[18] = "psn",  [19] = "clflsh",
+[20] = "REZ",  [21] = "ds",
+[22] = "acpi", [23] = "mmx",
+[24] = "fxsr", [25] = "sse",
+[26] = "sse2", [27] = "ss",
+[28] = "htt",  [29] = "tm",
+[30] = "ia64", [31] = "pbe",
+};
+
+static const char *str_1c[32] =
+{
+[ 0] = "sse3",[ 1] = "pclmulqdq",
+[ 2] = "dtes64",  [ 3] = "monitor",
+[ 4] = "ds-cpl",  [ 5] = "vmx",
+[ 6] = "smx", [ 7] = "est",
+[ 8] = "tm2", [ 9] = "ssse3",
+[10] = "cntx-id", [11] = "sdgb",
+[12] = "fma", [13] = "cx16",
+[14] = "xtpr",[15] = "pdcm",
+[16] = "REZ", [17] = "pcid",
+[18] = "dca", [19] = "sse41",
+[20] = "sse42",   [21] = "x2apic",
+[22] = "movebe",  [23] = "popcnt",
+[24] = "tsc-dl",  [25] = "aes",
+[26] = "xsave",   [27] = "osxsave",
+[28] = "avx", [29] = "f16c",
+[30] = "rdrnd",   [31] = "hyper",
+};
+
+static const char *str_e1d[32] =
+{
+[ 0] = "fpu",[ 1] = "vme",
+[ 2] = "de", [ 3] = "pse",
+[ 4] = "tsc",[ 5] = "msr",
+[ 6] = "pae",[ 7] = "mce",
+[ 8] = "cx8",[ 9] = "apic",
+[10] = "REZ",[11] = "syscall",
+[12] = "mtrr",   [13] = "pge",
+[14] = "mca",[15] = "cmov",
+[16] = "fcmov",  [17] = "pse36",
+[18] = "REZ",[19] = "mp",
+[20] = "nx", [21] = "REZ",
+[22] = "mmx+",   [23] = "mmx",
+[24] = "fxsr",   [25] = "fxsr+",
+[26] = "pg1g",   [27] = "rdtscp",
+[28] = "REZ",[29] = "lm",
+[30] = "3dnow+", [31] = "3dnow",
+};
+
+static const char *str_e1c[32] =
+{
+[ 0] = "lahf_lm",[ 1] = "cmp",
+[ 2] = "svm",[ 3] = "extapic",
+[ 4] = "cr8d",   [ 5] = "lzcnt",
+[ 6] = "sse4a",  [ 7] = "msse",
+[ 8] = "3dnowpf",[ 9] = "osvw",
+[10] = "ibs",[11] = "xop",
+[12] = "skinit", [13] = "wdt",
+[14] = "REZ",[15] = "lwp",
+[16] = "fma4",   [17] = "tce",
+[18] = "REZ",[19] = "nodeid",
+[20] = "REZ",[21] = "tbm",
+[22] = "topoext",[23] = "perfctr_core",
+[24] = "perfctr_nb", [25] = "REZ",
+[26] = "dbx",[27] = "perftsc",
+[28] = "pcx_l2i",[29] = "monitorx",
+
+[30 ... 31] = "REZ",
+};
+
+static const char *str_7b0[32] =
+{
+[ 0] = "fsgsbase", [ 1] = "tsc-adj",
+[ 2] = "sgx",  [ 3] = "bmi1",
+[ 4] = "hle",  [ 5] = "avx2",
+[ 6] = "REZ",  [ 7] = "smep",
+[ 8] = "bmi2", [ 9] = "erms",
+[10] = "invpcid",  [11] = "rtm",
+[12] = "pqm",  [13] = "depfpp",
+[14] = "mpx",  [15] = "pqe",
+[16] = "avx512f",  [17] = "avx512dq",
+[18] = "rdseed",   [19] = "adx",
+[20] = "smap", [21] = "avx512ifma",
+[22] = "pcomit",   [23] = "clfl

[Xen-devel] [PATCH v2 25/30] tools/libxc: Use public/featureset.h for cpuid policy generation

2016-02-05 Thread Andrew Cooper
Rather than having a different local copy of some of the feature
definitions.

Modify the xc_cpuid_x86.c cpumask helpers to appropriate truncate the
new values.

Signed-off-by: Andrew Cooper 
---
CC: Ian Campbell 
CC: Ian Jackson 
CC: Wei Liu 
---
 tools/libxc/xc_cpufeature.h | 147 
 tools/libxc/xc_cpuid_x86.c  |   8 +--
 2 files changed, 4 insertions(+), 151 deletions(-)
 delete mode 100644 tools/libxc/xc_cpufeature.h

diff --git a/tools/libxc/xc_cpufeature.h b/tools/libxc/xc_cpufeature.h
deleted file mode 100644
index ee53679..000
--- a/tools/libxc/xc_cpufeature.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation;
- * version 2.1 of the License.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; If not, see .
- */
-
-#ifndef __LIBXC_CPUFEATURE_H
-#define __LIBXC_CPUFEATURE_H
-
-/* Intel-defined CPU features, CPUID level 0x0001 (edx) */
-#define X86_FEATURE_FPU  0 /* Onboard FPU */
-#define X86_FEATURE_VME  1 /* Virtual Mode Extensions */
-#define X86_FEATURE_DE   2 /* Debugging Extensions */
-#define X86_FEATURE_PSE  3 /* Page Size Extensions */
-#define X86_FEATURE_TSC  4 /* Time Stamp Counter */
-#define X86_FEATURE_MSR  5 /* Model-Specific Registers, RDMSR, WRMSR */
-#define X86_FEATURE_PAE  6 /* Physical Address Extensions */
-#define X86_FEATURE_MCE  7 /* Machine Check Architecture */
-#define X86_FEATURE_CX8  8 /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC 9 /* Onboard APIC */
-#define X86_FEATURE_SEP 11 /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR12 /* Memory Type Range Registers */
-#define X86_FEATURE_PGE 13 /* Page Global Enable */
-#define X86_FEATURE_MCA 14 /* Machine Check Architecture */
-#define X86_FEATURE_CMOV15 /* CMOV instruction */
-#define X86_FEATURE_PAT 16 /* Page Attribute Table */
-#define X86_FEATURE_PSE36   17 /* 36-bit PSEs */
-#define X86_FEATURE_PN  18 /* Processor serial number */
-#define X86_FEATURE_CLFLSH  19 /* Supports the CLFLUSH instruction */
-#define X86_FEATURE_DS  21 /* Debug Store */
-#define X86_FEATURE_ACPI22 /* ACPI via MSR */
-#define X86_FEATURE_MMX 23 /* Multimedia Extensions */
-#define X86_FEATURE_FXSR24 /* FXSAVE and FXRSTOR instructions */
-#define X86_FEATURE_XMM 25 /* Streaming SIMD Extensions */
-#define X86_FEATURE_XMM226 /* Streaming SIMD Extensions-2 */
-#define X86_FEATURE_SELFSNOOP   27 /* CPU self snoop */
-#define X86_FEATURE_HT  28 /* Hyper-Threading */
-#define X86_FEATURE_ACC 29 /* Automatic clock control */
-#define X86_FEATURE_IA6430 /* IA-64 processor */
-#define X86_FEATURE_PBE 31 /* Pending Break Enable */
-
-/* AMD-defined CPU features, CPUID level 0x8001 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL 11 /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP  19 /* MP Capable. */
-#define X86_FEATURE_NX  20 /* Execute Disable */
-#define X86_FEATURE_MMXEXT  22 /* AMD MMX extensions */
-#define X86_FEATURE_FFXSR   25 /* FFXSR instruction optimizations */
-#define X86_FEATURE_PAGE1GB 26 /* 1Gb large page support */
-#define X86_FEATURE_RDTSCP  27 /* RDTSCP */
-#define X86_FEATURE_LM  29 /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT30 /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW   31 /* 3DNow! */
-
-/* Intel-defined CPU features, CPUID level 0x0001 (ecx) */
-#define X86_FEATURE_XMM3 0 /* Streaming SIMD Extensions-3 */
-#define X86_FEATURE_PCLMULQDQ1 /* Carry-less multiplication */
-#define X86_FEATURE_DTES64   2 /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT3 /* Monitor/Mwait support */
-#define X86_FEATURE_DSCPL4 /* CPL Qualified Debug Store */
-#define X86_FEATURE_VMXE 5 /* Virtual Machine Extensions */
-#define X86_FEATURE_SMXE 6 /* Safer Mode Extensions */
-#define X86_FEATURE_EST  7 /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2  8 /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE39 /* Supplemental Streaming SIMD Exts-3 */
-#define X86_FEATURE_CID 10 /* Context ID */
-#define X86_FEATURE_FMA 12 /* Fused Multiply Add */
-#define X86_FEATURE_CX1613 /* CMPXCHG16B */
-#define X86_FEATURE_XTPR14 /* Send Task Prio

[Xen-devel] [PATCH v2 24/30] tools/libxc: Modify bitmap operations to take void pointers

2016-02-05 Thread Andrew Cooper
The type of the pointer to a bitmap is not interesting; it does not affect the
representation of the block of bits being pointed to.

Make the libxc functions consistent with those in Xen, so they can work just
as well with 'unsigned int *' based bitmaps.

Signed-off-by: Andrew Cooper 
---
CC: Ian Campbell 
CC: Ian Jackson 
CC: Wei Liu 

New in v2
---
 tools/libxc/xc_bitops.h | 21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/libxc/xc_bitops.h b/tools/libxc/xc_bitops.h
index cd749f4..2a1710f 100644
--- a/tools/libxc/xc_bitops.h
+++ b/tools/libxc/xc_bitops.h
@@ -26,48 +26,53 @@ static inline unsigned long *bitmap_alloc(int nr_bits)
 return calloc(1, bitmap_size(nr_bits));
 }
 
-static inline void bitmap_set(unsigned long *addr, int nr_bits)
+static inline void bitmap_set(void *addr, int nr_bits)
 {
 memset(addr, 0xff, bitmap_size(nr_bits));
 }
 
-static inline void bitmap_clear(unsigned long *addr, int nr_bits)
+static inline void bitmap_clear(void *addr, int nr_bits)
 {
 memset(addr, 0, bitmap_size(nr_bits));
 }
 
-static inline int test_bit(int nr, unsigned long *addr)
+static inline int test_bit(int nr, const void *_addr)
 {
+const unsigned long *addr = _addr;
 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
 }
 
-static inline void clear_bit(int nr, unsigned long *addr)
+static inline void clear_bit(int nr, void *_addr)
 {
+unsigned long *addr = _addr;
 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
 }
 
-static inline void set_bit(int nr, unsigned long *addr)
+static inline void set_bit(int nr, void *_addr)
 {
+unsigned long *addr = _addr;
 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
 }
 
-static inline int test_and_clear_bit(int nr, unsigned long *addr)
+static inline int test_and_clear_bit(int nr, void *addr)
 {
 int oldbit = test_bit(nr, addr);
 clear_bit(nr, addr);
 return oldbit;
 }
 
-static inline int test_and_set_bit(int nr, unsigned long *addr)
+static inline int test_and_set_bit(int nr, void *addr)
 {
 int oldbit = test_bit(nr, addr);
 set_bit(nr, addr);
 return oldbit;
 }
 
-static inline void bitmap_or(unsigned long *dst, const unsigned long *other,
+static inline void bitmap_or(void *_dst, const void *_other,
  int nr_bits)
 {
+unsigned long *dst = _dst;
+const unsigned long *other = _other;
 int i, nr_longs = (bitmap_size(nr_bits) / sizeof(unsigned long));
 for ( i = 0; i < nr_longs; ++i )
 dst[i] |= other[i];
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 12/30] xen/x86: Generate deep dependencies of features

2016-02-05 Thread Andrew Cooper
Some features depend on other features.  Working out and maintaining the exact
dependency tree is complicated, so it is expressed in the automatic generation
script, and flattened for faster runtime use.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

For all intents an purposes, new in v2.
---
 xen/arch/x86/cpuid.c| 54 +
 xen/include/asm-x86/cpuid.h |  2 ++
 xen/tools/gen-cpuid.py  | 73 -
 3 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
index 1af0e6c..25dcd0e 100644
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -12,6 +12,7 @@ const uint32_t inverted_features[] = INIT_INVERTED_FEATURES;
 static const uint32_t pv_featuremask[] = INIT_PV_FEATURES;
 static const uint32_t hvm_shadow_featuremask[] = INIT_HVM_SHADOW_FEATURES;
 static const uint32_t hvm_hap_featuremask[] = INIT_HVM_HAP_FEATURES;
+static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
 
 uint32_t __read_mostly raw_featureset[FSCAPINTS];
 uint32_t __read_mostly host_featureset[FSCAPINTS];
@@ -20,12 +21,34 @@ uint32_t __read_mostly hvm_featureset[FSCAPINTS];
 
 static void sanitise_featureset(uint32_t *fs)
 {
+uint32_t disabled_features[FSCAPINTS];
 unsigned int i;
 
 for ( i = 0; i < FSCAPINTS; ++i )
 {
 /* Clamp to known mask. */
 fs[i] &= known_features[i];
+
+/*
+ * Identify which features with deep dependencies have been
+ * disabled.
+ */
+disabled_features[i] = ~fs[i] & deep_features[i];
+}
+
+for_each_set_bit(i, (void *)disabled_features,
+ sizeof(disabled_features) * 8)
+{
+const uint32_t *dfs = lookup_deep_deps(i);
+unsigned int j;
+
+ASSERT(dfs); /* deep_features[] should guarentee this. */
+
+for ( j = 0; j < FSCAPINTS; ++j )
+{
+fs[j] &= ~dfs[j];
+disabled_features[j] &= ~dfs[j];
+}
 }
 
 switch ( boot_cpu_data.x86_vendor )
@@ -153,6 +176,36 @@ void calculate_featuresets(void)
 calculate_hvm_featureset();
 }
 
+const uint32_t *lookup_deep_deps(uint32_t feature)
+{
+static const struct {
+uint32_t feature;
+uint32_t fs[FSCAPINTS];
+} deep_deps[] = INIT_DEEP_DEPS;
+unsigned int start = 0, end = ARRAY_SIZE(deep_deps);
+
+BUILD_BUG_ON(ARRAY_SIZE(deep_deps) != NR_DEEP_DEPS);
+
+/* Fast early exit. */
+if ( !test_bit(feature, deep_features) )
+return NULL;
+
+/* deep_deps[] is sorted.  Perform a binary search. */
+while ( start < end )
+{
+unsigned int mid = start + ((end - start) / 2);
+
+if ( deep_deps[mid].feature > feature )
+end = mid;
+else if ( deep_deps[mid].feature < feature )
+start = mid + 1;
+else
+return deep_deps[mid].fs;
+}
+
+return NULL;
+}
+
 static void __maybe_unused build_assertions(void)
 {
 BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
@@ -160,6 +213,7 @@ static void __maybe_unused build_assertions(void)
 BUILD_BUG_ON(ARRAY_SIZE(pv_featuremask) != FSCAPINTS);
 BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_featuremask) != FSCAPINTS);
 BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_featuremask) != FSCAPINTS);
+BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
 }
 
 /*
diff --git a/xen/include/asm-x86/cpuid.h b/xen/include/asm-x86/cpuid.h
index 18ba95b..cd7fa90 100644
--- a/xen/include/asm-x86/cpuid.h
+++ b/xen/include/asm-x86/cpuid.h
@@ -28,6 +28,8 @@ extern uint32_t hvm_featureset[FSCAPINTS];
 
 void calculate_featuresets(void);
 
+const uint32_t *lookup_deep_deps(uint32_t feature);
+
 #endif /* __ASSEMBLY__ */
 #endif /* !__X86_CPUID_H__ */
 
diff --git a/xen/tools/gen-cpuid.py b/xen/tools/gen-cpuid.py
index 5f0f892..c44f124 100755
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -138,6 +138,61 @@ def crunch_numbers(state):
 state.hvm_shadow = featureset_to_uint32s(state.raw_hvm_shadow, nr_entries)
 state.hvm_hap = featureset_to_uint32s(state.raw_hvm_hap, nr_entries)
 
+deps = {
+XSAVE:
+(XSAVEOPT, XSAVEC, XGETBV1, XSAVES, AVX, MPX),
+
+AVX:
+(FMA, FMA4, F16C, AVX2, XOP),
+
+PAE:
+(LM, ),
+
+LM:
+(CX16, LAHF_LM, PAGE1GB),
+
+XMM:
+(LM, ),
+
+XMM2:
+(LM, ),
+
+XMM3:
+(LM, ),
+
+APIC:
+(X2APIC, ),
+
+PSE:
+(PSE36, ),
+}
+
+deep_features = tuple(sorted(deps.keys()))
+state.deep_deps = {}
+
+for feat in deep_features:
+
+seen = [feat]
+to_process = list(deps[feat])
+
+while len(to_process):
+f = to_process.pop(0)
+
+if f in seen:
+raise Fail("ERROR: Cycle found with %s when processing %s"
+   % (state.names[f], state.names[feat]))
+
+seen.append(f)
+ 

[Xen-devel] [PATCH v2 16/30] x86/cpu: Move set_cpumask() calls into c_early_init()

2016-02-05 Thread Andrew Cooper
Before c/s 44e24f8567 "x86: don't call generic_identify() redundantly", the
commandline-provided masks would take effect in Xen's view of the features.

As the masks got applied after the query for features, the redundant call to
generic_identify() would clobber the pre-masking feature information with the
post-masking information.

Move the set_cpumask() calls into c_early_init() so their effects take place
before the main query for features in generic_identify().

The cpuid_mask_* command line parameters now limit the entire system, a
feature XenServer was relying on for testing purposes.  Subsequent changes
will cause the mask MSRs to be context switched per-domain, removing the need
to use the command line parameters for heterogeneous levelling purposes.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
---
 xen/arch/x86/cpu/amd.c   |  8 ++--
 xen/arch/x86/cpu/intel.c | 34 +-
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index f9dc532..5908cba 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -407,6 +407,11 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
  c->cpu_core_id);
 }
 
+static void early_init_amd(struct cpuinfo_x86 *c)
+{
+   set_cpuidmask(c);
+}
+
 static void init_amd(struct cpuinfo_x86 *c)
 {
u32 l, h;
@@ -595,14 +600,13 @@ static void init_amd(struct cpuinfo_x86 *c)
if ((smp_processor_id() == 1) && !cpu_has(c, X86_FEATURE_ITSC))
disable_c1_ramping();
 
-   set_cpuidmask(c);
-
check_syscfg_dram_mod_en();
 }
 
 static const struct cpu_dev amd_cpu_dev = {
.c_vendor   = "AMD",
.c_ident= { "AuthenticAMD" },
+   .c_early_init   = early_init_amd,
.c_init = init_amd,
 };
 
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index bdf89f6..ad22375 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -189,6 +189,23 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (boot_cpu_data.x86 == 0xF && boot_cpu_data.x86_model == 3 &&
(boot_cpu_data.x86_mask == 3 || boot_cpu_data.x86_mask == 4))
paddr_bits = 36;
+
+   if (c == &boot_cpu_data && c->x86 == 6) {
+   if (probe_intel_cpuid_faulting())
+   __set_bit(X86_FEATURE_CPUID_FAULTING,
+ c->x86_capability);
+   } else if (boot_cpu_has(X86_FEATURE_CPUID_FAULTING)) {
+   BUG_ON(!probe_intel_cpuid_faulting());
+   __set_bit(X86_FEATURE_CPUID_FAULTING, c->x86_capability);
+   }
+
+   if (!cpu_has_cpuid_faulting)
+   set_cpuidmask(c);
+   else if ((c == &boot_cpu_data) &&
+(~(opt_cpuid_mask_ecx & opt_cpuid_mask_edx &
+   opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx &
+   opt_cpuid_mask_xsave_eax)))
+   printk("No CPUID feature masking support available\n");
 }
 
 /*
@@ -258,23 +275,6 @@ static void init_intel(struct cpuinfo_x86 *c)
detect_ht(c);
}
 
-   if (c == &boot_cpu_data && c->x86 == 6) {
-   if (probe_intel_cpuid_faulting())
-   __set_bit(X86_FEATURE_CPUID_FAULTING,
- c->x86_capability);
-   } else if (boot_cpu_has(X86_FEATURE_CPUID_FAULTING)) {
-   BUG_ON(!probe_intel_cpuid_faulting());
-   __set_bit(X86_FEATURE_CPUID_FAULTING, c->x86_capability);
-   }
-
-   if (!cpu_has_cpuid_faulting)
-   set_cpuidmask(c);
-   else if ((c == &boot_cpu_data) &&
-(~(opt_cpuid_mask_ecx & opt_cpuid_mask_edx &
-   opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx &
-   opt_cpuid_mask_xsave_eax)))
-   printk("No CPUID feature masking support available\n");
-
/* Work around errata */
Intel_errata_workarounds(c);
 
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


[Xen-devel] [PATCH v2 20/30] x86/cpu: Context switch cpuid masks and faulting state in context_switch()

2016-02-05 Thread Andrew Cooper
A single ctxt_switch_levelling() function pointer is provided
(defaulting to an empty nop), which is overridden in the appropriate
$VENDOR_init_levelling().

set_cpuid_faulting() is made private and included within
intel_ctxt_switch_levelling().

One functional change is that the faulting configuration is no longer special
cased for dom0.  There was never any need to, and it will cause dom0 to
observe the same information through native and enlightened cpuid.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 

v2:
 * Style fixes
 * ASSERT() that faulting is available in set_cpuid_faulting()
---
 xen/arch/x86/cpu/amd.c  |  3 +++
 xen/arch/x86/cpu/common.c   |  7 +++
 xen/arch/x86/cpu/intel.c| 20 +++-
 xen/arch/x86/domain.c   |  4 +---
 xen/include/asm-x86/processor.h |  2 +-
 5 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
index 1708dd9..9d162bc 100644
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -317,6 +317,9 @@ static void __init noinline amd_init_levelling(void)
   (uint32_t)cpuidmask_defaults._7ab0,
   (uint32_t)cpuidmask_defaults._6c);
}
+
+   if (levelling_caps)
+   ctxt_switch_levelling = amd_ctxt_switch_levelling;
 }
 
 /*
diff --git a/xen/arch/x86/cpu/common.c b/xen/arch/x86/cpu/common.c
index 3fdae96..dc2442b 100644
--- a/xen/arch/x86/cpu/common.c
+++ b/xen/arch/x86/cpu/common.c
@@ -90,6 +90,13 @@ static const struct cpu_dev default_cpu = {
 };
 static const struct cpu_dev *this_cpu = &default_cpu;
 
+static void default_ctxt_switch_levelling(const struct domain *nextd)
+{
+   /* Nop */
+}
+void (* __read_mostly ctxt_switch_levelling)(const struct domain *nextd) =
+   default_ctxt_switch_levelling;
+
 bool_t opt_cpu_info;
 boolean_param("cpuinfo", opt_cpu_info);
 
diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
index 143f497..95d44dd 100644
--- a/xen/arch/x86/cpu/intel.c
+++ b/xen/arch/x86/cpu/intel.c
@@ -32,13 +32,15 @@ static bool_t __init probe_intel_cpuid_faulting(void)
return 1;
 }
 
-static DEFINE_PER_CPU(bool_t, cpuid_faulting_enabled);
-void set_cpuid_faulting(bool_t enable)
+static void set_cpuid_faulting(bool_t enable)
 {
+   static DEFINE_PER_CPU(bool_t, cpuid_faulting_enabled);
+   bool_t *this_enabled = &this_cpu(cpuid_faulting_enabled);
uint32_t hi, lo;
 
-   if (!cpu_has_cpuid_faulting ||
-   this_cpu(cpuid_faulting_enabled) == enable )
+   ASSERT(cpu_has_cpuid_faulting);
+
+   if (*this_enabled == enable)
return;
 
rdmsr(MSR_INTEL_MISC_FEATURES_ENABLES, lo, hi);
@@ -47,7 +49,7 @@ void set_cpuid_faulting(bool_t enable)
lo |= MSR_MISC_FEATURES_CPUID_FAULTING;
wrmsr(MSR_INTEL_MISC_FEATURES_ENABLES, lo, hi);
 
-   this_cpu(cpuid_faulting_enabled) = enable;
+   *this_enabled = enable;
 }
 
 /*
@@ -151,6 +153,11 @@ static void intel_ctxt_switch_levelling(const struct 
domain *nextd)
struct cpuidmasks *these_masks = &this_cpu(cpuidmasks);
const struct cpuidmasks *masks = &cpuidmask_defaults;
 
+   if (cpu_has_cpuid_faulting) {
+   set_cpuid_faulting(nextd && is_pv_domain(nextd));
+   return;
+   }
+
 #define LAZY(msr, field)   \
({  \
if (msr && (these_masks->field != masks->field))\
@@ -221,6 +228,9 @@ static void __init noinline intel_init_levelling(void)
   (uint32_t)cpuidmask_defaults.e1cd,
   (uint32_t)cpuidmask_defaults.Da1);
}
+
+   if (levelling_caps)
+   ctxt_switch_levelling = intel_ctxt_switch_levelling;
 }
 
 static void early_init_intel(struct cpuinfo_x86 *c)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 8f2c0b6..dbce90f 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2079,9 +2079,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
 load_segments(next);
 }
 
-set_cpuid_faulting(is_pv_domain(nextd) &&
-   !is_control_domain(nextd) &&
-   !is_hardware_domain(nextd));
+ctxt_switch_levelling(nextd);
 }
 
 context_saved(prev);
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 09e82d8..12b6e25 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -210,7 +210,7 @@ extern struct cpuinfo_x86 boot_cpu_data;
 extern struct cpuinfo_x86 cpu_data[];
 #define current_cpu_data cpu_data[smp_processor_id()]
 
-extern void set_cpuid_faulting(bool_t enable);
+extern void (*ctxt_switch_levelling)(const struct domain *nextd);
 
 extern u64 host_pat;
 extern bool_t opt_cpu_info;
-- 
2.1.4


___
X

[Xen-devel] [PATCH v2 22/30] x86/domctl: Update PV domain cpumasks when setting cpuid policy

2016-02-05 Thread Andrew Cooper
This allows PV domains with different featuresets to observe different values
from a native cpuid instruction, on supporting hardware.

Signed-off-by: Andrew Cooper 
Reviewed-by: Jan Beulich 
---
v2:
 * Use switch() rather than if/elseif chain
 * Clamp to static PV featuremask
---
 xen/arch/x86/domctl.c | 88 +++
 1 file changed, 88 insertions(+)

diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index 55aecdc..f06bc02 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int gdbsx_guest_mem_io(domid_t domid, struct xen_domctl_gdbsx_memio 
*iop)
 {
@@ -87,6 +88,93 @@ static void update_domain_cpuid_info(struct domain *d,
 d->arch.x86_model = (ctl->eax >> 4) & 0xf;
 if ( d->arch.x86 >= 0x6 )
 d->arch.x86_model |= (ctl->eax >> 12) & 0xf0;
+
+if ( is_pv_domain(d) )
+{
+uint64_t mask = cpuidmask_defaults._1cd;
+uint32_t ecx = ctl->ecx & pv_featureset[FEATURESET_1c];
+uint32_t edx = ctl->edx & pv_featureset[FEATURESET_1d];
+
+switch ( boot_cpu_data.x86_vendor )
+{
+case X86_VENDOR_INTEL:
+mask &= ((uint64_t)edx << 32) | ecx;
+break;
+
+case X86_VENDOR_AMD:
+mask &= ((uint64_t)ecx << 32) | edx;
+break;
+}
+
+d->arch.pv_domain.cpuidmasks->_1cd = mask;
+}
+break;
+
+case 6:
+if ( is_pv_domain(d) )
+{
+uint64_t mask = cpuidmask_defaults._6c;
+
+if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+mask &= (~0ULL << 32) | ctl->ecx;
+
+d->arch.pv_domain.cpuidmasks->_6c = mask;
+}
+break;
+
+case 7:
+if ( ctl->input[1] != 0 )
+break;
+
+if ( is_pv_domain(d) )
+{
+uint64_t mask = cpuidmask_defaults._7ab0;
+uint32_t eax = ctl->eax;
+uint32_t ebx = ctl->ebx & pv_featureset[FEATURESET_7b0];
+
+if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+mask &= ((uint64_t)eax << 32) | ebx;
+
+d->arch.pv_domain.cpuidmasks->_7ab0 = mask;
+}
+break;
+
+case 0xd:
+if ( ctl->input[1] != 1 )
+break;
+
+if ( is_pv_domain(d) )
+{
+uint64_t mask = cpuidmask_defaults.Da1;
+uint32_t eax = ctl->eax & pv_featureset[FEATURESET_Da1];
+
+if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+mask &= (~0ULL << 32) | eax;
+
+d->arch.pv_domain.cpuidmasks->Da1 = mask;
+}
+break;
+
+case 0x8001:
+if ( is_pv_domain(d) )
+{
+uint64_t mask = cpuidmask_defaults.e1cd;
+uint32_t ecx = ctl->ecx & pv_featureset[FEATURESET_e1c];
+uint32_t edx = ctl->edx & pv_featureset[FEATURESET_e1d];
+
+switch ( boot_cpu_data.x86_vendor )
+{
+case X86_VENDOR_INTEL:
+mask &= ((uint64_t)edx << 32) | ecx;
+break;
+
+case X86_VENDOR_AMD:
+mask &= ((uint64_t)ecx << 32) | edx;
+break;
+}
+
+d->arch.pv_domain.cpuidmasks->e1cd = mask;
+}
 break;
 }
 }
-- 
2.1.4


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [RESEND PATCH v4 09/10] vmx: Add VMX RDTSC(P) scaling support

2016-02-05 Thread Jan Beulich
>>> On 19.01.16 at 03:55,  wrote:
> @@ -2107,6 +2115,14 @@ const struct hvm_function_table * __init 
> start_vmx(void)
>   && cpu_has_vmx_secondary_exec_control )
>  vmx_function_table.pvh_supported = 1;
>  
> +if ( cpu_has_vmx_tsc_scaling )
> +{
> +vmx_function_table.default_tsc_scaling_ratio = 
> VMX_TSC_MULTIPLIER_DEFAULT;
> +vmx_function_table.max_tsc_scaling_ratio = VMX_TSC_MULTIPLIER_MAX;
> +vmx_function_table.tsc_scaling_ratio_frac_bits = 48;
> +vmx_function_table.setup_tsc_scaling = vmx_setup_tsc_scaling;
> +}

Same comments here as on the earlier patch - it indeed looks as if
tsc_scaling_ratio_frac_bits would be the ideal field to dynamically
initialize, as it being zero will not yield any bad behavior afaict.

Also please consider making all fields together a sub-structure
of struct hvm_function_table, such that the above would become

vmx_function_table.tsc_scaling.default_ratio = 
VMX_TSC_MULTIPLIER_DEFAULT;
vmx_function_table.tsc_scaling.max_ratio = VMX_TSC_MULTIPLIER_MAX;
vmx_function_table.tsc_scaling.ratio_frac_bits = 48;
vmx_function_table.tsc_scaling.setup = vmx_setup_tsc_scaling;

keeping everything nicely together.

> @@ -258,6 +259,9 @@ extern u64 vmx_ept_vpid_cap;
>  #define VMX_MISC_CR3_TARGET 0x01ff
>  #define VMX_MISC_VMWRITE_ALL0x2000
>  
> +#define VMX_TSC_MULTIPLIER_DEFAULT  0x0001ULL

Considering this and the respective SVM value - do we really
need the separate field in struct hvm_function_table? Both are
1ULL << tsc_scaling.ratio_frac_bits after all.

Jan


___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


Re: [Xen-devel] [PATCH v5 for Xen 4.7 2/4] libxc: enable per-VCPU parameter settings for RTDS scheduler

2016-02-05 Thread Wei Liu
On Thu, Feb 04, 2016 at 04:50:42PM -0600, Chong Li wrote:
> Add xc_sched_rtds_vcpu_get/set functions to interact with
> Xen to get/set a domain's per-VCPU parameters.
> 
> Signed-off-by: Chong Li 
> Signed-off-by: Meng Xu 
> Signed-off-by: Sisu Xi 

These looks like sensible wrappers. I will defer this patch to Dario. If
he's happy with this I will just ack it.

> ---
> Changes on PATCH v4:
> 1) Minor modifications on the function parameters.
> 
> Changes on PATCH v2:
> 1) Minor modifications due to the change of struct xen_domctl_scheduler_op.
> 
> CC: 
> CC: 
> CC: 
> CC: 
> CC: 
> CC: 
> ---
>  tools/libxc/include/xenctrl.h |  8 +++
>  tools/libxc/xc_rt.c   | 56 
> +++
>  2 files changed, 64 insertions(+)
> 
> diff --git a/tools/libxc/include/xenctrl.h b/tools/libxc/include/xenctrl.h
> index 01a6dda..db13434 100644
> --- a/tools/libxc/include/xenctrl.h
> +++ b/tools/libxc/include/xenctrl.h
> @@ -893,6 +893,14 @@ int xc_sched_rtds_domain_set(xc_interface *xch,
>  int xc_sched_rtds_domain_get(xc_interface *xch,
>  uint32_t domid,
>  struct xen_domctl_sched_rtds *sdom);
> +int xc_sched_rtds_vcpu_set(xc_interface *xch,
> +uint32_t domid,
> +struct xen_domctl_schedparam_vcpu *vcpus,
> +uint32_t num_vcpus);
> +int xc_sched_rtds_vcpu_get(xc_interface *xch,
> +uint32_t domid,
> +struct xen_domctl_schedparam_vcpu *vcpus,
> +uint32_t num_vcpus);
>  

Indentation looks wrong.

Wei.

___
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel


  1   2   3   >