On 03/31/2017 06:04 PM, Jan Beulich wrote:
>>>> On 31.03.17 at 17:01, <rcojoc...@bitdefender.com> wrote:
>> On 03/31/2017 05:46 PM, Jan Beulich wrote:
>>>>>> On 31.03.17 at 11:56, <rcojoc...@bitdefender.com> wrote:
>>>> On 03/31/2017 10:34 AM, Jan Beulich wrote:
>>>>>>>> On 31.03.17 at 08:17, <rcojoc...@bitdefender.com> wrote:
>>>>>> On 03/30/2017 06:47 PM, Jan Beulich wrote:
>>>>>>>> Speaking of emulated MMIO, I've got this when the guest was crashing
>>>>>>>> immediately (pre RETRY loop):
>>>>>>>>
>>>>>>>>  MMIO emulation failed: d3v8 32bit @ 0008:82679f3c -> f0 0f ba 30 00 72
>>>>>>>> 07 8b cb e8 da 4b ff ff 8b 45
>>>>>>>
>>>>>>> That's a BTR, which we should be emulating fine. More information
>>>>>>> would need to be collected to have a chance to understand what
>>>>>>> might be going one (first of all the virtual and physical memory
>>>>>>> address this was trying to act on).
>>>>>>
>>>>>> Right, the BTR part should be fine, but I think the LOCK part is what's
>>>>>> causing the issue. I've done a few more test runs to see what return
>>>>>> RETRY (dumping the instruction with an "(r)" prefix to distinguish from
>>>>>> the UNHANDLEABLE dump), and a couple of instructions return RETRY (BTR
>>>>>> and XADD, both LOCK-prefixed, which means they now involve CMPXCHG
>>>>>> handler, which presumably now fails - possibly simply because it's
>>>>>> always LOCKed in my patch):
>>>>>
>>>>> Well, all of that looks to be expected behavior. I'm afraid I don't see
>>>>> how this information helps understanding the MMIO emulation failure
>>>>> above.
>>>>
>>>> I've managed to obtain this log of emulation errors:
>>>> https://pastebin.com/Esy1SkHx 
>>>>
>>>> The "virtual address" lines that are not followed by any "Mem event"
>>>> line correspond to CMXCHG_FAILED return codes.
>>>>
>>>> The very last line is a MMIO emulation failed.
>>>>
>>>> It's probably important that this happens with the model where
>>>> hvm_emulate_one_vm_event() does _not_ re-try the emulation until it
>>>> succeeds. The other model allows me to go further with the guest, but
>>>> eventually I get timeout-related BSODs or the guest becomes unresponsive.
>>>
>>> Interesting. You didn't clarify what the printed "offset" values are,
>>> and it doesn't look like these have any correlation with the underlying
>>> (guest) physical address, which we would also want to see. And then
>>> it strikes me as odd that in these last lines
>>>
>>> (XEN) Mem event (RETRY) emulation failed: d5v8 32bit @ 0008:826bb861 -> f0 
>>> 0f 
>> ba 30 00 72 07 8b cb e8 da 4b ff ff 8b 45
>>> (XEN) virtual address: 0xffd080f0, offset: 4291854576
>>> (XEN) MMIO emulation failed: d5v8 32bit @ 0008:82655f3c -> f0 0f ba 30 00 
>>> 72 
>> 07 8b cb e8 da 4b ff ff 8b 45
>>>
>>> the instruction pointers and virtual addresses are different, but the
>>> code bytes are exactly the same. This doesn't seem very likely, so I
>>> wonder whether there's an issue with us wrongly re-using previously
>>> fetched insn bytes. (Of course I'd be happy to be proven wrong with
>>> this guessing, by you checking the involved binary/ies.)
>>
>> Offset is the actual value of the "offset" parameter of
>> hvmemul_cmpxchg().
> 
> That's not very useful then, as for flat segments "offset" ==
> "virtual address" (i.e. you merely re-print in decimal what you've
> already printed in hex).

The attached patch (a combination of your patch and mine) produces the
following output when booting a Windows 7 32-bit guest with monitoring:
https://pastebin.com/ayiFmj1N

The failed MMIO emulation is caused by a mapping failure due to the
"!nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa)" condition
being true in hvmemul_vaddr_to_mfn(). I've ripped that off from
__hvm_copy() but it looks like that might not be the right way to use it.


Thanks,
Razvan
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 2d92957..f6244af 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -20,6 +20,7 @@
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/ioreq.h>
+#include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/trace.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/svm/svm.h>
@@ -1029,6 +1030,86 @@ static int hvmemul_wbinvd_discard(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_vaddr_to_mfn(
+    unsigned long addr,
+    mfn_t *mfn,
+    uint32_t pfec,
+    struct x86_emulate_ctxt *ctxt)
+{
+    paddr_t gpa = addr & ~PAGE_MASK;
+    struct page_info *page;
+    p2m_type_t p2mt;
+    unsigned long gfn;
+    struct vcpu *curr = current;
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+
+    gfn = paging_gva_to_gfn(curr, addr, &pfec);
+
+    printk("gfn: 0x%lx\n", gfn);
+
+    if ( gfn == gfn_x(INVALID_GFN) )
+    {
+        pagefault_info_t pfinfo = {};
+
+        if ( ( pfec & PFEC_page_paged ) || ( pfec & PFEC_page_shared ) )
+            return X86EMUL_RETRY;
+
+        pfinfo.linear = addr;
+        pfinfo.ec = pfec;
+
+        x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &hvmemul_ctxt->ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    gpa |= (paddr_t)gfn << PAGE_SHIFT;
+
+    /*
+     * No need to do the P2M lookup for internally handled MMIO, benefiting
+     * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses,
+     * - newer Windows (like Server 2012) for HPET accesses.
+     */
+    if ( !nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa) )
+    {
+        printk("!nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa)\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    page = get_page_from_gfn(curr->domain, gfn, &p2mt, P2M_UNSHARE);
+
+    if ( !page )
+    {
+        printk("!page\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    if ( p2m_is_paging(p2mt) )
+    {
+        put_page(page);
+        p2m_mem_paging_populate(curr->domain, gfn);
+        return X86EMUL_RETRY;
+    }
+
+    if ( p2m_is_shared(p2mt) )
+    {
+        put_page(page);
+        return X86EMUL_RETRY;
+    }
+
+    if ( p2m_is_grant(p2mt) )
+    {
+        put_page(page);
+        printk("p2m_is_grant(p2mt)\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    *mfn = _mfn(page_to_mfn(page));
+
+    put_page(page);
+
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -1037,8 +1118,98 @@ static int hvmemul_cmpxchg(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    /* Fix this in case the guest is really relying on r-m-w atomicity. */
-    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
+    unsigned long addr = 0, reps = 1;
+    int rc = X86EMUL_OKAY;
+    unsigned long old = 0, new = 0;
+    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    mfn_t mfn[2];
+    void *map = NULL;
+    struct domain *currd = current->domain;
+    unsigned long ret;
+
+    if ( is_x86_system_segment(seg) )
+        pfec |= PFEC_implicit;
+    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    rc = hvmemul_virtual_to_linear(
+        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
+
+    if ( rc != X86EMUL_OKAY || !bytes )
+    {
+        printk("rc != X86EMUL_OKAY || !bytes\n");
+        goto out;
+    }
+
+    rc = hvmemul_vaddr_to_mfn(addr, &mfn[0], pfec, ctxt);
+
+    if ( rc != X86EMUL_OKAY )
+    {
+        printk("hvmemul_vaddr_to_mfn() fail\n");
+        goto out;
+    }
+
+    if ( likely(((addr + bytes - 1) & PAGE_MASK) == (addr & PAGE_MASK)) )
+    {
+        /* Whole write fits on a single page. */
+        mfn[1] = INVALID_MFN;
+        map = map_domain_page(mfn[0]);
+    }
+    else
+    {
+        rc = hvmemul_vaddr_to_mfn((addr + bytes - 1) & PAGE_MASK, &mfn[1],
+                                  pfec, ctxt);
+        if ( rc != X86EMUL_OKAY )
+        {
+            printk("hvmemul_vaddr_to_mfn(mfn[1]) fail\n");
+            goto out;
+        }
+
+        map = vmap(mfn, 2);
+    }
+
+    if ( !map )
+    {
+        printk("!map\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    map += (addr & ~PAGE_MASK);
+
+    memcpy(&old, p_old, bytes);
+    memcpy(&new, p_new, bytes);
+
+    ret = __cmpxchg(map, old, new, bytes);
+
+    if ( ret != old )
+    {
+        memcpy(p_old, &ret, bytes);
+        rc = X86EMUL_CMPXCHG_FAILED;
+    }
+
+    paging_mark_dirty(currd, mfn[0]);
+
+    if ( unlikely(mfn_valid(mfn[1])) )
+    {
+        paging_mark_dirty(currd, mfn[1]);
+        vunmap((void *)((unsigned long)map & PAGE_MASK));
+    }
+    else
+        unmap_domain_page(map);
+
+out:
+    if ( rc != X86EMUL_OKAY )
+    {
+        if ( rc != X86EMUL_CMPXCHG_FAILED )
+            rc = X86EMUL_UNHANDLEABLE;
+    }
+
+    printk("[%d] virtual address: 0x%lx, rc: %d\n",
+           current->vcpu_id, addr, rc);
+
+    return rc;
 }
 
 static int hvmemul_validate(
@@ -1961,59 +2132,64 @@ int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla)
 void hvm_emulate_one_vm_event(enum emul_kind kind, unsigned int trapnr,
     unsigned int errcode)
 {
-    struct hvm_emulate_ctxt ctx = {{ 0 }};
-    int rc;
+    int rc = X86EMUL_OKAY;
 
-    hvm_emulate_init_once(&ctx, NULL, guest_cpu_user_regs());
+    /* do { */
+        struct hvm_emulate_ctxt ctx = {{ 0 }};
 
-    switch ( kind )
-    {
-    case EMUL_KIND_NOWRITE:
-        rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write);
-        break;
-    case EMUL_KIND_SET_CONTEXT_INSN: {
-        struct vcpu *curr = current;
-        struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+        hvm_emulate_init_once(&ctx, NULL, guest_cpu_user_regs());
 
-        BUILD_BUG_ON(sizeof(vio->mmio_insn) !=
-                     sizeof(curr->arch.vm_event->emul.insn.data));
-        ASSERT(!vio->mmio_insn_bytes);
+        switch ( kind )
+        {
+        case EMUL_KIND_NOWRITE:
+            rc = _hvm_emulate_one(&ctx, &hvm_emulate_ops_no_write);
+            break;
+        case EMUL_KIND_SET_CONTEXT_INSN: {
+            struct vcpu *curr = current;
+            struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
+
+            BUILD_BUG_ON(sizeof(vio->mmio_insn) !=
+                         sizeof(curr->arch.vm_event->emul.insn.data));
+            ASSERT(!vio->mmio_insn_bytes);
+
+            /*
+             * Stash insn buffer into mmio buffer here instead of ctx
+             * to avoid having to add more logic to hvm_emulate_one.
+             */
+            vio->mmio_insn_bytes = sizeof(vio->mmio_insn);
+            memcpy(vio->mmio_insn, curr->arch.vm_event->emul.insn.data,
+                   vio->mmio_insn_bytes);
+        }
+        /* Fall-through */
+        default:
+            ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA);
+            rc = hvm_emulate_one(&ctx);
+        }
 
-        /*
-         * Stash insn buffer into mmio buffer here instead of ctx
-         * to avoid having to add more logic to hvm_emulate_one.
-         */
-        vio->mmio_insn_bytes = sizeof(vio->mmio_insn);
-        memcpy(vio->mmio_insn, curr->arch.vm_event->emul.insn.data,
-               vio->mmio_insn_bytes);
-    }
-    /* Fall-through */
-    default:
-        ctx.set_context = (kind == EMUL_KIND_SET_CONTEXT_DATA);
-        rc = hvm_emulate_one(&ctx);
-    }
+        if ( rc != X86EMUL_OKAY )
+        {
+            printk("Dump follows for VCPU %d\n", current->vcpu_id);
+        }
 
-    switch ( rc )
-    {
-    case X86EMUL_RETRY:
-        /*
-         * This function is called when handling an EPT-related vm_event
-         * reply. As such, nothing else needs to be done here, since simply
-         * returning makes the current instruction cause a page fault again,
-         * consistent with X86EMUL_RETRY.
-         */
-        return;
-    case X86EMUL_UNHANDLEABLE:
-        hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event", &ctx);
-        hvm_inject_hw_exception(trapnr, errcode);
-        break;
-    case X86EMUL_EXCEPTION:
-        if ( ctx.ctxt.event_pending )
-            hvm_inject_event(&ctx.ctxt.event);
-        break;
-    }
+        switch ( rc )
+        {
+        case X86EMUL_RETRY:
+            /* break; */
+            hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event (RETRY)", &ctx);
+            return;
+        case X86EMUL_UNHANDLEABLE:
+            hvm_dump_emulation_state(XENLOG_G_DEBUG "Mem event", &ctx);
+            hvm_inject_hw_exception(trapnr, errcode);
+            break;
+        case X86EMUL_EXCEPTION:
+            if ( ctx.ctxt.event_pending )
+                hvm_inject_event(&ctx.ctxt.event);
+            break;
+        }
+
+        hvm_emulate_writeback(&ctx);
 
-    hvm_emulate_writeback(&ctx);
+    /* } while( rc == X86EMUL_RETRY ); */
 }
 
 void hvm_emulate_init_once(
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 4dbd24f..a67cd55 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -5176,16 +5176,17 @@ static int ptwr_emulated_read(
 
 static int ptwr_emulated_update(
     unsigned long addr,
-    paddr_t old,
-    paddr_t val,
+    intpte_t *p_old,
+    intpte_t val,
     unsigned int bytes,
-    unsigned int do_cmpxchg,
     struct ptwr_emulate_ctxt *ptwr_ctxt)
 {
     unsigned long mfn;
     unsigned long unaligned_addr = addr;
     struct page_info *page;
     l1_pgentry_t pte, ol1e, nl1e, *pl1e;
+    intpte_t old = p_old ? *p_old : 0;
+    unsigned int offset = 0;
     struct vcpu *v = current;
     struct domain *d = v->domain;
     int ret;
@@ -5199,28 +5200,30 @@ static int ptwr_emulated_update(
     }
 
     /* Turn a sub-word access into a full-word access. */
-    if ( bytes != sizeof(paddr_t) )
+    if ( bytes != sizeof(val) )
     {
-        paddr_t      full;
-        unsigned int rc, offset = addr & (sizeof(paddr_t)-1);
+        intpte_t full;
+        unsigned int rc;
+
+        offset = addr & (sizeof(full) - 1);
 
         /* Align address; read full word. */
-        addr &= ~(sizeof(paddr_t)-1);
-        if ( (rc = copy_from_user(&full, (void *)addr, sizeof(paddr_t))) != 0 )
+        addr &= ~(sizeof(full) - 1);
+        if ( (rc = copy_from_user(&full, (void *)addr, sizeof(full))) != 0 )
         {
             x86_emul_pagefault(0, /* Read fault. */
-                               addr + sizeof(paddr_t) - rc,
+                               addr + sizeof(full) - rc,
                                &ptwr_ctxt->ctxt);
             return X86EMUL_EXCEPTION;
         }
         /* Mask out bits provided by caller. */
-        full &= ~((((paddr_t)1 << (bytes*8)) - 1) << (offset*8));
+        full &= ~((((intpte_t)1 << (bytes * 8)) - 1) << (offset * 8));
         /* Shift the caller value and OR in the missing bits. */
-        val  &= (((paddr_t)1 << (bytes*8)) - 1);
+        val  &= (((intpte_t)1 << (bytes * 8)) - 1);
         val <<= (offset)*8;
         val  |= full;
         /* Also fill in missing parts of the cmpxchg old value. */
-        old  &= (((paddr_t)1 << (bytes*8)) - 1);
+        old  &= (((intpte_t)1 << (bytes * 8)) - 1);
         old <<= (offset)*8;
         old  |= full;
     }
@@ -5242,7 +5245,7 @@ static int ptwr_emulated_update(
     {
     default:
         if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
-             !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
+             !p_old && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
         {
             /*
              * If this is an upper-half write to a PAE PTE then we assume that
@@ -5273,21 +5276,26 @@ static int ptwr_emulated_update(
     /* Checked successfully: do the update (write or cmpxchg). */
     pl1e = map_domain_page(_mfn(mfn));
     pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
-    if ( do_cmpxchg )
+    if ( p_old )
     {
-        int okay;
-        intpte_t t = old;
         ol1e = l1e_from_intpte(old);
 
-        okay = paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
-                                          &t, l1e_get_intpte(nl1e), _mfn(mfn));
-        okay = (okay && t == old);
+        if ( !paging_cmpxchg_guest_entry(v, &l1e_get_intpte(*pl1e),
+                                         &old, l1e_get_intpte(nl1e), _mfn(mfn)) )
+            ret = X86EMUL_UNHANDLEABLE;
+        else if ( l1e_get_intpte(ol1e) == old )
+            ret = X86EMUL_OKAY;
+        else
+        {
+            *p_old = old >> (offset * 8);
+            ret = X86EMUL_CMPXCHG_FAILED;
+        }
 
-        if ( !okay )
+        if ( ret != X86EMUL_OKAY )
         {
             unmap_domain_page(pl1e);
             put_page_from_l1e(nl1e, d);
-            return X86EMUL_RETRY;
+            return ret;
         }
     }
     else
@@ -5314,9 +5322,9 @@ static int ptwr_emulated_write(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    paddr_t val = 0;
+    intpte_t val = 0;
 
-    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes - 1)) || !bytes )
+    if ( (bytes > sizeof(val)) || (bytes & (bytes - 1)) || !bytes )
     {
         MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
                 offset, bytes);
@@ -5325,9 +5333,9 @@ static int ptwr_emulated_write(
 
     memcpy(&val, p_data, bytes);
 
-    return ptwr_emulated_update(
-        offset, 0, val, bytes, 0,
-        container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
+    return ptwr_emulated_update(offset, NULL, val, bytes,
+                                container_of(ctxt, struct ptwr_emulate_ctxt,
+                                             ctxt));
 }
 
 static int ptwr_emulated_cmpxchg(
@@ -5338,21 +5346,20 @@ static int ptwr_emulated_cmpxchg(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    paddr_t old = 0, new = 0;
+    intpte_t new = 0;
 
-    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
+    if ( (bytes > sizeof(new)) || (bytes & (bytes -1)) )
     {
         MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
                 offset, bytes);
         return X86EMUL_UNHANDLEABLE;
     }
 
-    memcpy(&old, p_old, bytes);
     memcpy(&new, p_new, bytes);
 
-    return ptwr_emulated_update(
-        offset, old, new, bytes, 1,
-        container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
+    return ptwr_emulated_update(offset, p_old, new, bytes,
+                                container_of(ctxt, struct ptwr_emulate_ctxt,
+                                             ctxt));
 }
 
 static int pv_emul_is_mem_write(const struct x86_emulate_state *state,
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index d93f2ab..06dc9f6 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -285,7 +285,7 @@ hvm_emulate_cmpxchg(enum x86_segment seg,
     struct sh_emulate_ctxt *sh_ctxt =
         container_of(ctxt, struct sh_emulate_ctxt, ctxt);
     struct vcpu *v = current;
-    unsigned long addr, old, new;
+    unsigned long addr, new = 0;
     int rc;
 
     if ( bytes > sizeof(long) )
@@ -296,12 +296,10 @@ hvm_emulate_cmpxchg(enum x86_segment seg,
     if ( rc )
         return rc;
 
-    old = new = 0;
-    memcpy(&old, p_old, bytes);
     memcpy(&new, p_new, bytes);
 
     return v->arch.paging.mode->shadow.x86_emulate_cmpxchg(
-               v, addr, old, new, bytes, sh_ctxt);
+               v, addr, p_old, new, bytes, sh_ctxt);
 }
 
 static const struct x86_emulate_ops hvm_shadow_emulator_ops = {
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 4798c93..d8270db 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -4783,11 +4783,11 @@ sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
 
 static int
 sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
-                        unsigned long old, unsigned long new,
-                        unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
+                       unsigned long *p_old, unsigned long new,
+                       unsigned int bytes, struct sh_emulate_ctxt *sh_ctxt)
 {
     void *addr;
-    unsigned long prev;
+    unsigned long prev, old = *p_old;
     int rv = X86EMUL_OKAY;
 
     /* Unaligned writes are only acceptable on HVM */
@@ -4811,7 +4811,10 @@ sh_x86_emulate_cmpxchg(struct vcpu *v, unsigned long vaddr,
     }
 
     if ( prev != old )
-        rv = X86EMUL_RETRY;
+    {
+        *p_old = prev;
+        rv = X86EMUL_CMPXCHG_FAILED;
+    }
 
     SHADOW_DEBUG(EMULATE, "va %#lx was %#lx expected %#lx"
                   " wanted %#lx now %#lx bytes %u\n",
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
index bb67be6..444c84c 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1862,6 +1862,9 @@ protmode_load_seg(
 
         default:
             return rc;
+
+        case X86EMUL_CMPXCHG_FAILED:
+            return X86EMUL_RETRY;
         }
 
         /* Force the Accessed flag in our local copy. */
@@ -6680,6 +6683,7 @@ x86_emulate(
         break;
 
     case X86EMUL_OPC(0x0f, 0xb0): case X86EMUL_OPC(0x0f, 0xb1): /* cmpxchg */
+        fail_if(!ops->cmpxchg);
         /* Save real source value, then compare EAX against destination. */
         src.orig_val = src.val;
         src.val = _regs.r(ax);
@@ -6688,8 +6692,17 @@ x86_emulate(
         if ( _regs.eflags & X86_EFLAGS_ZF )
         {
             /* Success: write back to memory. */
-            dst.val = src.orig_val;
+            dst.val = src.val;
+            rc = ops->cmpxchg(dst.mem.seg, dst.mem.off, &dst.val,
+                              &src.orig_val, dst.bytes, ctxt);
+            if ( rc == X86EMUL_CMPXCHG_FAILED )
+            {
+               _regs.eflags &= ~X86_EFLAGS_ZF;
+               rc = X86EMUL_OKAY;
+            }
         }
+        if ( _regs.eflags & X86_EFLAGS_ZF )
+            dst.type = OP_NONE;
         else
         {
             /* Failure: write the value we saw to EAX. */
@@ -6994,6 +7007,7 @@ x86_emulate(
 
         if ( memcmp(old, aux, op_bytes) )
         {
+        cmpxchgNb_failed:
             /* Expected != actual: store actual to rDX:rAX and clear ZF. */
             _regs.r(ax) = !(rex_prefix & REX_W) ? old->u32[0] : old->u64[0];
             _regs.r(dx) = !(rex_prefix & REX_W) ? old->u32[1] : old->u64[1];
@@ -7003,7 +7017,7 @@ x86_emulate(
         {
             /*
              * Expected == actual: Get proposed value, attempt atomic cmpxchg
-             * and set ZF.
+             * and set ZF if successful.
              */
             if ( !(rex_prefix & REX_W) )
             {
@@ -7016,10 +7030,20 @@ x86_emulate(
                 aux->u64[1] = _regs.r(cx);
             }
 
-            if ( (rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux,
-                                    op_bytes, ctxt)) != X86EMUL_OKAY )
+            switch ( rc = ops->cmpxchg(ea.mem.seg, ea.mem.off, old, aux,
+                                       op_bytes, ctxt) )
+            {
+            case X86EMUL_OKAY:
+                _regs.eflags |= X86_EFLAGS_ZF;
+                break;
+
+            case X86EMUL_CMPXCHG_FAILED:
+                rc = X86EMUL_OKAY;
+                goto cmpxchgNb_failed;
+
+            default:
                 goto done;
-            _regs.eflags |= X86_EFLAGS_ZF;
+            }
         }
         break;
     }
@@ -7942,6 +7966,8 @@ x86_emulate(
             rc = ops->cmpxchg(
                 dst.mem.seg, dst.mem.off, &dst.orig_val,
                 &dst.val, dst.bytes, ctxt);
+            if ( rc == X86EMUL_CMPXCHG_FAILED )
+                rc = X86EMUL_RETRY;
         }
         else
         {
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
index 9c5fcde..6a8d6a0 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -153,6 +153,8 @@ struct x86_emul_fpu_aux {
 #define X86EMUL_EXCEPTION      2
  /* Retry the emulation for some reason. No state modified. */
 #define X86EMUL_RETRY          3
+ /* (cmpxchg accessor): CMPXCHG failed. */
+#define X86EMUL_CMPXCHG_FAILED 4
  /*
   * Operation fully done by one of the hooks:
   * - validate(): operation completed (except common insn retire logic)
@@ -160,7 +162,7 @@ struct x86_emul_fpu_aux {
   * - read_io() / write_io(): bypass GPR update (non-string insns only)
   * Undefined behavior when used anywhere else.
   */
-#define X86EMUL_DONE           4
+#define X86EMUL_DONE           5
 
 /* FPU sub-types which may be requested via ->get_fpu(). */
 enum x86_emulate_fpu_type {
@@ -250,6 +252,8 @@ struct x86_emulate_ops
     /*
      * cmpxchg: Emulate an atomic (LOCKed) CMPXCHG operation.
      *  @p_old: [IN ] Pointer to value expected to be current at @addr.
+     *          [OUT] Pointer to value found at @addr (may always be
+     *                updated, meaningful for X86EMUL_CMPXCHG_FAILED only).
      *  @p_new: [IN ] Pointer to value to write to @addr.
      *  @bytes: [IN ] Operation size (up to 8 (x86/32) or 16 (x86/64) bytes).
      */
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index f262c9e..3efcf6d 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -89,7 +89,7 @@ struct shadow_paging_mode {
                                             void *src, u32 bytes,
                                             struct sh_emulate_ctxt *sh_ctxt);
     int           (*x86_emulate_cmpxchg   )(struct vcpu *v, unsigned long va,
-                                            unsigned long old, 
+                                            unsigned long *old,
                                             unsigned long new,
                                             unsigned int bytes,
                                             struct sh_emulate_ctxt *sh_ctxt);
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

Reply via email to