On 03/29/2017 05:00 PM, Razvan Cojocaru wrote:
> On 03/29/2017 04:55 PM, Jan Beulich wrote:
>>>>> On 28.03.17 at 12:50, <rcojoc...@bitdefender.com> wrote:
>>> On 03/28/2017 01:47 PM, Jan Beulich wrote:
>>>>>>> On 28.03.17 at 12:27, <rcojoc...@bitdefender.com> wrote:
>>>>> On 03/28/2017 01:03 PM, Jan Beulich wrote:
>>>>>>>>> On 28.03.17 at 11:14, <rcojoc...@bitdefender.com> wrote:
>>>>>>> I'm not sure that the RETRY model is what the guest OS expects. AFAIK, a
>>>>>>> failed CMPXCHG should happen just once, with the proper registers and ZF
>>>>>>> set. The guest surely expects neither that the instruction resume until
>>>>>>> it succeeds, nor that some hidden loop goes on for an undeterminate
>>>>>>> ammount of time until a CMPXCHG succeeds.
>>>>>>
>>>>>> The guest doesn't observe the CMPXCHG failing - RETRY leads to
>>>>>> the instruction being restarted instead of completed.
>>>>>
>>>>> Indeed, but it works differently with hvm_emulate_one_vm_event() where
>>>>> RETRY currently would have the instruction be re-executed (properly
>>>>> re-executed, not just re-emulated) by the guest.
>>>>
>>>> Right - see my other reply to Andrew: The function likely would
>>>> need to tell apart guest CMPXCHG uses from us using the insn to
>>>> carry out the write by some other one. That may involve
>>>> adjustments to the memory write logic in x86_emulate() itself, as
>>>> the late failure of the comparison then would also need to be
>>>> communicated back (via ZF clear) to the guest.
>>>
>>> Exactly, it would require quite some reworking of x86_emulate().
>>
>> I had imagined it to be less intrusive (outside of x86_emulate()),
>> but I've now learned why Andrew was able to get rid of
>> X86EMUL_CMPXCHG_FAILED - the apparently intended behavior
>> was never implemented. Attached a first take at it, which has
>> seen smoke testing, but nothing more. The way it ends up being
>> I don't think this can reasonably be considered for 4.9 at this
>> point in time. (Also Cc-ing Tim for the shadow code changes,
>> even if this isn't really a proper patch submission.)
> 
> Thanks! I'll give a spin with a modified version of my CMPXCHG patch as
> soon as possible.

With the attached patch with hvmemul_cmpxchg() now returning
X86EMUL_CMPXCHG_FAILED if __cmpxchg() fails my (32-bit) Windows 7 guest
gets stuck at the "Starting Windows" screen. It's state appears to be:

# ./xenctx -a 3
cs:eip: 0008:8bcd85d6
flags: 00200246 cid i z p
ss:esp: 0010:82736b9c
eax: 00000000   ebx: 84f3a678   ecx: 84ee2610   edx: 001eb615
esi: 40008000   edi: 82739d20   ebp: 82736c20
 ds:     0023    es:     0023    fs:     0030    gs:     0000

cr0: 8001003b
cr2: 8fd94000
cr3: 00185000
cr4: 000406f9

dr0: 00000000
dr1: 00000000
dr2: 00000000
dr3: 00000000
dr6: fffe0ff0
dr7: 00000400
Code (instr addr 8bcd85d6)
47 fc 83 c7 14 4e 75 ef 5f 5e c3 cc cc cc cc cc cc 8b ff fb f4 <c3> cc
cc cc cc cc 8b ff 55 8b ec

# ./xenctx -a 3
cs:eip: 0008:8bcd85d6
flags: 00200246 cid i z p
ss:esp: 0010:82736b9c
eax: 00000000   ebx: 84f3a678   ecx: 84ee2610   edx: 002ca60d
esi: 40008000   edi: 82739d20   ebp: 82736c20
 ds:     0023    es:     0023    fs:     0030    gs:     0000

cr0: 8001003b
cr2: 8fd94000
cr3: 00185000
cr4: 000406f9

dr0: 00000000
dr1: 00000000
dr2: 00000000
dr3: 00000000
dr6: fffe0ff0
dr7: 00000400
Code (instr addr 8bcd85d6)
47 fc 83 c7 14 4e 75 ef 5f 5e c3 cc cc cc cc cc cc 8b ff fb f4 <c3> cc
cc cc cc cc 8b ff 55 8b ec

This only happens in SMP scenarios (my guest had 10 VCPUs for easy
reproduction). With a single VCPU, the guest booted fine. So something
somehow is still not right when a CMPXCHG fails in a race-type situation
(unless something's obviously wrong with my patch, but I don't see it).


Thanks,
Razvan

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 2d92957..b946ef7 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -20,6 +20,7 @@
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/ioreq.h>
+#include <asm/hvm/nestedhvm.h>
 #include <asm/hvm/trace.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/svm/svm.h>
@@ -1029,6 +1030,77 @@ static int hvmemul_wbinvd_discard(
     return X86EMUL_OKAY;
 }
 
+static int hvmemul_vaddr_to_mfn(
+    unsigned long addr,
+    mfn_t *mfn,
+    uint32_t pfec,
+    struct x86_emulate_ctxt *ctxt)
+{
+    paddr_t gpa = addr & ~PAGE_MASK;
+    struct page_info *page;
+    p2m_type_t p2mt;
+    unsigned long gfn;
+    struct vcpu *curr = current;
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+
+    gfn = paging_gva_to_gfn(curr, addr, &pfec);
+
+    if ( gfn == gfn_x(INVALID_GFN) )
+    {
+        pagefault_info_t pfinfo = {};
+
+        if ( ( pfec & PFEC_page_paged ) || ( pfec & PFEC_page_shared ) )
+            return X86EMUL_RETRY;
+
+        pfinfo.linear = addr;
+        pfinfo.ec = pfec;
+
+        x86_emul_pagefault(pfinfo.ec, pfinfo.linear, &hvmemul_ctxt->ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    gpa |= (paddr_t)gfn << PAGE_SHIFT;
+
+    /*
+     * No need to do the P2M lookup for internally handled MMIO, benefiting
+     * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses,
+     * - newer Windows (like Server 2012) for HPET accesses.
+     */
+    if ( !nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa) )
+        return X86EMUL_UNHANDLEABLE;
+
+    page = get_page_from_gfn(curr->domain, gfn, &p2mt, P2M_UNSHARE);
+
+    if ( !page )
+        return X86EMUL_UNHANDLEABLE;
+
+    if ( p2m_is_paging(p2mt) )
+    {
+        put_page(page);
+        p2m_mem_paging_populate(curr->domain, gfn);
+        return X86EMUL_RETRY;
+    }
+
+    if ( p2m_is_shared(p2mt) )
+    {
+        put_page(page);
+        return X86EMUL_RETRY;
+    }
+
+    if ( p2m_is_grant(p2mt) )
+    {
+        put_page(page);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    *mfn = _mfn(page_to_mfn(page));
+
+    put_page(page);
+
+    return X86EMUL_OKAY;
+}
+
 static int hvmemul_cmpxchg(
     enum x86_segment seg,
     unsigned long offset,
@@ -1037,8 +1109,70 @@ static int hvmemul_cmpxchg(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    /* Fix this in case the guest is really relying on r-m-w atomicity. */
-    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
+    unsigned long addr, reps = 1;
+    int rc = X86EMUL_OKAY;
+    unsigned long old = 0, new = 0;
+    uint32_t pfec = PFEC_page_present | PFEC_write_access;
+    struct hvm_emulate_ctxt *hvmemul_ctxt =
+        container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+    mfn_t mfn[2];
+    void *map = NULL;
+    struct domain *currd = current->domain;
+
+    if ( is_x86_system_segment(seg) )
+        pfec |= PFEC_implicit;
+    else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3 )
+        pfec |= PFEC_user_mode;
+
+    rc = hvmemul_virtual_to_linear(
+        seg, offset, bytes, &reps, hvm_access_write, hvmemul_ctxt, &addr);
+
+    if ( rc != X86EMUL_OKAY || !bytes )
+        return rc;
+
+    rc = hvmemul_vaddr_to_mfn(addr, &mfn[0], pfec, ctxt);
+
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( likely(((addr + bytes - 1) & PAGE_MASK) == (addr & PAGE_MASK)) )
+    {
+        /* Whole write fits on a single page. */
+        mfn[1] = INVALID_MFN;
+        map = map_domain_page(mfn[0]);
+    }
+    else
+    {
+        rc = hvmemul_vaddr_to_mfn((addr + bytes - 1) & PAGE_MASK, &mfn[1],
+                                  pfec, ctxt);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        map = vmap(mfn, 2);
+    }
+
+    if ( !map )
+        return X86EMUL_UNHANDLEABLE;
+
+    map += (addr & ~PAGE_MASK);
+
+    memcpy(&old, p_old, bytes);
+    memcpy(&new, p_new, bytes);
+
+    if ( __cmpxchg(map, old, new, bytes) != old )
+        rc = X86EMUL_CMPXCHG_FAILED;
+
+    paging_mark_dirty(currd, mfn[0]);
+
+    if ( unlikely(mfn_valid(mfn[1])) )
+    {
+        paging_mark_dirty(currd, mfn[1]);
+        vunmap((void *)((unsigned long)map & PAGE_MASK));
+    }
+    else
+        unmap_domain_page(map);
+
+    return rc;
 }
 
 static int hvmemul_validate(
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

Reply via email to