Il Fri, Jun 15, 2007 at 12:06:50PM +0300, Avi Kivity ha scritto: > > After a bit of thinking: it's correct but removes an optimization; > > furthermore it may miss other instructions that write to memory mapped > > areas. > > A more proper fix should be "force the writeback if dst.ptr is in some > > kind of mmio area". > > > > I think we can just disable the optimization, and (in a separate patch) > add it back in emulator_write_phys(), where we know we're modifying > memory and not hitting mmio.
Problem is that in emulator_write_phys() we've already lost track of the previous (dst.orig_val) value. I moved up the decision in cmpxchg_emulated; unfortunately this means that the non-locked write path (write_emulated) can't do this optimization, unless I change its signature to include the old value. The first patch makes the writeback step uncoditional whenever we have a destination operand (the mov check (d & Mov) may be superfluous, yes?). The write-to-registry path still has the optimization that skips the write if possible. --- a/kernel/x86_emulate.c 2007-06-15 21:13:51.000000000 +0200 +++ b/kernel/x86_emulate.c 2007-06-15 22:12:15.000000000 +0200 @@ -1057,9 +1057,8 @@ } writeback: - if ((d & Mov) || (dst.orig_val != dst.val)) { - switch (dst.type) { - case OP_REG: + if ((d & Mov) || (d & DstMask) == DstMem || (d & DstMask) == DstReg ) { + if (dst.type == OP_REG && dst.orig_val != dst.val) { /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ switch (dst.bytes) { case 1: @@ -1075,8 +1074,10 @@ *dst.ptr = dst.val; break; } - break; - case OP_MEM: + } else if (dst.type == OP_MEM) { + /* Always dispatch the write, since it may hit a + * MMIO area. + */ if (lock_prefix) rc = ops->cmpxchg_emulated((unsigned long)dst. ptr, &dst.orig_val, @@ -1088,8 +1089,6 @@ ctxt); if (rc != 0) goto done; - default: - break; } } Next one: I've splitted emulator_write_phys into emulator_write_phys_mem (for normal RAM) and emulator_write_phys_mmio (for the rest). The cmpxchg code path is roughly: if (!mapped) page_fault(); page = gfn_to_page(...) if (page) { if (!memcmp(old, new)) return X86EMUL_CONTINUE; write_mem(); retrun X86EMUL_CONTINUE; } /* for mmio always do the write */ write_mmio(); I'm a bit confused about this test, found in emulator_write_phys (original code): if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; AFAICT is makes the emulator skip the write if the modified area spans across two different (physical) pages. When this happens write_emulated does a MMIO write. I'd expect the function to load the 2 pages and do the memory write on both instead. I've preserved this logic in the following patch, but I don't understand why it's correct :| --- a/kernel/kvm_main.c 2007-06-15 21:18:08.000000000 +0200 +++ b/kernel/kvm_main.c 2007-06-15 23:34:13.000000000 +0200 @@ -1125,26 +1125,39 @@ } } -static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) +static int emulator_write_phys_mem(struct kvm_vcpu *vcpu, gpa_t gpa, + struct page *page, const void *val, + int bytes) { - struct page *page; void *virt; - unsigned offset = offset_in_page(gpa); + unsigned int offset; + + offset = offset_in_page(gpa); if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT)) return 0; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (!page) - return 0; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); virt = kmap_atomic(page, KM_USER0); kvm_mmu_pte_write(vcpu, gpa, virt + offset, val, bytes); memcpy(virt + offset_in_page(gpa), val, bytes); kunmap_atomic(virt, KM_USER0); + return 1; } +static int emulator_write_phys_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) +{ + vcpu->mmio_needed = 1; + vcpu->mmio_phys_addr = gpa; + vcpu->mmio_size = bytes; + vcpu->mmio_is_write = 1; + memcpy(vcpu->mmio_data, val, bytes); + + return 0; +} + static int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, @@ -1152,20 +1165,18 @@ { struct kvm_vcpu *vcpu = ctxt->vcpu; gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + struct page *page; if (gpa == UNMAPPED_GVA) { kvm_arch_ops->inject_page_fault(vcpu, addr, 2); return X86EMUL_PROPAGATE_FAULT; } - if (emulator_write_phys(vcpu, gpa, val, bytes)) + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (page && emulator_write_phys_mem(vcpu, gpa, page, val, bytes)) return X86EMUL_CONTINUE; - vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); + emulator_write_phys_mmio(vcpu, gpa, val, bytes); return X86EMUL_CONTINUE; } @@ -1177,12 +1188,37 @@ struct x86_emulate_ctxt *ctxt) { static int reported; + struct kvm_vcpu *vcpu; + gpa_t gpa; + struct page *page; if (!reported) { reported = 1; printk(KERN_WARNING "kvm: emulating exchange as write\n"); } - return emulator_write_emulated(addr, new, bytes, ctxt); + + vcpu = ctxt->vcpu; + gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + + if (gpa == UNMAPPED_GVA) { + kvm_arch_ops->inject_page_fault(vcpu, addr, 2); + return X86EMUL_PROPAGATE_FAULT; + } + + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (page) { + /* Regular memory */ + if (!memcmp(old, new, bytes)) + return X86EMUL_CONTINUE; + + if (emulator_write_phys_mem(vcpu, gpa, page, new, bytes)) + return X86EMUL_CONTINUE; + } + + /* MMIO */ + emulator_write_phys_mmio(vcpu, gpa, new, bytes); + + return X86EMUL_CONTINUE; } static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) Both patches apply to kvm-28 and have been run-time tested with 32bit guest on 32bit host, with a VMX processor. If patches look good I'll resubmit with proper changelog and signed-off. Luca -- The trouble with computers is that they do what you tell them, not what you want. D. Cohen - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/