The TCG TLB keeps a TLB_NOTDIRTY bit that must be kept coherent with the cpu physical dirty memory bitmaps. If any bits are clear, TLB_NOTDIRTY *must* be set in all CPUs so that the nodirty_write() slowpath is guaranteed to be called on a write access.
TLB_NOTDIRTY may be set if none of the bits are clear, it just results in a superfluous nodirty_write() call (which should remove the bit). There is a race with clearing physical dirty bits and setting TLB_NOTDIRTY vs setting or creating TLB entries without the TLB_NOTDIRTY bit, that can cause the bit to get lost and notdirty_write() updates to be missed. nodirty_write() 1. cpu_physical_memory_set_dirty_range() 2. if (!cpu_physical_memory_is_clean()) 3. tlb_set_dirty() vs cpu_physical_memory_test_and_clear_dirty() A. dirty = bitmap_test_and_clear_atomic() if (dirty) B. tlb_reset_dirty_range_all() 1 executes, then 2 finds no bits clean, then A clears the dirty bit and runs B to set TLB_NOTDIRTY on the TLB entries, then 3 clears the TLB_NOTDIRTY bits from the TLB entries. This results in the physical dirty bitmap having a clear bit with a TLB entry pointing to it without TLB_NOTDIRTY, which means stores through that TLB are performed without the notdirty_write() call to set dirty bits (or invalidate tb). Fix this by checking the physical dirty bitmap while holding the TLB lock that also covers TLB entry insertion / TLB_NOTDIRTY clearing. Signed-off-by: Nicholas Piggin <npig...@gmail.com> --- Hi Thomas, This is the other dirty bitmap race I saw, if you are hunting migration bugs... Thanks, Nick include/exec/exec-all.h | 1 - accel/stubs/tcg-stub.c | 4 ---- accel/tcg/cputlb.c | 47 +++++++++++++++++++++++++++++------------ 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index ce36bb10d4..ade81b25f0 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -654,7 +654,6 @@ static inline void mmap_unlock(void) {} #define WITH_MMAP_LOCK_GUARD() void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length); -void tlb_set_dirty(CPUState *cpu, vaddr addr); MemoryRegionSection * address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, diff --git a/accel/stubs/tcg-stub.c b/accel/stubs/tcg-stub.c index 8a496a2a6f..dd890d6cf6 100644 --- a/accel/stubs/tcg-stub.c +++ b/accel/stubs/tcg-stub.c @@ -18,10 +18,6 @@ void tb_flush(CPUState *cpu) { } -void tlb_set_dirty(CPUState *cpu, vaddr vaddr) -{ -} - int probe_access_flags(CPUArchState *env, vaddr addr, int size, MMUAccessType access_type, int mmu_idx, bool nonfault, void **phost, uintptr_t retaddr) diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 047cd2cc0a..078f4ef166 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -1001,10 +1001,17 @@ static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s) *d = *s; } -/* This is a cross vCPU call (i.e. another vCPU resetting the flags of +/* + * This is a cross vCPU call (i.e. another vCPU resetting the flags of * the target vCPU). * We must take tlb_c.lock to avoid racing with another vCPU update. The only * thing actually updated is the target TLB entry ->addr_write flags. + * + * This can race between the physical memory dirty bits becoming all set and + * tlb_set_page_full or tlb_try_set_dirty creating dirty entries: it is + * harmless to set TLB_NOTDIRTY on a !clean physical page, it might just + * cause a notdirty_write() slowpath on the next write which clears out the + * spurious TLB_NOTDIRTY. */ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length) { @@ -1037,9 +1044,11 @@ static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry, } } -/* update the TLB corresponding to virtual page vaddr - so that it is no longer dirty */ -void tlb_set_dirty(CPUState *cpu, vaddr addr) +/* + * Update the TLB corresponding to virtual page vaddr if it no longer + * requires the TLB_NOTDIRTY bit. + */ +static void tlb_try_set_dirty(CPUState *cpu, vaddr addr, ram_addr_t ram_addr) { int mmu_idx; @@ -1047,6 +1056,12 @@ void tlb_set_dirty(CPUState *cpu, vaddr addr) addr &= TARGET_PAGE_MASK; qemu_spin_lock(&cpu->neg.tlb.c.lock); + if (cpu_physical_memory_is_clean(ram_addr)) { + /* Must be checked under lock, like tlb_set_page_full */ + qemu_spin_unlock(&cpu->neg.tlb.c.lock); + return; + } + for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { tlb_set_dirty1_locked(tlb_entry(cpu, mmu_idx, addr), addr); } @@ -1165,6 +1180,19 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx, addend = 0; } + /* + * Hold the TLB lock for the rest of the function. We could acquire/release + * the lock several times in the function, but it is faster to amortize the + * acquisition cost by acquiring it just once. Note that this leads to + * a longer critical section, but this is not a concern since the TLB lock + * is unlikely to be contended. + * + * The TLB lock must be held over checking cpu_physical_memory_is_clean + * and inserting the entry into the TLB, so clearing phsyical memory + * dirty status can find and set TLB_NOTDIRTY on the entries. + */ + qemu_spin_lock(&tlb->c.lock); + write_flags = read_flags; if (is_ram) { iotlb = memory_region_get_ram_addr(section->mr) + xlat; @@ -1200,15 +1228,6 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx, index = tlb_index(cpu, mmu_idx, addr_page); te = tlb_entry(cpu, mmu_idx, addr_page); - /* - * Hold the TLB lock for the rest of the function. We could acquire/release - * the lock several times in the function, but it is faster to amortize the - * acquisition cost by acquiring it just once. Note that this leads to - * a longer critical section, but this is not a concern since the TLB lock - * is unlikely to be contended. - */ - qemu_spin_lock(&tlb->c.lock); - /* Note that the tlb is no longer clean. */ tlb->c.dirty |= 1 << mmu_idx; @@ -1409,7 +1428,7 @@ static void notdirty_write(CPUState *cpu, vaddr mem_vaddr, unsigned size, /* We remove the notdirty callback only if the code has been flushed. */ if (!cpu_physical_memory_is_clean(ram_addr)) { trace_memory_notdirty_set_dirty(mem_vaddr); - tlb_set_dirty(cpu, mem_vaddr); + tlb_try_set_dirty(cpu, mem_vaddr, ram_addr); } } -- 2.42.0