When changing a kernel page from RO->RW, it's OK to leave stale TLB
entries around, since doing a global flush is expensive and they pose
no security problem.  They can, however, generate a spurious fault,
which we should catch and simply return from (which will have the
side-effect of reloading the TLB to the current PTE).

This can occur when running under Xen, because it frequently changes
kernel pages from RW->RO->RW to implement Xen's pagetable semantics.
It could also occur when using CONFIG_DEBUG_PAGEALLOC, since it avoids
doing a global TLB flush after changing page permissions.

Signed-off-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
Cc: Harvey Harrison <[EMAIL PROTECTED]>
---
arch/x86/mm/fault_32.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++
arch/x86/mm/fault_64.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 100 insertions(+)

===================================================================
--- a/arch/x86/mm/fault_32.c
+++ b/arch/x86/mm/fault_32.c
@@ -324,6 +324,51 @@ static int is_f00f_bug(struct pt_regs *r
}

/*
+ * Handle a spurious fault caused by a stale TLB entry.  This allows
+ * us to lazily refresh the TLB when increasing the permissions of a
+ * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
+ * expensive since that implies doing a full cross-processor TLB
+ * flush, even if no stale TLB entries exist on other processors.
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static int spurious_fault(unsigned long address,
+                         unsigned long error_code)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & (PF_USER | PF_RSVD))
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+
+       if ((error_code & PF_WRITE) && !pte_write(*pte))
+               return 0;
+       if ((error_code & PF_INSTR) && !pte_exec(*pte))
+               return 0;
+
+       return 1;
+}
+
+/*
 * Handle a fault on the vmalloc or module mapping area
 *
 * This assumes no large pages in there.
@@ -446,6 +491,11 @@ void __kprobes do_page_fault(struct pt_r
                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
                    vmalloc_fault(address) >= 0)
                        return;
+
+               /* Can handle a stale RO->RW TLB */
+               if (spurious_fault(address, error_code))
+                       return;
+
                /*
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.
===================================================================
--- a/arch/x86/mm/fault_64.c
+++ b/arch/x86/mm/fault_64.c
@@ -312,6 +312,51 @@ static noinline void pgtable_bad(unsigne
}

/*
+ * Handle a spurious fault caused by a stale TLB entry.  This allows
+ * us to lazily refresh the TLB when increasing the permissions of a
+ * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
+ * expensive since that implies doing a full cross-processor TLB
+ * flush, even if no stale TLB entries exist on other processors.
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static int spurious_fault(unsigned long address,
+                         unsigned long error_code)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & (PF_USER | PF_RSVD))
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+
+       if ((error_code & PF_WRITE) && !pte_write(*pte))
+               return 0;
+       if ((error_code & PF_INSTR) && !pte_exec(*pte))
+               return 0;
+
+       return 1;
+}
+
+/*
 * Handle a fault on the vmalloc area
 *
 * This assumes no large pages in there.
@@ -443,6 +488,11 @@ asmlinkage void __kprobes do_page_fault(
                        if (vmalloc_fault(address) >= 0)
                                return;
                }
+
+               /* Can handle a stale RO->RW TLB */
+               if (spurious_fault(address, error_code))
+                       return;
+
                /*
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to