# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxxxx
# Node ID 533bad7c0883189e26c2a7f43011801c417b01fe
# Parent e1ae7b3cb5b73f11bed3a51a7f4ded85c30cffd8
[LINUX] Add spurious page-fault detection, intended primarily
for spurious write faults on mappings that have been
changed from read-only to writable. If a CPU has a stale
read-only entry in its TLB, it is allowed to fault on
the next write access without re-walking the page table.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
---
linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c | 51 ++++++++++++++++++++++
linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c | 54 ++++++++++++++++++++++--
2 files changed, 101 insertions(+), 4 deletions(-)
diff -r e1ae7b3cb5b7 -r 533bad7c0883
linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c Fri Jun 16 18:18:55
2006 +0100
+++ b/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c Fri Jun 16 18:19:40
2006 +0100
@@ -273,6 +273,49 @@ static void dump_fault_path(unsigned lon
}
#endif
+static int spurious_fault(struct pt_regs *regs,
+ unsigned long address,
+ unsigned long error_code)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+#ifdef CONFIG_XEN
+ /* Faults in hypervisor area are never spurious. */
+ if (address >= HYPERVISOR_VIRT_START)
+ return 0;
+#endif
+
+ /* Reserved-bit violation or user access to kernel space? */
+ if (error_code & 0x0c)
+ return 0;
+
+ pgd = init_mm.pgd + pgd_index(address);
+ if (!pgd_present(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+ if ((error_code & 0x02) && !pte_write(*pte))
+ return 0;
+#ifdef CONFIG_X86_PAE
+ if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
+ return 0;
+#endif
+
+ return 1;
+}
/*
* This routine handles page faults. It determines the address,
@@ -327,8 +370,16 @@ fastcall void __kprobes do_page_fault(st
* protection error (error_code & 1) == 0.
*/
if (unlikely(address >= TASK_SIZE)) {
+#ifdef CONFIG_XEN
+ /* Faults in hypervisor area can never be patched up. */
+ if (address >= HYPERVISOR_VIRT_START)
+ goto bad_area_nosemaphore;
+#endif
if (!(error_code & 5))
goto vmalloc_fault;
+ /* Can take a spurious fault if mapping changes R/O -> R/W. */
+ if (spurious_fault(regs, address, error_code))
+ return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
diff -r e1ae7b3cb5b7 -r 533bad7c0883
linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c Fri Jun 16 18:18:55
2006 +0100
+++ b/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c Fri Jun 16 18:19:40
2006 +0100
@@ -307,6 +307,49 @@ int exception_trace = 1;
#define MEM_LOG(_f, _a...) ((void)0)
#endif
+static int spurious_fault(struct pt_regs *regs,
+ unsigned long address,
+ unsigned long error_code)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+#ifdef CONFIG_XEN
+ /* Faults in hypervisor area are never spurious. */
+ if ((address >= HYPERVISOR_VIRT_START) &&
+ (address < HYPERVISOR_VIRT_END))
+ return 0;
+#endif
+
+ /* Reserved-bit violation or user access to kernel space? */
+ if (error_code & PF_RSVD|PF_USER)
+ return 0;
+
+ pgd = init_mm.pgd + pgd_index(address);
+ if (!pgd_present(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+ if ((error_code & PF_WRITE) && !pte_write(*pte))
+ return 0;
+ if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
+ return 0;
+
+ return 1;
+}
+
/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
@@ -361,16 +404,19 @@ asmlinkage void __kprobes do_page_fault(
*/
if (unlikely(address >= TASK_SIZE64)) {
/*
- * Must check for the entire kernel range here: with writable
- * page tables the hypervisor may temporarily clear PMD
- * entries.
+ * Don't check for the module range here: its PML4
+ * is always initialized because it's shared with the main
+ * kernel text. Only vmalloc may need PML4 syncups.
*/
if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
- address >= PAGE_OFFSET) {
+ ((address >= VMALLOC_START && address < VMALLOC_END))) {
if (vmalloc_fault(address) < 0)
goto bad_area_nosemaphore;
return;
}
+ /* Can take a spurious fault if mapping changes R/O -> R/W. */
+ if (spurious_fault(regs, address, error_code))
+ return;
/*
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock.
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|