# HG changeset patch
# User Keir Fraser <keir@xxxxxxxxxxxxx>
# Date 1191577746 -3600
# Node ID 6e26ffc60647bd7454d0a066a8ab63ef7f0123af
# Parent ac1f33f633ba158a5427f24dbc31a1ee573a02b7
linux: allow use of split page table locks
This fixes the race condition previously experienced between
(un)pinning and vmscan.
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>
---
arch/i386/mm/pgtable-xen.c | 66 +++++++++++++++++++++++++++++++++++++++---
arch/x86_64/mm/pageattr-xen.c | 66 +++++++++++++++++++++++++++++++++++++++---
mm/Kconfig | 3 -
3 files changed, 124 insertions(+), 11 deletions(-)
diff -r ac1f33f633ba -r 6e26ffc60647 arch/i386/mm/pgtable-xen.c
--- a/arch/i386/mm/pgtable-xen.c Wed Oct 03 15:02:54 2007 +0100
+++ b/arch/i386/mm/pgtable-xen.c Fri Oct 05 10:49:06 2007 +0100
@@ -494,6 +494,64 @@ void make_pages_writable(void *va, unsig
}
}
+static void _pin_lock(struct mm_struct *mm, int lock) {
+ if (lock)
+ spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+ /* While mm->page_table_lock protects us against insertions and
+ * removals of higher level page table pages, it doesn't protect
+ * against updates of pte-s. Such updates, however, require the
+ * pte pages to be in consistent state (unpinned+writable or
+ * pinned+readonly). The pinning and attribute changes, however
+ * cannot be done atomically, which is why such updates must be
+ * prevented from happening concurrently.
+ * Note that no pte lock can ever elsewhere be acquired nesting
+ * with an already acquired one in the same mm, or with the mm's
+ * page_table_lock already acquired, as that would break in the
+ * non-split case (where all these are actually resolving to the
+ * one page_table_lock). Thus acquiring all of them here is not
+ * going to result in dead locks, and the order of acquires
+ * doesn't matter.
+ */
+ {
+ pgd_t *pgd = mm->pgd;
+ unsigned g;
+
+ for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+ pud_t *pud;
+ unsigned u;
+
+ if (pgd_none(*pgd))
+ continue;
+ pud = pud_offset(pgd, 0);
+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+ pmd_t *pmd;
+ unsigned m;
+
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, 0);
+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+ spinlock_t *ptl;
+
+ if (pmd_none(*pmd))
+ continue;
+ ptl = pte_lockptr(0, pmd);
+ if (lock)
+ spin_lock(ptl);
+ else
+ spin_unlock(ptl);
+ }
+ }
+ }
+ }
+#endif
+ if (!lock)
+ spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
static inline void pgd_walk_set_prot(struct page *page, pgprot_t flags)
{
unsigned long pfn = page_to_pfn(page);
@@ -576,18 +634,18 @@ void mm_pin(struct mm_struct *mm)
{
if (xen_feature(XENFEAT_writable_page_tables))
return;
- spin_lock(&mm->page_table_lock);
+ pin_lock(mm);
__pgd_pin(mm->pgd);
- spin_unlock(&mm->page_table_lock);
+ pin_unlock(mm);
}
void mm_unpin(struct mm_struct *mm)
{
if (xen_feature(XENFEAT_writable_page_tables))
return;
- spin_lock(&mm->page_table_lock);
+ pin_lock(mm);
__pgd_unpin(mm->pgd);
- spin_unlock(&mm->page_table_lock);
+ pin_unlock(mm);
}
void mm_pin_all(void)
diff -r ac1f33f633ba -r 6e26ffc60647 arch/x86_64/mm/pageattr-xen.c
--- a/arch/x86_64/mm/pageattr-xen.c Wed Oct 03 15:02:54 2007 +0100
+++ b/arch/x86_64/mm/pageattr-xen.c Fri Oct 05 10:49:06 2007 +0100
@@ -19,6 +19,64 @@
LIST_HEAD(mm_unpinned);
DEFINE_SPINLOCK(mm_unpinned_lock);
+
+static void _pin_lock(struct mm_struct *mm, int lock) {
+ if (lock)
+ spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+ /* While mm->page_table_lock protects us against insertions and
+ * removals of higher level page table pages, it doesn't protect
+ * against updates of pte-s. Such updates, however, require the
+ * pte pages to be in consistent state (unpinned+writable or
+ * pinned+readonly). The pinning and attribute changes, however
+ * cannot be done atomically, which is why such updates must be
+ * prevented from happening concurrently.
+ * Note that no pte lock can ever elsewhere be acquired nesting
+ * with an already acquired one in the same mm, or with the mm's
+ * page_table_lock already acquired, as that would break in the
+ * non-split case (where all these are actually resolving to the
+ * one page_table_lock). Thus acquiring all of them here is not
+ * going to result in dead locks, and the order of acquires
+ * doesn't matter.
+ */
+ {
+ pgd_t *pgd = mm->pgd;
+ unsigned g;
+
+ for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+ pud_t *pud;
+ unsigned u;
+
+ if (pgd_none(*pgd))
+ continue;
+ pud = pud_offset(pgd, 0);
+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+ pmd_t *pmd;
+ unsigned m;
+
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, 0);
+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+ spinlock_t *ptl;
+
+ if (pmd_none(*pmd))
+ continue;
+ ptl = pte_lockptr(0, pmd);
+ if (lock)
+ spin_lock(ptl);
+ else
+ spin_unlock(ptl);
+ }
+ }
+ }
+ }
+#endif
+ if (!lock)
+ spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
{
@@ -76,7 +134,7 @@ void mm_pin(struct mm_struct *mm)
if (xen_feature(XENFEAT_writable_page_tables))
return;
- spin_lock(&mm->page_table_lock);
+ pin_lock(mm);
mm_walk(mm, PAGE_KERNEL_RO);
if (HYPERVISOR_update_va_mapping(
@@ -97,7 +155,7 @@ void mm_pin(struct mm_struct *mm)
list_del(&mm->context.unpinned);
spin_unlock(&mm_unpinned_lock);
- spin_unlock(&mm->page_table_lock);
+ pin_unlock(mm);
}
void mm_unpin(struct mm_struct *mm)
@@ -105,7 +163,7 @@ void mm_unpin(struct mm_struct *mm)
if (xen_feature(XENFEAT_writable_page_tables))
return;
- spin_lock(&mm->page_table_lock);
+ pin_lock(mm);
xen_pgd_unpin(__pa(mm->pgd));
xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
@@ -125,7 +183,7 @@ void mm_unpin(struct mm_struct *mm)
list_add(&mm->context.unpinned, &mm_unpinned);
spin_unlock(&mm_unpinned_lock);
- spin_unlock(&mm->page_table_lock);
+ pin_unlock(mm);
}
void mm_pin_all(void)
diff -r ac1f33f633ba -r 6e26ffc60647 mm/Kconfig
--- a/mm/Kconfig Wed Oct 03 15:02:54 2007 +0100
+++ b/mm/Kconfig Fri Oct 05 10:49:06 2007 +0100
@@ -127,14 +127,11 @@ comment "Memory hotplug is currently inc
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
-# XEN on x86 architecture uses the mapping field on pagetable pages to store a
-# pointer to the destructor. This conflicts with pte_lock_deinit().
#
config SPLIT_PTLOCK_CPUS
int
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && !PA20
- default "4096" if X86_XEN || X86_64_XEN
default "4"
#
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|