WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] linux: allow use of split page table locks

To: <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH] linux: allow use of split page table locks
From: "Jan Beulich" <jbeulich@xxxxxxxxxx>
Date: Mon, 26 Mar 2007 15:59:49 +0100
Delivery-date: Mon, 26 Mar 2007 07:58:16 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
This fixes the race condition previously experienced between (un)pinning
and vmscan.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

Index: head-2007-03-19/arch/i386/mm/pgtable-xen.c
===================================================================
--- head-2007-03-19.orig/arch/i386/mm/pgtable-xen.c     2007-03-21 
10:19:07.000000000 +0100
+++ head-2007-03-19/arch/i386/mm/pgtable-xen.c  2007-03-21 11:51:37.000000000 
+0100
@@ -574,6 +574,64 @@ void make_pages_writable(void *va, unsig
        }
 }
 
+static void _pin_lock(struct mm_struct *mm, int lock) {
+       if (lock)
+               spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+       /* While mm->page_table_lock protects us against insertions and
+        * removals of higher level page table pages, it doesn't protect
+        * against updates of pte-s. Such updates, however, require the
+        * pte pages to be in consistent state (unpinned+writable or
+        * pinned+readonly). The pinning and attribute changes, however
+        * cannot be done atomically, which is why such updates must be
+        * prevented from happening concurrently.
+        * Note that no pte lock can ever elsewhere be acquired nesting
+        * with an already acquired one in the same mm, or with the mm's
+        * page_table_lock already acquired, as that would break in the
+        * non-split case (where all these are actually resolving to the
+        * one page_table_lock). Thus acquiring all of them here is not
+        * going to result in dead locks, and the order of acquires
+        * doesn't matter.
+        */
+       {
+               pgd_t *pgd = mm->pgd;
+               unsigned g;
+
+               for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+                       pud_t *pud;
+                       unsigned u;
+
+                       if (pgd_none(*pgd))
+                               continue;
+                       pud = pud_offset(pgd, 0);
+                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                               pmd_t *pmd;
+                               unsigned m;
+
+                               if (pud_none(*pud))
+                                       continue;
+                               pmd = pmd_offset(pud, 0);
+                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                                       spinlock_t *ptl;
+
+                                       if (pmd_none(*pmd))
+                                               continue;
+                                       ptl = pte_lockptr(0, pmd);
+                                       if (lock)
+                                               spin_lock(ptl);
+                                       else
+                                               spin_unlock(ptl);
+                               }
+                       }
+               }
+       }
+#endif
+       if (!lock)
+               spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
 static inline void pgd_walk_set_prot(struct page *page, pgprot_t flags)
 {
        unsigned long pfn = page_to_pfn(page);
@@ -656,18 +714,18 @@ void mm_pin(struct mm_struct *mm)
 {
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
        __pgd_pin(mm->pgd);
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_unpin(struct mm_struct *mm)
 {
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
        __pgd_unpin(mm->pgd);
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_pin_all(void)
Index: head-2007-03-19/arch/x86_64/mm/pageattr-xen.c
===================================================================
--- head-2007-03-19.orig/arch/x86_64/mm/pageattr-xen.c  2007-03-21 
10:19:07.000000000 +0100
+++ head-2007-03-19/arch/x86_64/mm/pageattr-xen.c       2007-03-21 
11:50:13.000000000 +0100
@@ -20,6 +20,64 @@
 LIST_HEAD(mm_unpinned);
 DEFINE_SPINLOCK(mm_unpinned_lock);
 
+static void _pin_lock(struct mm_struct *mm, int lock) {
+       if (lock)
+               spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+       /* While mm->page_table_lock protects us against insertions and
+        * removals of higher level page table pages, it doesn't protect
+        * against updates of pte-s. Such updates, however, require the
+        * pte pages to be in consistent state (unpinned+writable or
+        * pinned+readonly). The pinning and attribute changes, however
+        * cannot be done atomically, which is why such updates must be
+        * prevented from happening concurrently.
+        * Note that no pte lock can ever elsewhere be acquired nesting
+        * with an already acquired one in the same mm, or with the mm's
+        * page_table_lock already acquired, as that would break in the
+        * non-split case (where all these are actually resolving to the
+        * one page_table_lock). Thus acquiring all of them here is not
+        * going to result in dead locks, and the order of acquires
+        * doesn't matter.
+        */
+       {
+               pgd_t *pgd = mm->pgd;
+               unsigned g;
+
+               for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+                       pud_t *pud;
+                       unsigned u;
+
+                       if (pgd_none(*pgd))
+                               continue;
+                       pud = pud_offset(pgd, 0);
+                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                               pmd_t *pmd;
+                               unsigned m;
+
+                               if (pud_none(*pud))
+                                       continue;
+                               pmd = pmd_offset(pud, 0);
+                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                                       spinlock_t *ptl;
+
+                                       if (pmd_none(*pmd))
+                                               continue;
+                                       ptl = pte_lockptr(0, pmd);
+                                       if (lock)
+                                               spin_lock(ptl);
+                                       else
+                                               spin_unlock(ptl);
+                               }
+                       }
+               }
+       }
+#endif
+       if (!lock)
+               spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
 static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
 {
        struct page *page = virt_to_page(pt);
@@ -76,7 +134,7 @@ void mm_pin(struct mm_struct *mm)
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
 
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
 
        mm_walk(mm, PAGE_KERNEL_RO);
        if (HYPERVISOR_update_va_mapping(
@@ -97,7 +155,7 @@ void mm_pin(struct mm_struct *mm)
        list_del(&mm->context.unpinned);
        spin_unlock(&mm_unpinned_lock);
 
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_unpin(struct mm_struct *mm)
@@ -105,7 +163,7 @@ void mm_unpin(struct mm_struct *mm)
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
 
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
 
        xen_pgd_unpin(__pa(mm->pgd));
        xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
@@ -125,7 +183,7 @@ void mm_unpin(struct mm_struct *mm)
        list_add(&mm->context.unpinned, &mm_unpinned);
        spin_unlock(&mm_unpinned_lock);
 
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_pin_all(void)
Index: head-2007-03-19/mm/Kconfig
===================================================================
--- head-2007-03-19.orig/mm/Kconfig     2007-03-19 15:34:24.000000000 +0100
+++ head-2007-03-19/mm/Kconfig  2007-03-21 11:50:13.000000000 +0100
@@ -127,14 +127,11 @@ config MEMORY_HOTPLUG_SPARSE
 # Default to 4 for wider testing, though 8 might be more appropriate.
 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
 # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
-# XEN on x86 architecture uses the mapping field on pagetable pages to store a
-# pointer to the destructor. This conflicts with pte_lock_deinit().
 #
 config SPLIT_PTLOCK_CPUS
        int
        default "4096" if ARM && !CPU_CACHE_VIPT
        default "4096" if PARISC && !PA20
-       default "4096" if X86_XEN || X86_64_XEN
        default "4"
 
 #



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>