This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
Home Products Support Community News


[Xen-devel] [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sy

To: "H. Peter Anvin" <hpa@xxxxxxxxx>
Subject: [Xen-devel] [PATCH] x86: hold mm->page_table_lock while doing vmalloc_sync
From: Jeremy Fitzhardinge <jeremy@xxxxxxxx>
Date: Thu, 14 Oct 2010 13:56:43 -0700
Cc: "Xen-devel@xxxxxxxxxxxxxxxxxxx" <Xen-devel@xxxxxxxxxxxxxxxxxxx>, the arch/x86 maintainers <x86@xxxxxxxxxx>, Linux Kernel Mailing List <linux-kernel@xxxxxxxxxxxxxxx>, Jan Beulich <JBeulich@xxxxxxxxxx>, Ian Campbell <Ian.Campbell@xxxxxxxxxx>
Delivery-date: Thu, 14 Oct 2010 13:57:47 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv: Gecko/20100921 Fedora/3.1.4-1.fc13 Lightning/1.0b3pre Thunderbird/3.1.4

Take mm->page_table_lock while syncing the vmalloc region.  This prevents
a race with the Xen pagetable pin/unpin code, which expects that the
page_table_lock is already held.  If this race occurs, then Xen can see
an inconsistent page type (a page can either be read/write or a pagetable
page, and pin/unpin converts it between them), which will cause either
the pin or the set_p[gm]d to fail; either will crash the kernel.

vmalloc_sync_all() should be called rarely, so this extra use of
page_table_lock should not interfere with its normal users.

The mm pointer is stashed in the pgd page's index field, as that won't
be otherwise used for pgd pages.

Bug reported by Ian Campbell <ian.cambell@xxxxxxxxxxxxx>
Derived from a patch by Jan Beulich <jbeulich@xxxxxxxxxx>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@xxxxxxxxxx>

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785..422b363 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / 
sizeof(unsigned long)];
 extern spinlock_t pgd_lock;
 extern struct list_head pgd_list;
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
 #include <asm/paravirt.h>
 #else  /* !CONFIG_PARAVIRT */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e..b7f9ae1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
                spin_lock_irqsave(&pgd_lock, flags);
                list_for_each_entry(page, &pgd_list, lru) {
-                       if (!vmalloc_sync_one(page_address(page), address))
+                       spinlock_t *pgt_lock;
+                       int ret;
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                       spin_lock(pgt_lock);
+                       ret = vmalloc_sync_one(page_address(page), address);
+                       spin_unlock(pgt_lock);
+                       if (!ret)
                spin_unlock_irqrestore(&pgd_lock, flags);
@@ -341,11 +350,19 @@ void vmalloc_sync_all(void)
                spin_lock_irqsave(&pgd_lock, flags);
                list_for_each_entry(page, &pgd_list, lru) {
                        pgd_t *pgd;
+                       spinlock_t *pgt_lock;
                        pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                       spin_lock(pgt_lock);
                        if (pgd_none(*pgd))
                                set_pgd(pgd, *pgd_ref);
                                BUG_ON(pgd_page_vaddr(*pgd) != 
+                       spin_unlock(pgt_lock);
                spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee42..c70e57d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
 #define UNSHARED_PTRS_PER_PGD                          \
-static void pgd_ctor(pgd_t *pgd)
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+       BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+       virt_to_page(pgd)->index = (pgoff_t)mm;
+struct mm_struct *pgd_page_get_mm(struct page *page)
+       return (struct mm_struct *)page->index;
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
        /* If the pgd points to a shared pagetable level (either the
           ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd)
        /* list required to sync kernel mapping updates */
-       if (!SHARED_KERNEL_PMD)
+       if (!SHARED_KERNEL_PMD) {
+               pgd_set_mm(pgd, mm);
+       }
 static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
        spin_lock_irqsave(&pgd_lock, flags);
-       pgd_ctor(pgd);
+       pgd_ctor(mm, pgd);
        pgd_prepopulate_pmd(mm, pgd, pmds);
        spin_unlock_irqrestore(&pgd_lock, flags);

Attachment: x86-lock-page_table_lock-in-vmalloc_sync.patch
Description: Text document

Xen-devel mailing list
<Prev in Thread] Current Thread [Next in Thread>