WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] The patched attached enables x86_64 xenlinux with "late

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] The patched attached enables x86_64 xenlinux with "late pin, early
From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Fri, 26 Aug 2005 11:02:16 +0000
Delivery-date: Fri, 26 Aug 2005 11:00:43 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 5978be010beca73a6b88ae68d2e120c531bb0edd
# Parent  edeee85c90b1fe1431437338cb1645acb176b0bd
The patched attached enables x86_64 xenlinux with "late pin, early
unpin", which is already implemented for x86_32. Since we now only pin
the root rather than any of the other levels, the overall performance
became better especially with workloads that require heavy memory
management operations.

On 8-way x86_64 xenlinux (dom0) the kernel build was improved by about
10% (using make -j32). Even a small setup like a UP HT system, I see
about 3% performance gain with kernel build (make -j4).

Lmbench also shows improvements in fork/exec/sh:
Processor, Processes - times in microseconds - smaller is better
--------------------------------------------------------------------
Host    OS  Mhz   null null      open slct sig  sig  fork exec sh =20
                  call  I/O stat clos TCP  inst hndl proc proc proc
--------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ----=20
Linux 2.6.12- 3786 1.13 1.36 3.93 6.04 10.5 1.43 4.33 536. 1446 3614
Linux 2.6.12- 3786 1.13 1.36 3.91 6.03 10.4 1.44 4.38 346. 1050 2831

Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx>

diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:02:14 2005
@@ -105,13 +105,18 @@
        struct mm_struct * old_mm;
        int retval = 0;
 
+       memset(&mm->context, 0, sizeof(mm->context));
        init_MUTEX(&mm->context.sem);
-       mm->context.size = 0;
        old_mm = current->mm;
        if (old_mm && old_mm->context.size > 0) {
                down(&old_mm->context.sem);
                retval = copy_ldt(&mm->context, &old_mm->context);
                up(&old_mm->context.sem);
+       }
+       if (retval == 0) {
+               spin_lock(&mm_unpinned_lock);
+               list_add(&mm->context.unpinned, &mm_unpinned);
+               spin_unlock(&mm_unpinned_lock);
        }
        return retval;
 }
@@ -133,6 +138,11 @@
                else
                        kfree(mm->context.ldt);
                mm->context.size = 0;
+       }
+       if (!mm->context.pinned) {
+               spin_lock(&mm_unpinned_lock);
+               list_del(&mm->context.unpinned);
+               spin_unlock(&mm_unpinned_lock);
        }
 }
 
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c    Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c    Fri Aug 26 11:02:14 2005
@@ -712,6 +712,7 @@
         HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
 
         memset(empty_zero_page, 0, sizeof(empty_zero_page));
+       init_mm.context.pinned = 1;
 
 #ifdef CONFIG_XEN_PHYSDEV_ACCESS
        {
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c        Fri Aug 26 
11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c        Fri Aug 26 
11:02:14 2005
@@ -12,19 +12,145 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_XEN
 #include <asm/pgalloc.h>
-#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+LIST_HEAD(mm_unpinned);
+DEFINE_SPINLOCK(mm_unpinned_lock);
+
+static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+{
+       struct page *page = virt_to_page(pt);
+       unsigned long pfn = page_to_pfn(page);
+
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)__va(pfn << PAGE_SHIFT),
+                      pfn_pte(pfn, flags), 0));
+}
+
+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
+{
+       pgd_t       *pgd;
+       pud_t       *pud;
+       pmd_t       *pmd;
+       pte_t       *pte;
+       int          g,u,m;
+
+       pgd = mm->pgd;
+       for (g = 0; g <= USER_PTRS_PER_PGD; g++, pgd++) {
+               if (pgd_none(*pgd))
+                       continue;
+               pud = pud_offset(pgd, 0);
+               if (PTRS_PER_PUD > 1) /* not folded */ 
+                       mm_walk_set_prot(pud,flags);
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                       if (pud_none(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+                       if (PTRS_PER_PMD > 1) /* not folded */ 
+                               mm_walk_set_prot(pmd,flags);
+                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                               if (pmd_none(*pmd))
+                                       continue;
+                               pte = pte_offset_kernel(pmd,0);
+                               mm_walk_set_prot(pte,flags);
+                       }
+               }
+       }
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+       spin_lock(&mm->page_table_lock);
+
+       mm_walk(mm, PAGE_KERNEL_RO);
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)mm->pgd,
+                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, 
PAGE_KERNEL_RO),
+                      UVMF_TLB_FLUSH));
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)__user_pgd(mm->pgd),
+                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, 
PAGE_KERNEL_RO),
+                      UVMF_TLB_FLUSH));
+       xen_pgd_pin(__pa(mm->pgd)); /* kernel */
+       xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
+       mm->context.pinned = 1;
+       spin_lock(&mm_unpinned_lock);
+       list_del(&mm->context.unpinned);
+       spin_unlock(&mm_unpinned_lock);
+
+       spin_unlock(&mm->page_table_lock);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+       spin_lock(&mm->page_table_lock);
+
+       xen_pgd_unpin(__pa(mm->pgd));
+       xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)mm->pgd,
+                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 
0));
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)__user_pgd(mm->pgd),
+                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, 
PAGE_KERNEL), 0));
+       mm_walk(mm, PAGE_KERNEL);
+       xen_tlb_flush();
+       mm->context.pinned = 0;
+       spin_lock(&mm_unpinned_lock);
+       list_add(&mm->context.unpinned, &mm_unpinned);
+       spin_unlock(&mm_unpinned_lock);
+
+       spin_unlock(&mm->page_table_lock);
+}
+
+void mm_pin_all(void)
+{
+       while (!list_empty(&mm_unpinned))       
+               mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
+                                 context.unpinned));
+}
+
+void _arch_exit_mmap(struct mm_struct *mm)
+{
+    struct task_struct *tsk = current;
+
+    task_lock(tsk);
+
+    /*
+     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+     */
+    if ( tsk->active_mm == mm )
+    {
+        tsk->active_mm = &init_mm;
+        atomic_inc(&init_mm.mm_count);
+
+        switch_mm(mm, &init_mm, tsk);
+
+        atomic_dec(&mm->mm_count);
+        BUG_ON(atomic_read(&mm->mm_count) == 0);
+    }
+
+    task_unlock(tsk);
+
+    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
+        mm_unpin(mm);
+}
 
 void pte_free(struct page *pte)
 {
-        pte_t *ptep;
-
-        ptep = pfn_to_kaddr(page_to_pfn(pte));
-
-        xen_pte_unpin(__pa(ptep));
-        make_page_writable(ptep);
-       __free_page(pte); 
-}
+       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
+
+       if (!pte_write(*virt_to_ptep(va)))
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
+       __free_page(pte);
+}
+#endif /* CONFIG_XEN */
 
 static inline pte_t *lookup_address(unsigned long address) 
 { 
@@ -78,7 +204,7 @@
        } else
                asm volatile("wbinvd":::"memory"); 
        if (address)
-                __flush_tlb_one((unsigned long) address);
+               __flush_tlb_one(address);
        else
                __flush_tlb_all();
 }
@@ -166,14 +292,17 @@
                BUG();
 
        /* on x86-64 the direct mapping set at boot is not using 4k pages */
-//     BUG_ON(PageReserved(kpte_page));
        /*
         * ..., but the XEN guest kernels (currently) do:
         * If the pte was reserved, it means it was created at boot
         * time (not via split_large_page) and in turn we must not
         * replace it with a large page.
         */
-       if (!PageReserved(kpte_page)) {
+#ifndef CONFIG_XEN
+       BUG_ON(PageReserved(kpte_page));
+#else
+       if (!PageReserved(kpte_page))
+#endif
                switch (page_count(kpte_page)) {
                case 1:
                        save_page(address, kpte_page);               
@@ -182,7 +311,6 @@
                case 0:
                        BUG(); /* memleak and failed 2M page regeneration */
                }
-       }
        return 0;
 } 
 
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h     Fri Aug 
26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h     Fri Aug 
26 11:02:14 2005
@@ -58,6 +58,9 @@
        }
 }
 
+extern void mm_pin(struct mm_struct *mm);
+extern void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
                             struct task_struct *tsk)
@@ -66,6 +69,9 @@
        struct mmuext_op _op[3], *op = _op;
 
        if (likely(prev != next)) {
+               if (!next->context.pinned)
+                       mm_pin(next);
+
                /* stop flush ipis for the previous mm */
                clear_bit(cpu, &prev->cpu_vm_mask);
 #if 0  /* XEN: no lazy tlb */
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26 
11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26 
11:02:14 2005
@@ -21,12 +21,27 @@
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page 
*pte)
 {
-       set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
+       if (unlikely((mm)->context.pinned)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)__va(page_to_pfn(pte) << 
PAGE_SHIFT),
+                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
+               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << 
PAGE_SHIFT)));
+       } else {
+               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
+       }
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
-       set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+       if (unlikely((mm)->context.pinned)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)pmd,
+                              pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, 
+                                      PAGE_KERNEL_RO), 0));
+               set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+       } else {
+               *(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
+       }
 }
 
 /*
@@ -35,53 +50,54 @@
  */
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
-        set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-        set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
-}
-
-extern __inline__ pmd_t *get_pmd(void)
-{
-        pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
-        if (!pmd)
-               return NULL;
-        make_page_readonly(pmd);
-        xen_pmd_pin(__pa(pmd));
-       return pmd;
+       if (unlikely((mm)->context.pinned)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)pud,
+                              pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, 
+                                      PAGE_KERNEL_RO), 0));
+               set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+               set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
+       } else {
+               *(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
+               *(__user_pgd(pgd)) = *(pgd);
+       }
 }
 
 extern __inline__ void pmd_free(pmd_t *pmd)
 {
-       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-        xen_pmd_unpin(__pa(pmd));
-        make_page_writable(pmd);
+       pte_t *ptep = virt_to_ptep(pmd);
+
+       if (!pte_write(*ptep)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                       (unsigned long)pmd,
+                       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
+                       0));
+       }
        free_page((unsigned long)pmd);
 }
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
         pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-        if (!pmd)
-               return NULL;
-        make_page_readonly(pmd);
-        xen_pmd_pin(__pa(pmd)); 
         return pmd;
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
         pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-        if (!pud)
-               return NULL;
-        make_page_readonly(pud);
-        xen_pud_pin(__pa(pud)); 
         return pud;
 }
 
 static inline void pud_free(pud_t *pud)
 {
-       BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-        xen_pud_unpin(__pa(pud));
-        make_page_writable(pud);
+       pte_t *ptep = virt_to_ptep(pud);
+
+       if (!pte_write(*ptep)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                       (unsigned long)pud,
+                       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
+                       0));
+       }
        free_page((unsigned long)pud);
 }
 
@@ -107,10 +123,6 @@
               (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
 
        memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
-        make_pages_readonly(pgd, 2);
-
-        xen_pgd_pin(__pa(pgd)); /* kernel */
-        xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
         /*
          * Set level3_user_pgt for vsyscall area
          */
@@ -121,31 +133,45 @@
 
 static inline void pgd_free(pgd_t *pgd)
 {
-       BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
-        xen_pgd_unpin(__pa(pgd));
-        xen_pgd_unpin(__pa(__user_pgd(pgd)));
-        make_pages_writable(pgd, 2);
+       pte_t *ptep = virt_to_ptep(pgd);
+
+       if (!pte_write(*ptep)) {
+               xen_pgd_unpin(__pa(pgd));
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)pgd,
+                              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, 
PAGE_KERNEL),
+                              0));
+       }
+
+       ptep = virt_to_ptep(__user_pgd(pgd));
+
+       if (!pte_write(*ptep)) {
+               xen_pgd_unpin(__pa(__user_pgd(pgd)));
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)__user_pgd(pgd),
+                              
pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, 
+                                      PAGE_KERNEL),
+                              0));
+       }
+
        free_pages((unsigned long)pgd, 1);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long 
address)
 {
         pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-        if (!pte)
-               return NULL;
-        make_page_readonly(pte);
-        xen_pte_pin(__pa(pte));
+        if (pte)
+               make_page_readonly(pte);
+
        return pte;
 }
 
 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long 
address)
 {
-       pte_t *pte = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-       if (!pte)
-               return NULL;
-        make_page_readonly(pte);
-        xen_pte_pin(__pa(pte));
-       return virt_to_page((unsigned long)pte);
+       struct page *pte;
+
+       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+       return pte;
 }
 
 /* Should really implement gc for free page table pages. This could be
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h        Fri Aug 
26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h        Fri Aug 
26 11:02:14 2005
@@ -18,7 +18,7 @@
 
 #define __flush_tlb_all() __flush_tlb_global()
 
-#define __flush_tlb_one(addr)  xen_invlpg(addr)
+#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
 
 
 /*
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h
--- /dev/null   Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h     Fri Aug 26 
11:02:14 2005
@@ -0,0 +1,33 @@
+#ifndef __x86_64_MMU_H
+#define __x86_64_MMU_H
+
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+/*
+ * The x86_64 doesn't have a mmu context, but
+ * we put the segment information here.
+ *
+ * cpu_vm_mask is used to optimize ldt flushing.
+ */
+typedef struct { 
+       void *ldt;
+       rwlock_t ldtlock; 
+       int size;
+       struct semaphore sem; 
+#ifdef CONFIG_XEN
+       unsigned pinned:1;
+       struct list_head unpinned;
+#endif
+} mm_context_t;
+
+#ifdef CONFIG_XEN
+extern struct list_head mm_unpinned;
+extern spinlock_t mm_unpinned_lock;
+
+/* mm/memory.c:exit_mmap hook */
+extern void _arch_exit_mmap(struct mm_struct *mm);
+#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
+#endif
+
+#endif

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] The patched attached enables x86_64 xenlinux with "late pin, early, Xen patchbot -unstable <=