[Xen-changelog] The patched attached enables x86_64 xenlinux wit

# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 5978be010beca73a6b88ae68d2e120c531bb0edd
# Parent  edeee85c90b1fe1431437338cb1645acb176b0bd
The patched attached enables x86_64 xenlinux with "late pin, early
unpin", which is already implemented for x86_32. Since we now only pin
the root rather than any of the other levels, the overall performance
became better especially with workloads that require heavy memory
management operations.

On 8-way x86_64 xenlinux (dom0) the kernel build was improved by about
10% (using make -j32). Even a small setup like a UP HT system, I see
about 3% performance gain with kernel build (make -j4).

Lmbench also shows improvements in fork/exec/sh:
Processor, Processes - times in microseconds - smaller is better
--------------------------------------------------------------------
Host    OS  Mhz   null null      open slct sig  sig  fork exec sh =20
                  call  I/O stat clos TCP  inst hndl proc proc proc
--------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ----=20
Linux 2.6.12- 3786 1.13 1.36 3.93 6.04 10.5 1.43 4.33 536. 1446 3614
Linux 2.6.12- 3786 1.13 1.36 3.91 6.03 10.4 1.44 4.38 346. 1050 2831

Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx>

diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:02:14 2005
@@ -105,13 +105,18 @@
        struct mm_struct * old_mm;
        int retval = 0;
 
+       memset(&mm->context, 0, sizeof(mm->context));
        init_MUTEX(&mm->context.sem);
-       mm->context.size = 0;
        old_mm = current->mm;
        if (old_mm && old_mm->context.size > 0) {
                down(&old_mm->context.sem);
                retval = copy_ldt(&mm->context, &old_mm->context);
                up(&old_mm->context.sem);
+       }
+       if (retval == 0) {
+               spin_lock(&mm_unpinned_lock);
+               list_add(&mm->context.unpinned, &mm_unpinned);
+               spin_unlock(&mm_unpinned_lock);
        }
        return retval;
 }
@@ -133,6 +138,11 @@
                else
                        kfree(mm->context.ldt);
                mm->context.size = 0;
+       }
+       if (!mm->context.pinned) {
+               spin_lock(&mm_unpinned_lock);
+               list_del(&mm->context.unpinned);
+               spin_unlock(&mm_unpinned_lock);
        }
 }
 
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c    Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c    Fri Aug 26 11:02:14 2005
@@ -712,6 +712,7 @@
         HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
 
         memset(empty_zero_page, 0, sizeof(empty_zero_page));
+       init_mm.context.pinned = 1;
 
 #ifdef CONFIG_XEN_PHYSDEV_ACCESS
        {
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c        Fri Aug 26 
11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c        Fri Aug 26 
11:02:14 2005
@@ -12,19 +12,145 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_XEN
 #include <asm/pgalloc.h>
-#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+LIST_HEAD(mm_unpinned);
+DEFINE_SPINLOCK(mm_unpinned_lock);
+
+static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+{
+       struct page *page = virt_to_page(pt);
+       unsigned long pfn = page_to_pfn(page);
+
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)__va(pfn << PAGE_SHIFT),
+                      pfn_pte(pfn, flags), 0));
+}
+
+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
+{
+       pgd_t       *pgd;
+       pud_t       *pud;
+       pmd_t       *pmd;
+       pte_t       *pte;
+       int          g,u,m;
+
+       pgd = mm->pgd;
+       for (g = 0; g <= USER_PTRS_PER_PGD; g++, pgd++) {
+               if (pgd_none(*pgd))
+                       continue;
+               pud = pud_offset(pgd, 0);
+               if (PTRS_PER_PUD > 1) /* not folded */ 
+                       mm_walk_set_prot(pud,flags);
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                       if (pud_none(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+                       if (PTRS_PER_PMD > 1) /* not folded */ 
+                               mm_walk_set_prot(pmd,flags);
+                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                               if (pmd_none(*pmd))
+                                       continue;
+                               pte = pte_offset_kernel(pmd,0);
+                               mm_walk_set_prot(pte,flags);
+                       }
+               }
+       }
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+       spin_lock(&mm->page_table_lock);
+
+       mm_walk(mm, PAGE_KERNEL_RO);
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)mm->pgd,
+                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, 
PAGE_KERNEL_RO),
+                      UVMF_TLB_FLUSH));
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)__user_pgd(mm->pgd),
+                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, 
PAGE_KERNEL_RO),
+                      UVMF_TLB_FLUSH));
+       xen_pgd_pin(__pa(mm->pgd)); /* kernel */
+       xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
+       mm->context.pinned = 1;
+       spin_lock(&mm_unpinned_lock);
+       list_del(&mm->context.unpinned);
+       spin_unlock(&mm_unpinned_lock);
+
+       spin_unlock(&mm->page_table_lock);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+       spin_lock(&mm->page_table_lock);
+
+       xen_pgd_unpin(__pa(mm->pgd));
+       xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)mm->pgd,
+                      pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 
0));
+       BUG_ON(HYPERVISOR_update_va_mapping(
+                      (unsigned long)__user_pgd(mm->pgd),
+                      pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, 
PAGE_KERNEL), 0));
+       mm_walk(mm, PAGE_KERNEL);
+       xen_tlb_flush();
+       mm->context.pinned = 0;
+       spin_lock(&mm_unpinned_lock);
+       list_add(&mm->context.unpinned, &mm_unpinned);
+       spin_unlock(&mm_unpinned_lock);
+
+       spin_unlock(&mm->page_table_lock);
+}
+
+void mm_pin_all(void)
+{
+       while (!list_empty(&mm_unpinned))       
+               mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
+                                 context.unpinned));
+}
+
+void _arch_exit_mmap(struct mm_struct *mm)
+{
+    struct task_struct *tsk = current;
+
+    task_lock(tsk);
+
+    /*
+     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+     */
+    if ( tsk->active_mm == mm )
+    {
+        tsk->active_mm = &init_mm;
+        atomic_inc(&init_mm.mm_count);
+
+        switch_mm(mm, &init_mm, tsk);
+
+        atomic_dec(&mm->mm_count);
+        BUG_ON(atomic_read(&mm->mm_count) == 0);
+    }
+
+    task_unlock(tsk);
+
+    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
+        mm_unpin(mm);
+}
 
 void pte_free(struct page *pte)
 {
-        pte_t *ptep;
-
-        ptep = pfn_to_kaddr(page_to_pfn(pte));
-
-        xen_pte_unpin(__pa(ptep));
-        make_page_writable(ptep);
-       __free_page(pte); 
-}
+       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
+
+       if (!pte_write(*virt_to_ptep(va)))
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
+       __free_page(pte);
+}
+#endif /* CONFIG_XEN */
 
 static inline pte_t *lookup_address(unsigned long address) 
 { 
@@ -78,7 +204,7 @@
        } else
                asm volatile("wbinvd":::"memory"); 
        if (address)
-                __flush_tlb_one((unsigned long) address);
+               __flush_tlb_one(address);
        else
                __flush_tlb_all();
 }
@@ -166,14 +292,17 @@
                BUG();
 
        /* on x86-64 the direct mapping set at boot is not using 4k pages */
-//     BUG_ON(PageReserved(kpte_page));
        /*
         * ..., but the XEN guest kernels (currently) do:
         * If the pte was reserved, it means it was created at boot
         * time (not via split_large_page) and in turn we must not
         * replace it with a large page.
         */
-       if (!PageReserved(kpte_page)) {
+#ifndef CONFIG_XEN
+       BUG_ON(PageReserved(kpte_page));
+#else
+       if (!PageReserved(kpte_page))
+#endif
                switch (page_count(kpte_page)) {
                case 1:
                        save_page(address, kpte_page);               
@@ -182,7 +311,6 @@
                case 0:
                        BUG(); /* memleak and failed 2M page regeneration */
                }
-       }
        return 0;
 } 
 
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h     Fri Aug 
26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h     Fri Aug 
26 11:02:14 2005
@@ -58,6 +58,9 @@
        }
 }
 
+extern void mm_pin(struct mm_struct *mm);
+extern void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
                             struct task_struct *tsk)
@@ -66,6 +69,9 @@
        struct mmuext_op _op[3], *op = _op;
 
        if (likely(prev != next)) {
+               if (!next->context.pinned)
+                       mm_pin(next);
+
                /* stop flush ipis for the previous mm */
                clear_bit(cpu, &prev->cpu_vm_mask);
 #if 0  /* XEN: no lazy tlb */
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26 
11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26 
11:02:14 2005
@@ -21,12 +21,27 @@
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page 
*pte)
 {
-       set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
+       if (unlikely((mm)->context.pinned)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)__va(page_to_pfn(pte) << 
PAGE_SHIFT),
+                              pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
+               set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << 
PAGE_SHIFT)));
+       } else {
+               *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
+       }
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
-       set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+       if (unlikely((mm)->context.pinned)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)pmd,
+                              pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, 
+                                      PAGE_KERNEL_RO), 0));
+               set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+       } else {
+               *(pud) =  __pud(_PAGE_TABLE | __pa(pmd));
+       }
 }
 
 /*
@@ -35,53 +50,54 @@
  */
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
-        set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-        set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
-}
-
-extern __inline__ pmd_t *get_pmd(void)
-{
-        pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
-        if (!pmd)
-               return NULL;
-        make_page_readonly(pmd);
-        xen_pmd_pin(__pa(pmd));
-       return pmd;
+       if (unlikely((mm)->context.pinned)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)pud,
+                              pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, 
+                                      PAGE_KERNEL_RO), 0));
+               set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+               set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
+       } else {
+               *(pgd) =  __pgd(_PAGE_TABLE | __pa(pud));
+               *(__user_pgd(pgd)) = *(pgd);
+       }
 }
 
 extern __inline__ void pmd_free(pmd_t *pmd)
 {
-       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
-        xen_pmd_unpin(__pa(pmd));
-        make_page_writable(pmd);
+       pte_t *ptep = virt_to_ptep(pmd);
+
+       if (!pte_write(*ptep)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                       (unsigned long)pmd,
+                       pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
+                       0));
+       }
        free_page((unsigned long)pmd);
 }
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
         pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-        if (!pmd)
-               return NULL;
-        make_page_readonly(pmd);
-        xen_pmd_pin(__pa(pmd)); 
         return pmd;
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
         pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-        if (!pud)
-               return NULL;
-        make_page_readonly(pud);
-        xen_pud_pin(__pa(pud)); 
         return pud;
 }
 
 static inline void pud_free(pud_t *pud)
 {
-       BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
-        xen_pud_unpin(__pa(pud));
-        make_page_writable(pud);
+       pte_t *ptep = virt_to_ptep(pud);
+
+       if (!pte_write(*ptep)) {
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                       (unsigned long)pud,
+                       pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
+                       0));
+       }
        free_page((unsigned long)pud);
 }
 
@@ -107,10 +123,6 @@
               (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
 
        memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
-        make_pages_readonly(pgd, 2);
-
-        xen_pgd_pin(__pa(pgd)); /* kernel */
-        xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
         /*
          * Set level3_user_pgt for vsyscall area
          */
@@ -121,31 +133,45 @@
 
 static inline void pgd_free(pgd_t *pgd)
 {
-       BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
-        xen_pgd_unpin(__pa(pgd));
-        xen_pgd_unpin(__pa(__user_pgd(pgd)));
-        make_pages_writable(pgd, 2);
+       pte_t *ptep = virt_to_ptep(pgd);
+
+       if (!pte_write(*ptep)) {
+               xen_pgd_unpin(__pa(pgd));
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)pgd,
+                              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, 
PAGE_KERNEL),
+                              0));
+       }
+
+       ptep = virt_to_ptep(__user_pgd(pgd));
+
+       if (!pte_write(*ptep)) {
+               xen_pgd_unpin(__pa(__user_pgd(pgd)));
+               BUG_ON(HYPERVISOR_update_va_mapping(
+                              (unsigned long)__user_pgd(pgd),
+                              
pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, 
+                                      PAGE_KERNEL),
+                              0));
+       }
+
        free_pages((unsigned long)pgd, 1);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long 
address)
 {
         pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-        if (!pte)
-               return NULL;
-        make_page_readonly(pte);
-        xen_pte_pin(__pa(pte));
+        if (pte)
+               make_page_readonly(pte);
+
        return pte;
 }
 
 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long 
address)
 {
-       pte_t *pte = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-       if (!pte)
-               return NULL;
-        make_page_readonly(pte);
-        xen_pte_pin(__pa(pte));
-       return virt_to_page((unsigned long)pte);
+       struct page *pte;
+
+       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+       return pte;
 }
 
 /* Should really implement gc for free page table pages. This could be
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h        Fri Aug 
26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h        Fri Aug 
26 11:02:14 2005
@@ -18,7 +18,7 @@
 
 #define __flush_tlb_all() __flush_tlb_global()
 
-#define __flush_tlb_one(addr)  xen_invlpg(addr)
+#define __flush_tlb_one(addr)  xen_invlpg((unsigned long)addr)
 
 
 /*
diff -r edeee85c90b1 -r 5978be010bec 
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h
--- /dev/null   Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h     Fri Aug 26 
11:02:14 2005
@@ -0,0 +1,33 @@
+#ifndef __x86_64_MMU_H
+#define __x86_64_MMU_H
+
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+/*
+ * The x86_64 doesn't have a mmu context, but
+ * we put the segment information here.
+ *
+ * cpu_vm_mask is used to optimize ldt flushing.
+ */
+typedef struct { 
+       void *ldt;
+       rwlock_t ldtlock; 
+       int size;
+       struct semaphore sem; 
+#ifdef CONFIG_XEN
+       unsigned pinned:1;
+       struct list_head unpinned;
+#endif
+} mm_context_t;
+
+#ifdef CONFIG_XEN
+extern struct list_head mm_unpinned;
+extern spinlock_t mm_unpinned_lock;
+
+/* mm/memory.c:exit_mmap hook */
+extern void _arch_exit_mmap(struct mm_struct *mm);
+#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
+#endif
+
+#endif

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
WARNING - OLD ARCHIVES

xen-changelog

[Xen-changelog] The patched attached enables x86_64 xenlinux with "late