# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 5978be010beca73a6b88ae68d2e120c531bb0edd
# Parent edeee85c90b1fe1431437338cb1645acb176b0bd
The patched attached enables x86_64 xenlinux with "late pin, early
unpin", which is already implemented for x86_32. Since we now only pin
the root rather than any of the other levels, the overall performance
became better especially with workloads that require heavy memory
management operations.
On 8-way x86_64 xenlinux (dom0) the kernel build was improved by about
10% (using make -j32). Even a small setup like a UP HT system, I see
about 3% performance gain with kernel build (make -j4).
Lmbench also shows improvements in fork/exec/sh:
Processor, Processes - times in microseconds - smaller is better
--------------------------------------------------------------------
Host OS Mhz null null open slct sig sig fork exec sh =20
call I/O stat clos TCP inst hndl proc proc proc
--------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ----=20
Linux 2.6.12- 3786 1.13 1.36 3.93 6.04 10.5 1.43 4.33 536. 1446 3614
Linux 2.6.12- 3786 1.13 1.36 3.91 6.03 10.4 1.44 4.38 346. 1050 2831
Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx>
diff -r edeee85c90b1 -r 5978be010bec
linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:02:14 2005
@@ -105,13 +105,18 @@
struct mm_struct * old_mm;
int retval = 0;
+ memset(&mm->context, 0, sizeof(mm->context));
init_MUTEX(&mm->context.sem);
- mm->context.size = 0;
old_mm = current->mm;
if (old_mm && old_mm->context.size > 0) {
down(&old_mm->context.sem);
retval = copy_ldt(&mm->context, &old_mm->context);
up(&old_mm->context.sem);
+ }
+ if (retval == 0) {
+ spin_lock(&mm_unpinned_lock);
+ list_add(&mm->context.unpinned, &mm_unpinned);
+ spin_unlock(&mm_unpinned_lock);
}
return retval;
}
@@ -133,6 +138,11 @@
else
kfree(mm->context.ldt);
mm->context.size = 0;
+ }
+ if (!mm->context.pinned) {
+ spin_lock(&mm_unpinned_lock);
+ list_del(&mm->context.unpinned);
+ spin_unlock(&mm_unpinned_lock);
}
}
diff -r edeee85c90b1 -r 5978be010bec
linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Fri Aug 26 11:02:14 2005
@@ -712,6 +712,7 @@
HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
memset(empty_zero_page, 0, sizeof(empty_zero_page));
+ init_mm.context.pinned = 1;
#ifdef CONFIG_XEN_PHYSDEV_ACCESS
{
diff -r edeee85c90b1 -r 5978be010bec
linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c
--- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c Fri Aug 26
11:00:14 2005
+++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c Fri Aug 26
11:02:14 2005
@@ -12,19 +12,145 @@
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#ifdef CONFIG_XEN
#include <asm/pgalloc.h>
-#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+LIST_HEAD(mm_unpinned);
+DEFINE_SPINLOCK(mm_unpinned_lock);
+
+static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+{
+ struct page *page = virt_to_page(pt);
+ unsigned long pfn = page_to_pfn(page);
+
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ pfn_pte(pfn, flags), 0));
+}
+
+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int g,u,m;
+
+ pgd = mm->pgd;
+ for (g = 0; g <= USER_PTRS_PER_PGD; g++, pgd++) {
+ if (pgd_none(*pgd))
+ continue;
+ pud = pud_offset(pgd, 0);
+ if (PTRS_PER_PUD > 1) /* not folded */
+ mm_walk_set_prot(pud,flags);
+ for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+ if (pud_none(*pud))
+ continue;
+ pmd = pmd_offset(pud, 0);
+ if (PTRS_PER_PMD > 1) /* not folded */
+ mm_walk_set_prot(pmd,flags);
+ for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+ if (pmd_none(*pmd))
+ continue;
+ pte = pte_offset_kernel(pmd,0);
+ mm_walk_set_prot(pte,flags);
+ }
+ }
+ }
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+ spin_lock(&mm->page_table_lock);
+
+ mm_walk(mm, PAGE_KERNEL_RO);
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)mm->pgd,
+ pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT,
PAGE_KERNEL_RO),
+ UVMF_TLB_FLUSH));
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)__user_pgd(mm->pgd),
+ pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT,
PAGE_KERNEL_RO),
+ UVMF_TLB_FLUSH));
+ xen_pgd_pin(__pa(mm->pgd)); /* kernel */
+ xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */
+ mm->context.pinned = 1;
+ spin_lock(&mm_unpinned_lock);
+ list_del(&mm->context.unpinned);
+ spin_unlock(&mm_unpinned_lock);
+
+ spin_unlock(&mm->page_table_lock);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+ spin_lock(&mm->page_table_lock);
+
+ xen_pgd_unpin(__pa(mm->pgd));
+ xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)mm->pgd,
+ pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL),
0));
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)__user_pgd(mm->pgd),
+ pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT,
PAGE_KERNEL), 0));
+ mm_walk(mm, PAGE_KERNEL);
+ xen_tlb_flush();
+ mm->context.pinned = 0;
+ spin_lock(&mm_unpinned_lock);
+ list_add(&mm->context.unpinned, &mm_unpinned);
+ spin_unlock(&mm_unpinned_lock);
+
+ spin_unlock(&mm->page_table_lock);
+}
+
+void mm_pin_all(void)
+{
+ while (!list_empty(&mm_unpinned))
+ mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
+ context.unpinned));
+}
+
+void _arch_exit_mmap(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ task_lock(tsk);
+
+ /*
+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+ * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+ */
+ if ( tsk->active_mm == mm )
+ {
+ tsk->active_mm = &init_mm;
+ atomic_inc(&init_mm.mm_count);
+
+ switch_mm(mm, &init_mm, tsk);
+
+ atomic_dec(&mm->mm_count);
+ BUG_ON(atomic_read(&mm->mm_count) == 0);
+ }
+
+ task_unlock(tsk);
+
+ if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
+ mm_unpin(mm);
+}
void pte_free(struct page *pte)
{
- pte_t *ptep;
-
- ptep = pfn_to_kaddr(page_to_pfn(pte));
-
- xen_pte_unpin(__pa(ptep));
- make_page_writable(ptep);
- __free_page(pte);
-}
+ unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
+
+ if (!pte_write(*virt_to_ptep(va)))
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0));
+ __free_page(pte);
+}
+#endif /* CONFIG_XEN */
static inline pte_t *lookup_address(unsigned long address)
{
@@ -78,7 +204,7 @@
} else
asm volatile("wbinvd":::"memory");
if (address)
- __flush_tlb_one((unsigned long) address);
+ __flush_tlb_one(address);
else
__flush_tlb_all();
}
@@ -166,14 +292,17 @@
BUG();
/* on x86-64 the direct mapping set at boot is not using 4k pages */
-// BUG_ON(PageReserved(kpte_page));
/*
* ..., but the XEN guest kernels (currently) do:
* If the pte was reserved, it means it was created at boot
* time (not via split_large_page) and in turn we must not
* replace it with a large page.
*/
- if (!PageReserved(kpte_page)) {
+#ifndef CONFIG_XEN
+ BUG_ON(PageReserved(kpte_page));
+#else
+ if (!PageReserved(kpte_page))
+#endif
switch (page_count(kpte_page)) {
case 1:
save_page(address, kpte_page);
@@ -182,7 +311,6 @@
case 0:
BUG(); /* memleak and failed 2M page regeneration */
}
- }
return 0;
}
diff -r edeee85c90b1 -r 5978be010bec
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Fri Aug
26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Fri Aug
26 11:02:14 2005
@@ -58,6 +58,9 @@
}
}
+extern void mm_pin(struct mm_struct *mm);
+extern void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
@@ -66,6 +69,9 @@
struct mmuext_op _op[3], *op = _op;
if (likely(prev != next)) {
+ if (!next->context.pinned)
+ mm_pin(next);
+
/* stop flush ipis for the previous mm */
clear_bit(cpu, &prev->cpu_vm_mask);
#if 0 /* XEN: no lazy tlb */
diff -r edeee85c90b1 -r 5978be010bec
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26
11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26
11:02:14 2005
@@ -21,12 +21,27 @@
static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page
*pte)
{
- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
+ if (unlikely((mm)->context.pinned)) {
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(page_to_pfn(pte) <<
PAGE_SHIFT),
+ pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));
+ set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) <<
PAGE_SHIFT)));
+ } else {
+ *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT));
+ }
}
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+ if (unlikely((mm)->context.pinned)) {
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)pmd,
+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT,
+ PAGE_KERNEL_RO), 0));
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+ } else {
+ *(pud) = __pud(_PAGE_TABLE | __pa(pmd));
+ }
}
/*
@@ -35,53 +50,54 @@
*/
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
{
- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
- set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
-}
-
-extern __inline__ pmd_t *get_pmd(void)
-{
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
- if (!pmd)
- return NULL;
- make_page_readonly(pmd);
- xen_pmd_pin(__pa(pmd));
- return pmd;
+ if (unlikely((mm)->context.pinned)) {
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)pud,
+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT,
+ PAGE_KERNEL_RO), 0));
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+ set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud)));
+ } else {
+ *(pgd) = __pgd(_PAGE_TABLE | __pa(pud));
+ *(__user_pgd(pgd)) = *(pgd);
+ }
}
extern __inline__ void pmd_free(pmd_t *pmd)
{
- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- xen_pmd_unpin(__pa(pmd));
- make_page_writable(pmd);
+ pte_t *ptep = virt_to_ptep(pmd);
+
+ if (!pte_write(*ptep)) {
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)pmd,
+ pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL),
+ 0));
+ }
free_page((unsigned long)pmd);
}
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
- if (!pmd)
- return NULL;
- make_page_readonly(pmd);
- xen_pmd_pin(__pa(pmd));
return pmd;
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
- if (!pud)
- return NULL;
- make_page_readonly(pud);
- xen_pud_pin(__pa(pud));
return pud;
}
static inline void pud_free(pud_t *pud)
{
- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- xen_pud_unpin(__pa(pud));
- make_page_writable(pud);
+ pte_t *ptep = virt_to_ptep(pud);
+
+ if (!pte_write(*ptep)) {
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)pud,
+ pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL),
+ 0));
+ }
free_page((unsigned long)pud);
}
@@ -107,10 +123,6 @@
(PTRS_PER_PGD - boundary) * sizeof(pgd_t));
memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */
- make_pages_readonly(pgd, 2);
-
- xen_pgd_pin(__pa(pgd)); /* kernel */
- xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */
/*
* Set level3_user_pgt for vsyscall area
*/
@@ -121,31 +133,45 @@
static inline void pgd_free(pgd_t *pgd)
{
- BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
- xen_pgd_unpin(__pa(pgd));
- xen_pgd_unpin(__pa(__user_pgd(pgd)));
- make_pages_writable(pgd, 2);
+ pte_t *ptep = virt_to_ptep(pgd);
+
+ if (!pte_write(*ptep)) {
+ xen_pgd_unpin(__pa(pgd));
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)pgd,
+ pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT,
PAGE_KERNEL),
+ 0));
+ }
+
+ ptep = virt_to_ptep(__user_pgd(pgd));
+
+ if (!pte_write(*ptep)) {
+ xen_pgd_unpin(__pa(__user_pgd(pgd)));
+ BUG_ON(HYPERVISOR_update_va_mapping(
+ (unsigned long)__user_pgd(pgd),
+
pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT,
+ PAGE_KERNEL),
+ 0));
+ }
+
free_pages((unsigned long)pgd, 1);
}
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long
address)
{
pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
- if (!pte)
- return NULL;
- make_page_readonly(pte);
- xen_pte_pin(__pa(pte));
+ if (pte)
+ make_page_readonly(pte);
+
return pte;
}
static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long
address)
{
- pte_t *pte = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
- if (!pte)
- return NULL;
- make_page_readonly(pte);
- xen_pte_pin(__pa(pte));
- return virt_to_page((unsigned long)pte);
+ struct page *pte;
+
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ return pte;
}
/* Should really implement gc for free page table pages. This could be
diff -r edeee85c90b1 -r 5978be010bec
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Fri Aug
26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Fri Aug
26 11:02:14 2005
@@ -18,7 +18,7 @@
#define __flush_tlb_all() __flush_tlb_global()
-#define __flush_tlb_one(addr) xen_invlpg(addr)
+#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
/*
diff -r edeee85c90b1 -r 5978be010bec
linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h
--- /dev/null Fri Aug 26 11:00:14 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h Fri Aug 26
11:02:14 2005
@@ -0,0 +1,33 @@
+#ifndef __x86_64_MMU_H
+#define __x86_64_MMU_H
+
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+/*
+ * The x86_64 doesn't have a mmu context, but
+ * we put the segment information here.
+ *
+ * cpu_vm_mask is used to optimize ldt flushing.
+ */
+typedef struct {
+ void *ldt;
+ rwlock_t ldtlock;
+ int size;
+ struct semaphore sem;
+#ifdef CONFIG_XEN
+ unsigned pinned:1;
+ struct list_head unpinned;
+#endif
+} mm_context_t;
+
+#ifdef CONFIG_XEN
+extern struct list_head mm_unpinned;
+extern spinlock_t mm_unpinned_lock;
+
+/* mm/memory.c:exit_mmap hook */
+extern void _arch_exit_mmap(struct mm_struct *mm);
+#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
+#endif
+
+#endif
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|