The current version of superpage mapping takes a PGT_writable
reference to every page in a superpage each time it is mapped. This
is extremely slow, so slow that applications become unusable.
My solution for this is to introduce a superpage table in the
hypervisor, similar to the frametable structure for pages. Currently
this table only has a type_info element. There are three types a
superpage can have, SGT_mark, SGT_dynamic, or SGT_none.
In normal operation, the first time a superpage is mapped, a
PGT_writable reference is taken to each page in the superpage, and the
superpage is set to type SGT_dynamic and the superpage typecount is
incremented. On subsequent mappings and unmappings, only the
superpage typecount changes. On the last unmap, the PGT_writable
reference on each page is removed.
The SGT_mark type is set and cleared through two new MMUEXT
hypercalls, mark_super and unmark_super. When the hypercall is made,
the superpage's type is set to SGT_mark and a PGT_writable reference
is taken to its pages. On unmark, the type is cleared and the
reference removed.
If a page is already set to SGT_dynamic when mark_super is called, the
type is changed to SGT_mark and no additional PGT_writable reference
is taken. If there are still outstanding mappings of this superpage
when unmark_super is called, the type is set to SGT_dynamic and the
PGT_writable reference is not removed.
Fast superpage mapping is only supported on 64 bit hypervisors. For
32 bit hyperviors, superpage mapping is supported but will be
extremely slow.
Signed-off-by: Dave McCracken <dave.mccracken@xxxxxxxxxx>
--------
--- xen-staging/xen/include/asm-x86/mm.h 2010-05-18 09:45:53.000000000
-0500
+++ xen-staging-fs//xen/include/asm-x86/mm.h 2010-05-26 08:38:39.000000000
-0500
@@ -214,6 +214,23 @@ struct page_info
#define PGC_count_width PG_shift(9)
#define PGC_count_mask ((1UL<<PGC_count_width)-1)
+#ifdef __x86_64__
+struct spage_info
+{
+ unsigned long type_info;
+};
+
+ /* The following page types are MUTUALLY EXCLUSIVE. */
+#define SGT_none PG_mask(0, 2) /* superpage not in use */
+#define SGT_mark PG_mask(1, 2) /* Marked as a superpage */
+#define SGT_dynamic PG_mask(2, 2) /* has been dynamically mapped as a
superpage */
+#define SGT_type_mask PG_mask(3, 2) /* Bits 30-31 or 62-63. */
+
+ /* Count of uses of this superpage as its current type. */
+#define SGT_count_width PG_shift(3)
+#define SGT_count_mask ((1UL<<SGT_count_width)-1)
+#endif
+
#if defined(__i386__)
#define is_xen_heap_page(page) is_xen_heap_mfn(page_to_mfn(page))
#define is_xen_heap_mfn(mfn) ({ \
@@ -262,6 +279,9 @@ extern void share_xen_page_with_privileg
struct page_info *page, int readonly);
#define frame_table ((struct page_info *)FRAMETABLE_VIRT_START)
+#ifdef __x86_64__
+#define spage_table ((struct spage_info *)SPAGETABLE_VIRT_START)
+#endif
extern unsigned long max_page;
extern unsigned long total_pages;
void init_frametable(void);
@@ -305,6 +325,8 @@ void cleanup_page_cacheattr(struct page_
int is_iomem_page(unsigned long mfn);
+void clear_superpage_mark(struct page_info *page);
+
struct domain *page_get_owner_and_reference(struct page_info *page);
void put_page(struct page_info *page);
int get_page(struct page_info *page, struct domain *domain);
@@ -370,7 +392,7 @@ pae_copy_root(struct vcpu *v, l3_pgentry
int check_descriptor(const struct domain *, struct desc_struct *d);
-extern int opt_allow_hugepage;
+extern int opt_allow_superpage;
extern int mem_hotplug;
/******************************************************************************
--- xen-staging/xen/include/asm-x86/guest_pt.h 2010-05-18 09:45:53.000000000
-0500
+++ xen-staging-fs//xen/include/asm-x86/guest_pt.h 2010-05-24
09:00:42.000000000 -0500
@@ -187,7 +187,7 @@ guest_supports_superpages(struct vcpu *v
* CR4.PSE is set or the guest is in PAE or long mode.
* It's also used in the dummy PT for vcpus with CR4.PG cleared. */
return (!is_hvm_vcpu(v)
- ? opt_allow_hugepage
+ ? opt_allow_superpage
: (GUEST_PAGING_LEVELS != 2
|| !hvm_paging_enabled(v)
|| (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
--- xen-staging/xen/include/asm-x86/x86_32/page.h 2010-05-18
09:45:54.000000000 -0500
+++ xen-staging-fs//xen/include/asm-x86/x86_32/page.h 2010-05-24
09:00:43.000000000 -0500
@@ -6,6 +6,7 @@
#define L2_PAGETABLE_SHIFT 21
#define L3_PAGETABLE_SHIFT 30
#define PAGE_SHIFT L1_PAGETABLE_SHIFT
+#define SUPERPAGE_SHIFT L2_PAGETABLE_SHIFT
#define ROOT_PAGETABLE_SHIFT L3_PAGETABLE_SHIFT
#define PAGETABLE_ORDER 9
@@ -13,6 +14,7 @@
#define L2_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER)
#define L3_PAGETABLE_ENTRIES 4
#define ROOT_PAGETABLE_ENTRIES L3_PAGETABLE_ENTRIES
+#define SUPERPAGE_ORDER PAGETABLE_ORDER
/*
* Architecturally, physical addresses may be up to 52 bits. However, the
@@ -53,6 +55,9 @@
#define virt_to_pdx(va) virt_to_mfn(va)
#define pdx_to_virt(pdx) mfn_to_virt(pdx)
+#define pfn_to_sdx(pfn) ((pfn)>>(SUPERPAGE_SHIFT-PAGE_SHIFT))
+#define sdx_to_pfn(sdx) ((sdx)<<(SUPERPAGE_SHIFT-PAGE_SHIFT))
+
static inline unsigned long __virt_to_maddr(unsigned long va)
{
ASSERT(va >= DIRECTMAP_VIRT_START && va < DIRECTMAP_VIRT_END);
--- xen-staging/xen/include/asm-x86/config.h 2010-05-18 09:45:53.000000000
-0500
+++ xen-staging-fs//xen/include/asm-x86/config.h 2010-05-26
08:38:03.000000000 -0500
@@ -225,6 +225,11 @@ extern unsigned int video_mode, video_fl
/* Slot 261: xen text, static data and bss (1GB). */
#define XEN_VIRT_START (HIRO_COMPAT_MPT_VIRT_END)
#define XEN_VIRT_END (XEN_VIRT_START + GB(1))
+/* Slot 261: superpage information array (20MB). */
+#define SPAGETABLE_VIRT_END FRAMETABLE_VIRT_START
+#define SPAGETABLE_SIZE ((DIRECTMAP_SIZE >> SUPERPAGE_SHIFT) * \
+ sizeof(struct spage_info))
+#define SPAGETABLE_VIRT_START (SPAGETABLE_VIRT_END - SPAGETABLE_SIZE)
/* Slot 261: page-frame information array (40GB). */
#define FRAMETABLE_VIRT_END DIRECTMAP_VIRT_START
#define FRAMETABLE_SIZE ((DIRECTMAP_SIZE >> PAGE_SHIFT) * \
--- xen-staging/xen/include/asm-x86/page.h 2010-05-18 09:45:53.000000000
-0500
+++ xen-staging-fs//xen/include/asm-x86/page.h 2010-05-24 09:00:43.000000000
-0500
@@ -240,6 +240,14 @@ void copy_page_sse2(void *, const void *
#define __pfn_to_paddr(pfn) ((paddr_t)(pfn) << PAGE_SHIFT)
#define __paddr_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT))
+/* Convert between machine frame numbers and spage-info structures. */
+#define __mfn_to_spage(mfn) (spage_table + pfn_to_sdx(mfn))
+#define __spage_to_mfn(pg) sdx_to_pfn((unsigned long)((pg) - spage_table))
+
+/* Convert between page-info structures and spage-info structures. */
+#define page_to_spage(page)
(spage_table+(((page)-frame_table)>>(SUPERPAGE_SHIFT-PAGE_SHIFT)))
+#define spage_to_page(spage)
(frame_table+(((spage)-spage_table)<<(SUPERPAGE_SHIFT-PAGE_SHIFT)))
+
/*
* We define non-underscored wrappers for above conversion functions. These are
* overridden in various source files while underscored versions remain intact.
@@ -251,6 +259,8 @@ void copy_page_sse2(void *, const void *
#define maddr_to_virt(ma) __maddr_to_virt((unsigned long)(ma))
#define mfn_to_page(mfn) __mfn_to_page(mfn)
#define page_to_mfn(pg) __page_to_mfn(pg)
+#define mfn_to_spage(mfn) __mfn_to_spage(mfn)
+#define spage_to_mfn(pg) __spage_to_mfn(pg)
#define maddr_to_page(ma) __maddr_to_page(ma)
#define page_to_maddr(pg) __page_to_maddr(pg)
#define virt_to_page(va) __virt_to_page(va)
--- xen-staging/xen/include/asm-x86/x86_64/page.h 2010-05-18
09:45:54.000000000 -0500
+++ xen-staging-fs//xen/include/asm-x86/x86_64/page.h 2010-05-24
09:00:43.000000000 -0500
@@ -7,6 +7,7 @@
#define L3_PAGETABLE_SHIFT 30
#define L4_PAGETABLE_SHIFT 39
#define PAGE_SHIFT L1_PAGETABLE_SHIFT
+#define SUPERPAGE_SHIFT L2_PAGETABLE_SHIFT
#define ROOT_PAGETABLE_SHIFT L4_PAGETABLE_SHIFT
#define PAGETABLE_ORDER 9
@@ -15,6 +16,7 @@
#define L3_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER)
#define L4_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER)
#define ROOT_PAGETABLE_ENTRIES L4_PAGETABLE_ENTRIES
+#define SUPERPAGE_ORDER PAGETABLE_ORDER
#define __PAGE_OFFSET DIRECTMAP_VIRT_START
#define __XEN_VIRT_START XEN_VIRT_START
@@ -41,6 +43,8 @@ extern void pfn_pdx_hole_setup(unsigned
#define page_to_pdx(pg) ((pg) - frame_table)
#define pdx_to_page(pdx) (frame_table + (pdx))
+#define spage_to_pdx(spg) ((spg>>(SUPERPAGE_SHIFT-PAGE_SHIFT)) - spage_table)
+#define pdx_to_spage(pdx) (spage_table + ((pdx)<<(SUPERPAGE_SHIFT-PAGE_SHIFT)))
/*
* Note: These are solely for the use by page_{get,set}_owner(), and
* therefore don't need to handle the XEN_VIRT_{START,END} range.
@@ -64,6 +68,16 @@ static inline unsigned long pdx_to_pfn(u
((pdx << pfn_pdx_hole_shift) & pfn_top_mask);
}
+static inline unsigned long pfn_to_sdx(unsigned long pfn)
+{
+ return pfn_to_pdx(pfn) >> (SUPERPAGE_SHIFT-PAGE_SHIFT);
+}
+
+static inline unsigned long sdx_to_pfn(unsigned long sdx)
+{
+ return pdx_to_pfn(sdx << (SUPERPAGE_SHIFT-PAGE_SHIFT));
+}
+
static inline unsigned long __virt_to_maddr(unsigned long va)
{
ASSERT(va >= XEN_VIRT_START);
--- xen-staging/xen/arch/x86/domain.c 2010-05-24 08:59:03.000000000 -0500
+++ xen-staging-fs//xen/arch/x86/domain.c 2010-05-26 08:37:11.000000000
-0500
@@ -1739,6 +1739,11 @@ static int relinquish_memory(
BUG();
}
+#ifdef __x86_64__
+ if (opt_allow_superpage)
+ clear_superpage_mark(page);
+#endif
+
if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
put_page(page);
--- xen-staging/xen/arch/x86/mm.c 2010-05-18 09:45:53.000000000 -0500
+++ xen-staging-fs//xen/arch/x86/mm.c 2010-05-26 08:43:33.000000000 -0500
@@ -151,8 +151,15 @@ unsigned long __read_mostly pdx_group_va
#define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
-int opt_allow_hugepage;
-boolean_param("allowhugepage", opt_allow_hugepage);
+int opt_allow_superpage;
+boolean_param("allowsuperpage", opt_allow_superpage);
+
+#ifdef __x86_64__
+static int get_superpage(struct spage_info *spage, struct domain *d);
+static void put_superpage(struct spage_info *spage);
+#endif
+static int get_spage_pages(struct page_info *page, struct domain *d);
+static void put_spage_pages(struct page_info *page);
#define l1_disallow_mask(d) \
((d != dom_io) && \
@@ -171,6 +178,30 @@ l2_pgentry_t *compat_idle_pg_table_l2 =
#define l3_disallow_mask(d) L3_DISALLOW_MASK
#endif
+#ifdef __x86_64__
+static void __init init_spagetable(void)
+{
+ unsigned long s, start = SPAGETABLE_VIRT_START;
+ unsigned long end = SPAGETABLE_VIRT_END;
+ unsigned long step, mfn;
+ unsigned int max_entries;
+
+ step = 1UL << PAGETABLE_ORDER;
+ max_entries = (max_pdx + ((1UL<<SUPERPAGE_ORDER)-1)) >> SUPERPAGE_ORDER;
+ end = start + (((max_entries * sizeof(*spage_table)) +
+ ((1UL<<SUPERPAGE_SHIFT)-1)) &
(~((1UL<<SUPERPAGE_SHIFT)-1)));
+
+ for (s = start; s < end; s += step << PAGE_SHIFT)
+ {
+ mfn = alloc_boot_pages(step, step);
+ if ( !mfn )
+ panic("Not enough memory for spage table");
+ map_pages_to_xen(s, mfn, step, PAGE_HYPERVISOR);
+ }
+ memset((void *)start, 0, end - start);
+}
+#endif
+
static void __init init_frametable_chunk(void *start, void *end)
{
unsigned long s = (unsigned long)start;
@@ -232,6 +263,10 @@ void __init init_frametable(void)
(unsigned long)pdx_to_page(max_idx * PDX_GROUP_COUNT) -
(unsigned long)pdx_to_page(max_pdx));
}
+#ifdef __x86_64__
+ if (opt_allow_superpage)
+ init_spagetable();
+#endif
}
void __init arch_init_memory(void)
@@ -652,19 +687,7 @@ static int get_page_and_type_from_pagenr
return rc;
}
-static int get_data_page(
- struct page_info *page, struct domain *d, int writeable)
-{
- int rc;
-
- if ( writeable )
- rc = get_page_and_type(page, d, PGT_writable_page);
- else
- rc = get_page(page, d);
-
- return rc;
-}
-
+#ifdef __x86_64__
static void put_data_page(
struct page_info *page, int writeable)
{
@@ -673,6 +696,7 @@ static void put_data_page(
else
put_page(page);
}
+#endif
/*
* We allow root tables to map each other (a.k.a. linear page tables). It
@@ -887,30 +911,25 @@ get_page_from_l2e(
rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
rc = 0;
+
+ return rc;
}
- else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
+ if ( !opt_allow_superpage )
{
- rc = -EINVAL;
+ MEM_LOG("Attempt to map superpage without allowsuperpage flag in
hypervisor");
+ return -EINVAL;
}
- else
+ if ( mfn & (L1_PAGETABLE_ENTRIES-1) )
{
- unsigned long m = mfn;
- int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
-
- do {
- if ( !mfn_valid(m) ||
- !get_data_page(mfn_to_page(m), d, writeable) )
- {
- while ( m-- > mfn )
- put_data_page(mfn_to_page(m), writeable);
- return -EINVAL;
- }
- } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
-
- rc = 1;
+ MEM_LOG("Unaligned superpage map attempt mfn %lx", mfn);
+ return -EINVAL;
}
+#ifdef __x86_64__
+ return get_superpage(mfn_to_spage(mfn), d);
+#else
+ return get_spage_pages(mfn_to_page(mfn), d);
+#endif
- return rc;
}
@@ -1101,13 +1120,11 @@ static int put_page_from_l2e(l2_pgentry_
if ( l2e_get_flags(l2e) & _PAGE_PSE )
{
- unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
- int writeable = l2e_get_flags(l2e) & _PAGE_RW;
-
- ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
- do {
- put_data_page(mfn_to_page(m), writeable);
- } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
+#ifdef __x86_64__
+ put_superpage(mfn_to_spage(l2e_get_pfn(l2e)));
+#else
+ put_spage_pages(mfn_to_page(l2e_get_pfn(l2e)));
+#endif
}
else
{
@@ -2445,6 +2462,169 @@ int get_page_type_preemptible(struct pag
return __get_page_type(page, type, 1);
}
+static int get_spage_pages(struct page_info *page, struct domain *d)
+{
+ int i;
+
+ for (i = 0; i < (1<<PAGETABLE_ORDER); i++, page++)
+ {
+ if (!get_page_and_type(page, d, PGT_writable_page))
+ {
+ while (--i >= 0)
+ put_page_and_type(--page);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static void put_spage_pages(struct page_info *page)
+{
+ int i;
+
+ for (i = 0; i < (1<<PAGETABLE_ORDER); i++, page++)
+ {
+ put_page_and_type(page);
+ }
+ return;
+}
+
+#ifdef __x86_64__
+static int mark_superpage(struct spage_info *spage, struct domain *d)
+{
+ unsigned long x, nx, y = spage->type_info;
+ int pages_done = 0;
+
+ do {
+ x = y;
+ nx = x + 1;
+ if ((x & SGT_type_mask) == SGT_mark)
+ {
+ MEM_LOG("Duplicate superpage mark attempt mfn %lx",
spage_to_mfn(spage));
+ if (pages_done)
+ put_spage_pages(spage_to_page(spage));
+
+ return -EINVAL;
+ }
+ if ((x & SGT_type_mask) == SGT_dynamic)
+ {
+ if (pages_done)
+ {
+ put_spage_pages(spage_to_page(spage));
+ pages_done = 0;
+ }
+ }
+ else if (!pages_done)
+ {
+ if (!get_spage_pages(spage_to_page(spage), d))
+ {
+ MEM_LOG("Superpage type conflict in mark attempt mfn %lx",
+ spage_to_mfn(spage));
+ return -EINVAL;
+ }
+ pages_done = 1;
+ }
+ nx = (nx & ~SGT_type_mask) | SGT_mark;
+
+ } while ((y = cmpxchg(&spage->type_info, x, nx)) != x);
+ return 0;
+}
+
+static int unmark_superpage(struct spage_info *spage)
+{
+ unsigned long x, nx, y = spage->type_info;
+ unsigned long do_pages = 0;
+
+ do {
+ x = y;
+ nx = x - 1;
+ if ((x & SGT_type_mask) != SGT_mark)
+ {
+ MEM_LOG("Attempt to unmark unmarked superpage mfn %lx",
spage_to_mfn(spage));
+ return -EINVAL;
+ }
+ if ((nx & SGT_count_mask) == 0)
+ {
+ nx = (nx & ~SGT_type_mask) | SGT_none;
+ do_pages = 1;
+ }
+ else
+ {
+ nx = (nx & ~SGT_type_mask) | SGT_dynamic;
+ }
+ } while ((y = cmpxchg(&spage->type_info, x, nx)) != x);
+
+ if (do_pages)
+ put_spage_pages(spage_to_page(spage));
+
+ return 0;
+}
+
+void clear_superpage_mark(struct page_info *page)
+{
+ struct spage_info *spage = page_to_spage(page);
+
+ if ((spage->type_info & SGT_type_mask) == SGT_mark)
+ unmark_superpage(spage);
+
+}
+static int get_superpage(struct spage_info *spage, struct domain *d)
+{
+ unsigned long x, nx, y = spage->type_info;
+ int pages_done = 0;
+
+ do {
+ x = y;
+ nx = x + 1;
+ if ((x & SGT_type_mask) != SGT_none)
+ {
+ if (pages_done)
+ {
+ put_spage_pages(spage_to_page(spage));
+ pages_done = 0;
+ }
+ }
+ else
+ {
+ if (!get_spage_pages(spage_to_page(spage), d))
+ {
+ MEM_LOG("Type conflict on superpage mapping mfn %lx",
+ spage_to_mfn(spage));
+ return -EINVAL;
+ }
+ pages_done = 1;
+ nx = (nx & ~SGT_type_mask) | SGT_dynamic;
+ }
+ } while ((y = cmpxchg(&spage->type_info, x, nx)) != x);
+ return 0;
+}
+
+static void put_superpage(struct spage_info *spage)
+{
+ unsigned long x, nx, y = spage->type_info;
+ unsigned long do_pages = 0;
+
+ do {
+ x = y;
+ nx = x - 1;
+ if ((x & SGT_type_mask) == SGT_dynamic)
+ {
+ if ((nx & SGT_count_mask) == 0)
+ {
+ nx = (nx & ~SGT_type_mask) | SGT_none;
+ do_pages = 1;
+ }
+ }
+
+ } while ((y = cmpxchg(&spage->type_info, x, nx)) != x);
+
+ if (do_pages)
+ put_spage_pages(spage_to_page(spage));
+
+ return;
+}
+#endif
+
void cleanup_page_cacheattr(struct page_info *page)
{
uint32_t cacheattr =
@@ -3002,6 +3182,47 @@ int do_mmuext_op(
break;
}
+#ifdef __x86_64__
+ case MMUEXT_MARK_SUPER:
+ {
+ unsigned long mfn;
+ struct spage_info *spage;
+
+ mfn = op.arg1.mfn;
+ if (mfn & (L1_PAGETABLE_ENTRIES-1))
+ {
+ MEM_LOG("Unaligned superpage reference mfn %lx", mfn);
+ okay = 0;
+ break;
+ }
+
+ spage = mfn_to_spage(mfn);
+ if (mark_superpage(spage, d) < 0)
+ okay = 0;
+
+ break;
+ }
+
+ case MMUEXT_UNMARK_SUPER:
+ {
+ unsigned long mfn;
+ struct spage_info *spage;
+
+ mfn = op.arg1.mfn;
+ if (mfn & (L1_PAGETABLE_ENTRIES-1))
+ {
+ MEM_LOG("Unaligned superpage reference mfn %lx", mfn);
+ okay = 0;
+ break;
+ }
+ spage = mfn_to_spage(mfn);
+ if (unmark_superpage(spage) < 0)
+ okay = 0;
+
+ break;
+ }
+#endif
+
default:
MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
rc = -ENOSYS;
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|