# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 954f4dea9da6336aaa35d0706aed55fde7909644
# Parent 3ed325fa395bb75f846b29bd3fa571ffc03eb5e4
[PAE] Allow pgdirs above 4GB for paravirt guests.
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
---
tools/libxc/xc_linux_build.c | 13 ----
tools/libxc/xc_linux_restore.c | 122 +++--------------------------------------
tools/libxc/xc_private.c | 22 -------
tools/libxc/xenctrl.h | 3 -
xen/arch/x86/domain_build.c | 5 +
xen/arch/x86/mm.c | 98 ++++++++++++++++++++++++++++----
xen/common/kernel.c | 5 -
xen/include/asm-x86/domain.h | 12 ++++
8 files changed, 113 insertions(+), 167 deletions(-)
diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xc_linux_build.c Fri May 26 17:22:30 2006 +0100
@@ -268,21 +268,10 @@ static int setup_pg_tables_pae(int xc_ha
l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
- unsigned long ppt_alloc, count, nmfn;
+ unsigned long ppt_alloc, count;
/* First allocate page for page dir. */
ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
-
- if ( page_array[ppt_alloc] > 0xfffff )
- {
- nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]);
- if ( nmfn == 0 )
- {
- fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
- goto error_out;
- }
- page_array[ppt_alloc] = nmfn;
- }
alloc_pt(l3tab, vl3tab, pl3tab);
vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xc_linux_restore.c
--- a/tools/libxc/xc_linux_restore.c Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xc_linux_restore.c Fri May 26 17:22:30 2006 +0100
@@ -331,25 +331,17 @@ int xc_linux_restore(int xc_handle, int
** A page table page - need to 'uncanonicalize' it, i.e.
** replace all the references to pfns with the corresponding
** mfns for the new domain.
- **
- ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
- ** so we may need to update the p2m after the main loop.
- ** Hence we defer canonicalization of L1s until then.
*/
- if(pt_levels != 3 || pagetype != L1TAB) {
-
- if(!uncanonicalize_pagetable(pagetype, page)) {
- /*
- ** Failing to uncanonicalize a page table can be ok
- ** under live migration since the pages type may have
- ** changed by now (and we'll get an update later).
- */
- DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
- pagetype >> 28, pfn, mfn);
- nraces++;
- continue;
- }
-
+ if(!uncanonicalize_pagetable(pagetype, page)) {
+ /*
+ ** Failing to uncanonicalize a page table can be ok
+ ** under live migration since the pages type may have
+ ** changed by now (and we'll get an update later).
+ */
+ DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+ pagetype >> 28, pfn, mfn);
+ nraces++;
+ continue;
}
} else if(pagetype != NOTAB) {
@@ -397,100 +389,6 @@ int xc_linux_restore(int xc_handle, int
}
DPRINTF("Received all pages (%d races)\n", nraces);
-
- if(pt_levels == 3) {
-
- /*
- ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
- ** is a little awkward and involves (a) finding all such PGDs and
- ** replacing them with 'lowmem' versions; (b) upating the p2m[]
- ** with the new info; and (c) canonicalizing all the L1s using the
- ** (potentially updated) p2m[].
- **
- ** This is relatively slow (and currently involves two passes through
- ** the pfn_type[] array), but at least seems to be correct. May wish
- ** to consider more complex approaches to optimize this later.
- */
-
- int j, k;
-
- /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
- for (i = 0; i < max_pfn; i++) {
-
- if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
-
- unsigned long new_mfn;
- uint64_t l3ptes[4];
- uint64_t *l3tab;
-
- l3tab = (uint64_t *)
- xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
- PROT_READ, p2m[i]);
-
- for(j = 0; j < 4; j++)
- l3ptes[j] = l3tab[j];
-
- munmap(l3tab, PAGE_SIZE);
-
- if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
- ERR("Couldn't get a page below 4GB :-(");
- goto out;
- }
-
- p2m[i] = new_mfn;
- if (xc_add_mmu_update(xc_handle, mmu,
- (((unsigned long long)new_mfn)
- << PAGE_SHIFT) |
- MMU_MACHPHYS_UPDATE, i)) {
- ERR("Couldn't m2p on PAE root pgdir");
- goto out;
- }
-
- l3tab = (uint64_t *)
- xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
- PROT_READ | PROT_WRITE, p2m[i]);
-
- for(j = 0; j < 4; j++)
- l3tab[j] = l3ptes[j];
-
- munmap(l3tab, PAGE_SIZE);
-
- }
- }
-
- /* Second pass: find all L1TABs and uncanonicalize them */
- j = 0;
-
- for(i = 0; i < max_pfn; i++) {
-
- if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
- region_mfn[j] = p2m[i];
- j++;
- }
-
- if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
-
- if (!(region_base = xc_map_foreign_batch(
- xc_handle, dom, PROT_READ | PROT_WRITE,
- region_mfn, j))) {
- ERR("map batch failed");
- goto out;
- }
-
- for(k = 0; k < j; k++) {
- if(!uncanonicalize_pagetable(L1TAB,
- region_base + k*PAGE_SIZE)) {
- ERR("failed uncanonicalize pt!");
- goto out;
- }
- }
-
- munmap(region_base, j*PAGE_SIZE);
- j = 0;
- }
- }
-
- }
if (xc_finish_mmu_updates(xc_handle, mmu)) {
diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xc_private.c
--- a/tools/libxc/xc_private.c Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xc_private.c Fri May 26 17:22:30 2006 +0100
@@ -430,28 +430,6 @@ int xc_version(int xc_handle, int cmd, v
return rc;
}
-unsigned long xc_make_page_below_4G(
- int xc_handle, uint32_t domid, unsigned long mfn)
-{
- unsigned long new_mfn;
-
- if ( xc_domain_memory_decrease_reservation(
- xc_handle, domid, 1, 0, &mfn) != 0 )
- {
- fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn);
- return 0;
- }
-
- if ( xc_domain_memory_increase_reservation(
- xc_handle, domid, 1, 0, 32, &new_mfn) != 0 )
- {
- fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn);
- return 0;
- }
-
- return new_mfn;
-}
-
/*
* Local variables:
* mode: C
diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xenctrl.h Fri May 26 17:22:30 2006 +0100
@@ -453,9 +453,6 @@ int xc_domain_iomem_permission(int xc_ha
unsigned long nr_mfns,
uint8_t allow_access);
-unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid,
- unsigned long mfn);
-
typedef dom0_perfc_desc_t xc_perfc_desc_t;
/* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */
int xc_perfc_control(int xc_handle,
diff -r 3ed325fa395b -r 954f4dea9da6 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c Fri May 26 13:53:49 2006 +0100
+++ b/xen/arch/x86/domain_build.c Fri May 26 17:22:30 2006 +0100
@@ -367,7 +367,10 @@ int construct_dom0(struct domain *d,
if ( (1UL << order) > nr_pages )
panic("Domain 0 allocation is too small for kernel image.\n");
- /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
+ /*
+ * Allocate from DMA pool: on i386 this ensures that our low-memory 1:1
+ * mapping covers the allocation.
+ */
if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
panic("Not enough RAM for domain 0 allocation.\n");
alloc_spfn = page_to_mfn(page);
diff -r 3ed325fa395b -r 954f4dea9da6 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri May 26 13:53:49 2006 +0100
+++ b/xen/arch/x86/mm.c Fri May 26 17:22:30 2006 +0100
@@ -260,9 +260,42 @@ void share_xen_page_with_privileged_gues
share_xen_page_with_guest(page, dom_xen, readonly);
}
+static void __write_ptbase(unsigned long mfn)
+{
+#ifdef CONFIG_X86_PAE
+ if ( mfn >= 0x100000 )
+ {
+ l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
+ struct vcpu *v = current;
+ unsigned long flags;
+
+ /* Protects against re-entry and against __pae_flush_pgd(). */
+ local_irq_save(flags);
+
+ /* Pick an unused low-memory L3 cache slot. */
+ v->arch.lowmem_l3tab_inuse ^= 1;
+ lowmem_l3tab = v->arch.lowmem_l3tab[v->arch.lowmem_l3tab_inuse];
+ v->arch.lowmem_l3tab_high_mfn[v->arch.lowmem_l3tab_inuse] = mfn;
+
+ /* Map the guest L3 table and copy to the chosen low-memory cache. */
+ highmem_l3tab = map_domain_page(mfn);
+ memcpy(lowmem_l3tab, highmem_l3tab, sizeof(v->arch.lowmem_l3tab));
+ unmap_domain_page(highmem_l3tab);
+
+ /* Install the low-memory L3 table in CR3. */
+ write_cr3(__pa(lowmem_l3tab));
+
+ local_irq_restore(flags);
+ return;
+ }
+#endif
+
+ write_cr3(mfn << PAGE_SHIFT);
+}
+
void write_ptbase(struct vcpu *v)
{
- write_cr3(pagetable_get_paddr(v->arch.monitor_table));
+ __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
}
void invalidate_shadow_ldt(struct vcpu *v)
@@ -401,6 +434,7 @@ static int get_page_and_type_from_pagenr
return 1;
}
+#ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
/*
* We allow root tables to map each other (a.k.a. linear page tables). It
* needs some special care with reference counts and access permissions:
@@ -456,6 +490,7 @@ get_linear_pagetable(
return 1;
}
+#endif /* !CONFIG_X86_PAE */
int
get_page_from_l1e(
@@ -564,10 +599,6 @@ get_page_from_l3e(
rc = get_page_and_type_from_pagenr(
l3e_get_pfn(l3e),
PGT_l2_page_table | vaddr, d);
-#if CONFIG_PAGING_LEVELS == 3
- if ( unlikely(!rc) )
- rc = get_linear_pagetable(l3e, pfn, d);
-#endif
return rc;
}
#endif /* 3 level */
@@ -773,6 +804,50 @@ static int create_pae_xen_mappings(l3_pg
return 1;
}
+struct pae_flush_pgd {
+ unsigned long l3tab_mfn;
+ unsigned int l3tab_idx;
+ l3_pgentry_t nl3e;
+};
+
+static void __pae_flush_pgd(void *data)
+{
+ struct pae_flush_pgd *args = data;
+ struct vcpu *v = this_cpu(curr_vcpu);
+ int i = v->arch.lowmem_l3tab_inuse;
+ intpte_t _ol3e, _nl3e, _pl3e;
+ l3_pgentry_t *l3tab_ptr;
+
+ ASSERT(!local_irq_is_enabled());
+
+ if ( v->arch.lowmem_l3tab_high_mfn[i] != args->l3tab_mfn )
+ return;
+
+ l3tab_ptr = &v->arch.lowmem_l3tab[i][args->l3tab_idx];
+
+ _ol3e = l3e_get_intpte(*l3tab_ptr);
+ _nl3e = l3e_get_intpte(args->nl3e);
+ _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
+ BUG_ON(_pl3e != _ol3e);
+}
+
+/* Flush a pgdir update into low-memory caches. */
+static void pae_flush_pgd(
+ unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
+{
+ struct domain *d = page_get_owner(mfn_to_page(mfn));
+ struct pae_flush_pgd args = {
+ .l3tab_mfn = mfn,
+ .l3tab_idx = idx,
+ .nl3e = nl3e };
+
+ /* If below 4GB then the pgdir is not shadowed in low memory. */
+ if ( mfn < 0x100000 )
+ return;
+
+ on_selected_cpus(d->domain_dirty_cpumask, __pae_flush_pgd, &args, 1, 1);
+}
+
static inline int l1_backptr(
unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
{
@@ -787,6 +862,7 @@ static inline int l1_backptr(
#elif CONFIG_X86_64
# define create_pae_xen_mappings(pl3e) (1)
+# define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
static inline int l1_backptr(
unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
@@ -886,14 +962,6 @@ static int alloc_l3_table(struct page_in
ASSERT(!shadow_mode_refcounts(d));
-#ifdef CONFIG_X86_PAE
- if ( pfn >= 0x100000 )
- {
- MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
- return 0;
- }
-#endif
-
pl3e = map_domain_page(pfn);
for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
{
@@ -1240,6 +1308,8 @@ static int mod_l3_entry(l3_pgentry_t *pl
okay = create_pae_xen_mappings(pl3e);
BUG_ON(!okay);
+
+ pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
put_page_from_l3e(ol3e, pfn);
return 1;
@@ -3109,7 +3179,7 @@ void ptwr_flush(struct domain *d, const
if ( unlikely(d->arch.ptwr[which].vcpu != current) )
/* Don't use write_ptbase: it may switch to guest_user on x86/64! */
- write_cr3(pagetable_get_paddr(
+ __write_ptbase(pagetable_get_pfn(
d->arch.ptwr[which].vcpu->arch.guest_table));
else
TOGGLE_MODE();
diff -r 3ed325fa395b -r 954f4dea9da6 xen/common/kernel.c
--- a/xen/common/kernel.c Fri May 26 13:53:49 2006 +0100
+++ b/xen/common/kernel.c Fri May 26 17:22:30 2006 +0100
@@ -191,12 +191,11 @@ long do_xen_version(int cmd, XEN_GUEST_H
switch ( fi.submap_idx )
{
case 0:
- fi.submap = 0;
+ fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb);
if ( shadow_mode_translate(current->domain) )
fi.submap |=
(1U << XENFEAT_writable_page_tables) |
- (1U << XENFEAT_auto_translated_physmap) |
- (1U << XENFEAT_pae_pgdir_above_4gb);
+ (1U << XENFEAT_auto_translated_physmap);
if ( supervisor_mode_kernel )
fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
break;
diff -r 3ed325fa395b -r 954f4dea9da6 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Fri May 26 13:53:49 2006 +0100
+++ b/xen/include/asm-x86/domain.h Fri May 26 17:22:30 2006 +0100
@@ -120,6 +120,18 @@ struct arch_vcpu
struct vcpu_guest_context guest_context
__attribute__((__aligned__(16)));
+#ifdef CONFIG_X86_PAE
+ /*
+ * Two low-memory (<4GB) PAE L3 tables, used as fallback when the guest
+ * supplies a >=4GB PAE L3 table. We need two because we cannot set up
+ * an L3 table while we are currently running on it (without using
+ * expensive atomic 64-bit operations).
+ */
+ l3_pgentry_t lowmem_l3tab[2][4] __attribute__((__aligned__(32)));
+ unsigned long lowmem_l3tab_high_mfn[2]; /* The >=4GB MFN being shadowed. */
+ unsigned int lowmem_l3tab_inuse; /* Which lowmem_l3tab is in use? */
+#endif
+
unsigned long flags; /* TF_ */
void (*schedule_tail) (struct vcpu *);
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|