WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] RE: [Xen-changelog] [PAE] Allow pgdirs above 4GB for paravir

To: <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] RE: [Xen-changelog] [PAE] Allow pgdirs above 4GB for paravirt guests.
From: "Nakajima, Jun" <jun.nakajima@xxxxxxxxx>
Date: Fri, 26 May 2006 15:31:31 -0700
Delivery-date: Fri, 26 May 2006 15:32:01 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AcaA9XOUcCUTGk4OTlCgDOS4M467JAAHkjUg
Thread-topic: [Xen-changelog] [PAE] Allow pgdirs above 4GB for paravirt guests.
Keir, 

Dom0 hangs with this patch when loading modules. If I back it out, it
boots fine. I configured dom0 as SMP (and the machine has >4GB).

Jun
---
Intel Open Source Technology Center 

-----Original Message-----
From: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
[mailto:xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx] On Behalf Of Xen
patchbot-unstable
Sent: Friday, May 26, 2006 11:48 AM
To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [PAE] Allow pgdirs above 4GB for paravirt
guests.

# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 954f4dea9da6336aaa35d0706aed55fde7909644
# Parent  3ed325fa395bb75f846b29bd3fa571ffc03eb5e4
[PAE] Allow pgdirs above 4GB for paravirt guests.
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
---
 tools/libxc/xc_linux_build.c   |   13 ----
 tools/libxc/xc_linux_restore.c |  122
+++--------------------------------------
 tools/libxc/xc_private.c       |   22 -------
 tools/libxc/xenctrl.h          |    3 -
 xen/arch/x86/domain_build.c    |    5 +
 xen/arch/x86/mm.c              |   98 ++++++++++++++++++++++++++++----
 xen/common/kernel.c            |    5 -
 xen/include/asm-x86/domain.h   |   12 ++++
 8 files changed, 113 insertions(+), 167 deletions(-)

diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c      Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xc_linux_build.c      Fri May 26 17:22:30 2006 +0100
@@ -268,21 +268,10 @@ static int setup_pg_tables_pae(int xc_ha
     l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
     l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
     uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
-    unsigned long ppt_alloc, count, nmfn;
+    unsigned long ppt_alloc, count;
 
     /* First allocate page for page dir. */
     ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
-
-    if ( page_array[ppt_alloc] > 0xfffff )
-    {
-        nmfn = xc_make_page_below_4G(xc_handle, dom,
page_array[ppt_alloc]);
-        if ( nmfn == 0 )
-        {
-            fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
-            goto error_out;
-        }
-        page_array[ppt_alloc] = nmfn;
-    }
 
     alloc_pt(l3tab, vl3tab, pl3tab);
     vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xc_linux_restore.c
--- a/tools/libxc/xc_linux_restore.c    Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xc_linux_restore.c    Fri May 26 17:22:30 2006 +0100
@@ -331,25 +331,17 @@ int xc_linux_restore(int xc_handle, int 
                 ** A page table page - need to 'uncanonicalize' it,
i.e.
                 ** replace all the references to pfns with the
corresponding
                 ** mfns for the new domain.
-                **
-                ** On PAE we need to ensure that PGDs are in MFNs < 4G,
and
-                ** so we may need to update the p2m after the main
loop.
-                ** Hence we defer canonicalization of L1s until then.
                 */
-                if(pt_levels != 3 || pagetype != L1TAB) {
-
-                    if(!uncanonicalize_pagetable(pagetype, page)) {
-                        /*
-                        ** Failing to uncanonicalize a page table can
be ok
-                        ** under live migration since the pages type
may have
-                        ** changed by now (and we'll get an update
later).
-                        */
-                        DPRINTF("PT L%ld race on pfn=%08lx
mfn=%08lx\n",
-                                pagetype >> 28, pfn, mfn);
-                        nraces++;
-                        continue;
-                    }
-
+                if(!uncanonicalize_pagetable(pagetype, page)) {
+                    /*
+                    ** Failing to uncanonicalize a page table can be ok
+                    ** under live migration since the pages type may
have
+                    ** changed by now (and we'll get an update later).
+                    */
+                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+                            pagetype >> 28, pfn, mfn);
+                    nraces++;
+                    continue;
                 }
 
             } else if(pagetype != NOTAB) {
@@ -397,100 +389,6 @@ int xc_linux_restore(int xc_handle, int 
     }
 
     DPRINTF("Received all pages (%d races)\n", nraces);
-
-    if(pt_levels == 3) {
-
-        /*
-        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
-        ** is a little awkward and involves (a) finding all such PGDs
and
-        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
-        ** with the new info; and (c) canonicalizing all the L1s using
the
-        ** (potentially updated) p2m[].
-        **
-        ** This is relatively slow (and currently involves two passes
through
-        ** the pfn_type[] array), but at least seems to be correct. May
wish
-        ** to consider more complex approaches to optimize this later.
-        */
-
-        int j, k;
-
-        /* First pass: find all L3TABs current in > 4G mfns and get new
mfns */
-        for (i = 0; i < max_pfn; i++) {
-
-            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) &&
(p2m[i]>0xfffffUL)) {
-
-                unsigned long new_mfn;
-                uint64_t l3ptes[4];
-                uint64_t *l3tab;
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                         PROT_READ, p2m[i]);
-
-                for(j = 0; j < 4; j++)
-                    l3ptes[j] = l3tab[j];
-
-                munmap(l3tab, PAGE_SIZE);
-
-                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom,
p2m[i]))) {
-                    ERR("Couldn't get a page below 4GB :-(");
-                    goto out;
-                }
-
-                p2m[i] = new_mfn;
-                if (xc_add_mmu_update(xc_handle, mmu,
-                                      (((unsigned long long)new_mfn)
-                                       << PAGE_SHIFT) |
-                                      MMU_MACHPHYS_UPDATE, i)) {
-                    ERR("Couldn't m2p on PAE root pgdir");
-                    goto out;
-                }
-
-                l3tab = (uint64_t *)
-                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                         PROT_READ | PROT_WRITE,
p2m[i]);
-
-                for(j = 0; j < 4; j++)
-                    l3tab[j] = l3ptes[j];
-
-                munmap(l3tab, PAGE_SIZE);
-
-            }
-        }
-
-        /* Second pass: find all L1TABs and uncanonicalize them */
-        j = 0;
-
-        for(i = 0; i < max_pfn; i++) {
-
-            if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
-                region_mfn[j] = p2m[i];
-                j++;
-            }
-
-            if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
-
-                if (!(region_base = xc_map_foreign_batch(
-                          xc_handle, dom, PROT_READ | PROT_WRITE,
-                          region_mfn, j))) {
-                    ERR("map batch failed");
-                    goto out;
-                }
-
-                for(k = 0; k < j; k++) {
-                    if(!uncanonicalize_pagetable(L1TAB,
-                                                 region_base +
k*PAGE_SIZE)) {
-                        ERR("failed uncanonicalize pt!");
-                        goto out;
-                    }
-                }
-
-                munmap(region_base, j*PAGE_SIZE);
-                j = 0;
-            }
-        }
-
-    }
 
 
     if (xc_finish_mmu_updates(xc_handle, mmu)) {
diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xc_private.c
--- a/tools/libxc/xc_private.c  Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xc_private.c  Fri May 26 17:22:30 2006 +0100
@@ -430,28 +430,6 @@ int xc_version(int xc_handle, int cmd, v
     return rc;
 }
 
-unsigned long xc_make_page_below_4G(
-    int xc_handle, uint32_t domid, unsigned long mfn)
-{
-    unsigned long new_mfn;
-
-    if ( xc_domain_memory_decrease_reservation(
-        xc_handle, domid, 1, 0, &mfn) != 0 )
-    {
-        fprintf(stderr,"xc_make_page_below_4G decrease failed.
mfn=%lx\n",mfn);
-        return 0;
-    }
-
-    if ( xc_domain_memory_increase_reservation(
-        xc_handle, domid, 1, 0, 32, &new_mfn) != 0 )
-    {
-        fprintf(stderr,"xc_make_page_below_4G increase failed.
mfn=%lx\n",mfn);
-        return 0;
-    }
-
-    return new_mfn;
-}
-
 /*
  * Local variables:
  * mode: C
diff -r 3ed325fa395b -r 954f4dea9da6 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri May 26 13:53:49 2006 +0100
+++ b/tools/libxc/xenctrl.h     Fri May 26 17:22:30 2006 +0100
@@ -453,9 +453,6 @@ int xc_domain_iomem_permission(int xc_ha
                                unsigned long nr_mfns,
                                uint8_t allow_access);
 
-unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid,
-                                    unsigned long mfn);
-
 typedef dom0_perfc_desc_t xc_perfc_desc_t;
 /* IMPORTANT: The caller is responsible for mlock()'ing the @desc
array. */
 int xc_perfc_control(int xc_handle,
diff -r 3ed325fa395b -r 954f4dea9da6 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Fri May 26 13:53:49 2006 +0100
+++ b/xen/arch/x86/domain_build.c       Fri May 26 17:22:30 2006 +0100
@@ -367,7 +367,10 @@ int construct_dom0(struct domain *d,
     if ( (1UL << order) > nr_pages )
         panic("Domain 0 allocation is too small for kernel image.\n");
 
-    /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary.
*/
+    /*
+     * Allocate from DMA pool: on i386 this ensures that our low-memory
1:1
+     * mapping covers the allocation.
+     */
     if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL
)
         panic("Not enough RAM for domain 0 allocation.\n");
     alloc_spfn = page_to_mfn(page);
diff -r 3ed325fa395b -r 954f4dea9da6 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri May 26 13:53:49 2006 +0100
+++ b/xen/arch/x86/mm.c Fri May 26 17:22:30 2006 +0100
@@ -260,9 +260,42 @@ void share_xen_page_with_privileged_gues
     share_xen_page_with_guest(page, dom_xen, readonly);
 }
 
+static void __write_ptbase(unsigned long mfn)
+{
+#ifdef CONFIG_X86_PAE
+    if ( mfn >= 0x100000 )
+    {
+        l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
+        struct vcpu *v = current;
+        unsigned long flags;
+
+        /* Protects against re-entry and against __pae_flush_pgd(). */
+        local_irq_save(flags);
+
+        /* Pick an unused low-memory L3 cache slot. */
+        v->arch.lowmem_l3tab_inuse ^= 1;
+        lowmem_l3tab =
v->arch.lowmem_l3tab[v->arch.lowmem_l3tab_inuse];
+        v->arch.lowmem_l3tab_high_mfn[v->arch.lowmem_l3tab_inuse] =
mfn;
+
+        /* Map the guest L3 table and copy to the chosen low-memory
cache. */
+        highmem_l3tab = map_domain_page(mfn);
+        memcpy(lowmem_l3tab, highmem_l3tab,
sizeof(v->arch.lowmem_l3tab));
+        unmap_domain_page(highmem_l3tab);
+
+        /* Install the low-memory L3 table in CR3. */
+        write_cr3(__pa(lowmem_l3tab));
+
+        local_irq_restore(flags);
+        return;
+    }
+#endif
+
+    write_cr3(mfn << PAGE_SHIFT);
+}
+
 void write_ptbase(struct vcpu *v)
 {
-    write_cr3(pagetable_get_paddr(v->arch.monitor_table));
+    __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
 }
 
 void invalidate_shadow_ldt(struct vcpu *v)
@@ -401,6 +434,7 @@ static int get_page_and_type_from_pagenr
     return 1;
 }
 
+#ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on
PAE. */
 /*
  * We allow root tables to map each other (a.k.a. linear page tables).
It
  * needs some special care with reference counts and access
permissions:
@@ -456,6 +490,7 @@ get_linear_pagetable(
 
     return 1;
 }
+#endif /* !CONFIG_X86_PAE */
 
 int
 get_page_from_l1e(
@@ -564,10 +599,6 @@ get_page_from_l3e(
     rc = get_page_and_type_from_pagenr(
         l3e_get_pfn(l3e),
         PGT_l2_page_table | vaddr, d);
-#if CONFIG_PAGING_LEVELS == 3
-    if ( unlikely(!rc) )
-        rc = get_linear_pagetable(l3e, pfn, d);
-#endif
     return rc;
 }
 #endif /* 3 level */
@@ -773,6 +804,50 @@ static int create_pae_xen_mappings(l3_pg
     return 1;
 }
 
+struct pae_flush_pgd {
+    unsigned long l3tab_mfn;
+    unsigned int  l3tab_idx;
+    l3_pgentry_t  nl3e;
+};
+
+static void __pae_flush_pgd(void *data)
+{
+    struct pae_flush_pgd *args = data;
+    struct vcpu *v = this_cpu(curr_vcpu);
+    int i = v->arch.lowmem_l3tab_inuse;
+    intpte_t _ol3e, _nl3e, _pl3e;
+    l3_pgentry_t *l3tab_ptr;
+
+    ASSERT(!local_irq_is_enabled());
+
+    if ( v->arch.lowmem_l3tab_high_mfn[i] != args->l3tab_mfn )
+        return;
+
+    l3tab_ptr = &v->arch.lowmem_l3tab[i][args->l3tab_idx];
+
+    _ol3e = l3e_get_intpte(*l3tab_ptr);
+    _nl3e = l3e_get_intpte(args->nl3e);
+    _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
+    BUG_ON(_pl3e != _ol3e);
+}
+
+/* Flush a pgdir update into low-memory caches. */
+static void pae_flush_pgd(
+    unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
+{
+    struct domain *d = page_get_owner(mfn_to_page(mfn));
+    struct pae_flush_pgd args = {
+        .l3tab_mfn = mfn,
+        .l3tab_idx = idx,
+        .nl3e      = nl3e };
+
+    /* If below 4GB then the pgdir is not shadowed in low memory. */
+    if ( mfn < 0x100000 )
+        return;
+
+    on_selected_cpus(d->domain_dirty_cpumask, __pae_flush_pgd, &args,
1, 1);
+}
+
 static inline int l1_backptr(
     unsigned long *backptr, unsigned long offset_in_l2, unsigned long
l2_type)
 {
@@ -787,6 +862,7 @@ static inline int l1_backptr(
 
 #elif CONFIG_X86_64
 # define create_pae_xen_mappings(pl3e) (1)
+# define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
 
 static inline int l1_backptr(
     unsigned long *backptr, unsigned long offset_in_l2, unsigned long
l2_type)
@@ -886,14 +962,6 @@ static int alloc_l3_table(struct page_in
 
     ASSERT(!shadow_mode_refcounts(d));
 
-#ifdef CONFIG_X86_PAE
-    if ( pfn >= 0x100000 )
-    {
-        MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
-        return 0;
-    }
-#endif
-
     pl3e = map_domain_page(pfn);
     for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
     {
@@ -1240,6 +1308,8 @@ static int mod_l3_entry(l3_pgentry_t *pl
 
     okay = create_pae_xen_mappings(pl3e);
     BUG_ON(!okay);
+
+    pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
 
     put_page_from_l3e(ol3e, pfn);
     return 1;
@@ -3109,7 +3179,7 @@ void ptwr_flush(struct domain *d, const 
 
     if ( unlikely(d->arch.ptwr[which].vcpu != current) )
         /* Don't use write_ptbase: it may switch to guest_user on
x86/64! */
-        write_cr3(pagetable_get_paddr(
+        __write_ptbase(pagetable_get_pfn(
             d->arch.ptwr[which].vcpu->arch.guest_table));
     else
         TOGGLE_MODE();
diff -r 3ed325fa395b -r 954f4dea9da6 xen/common/kernel.c
--- a/xen/common/kernel.c       Fri May 26 13:53:49 2006 +0100
+++ b/xen/common/kernel.c       Fri May 26 17:22:30 2006 +0100
@@ -191,12 +191,11 @@ long do_xen_version(int cmd, XEN_GUEST_H
         switch ( fi.submap_idx )
         {
         case 0:
-            fi.submap = 0;
+            fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb);
             if ( shadow_mode_translate(current->domain) )
                 fi.submap |= 
                     (1U << XENFEAT_writable_page_tables) |
-                    (1U << XENFEAT_auto_translated_physmap) |
-                    (1U << XENFEAT_pae_pgdir_above_4gb);
+                    (1U << XENFEAT_auto_translated_physmap);
             if ( supervisor_mode_kernel )
                 fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
             break;
diff -r 3ed325fa395b -r 954f4dea9da6 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Fri May 26 13:53:49 2006 +0100
+++ b/xen/include/asm-x86/domain.h      Fri May 26 17:22:30 2006 +0100
@@ -120,6 +120,18 @@ struct arch_vcpu
     struct vcpu_guest_context guest_context
     __attribute__((__aligned__(16)));
 
+#ifdef CONFIG_X86_PAE
+    /*
+     * Two low-memory (<4GB) PAE L3 tables, used as fallback when the
guest
+     * supplies a >=4GB PAE L3 table. We need two because we cannot set
up
+     * an L3 table while we are currently running on it (without using
+     * expensive atomic 64-bit operations).
+     */
+    l3_pgentry_t  lowmem_l3tab[2][4] __attribute__((__aligned__(32)));
+    unsigned long lowmem_l3tab_high_mfn[2]; /* The >=4GB MFN being
shadowed. */
+    unsigned int  lowmem_l3tab_inuse;       /* Which lowmem_l3tab is in
use? */
+#endif
+
     unsigned long      flags; /* TF_ */
 
     void (*schedule_tail) (struct vcpu *);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel