WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] x86: Move the guest pagetable walker out

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] x86: Move the guest pagetable walker out of shadow/multi.c
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Wed, 19 Nov 2008 08:10:28 -0800
Delivery-date: Wed, 19 Nov 2008 08:12:30 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1226581328 0
# Node ID 7fb33d15dc9bc5892e4708011beded66dd756be3
# Parent  b87cc4de3ca676e895f6374daed1d33a79849b9d
x86: Move the guest pagetable walker out of shadow/multi.c

Move the guest PT walker into its own file, and purge it of references
to the rest of the shadow code.

Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---
 xen/arch/x86/mm/Makefile         |    6 
 xen/arch/x86/mm/guest_walk.c     |  260 +++++++++++++++++++++++++++++
 xen/arch/x86/mm/shadow/multi.c   |  341 ++-------------------------------------
 xen/include/asm-x86/guest_pt.h   |   89 ++++++++++
 xen/include/asm-x86/perfc_defn.h |    2 
 5 files changed, 378 insertions(+), 320 deletions(-)

diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/Makefile
--- a/xen/arch/x86/mm/Makefile  Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/arch/x86/mm/Makefile  Thu Nov 13 13:02:08 2008 +0000
@@ -3,3 +3,9 @@ subdir-y += hap
 
 obj-y += paging.o
 obj-y += p2m.o
+obj-y += guest_walk_2.o
+obj-y += guest_walk_3.o
+obj-$(x86_64) += guest_walk_4.o
+
+guest_walk_%.o: guest_walk.c $(HDRS) Makefile
+       $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/guest_walk.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/guest_walk.c      Thu Nov 13 13:02:08 2008 +0000
@@ -0,0 +1,260 @@
+/******************************************************************************
+ * arch/x86/mm/guest_walk.c
+ *
+ * Pagetable walker for guest memory accesses.
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/page.h>
+#include <asm/guest_pt.h>
+
+
+/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
+static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
+{
+    static uint32_t flags[] = {
+        /* I/F -  Usr Wr */
+        /* 0   0   0   0 */ _PAGE_PRESENT, 
+        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
+        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
+        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+        /* 0   1   0   0 */ _PAGE_PRESENT, 
+        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
+        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
+        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
+        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
+        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+    };
+
+    /* Don't demand not-NX if the CPU wouldn't enforce it. */
+    if ( !guest_supports_nx(v) )
+        pfec &= ~PFEC_insn_fetch;
+
+    /* Don't demand R/W if the CPU wouldn't enforce it. */
+    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
+         && !(pfec & PFEC_user_mode) )
+        pfec &= ~PFEC_write_access;
+
+    return flags[(pfec & 0x1f) >> 1];
+}
+
+/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
+ * Returns non-zero if it actually writes to guest memory. */
+static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
+{
+    guest_intpte_t old, new;
+
+    old = *(guest_intpte_t *)walk_p;
+    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
+    if ( old != new ) 
+    {
+        /* Write the new entry into the walk, and try to write it back
+         * into the guest table as well.  If the guest table has changed
+         * under out feet then leave it alone. */
+        *(guest_intpte_t *)walk_p = new;
+        if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
+            return 1;
+    }
+    return 0;
+}
+
+
+/* Walk the guest pagetables, after the manner of a hardware walker. */
+uint32_t
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                  uint32_t pfec, mfn_t top_mfn, void *top_map)
+{
+    struct domain *d = v->domain;
+    p2m_type_t p2mt;
+    guest_l1e_t *l1p = NULL;
+    guest_l2e_t *l2p = NULL;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    guest_l3e_t *l3p = NULL;
+    guest_l4e_t *l4p;
+#endif
+    uint32_t gflags, mflags, rc = 0;
+    int pse;
+
+    perfc_incr(guest_walk);
+    memset(gw, 0, sizeof(*gw));
+    gw->va = va;
+
+    /* Mandatory bits that must be set in every entry.  We invert NX, to
+     * calculate as if there were an "X" bit that allowed access. 
+     * We will accumulate, in rc, the set of flags that are missing. */
+    mflags = mandatory_flags(v, pfec);
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+
+    /* Get the l4e from the top level table and check its flags*/
+    gw->l4mfn = top_mfn;
+    l4p = (guest_l4e_t *) top_map;
+    gw->l4e = l4p[guest_l4_table_offset(va)];
+    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT ) goto out;
+
+    /* Map the l3 table */
+    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
+    if ( !p2m_is_ram(p2mt) ) 
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+    ASSERT(mfn_valid(mfn_x(gw->l3mfn)));
+
+    /* Get the l3e and check its flags*/
+    l3p = map_domain_page(mfn_x(gw->l3mfn));
+    gw->l3e = l3p[guest_l3_table_offset(va)];
+    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT )
+        goto out;
+
+#else /* PAE only... */
+
+    /* Get the l3e and check its flag */
+    gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
+    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+
+#endif /* PAE or 64... */
+
+    /* Map the l2 table */
+    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
+    if ( !p2m_is_ram(p2mt) )
+    {
+        rc |= _PAGE_PRESENT;
+        goto out;
+    }
+    ASSERT(mfn_valid(mfn_x(gw->l2mfn)));
+
+    /* Get the l2e */
+    l2p = map_domain_page(mfn_x(gw->l2mfn));
+    gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#else /* 32-bit only... */
+
+    /* Get l2e from the top level table */
+    gw->l2mfn = top_mfn;
+    l2p = (guest_l2e_t *) top_map;
+    gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#endif /* All levels... */
+
+    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
+    rc |= ((gflags & mflags) ^ mflags);
+    if ( rc & _PAGE_PRESENT )
+        goto out;
+
+    pse = (guest_supports_superpages(v) && 
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
+
+    if ( pse )
+    {
+        /* Special case: this guest VA is in a PSE superpage, so there's
+         * no guest l1e.  We make one up so that the propagation code
+         * can generate a shadow l1 table.  Start with the gfn of the 
+         * first 4k-page of the superpage. */
+        gfn_t start = guest_l2e_get_gfn(gw->l2e);
+        /* Grant full access in the l1e, since all the guest entry's 
+         * access controls are enforced in the shadow l2e. */
+        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+                     _PAGE_ACCESSED|_PAGE_DIRTY);
+        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+         * of the level 1. */
+        if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) 
+            flags |= _PAGE_PAT;
+        /* Copy the cache-control bits to the l1 as well, because we
+         * can't represent PAT in the (non-PSE) shadow l2e. :(
+         * This could cause problems if a guest ever maps an area of
+         * memory with superpages using more than one caching mode. */
+        flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
+        /* Increment the pfn by the right number of 4k pages.  
+         * The ~0x1 is to mask out the PAT bit mentioned above. */
+        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+        gw->l1e = guest_l1e_from_gfn(start, flags);
+        gw->l1mfn = _mfn(INVALID_MFN);
+    } 
+    else 
+    {
+        /* Not a superpage: carry on and find the l1e. */
+        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
+        if ( !p2m_is_ram(p2mt) )
+        {
+            rc |= _PAGE_PRESENT;
+            goto out;
+        }
+        ASSERT(mfn_valid(mfn_x(gw->l1mfn)));
+        l1p = map_domain_page(mfn_x(gw->l1mfn));
+        gw->l1e = l1p[guest_l1_table_offset(va)];
+        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
+        rc |= ((gflags & mflags) ^ mflags);
+    }
+
+    /* Go back and set accessed and dirty bits only if the walk was a
+     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
+     * get set whenever a lower-level PT is used, at least some hardware
+     * walkers behave this way. */
+    if ( rc == 0 ) 
+    {
+#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
+        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
+            paging_mark_dirty(d, mfn_x(gw->l4mfn));
+        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
+            paging_mark_dirty(d, mfn_x(gw->l3mfn));
+#endif
+        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
+                         (pse && (pfec & PFEC_write_access))) )
+            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
+        if ( !pse ) 
+        {
+            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
+                             (pfec & PFEC_write_access)) )
+                paging_mark_dirty(d, mfn_x(gw->l1mfn));
+        }
+    }
+
+ out:
+#if GUEST_PAGING_LEVELS == 4
+    if ( l3p ) unmap_domain_page(l3p);
+#endif
+#if GUEST_PAGING_LEVELS >= 3
+    if ( l2p ) unmap_domain_page(l2p);
+#endif
+    if ( l1p ) unmap_domain_page(l1p);
+
+    return rc;
+}
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c    Thu Nov 13 13:02:08 2008 +0000
@@ -157,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn
         put_page(mfn_to_page(gmfn));
 }
 
-/**************************************************************************/
-/* CPU feature support querying */
-
-static inline int
-guest_supports_superpages(struct vcpu *v)
-{
-    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
-     * CR4.PSE is set or the guest is in PAE or long mode. 
-     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
-    return (is_hvm_vcpu(v) && 
-            (GUEST_PAGING_LEVELS != 2 
-             || !hvm_paging_enabled(v)
-             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
-}
-
-static inline int
-guest_supports_nx(struct vcpu *v)
-{
-    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
-        return 0;
-    if ( !is_hvm_vcpu(v) )
-        return cpu_has_nx;
-    return hvm_nx_enabled(v);
-}
-
 
 /**************************************************************************/
 /* Functions for walking the guest page tables */
 
-/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
-static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
-{
-    static uint32_t flags[] = {
-        /* I/F -  Usr Wr */
-        /* 0   0   0   0 */ _PAGE_PRESENT, 
-        /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-        /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-        /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-        /* 0   1   0   0 */ _PAGE_PRESENT, 
-        /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-        /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-        /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-        /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
-        /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-        /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT, 
-        /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-        /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-        /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-    };
-
-    /* Don't demand not-NX if the CPU wouldn't enforce it. */
-    if ( !guest_supports_nx(v) )
-        pfec &= ~PFEC_insn_fetch;
-
-    /* Don't demand R/W if the CPU wouldn't enforce it. */
-    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
-         && !(pfec & PFEC_user_mode) )
-        pfec &= ~PFEC_write_access;
-
-    return flags[(pfec & 0x1f) >> 1];
-}
-
-/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
- * Returns non-zero if it actually writes to guest memory. */
-static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
-{
-    guest_intpte_t old, new;
-    int ret = 0;
-
-    old = *(guest_intpte_t *)walk_p;
-    new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
-    if ( old != new ) 
-    {
-        /* Write the new entry into the walk, and try to write it back
-         * into the guest table as well.  If the guest table has changed
-         * under out feet then leave it alone. */
-        *(guest_intpte_t *)walk_p = new;
-        if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
-            ret = 1;
-
-        /* FIXME -- this code is longer than necessary */
-        if(set_dirty)
-            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
-        else
-            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
-    }
-    return ret;
+static inline uint32_t
+sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                     uint32_t pfec)
+{
+    return guest_walk_tables(v, va, gw, pfec, 
+#if GUEST_PAGING_LEVELS == 3 /* PAE */
+                             _mfn(INVALID_MFN),
+                             v->arch.paging.shadow.gl3e
+#else /* 32 or 64 */
+                             pagetable_get_mfn(v->arch.guest_table),
+                             v->arch.paging.shadow.guest_vtable
+#endif
+                             );
 }
 
 /* This validation is called with lock held, and after write permission
@@ -364,236 +292,6 @@ gw_remove_write_accesses(struct vcpu *v,
     return rc;
 }
 
-/* Walk the guest pagetables, after the manner of a hardware walker. 
- *
- * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
- *         pointer to a pagefault code
- * 
- * We walk the vcpu's guest pagetables, filling the walk_t with what we
- * see and adding any Accessed and Dirty bits that are needed in the
- * guest entries.  Using the pagefault code, we check the permissions as
- * we go.  For the purposes of reading pagetables we treat all non-RAM
- * memory as contining zeroes.
- * 
- * The walk is done in a lock-free style, with some sanity check postponed
- * after grabbing shadow lock later. Those delayed checks will make sure
- * no inconsistent mapping being translated into shadow page table.
- * 
- * Returns 0 for success, or the set of permission bits that we failed on 
- * if the walk did not complete.
- * N.B. This is different from the old return code but almost no callers
- * checked the old return code anyway.
- */
-static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
-{
-    struct domain *d = v->domain;
-    p2m_type_t p2mt;
-    guest_l1e_t *l1p = NULL;
-    guest_l2e_t *l2p = NULL;
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    guest_l3e_t *l3p = NULL;
-    guest_l4e_t *l4p;
-#endif
-    uint32_t gflags, mflags, rc = 0;
-    int pse;
-
-    perfc_incr(shadow_guest_walk);
-    memset(gw, 0, sizeof(*gw));
-    gw->va = va;
-
-    /* Mandatory bits that must be set in every entry.  We invert NX, to
-     * calculate as if there were an "X" bit that allowed access. 
-     * We will accumulate, in rc, the set of flags that are missing. */
-    mflags = mandatory_flags(v, pfec);
-
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-
-    /* Get the l4e from the top level table and check its flags*/
-    gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
-    l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
-    gw->l4e = l4p[guest_l4_table_offset(va)];
-    gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT ) goto out;
-
-    /* Map the l3 table */
-    gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
-    if ( !p2m_is_ram(p2mt) ) 
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-    ASSERT(mfn_valid(gw->l3mfn));
-
-    /* Get the l3e and check its flags*/
-    l3p = sh_map_domain_page(gw->l3mfn);
-    gw->l3e = l3p[guest_l3_table_offset(va)];
-    gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT )
-        goto out;
-
-#else /* PAE only... */
-
-    /* Get l3e from the cache of the top level table and check its flag */
-    gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
-    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-
-#endif /* PAE or 64... */
-
-    /* Map the l2 table */
-    gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
-    if ( !p2m_is_ram(p2mt) )
-    {
-        rc |= _PAGE_PRESENT;
-        goto out;
-    }
-    ASSERT(mfn_valid(gw->l2mfn));
-
-    /* Get the l2e */
-    l2p = sh_map_domain_page(gw->l2mfn);
-    gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#else /* 32-bit only... */
-
-    /* Get l2e from the top level table */
-    gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
-    l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
-    gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#endif /* All levels... */
-
-    gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
-    rc |= ((gflags & mflags) ^ mflags);
-    if ( rc & _PAGE_PRESENT )
-        goto out;
-
-    pse = (guest_supports_superpages(v) && 
-           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); 
-
-    if ( pse )
-    {
-        /* Special case: this guest VA is in a PSE superpage, so there's
-         * no guest l1e.  We make one up so that the propagation code
-         * can generate a shadow l1 table.  Start with the gfn of the 
-         * first 4k-page of the superpage. */
-        gfn_t start = guest_l2e_get_gfn(gw->l2e);
-        /* Grant full access in the l1e, since all the guest entry's 
-         * access controls are enforced in the shadow l2e. */
-        int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
-                     _PAGE_ACCESSED|_PAGE_DIRTY);
-        /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
-         * of the level 1. */
-        if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) 
-            flags |= _PAGE_PAT;
-        /* Copy the cache-control bits to the l1 as well, because we
-         * can't represent PAT in the (non-PSE) shadow l2e. :(
-         * This could cause problems if a guest ever maps an area of
-         * memory with superpages using more than one caching mode. */
-        flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
-        /* Increment the pfn by the right number of 4k pages.  
-         * The ~0x1 is to mask out the PAT bit mentioned above. */
-        start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
-        gw->l1e = guest_l1e_from_gfn(start, flags);
-        gw->l1mfn = _mfn(INVALID_MFN);
-    } 
-    else 
-    {
-        /* Not a superpage: carry on and find the l1e. */
-        gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
-        if ( !p2m_is_ram(p2mt) )
-        {
-            rc |= _PAGE_PRESENT;
-            goto out;
-        }
-        ASSERT(mfn_valid(gw->l1mfn));
-        l1p = sh_map_domain_page(gw->l1mfn);
-        gw->l1e = l1p[guest_l1_table_offset(va)];
-        gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
-        rc |= ((gflags & mflags) ^ mflags);
-    }
-
-    /* Go back and set accessed and dirty bits only if the walk was a
-     * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
-     * get set whenever a lower-level PT is used, at least some hardware
-     * walkers behave this way. */
-    if ( rc == 0 ) 
-    {
-#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
-        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
-            paging_mark_dirty(d, mfn_x(gw->l4mfn));
-        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
-            paging_mark_dirty(d, mfn_x(gw->l3mfn));
-#endif
-        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
-                         (pse && (pfec & PFEC_write_access))) )
-            paging_mark_dirty(d, mfn_x(gw->l2mfn));            
-        if ( !pse ) 
-        {
-            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
-                             (pfec & PFEC_write_access)) )
-                paging_mark_dirty(d, mfn_x(gw->l1mfn));
-        }
-    }
-
- out:
-#if GUEST_PAGING_LEVELS == 4
-    if ( l3p ) sh_unmap_domain_page(l3p);
-#endif
-#if GUEST_PAGING_LEVELS >= 3
-    if ( l2p ) sh_unmap_domain_page(l2p);
-#endif
-    if ( l1p ) sh_unmap_domain_page(l1p);
-
-    return rc;
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding frame number. */
-static inline gfn_t
-guest_walk_to_gfn(walk_t *gw)
-{
-    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
-        return _gfn(INVALID_GFN);
-    return guest_l1e_get_gfn(gw->l1e);
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding physical address. */
-static inline paddr_t
-guest_walk_to_gpa(walk_t *gw)
-{
-    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
-        return 0;
-    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
-}
-
-#if 0 /* Keep for debugging */
-/* Pretty-print the contents of a guest-walk */
-static inline void print_gw(walk_t *gw)
-{
-    SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    SHADOW_PRINTK("   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
-    SHADOW_PRINTK("   l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
-    SHADOW_PRINTK("   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
-#endif /* PAE or 64... */
-    SHADOW_PRINTK("   l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
-#endif /* All levels... */
-    SHADOW_PRINTK("   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
-    SHADOW_PRINTK("   l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
-    SHADOW_PRINTK("   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
-    SHADOW_PRINTK("   l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
-}
-#endif /* 0 */
-
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 /* Lightweight audit: pass all the shadows associated with this guest walk
  * through the audit mechanisms */
@@ -654,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 
+    if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 
          && mfn_valid(gw.l1mfn) )
     {
         if ( gl1mfn )
@@ -676,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
     // XXX -- this is expensive, but it's easy to cobble together...
     // FIXME!
 
-    (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
+    (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
     *(guest_l1e_t *)eff_l1e = gw.l1e;
 }
 #endif /* CONFIG == GUEST (== SHADOW) */
@@ -3314,9 +3012,14 @@ static int sh_page_fault(struct vcpu *v,
     }
 
  rewalk:
+
+    /* The walk is done in a lock-free style, with some sanity check
+     * postponed after grabbing shadow lock later. Those delayed checks
+     * will make sure no inconsistent mapping being translated into
+     * shadow page table. */ 
     version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
     rmb();
-    rc = guest_walk_tables(v, va, &gw, regs->error_code);
+    rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     regs->error_code &= ~PFEC_page_present;
@@ -3869,7 +3572,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-    if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
+    if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
     {
         if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
             pfec[0] &= ~PFEC_page_present;
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/include/asm-x86/guest_pt.h
--- a/xen/include/asm-x86/guest_pt.h    Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/include/asm-x86/guest_pt.h    Thu Nov 13 13:02:08 2008 +0000
@@ -174,6 +174,32 @@ static inline guest_l4e_t guest_l4e_from
 #endif /* GUEST_PAGING_LEVELS != 2 */
 
 
+/* Which pagetable features are supported on this vcpu? */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+    /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
+     * CR4.PSE is set or the guest is in PAE or long mode. 
+     * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
+    return (is_hvm_vcpu(v) && 
+            (GUEST_PAGING_LEVELS != 2 
+             || !hvm_paging_enabled(v)
+             || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+    if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
+        return 0;
+    if ( !is_hvm_vcpu(v) )
+        return cpu_has_nx;
+    return hvm_nx_enabled(v);
+}
+
+
+
 /* Type used for recording a walk through guest pagetables.  It is
  * filled in by the pagetable walk function, and also used as a cache
  * for later walks.  When we encounter a superpage l2e, we fabricate an
@@ -199,4 +225,67 @@ struct guest_pagetable_walk
     mfn_t l1mfn;                /* MFN that the level 1 entry was in */
 };
 
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+        return _gfn(INVALID_GFN);
+    return guest_l1e_get_gfn(gw->l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+    if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+        return 0;
+    return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
+}
+
+/* Walk the guest pagetables, after the manner of a hardware walker. 
+ *
+ * Inputs: a vcpu, a virtual address, a walk_t to fill, a 
+ *         pointer to a pagefault code, the MFN of the guest's 
+ *         top-level pagetable, and a mapping of the 
+ *         guest's top-level pagetable.
+ * 
+ * We walk the vcpu's guest pagetables, filling the walk_t with what we
+ * see and adding any Accessed and Dirty bits that are needed in the
+ * guest entries.  Using the pagefault code, we check the permissions as
+ * we go.  For the purposes of reading pagetables we treat all non-RAM
+ * memory as contining zeroes.
+ * 
+ * Returns 0 for success, or the set of permission bits that we failed on 
+ * if the walk did not complete. */
+
+/* Macro-fu so you can call guest_walk_tables() and get the right one. */
+#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels
+#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
+#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
+
+extern uint32_t 
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, 
+                  uint32_t pfec, mfn_t top_mfn, void *top_map);
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+    gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+    gdprintk(XENLOG_INFO, "   l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
+    gdprintk(XENLOG_INFO, "   l4e=%" PRI_gpte "\n", gw->l4e.l4);
+    gdprintk(XENLOG_INFO, "   l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
+#endif /* PAE or 64... */
+    gdprintk(XENLOG_INFO, "   l3e=%" PRI_gpte "\n", gw->l3e.l3);
+#endif /* All levels... */
+    gdprintk(XENLOG_INFO, "   l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
+    gdprintk(XENLOG_INFO, "   l2e=%" PRI_gpte "\n", gw->l2e.l2);
+    gdprintk(XENLOG_INFO, "   l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
+    gdprintk(XENLOG_INFO, "   l1e=%" PRI_gpte "\n", gw->l1e.l1);
+}
+
 #endif /* _XEN_ASM_GUEST_PT_H */
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/include/asm-x86/perfc_defn.h  Thu Nov 13 13:02:08 2008 +0000
@@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations,        "wri
 
 PERFCOUNTER(exception_fixed,        "pre-exception fixed")
 
+PERFCOUNTER(guest_walk,            "guest pagetable walks")
 
 /* Shadow counters */
 PERFCOUNTER(shadow_alloc,          "calls to shadow_alloc")
@@ -92,7 +93,6 @@ PERFCOUNTER(shadow_up_pointer,     "shad
 PERFCOUNTER(shadow_up_pointer,     "shadow unshadow by up-pointer")
 PERFCOUNTER(shadow_unshadow_bf,    "shadow unshadow brute-force")
 PERFCOUNTER(shadow_get_page_fail,  "shadow_get_page_from_l1e failed")
-PERFCOUNTER(shadow_guest_walk,     "shadow walks guest tables")
 PERFCOUNTER(shadow_check_gwalk,    "shadow checks gwalk")
 PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
 PERFCOUNTER(shadow_rm_write_flush_tlb,

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] x86: Move the guest pagetable walker out of shadow/multi.c, Xen patchbot-unstable <=