# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1226581328 0
# Node ID 7fb33d15dc9bc5892e4708011beded66dd756be3
# Parent b87cc4de3ca676e895f6374daed1d33a79849b9d
x86: Move the guest pagetable walker out of shadow/multi.c
Move the guest PT walker into its own file, and purge it of references
to the rest of the shadow code.
Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---
xen/arch/x86/mm/Makefile | 6
xen/arch/x86/mm/guest_walk.c | 260 +++++++++++++++++++++++++++++
xen/arch/x86/mm/shadow/multi.c | 341 ++-------------------------------------
xen/include/asm-x86/guest_pt.h | 89 ++++++++++
xen/include/asm-x86/perfc_defn.h | 2
5 files changed, 378 insertions(+), 320 deletions(-)
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/Makefile
--- a/xen/arch/x86/mm/Makefile Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/arch/x86/mm/Makefile Thu Nov 13 13:02:08 2008 +0000
@@ -3,3 +3,9 @@ subdir-y += hap
obj-y += paging.o
obj-y += p2m.o
+obj-y += guest_walk_2.o
+obj-y += guest_walk_3.o
+obj-$(x86_64) += guest_walk_4.o
+
+guest_walk_%.o: guest_walk.c $(HDRS) Makefile
+ $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/guest_walk.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/mm/guest_walk.c Thu Nov 13 13:02:08 2008 +0000
@@ -0,0 +1,260 @@
+/******************************************************************************
+ * arch/x86/mm/guest_walk.c
+ *
+ * Pagetable walker for guest memory accesses.
+ *
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/paging.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/page.h>
+#include <asm/guest_pt.h>
+
+
+/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
+static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
+{
+ static uint32_t flags[] = {
+ /* I/F - Usr Wr */
+ /* 0 0 0 0 */ _PAGE_PRESENT,
+ /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
+ /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
+ /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+ /* 0 1 0 0 */ _PAGE_PRESENT,
+ /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
+ /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
+ /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
+ /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
+ /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+ /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+ /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+ /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
+ /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
+ /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
+ /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
+ };
+
+ /* Don't demand not-NX if the CPU wouldn't enforce it. */
+ if ( !guest_supports_nx(v) )
+ pfec &= ~PFEC_insn_fetch;
+
+ /* Don't demand R/W if the CPU wouldn't enforce it. */
+ if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
+ && !(pfec & PFEC_user_mode) )
+ pfec &= ~PFEC_write_access;
+
+ return flags[(pfec & 0x1f) >> 1];
+}
+
+/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
+ * Returns non-zero if it actually writes to guest memory. */
+static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
+{
+ guest_intpte_t old, new;
+
+ old = *(guest_intpte_t *)walk_p;
+ new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
+ if ( old != new )
+ {
+ /* Write the new entry into the walk, and try to write it back
+ * into the guest table as well. If the guest table has changed
+ * under out feet then leave it alone. */
+ *(guest_intpte_t *)walk_p = new;
+ if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
+ return 1;
+ }
+ return 0;
+}
+
+
+/* Walk the guest pagetables, after the manner of a hardware walker. */
+uint32_t
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw,
+ uint32_t pfec, mfn_t top_mfn, void *top_map)
+{
+ struct domain *d = v->domain;
+ p2m_type_t p2mt;
+ guest_l1e_t *l1p = NULL;
+ guest_l2e_t *l2p = NULL;
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ guest_l3e_t *l3p = NULL;
+ guest_l4e_t *l4p;
+#endif
+ uint32_t gflags, mflags, rc = 0;
+ int pse;
+
+ perfc_incr(guest_walk);
+ memset(gw, 0, sizeof(*gw));
+ gw->va = va;
+
+ /* Mandatory bits that must be set in every entry. We invert NX, to
+ * calculate as if there were an "X" bit that allowed access.
+ * We will accumulate, in rc, the set of flags that are missing. */
+ mflags = mandatory_flags(v, pfec);
+
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+
+ /* Get the l4e from the top level table and check its flags*/
+ gw->l4mfn = top_mfn;
+ l4p = (guest_l4e_t *) top_map;
+ gw->l4e = l4p[guest_l4_table_offset(va)];
+ gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ if ( rc & _PAGE_PRESENT ) goto out;
+
+ /* Map the l3 table */
+ gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
+ if ( !p2m_is_ram(p2mt) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+ ASSERT(mfn_valid(mfn_x(gw->l3mfn)));
+
+ /* Get the l3e and check its flags*/
+ l3p = map_domain_page(mfn_x(gw->l3mfn));
+ gw->l3e = l3p[guest_l3_table_offset(va)];
+ gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ if ( rc & _PAGE_PRESENT )
+ goto out;
+
+#else /* PAE only... */
+
+ /* Get the l3e and check its flag */
+ gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
+ if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+
+#endif /* PAE or 64... */
+
+ /* Map the l2 table */
+ gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
+ if ( !p2m_is_ram(p2mt) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+ ASSERT(mfn_valid(mfn_x(gw->l2mfn)));
+
+ /* Get the l2e */
+ l2p = map_domain_page(mfn_x(gw->l2mfn));
+ gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#else /* 32-bit only... */
+
+ /* Get l2e from the top level table */
+ gw->l2mfn = top_mfn;
+ l2p = (guest_l2e_t *) top_map;
+ gw->l2e = l2p[guest_l2_table_offset(va)];
+
+#endif /* All levels... */
+
+ gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ if ( rc & _PAGE_PRESENT )
+ goto out;
+
+ pse = (guest_supports_superpages(v) &&
+ (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
+
+ if ( pse )
+ {
+ /* Special case: this guest VA is in a PSE superpage, so there's
+ * no guest l1e. We make one up so that the propagation code
+ * can generate a shadow l1 table. Start with the gfn of the
+ * first 4k-page of the superpage. */
+ gfn_t start = guest_l2e_get_gfn(gw->l2e);
+ /* Grant full access in the l1e, since all the guest entry's
+ * access controls are enforced in the shadow l2e. */
+ int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
+ _PAGE_ACCESSED|_PAGE_DIRTY);
+ /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
+ * of the level 1. */
+ if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
+ flags |= _PAGE_PAT;
+ /* Copy the cache-control bits to the l1 as well, because we
+ * can't represent PAT in the (non-PSE) shadow l2e. :(
+ * This could cause problems if a guest ever maps an area of
+ * memory with superpages using more than one caching mode. */
+ flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
+ /* Increment the pfn by the right number of 4k pages.
+ * The ~0x1 is to mask out the PAT bit mentioned above. */
+ start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
+ gw->l1e = guest_l1e_from_gfn(start, flags);
+ gw->l1mfn = _mfn(INVALID_MFN);
+ }
+ else
+ {
+ /* Not a superpage: carry on and find the l1e. */
+ gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
+ if ( !p2m_is_ram(p2mt) )
+ {
+ rc |= _PAGE_PRESENT;
+ goto out;
+ }
+ ASSERT(mfn_valid(mfn_x(gw->l1mfn)));
+ l1p = map_domain_page(mfn_x(gw->l1mfn));
+ gw->l1e = l1p[guest_l1_table_offset(va)];
+ gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
+ rc |= ((gflags & mflags) ^ mflags);
+ }
+
+ /* Go back and set accessed and dirty bits only if the walk was a
+ * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
+ * get set whenever a lower-level PT is used, at least some hardware
+ * walkers behave this way. */
+ if ( rc == 0 )
+ {
+#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
+ if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
+ paging_mark_dirty(d, mfn_x(gw->l4mfn));
+ if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
+ paging_mark_dirty(d, mfn_x(gw->l3mfn));
+#endif
+ if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
+ (pse && (pfec & PFEC_write_access))) )
+ paging_mark_dirty(d, mfn_x(gw->l2mfn));
+ if ( !pse )
+ {
+ if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
+ (pfec & PFEC_write_access)) )
+ paging_mark_dirty(d, mfn_x(gw->l1mfn));
+ }
+ }
+
+ out:
+#if GUEST_PAGING_LEVELS == 4
+ if ( l3p ) unmap_domain_page(l3p);
+#endif
+#if GUEST_PAGING_LEVELS >= 3
+ if ( l2p ) unmap_domain_page(l2p);
+#endif
+ if ( l1p ) unmap_domain_page(l1p);
+
+ return rc;
+}
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/arch/x86/mm/shadow/multi.c Thu Nov 13 13:02:08 2008 +0000
@@ -157,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn
put_page(mfn_to_page(gmfn));
}
-/**************************************************************************/
-/* CPU feature support querying */
-
-static inline int
-guest_supports_superpages(struct vcpu *v)
-{
- /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
- * CR4.PSE is set or the guest is in PAE or long mode.
- * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
- return (is_hvm_vcpu(v) &&
- (GUEST_PAGING_LEVELS != 2
- || !hvm_paging_enabled(v)
- || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
-}
-
-static inline int
-guest_supports_nx(struct vcpu *v)
-{
- if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
- return 0;
- if ( !is_hvm_vcpu(v) )
- return cpu_has_nx;
- return hvm_nx_enabled(v);
-}
-
/**************************************************************************/
/* Functions for walking the guest page tables */
-/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
-static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec)
-{
- static uint32_t flags[] = {
- /* I/F - Usr Wr */
- /* 0 0 0 0 */ _PAGE_PRESENT,
- /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW,
- /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER,
- /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
- /* 0 1 0 0 */ _PAGE_PRESENT,
- /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW,
- /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER,
- /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
- /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
- /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
- /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
- /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
- /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
- /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
- /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
- /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
- };
-
- /* Don't demand not-NX if the CPU wouldn't enforce it. */
- if ( !guest_supports_nx(v) )
- pfec &= ~PFEC_insn_fetch;
-
- /* Don't demand R/W if the CPU wouldn't enforce it. */
- if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v))
- && !(pfec & PFEC_user_mode) )
- pfec &= ~PFEC_write_access;
-
- return flags[(pfec & 0x1f) >> 1];
-}
-
-/* Modify a guest pagetable entry to set the Accessed and Dirty bits.
- * Returns non-zero if it actually writes to guest memory. */
-static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
-{
- guest_intpte_t old, new;
- int ret = 0;
-
- old = *(guest_intpte_t *)walk_p;
- new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
- if ( old != new )
- {
- /* Write the new entry into the walk, and try to write it back
- * into the guest table as well. If the guest table has changed
- * under out feet then leave it alone. */
- *(guest_intpte_t *)walk_p = new;
- if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old )
- ret = 1;
-
- /* FIXME -- this code is longer than necessary */
- if(set_dirty)
- TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
- else
- TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
- }
- return ret;
+static inline uint32_t
+sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
+ uint32_t pfec)
+{
+ return guest_walk_tables(v, va, gw, pfec,
+#if GUEST_PAGING_LEVELS == 3 /* PAE */
+ _mfn(INVALID_MFN),
+ v->arch.paging.shadow.gl3e
+#else /* 32 or 64 */
+ pagetable_get_mfn(v->arch.guest_table),
+ v->arch.paging.shadow.guest_vtable
+#endif
+ );
}
/* This validation is called with lock held, and after write permission
@@ -364,236 +292,6 @@ gw_remove_write_accesses(struct vcpu *v,
return rc;
}
-/* Walk the guest pagetables, after the manner of a hardware walker.
- *
- * Inputs: a vcpu, a virtual address, a walk_t to fill, a
- * pointer to a pagefault code
- *
- * We walk the vcpu's guest pagetables, filling the walk_t with what we
- * see and adding any Accessed and Dirty bits that are needed in the
- * guest entries. Using the pagefault code, we check the permissions as
- * we go. For the purposes of reading pagetables we treat all non-RAM
- * memory as contining zeroes.
- *
- * The walk is done in a lock-free style, with some sanity check postponed
- * after grabbing shadow lock later. Those delayed checks will make sure
- * no inconsistent mapping being translated into shadow page table.
- *
- * Returns 0 for success, or the set of permission bits that we failed on
- * if the walk did not complete.
- * N.B. This is different from the old return code but almost no callers
- * checked the old return code anyway.
- */
-static uint32_t
-guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec)
-{
- struct domain *d = v->domain;
- p2m_type_t p2mt;
- guest_l1e_t *l1p = NULL;
- guest_l2e_t *l2p = NULL;
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- guest_l3e_t *l3p = NULL;
- guest_l4e_t *l4p;
-#endif
- uint32_t gflags, mflags, rc = 0;
- int pse;
-
- perfc_incr(shadow_guest_walk);
- memset(gw, 0, sizeof(*gw));
- gw->va = va;
-
- /* Mandatory bits that must be set in every entry. We invert NX, to
- * calculate as if there were an "X" bit that allowed access.
- * We will accumulate, in rc, the set of flags that are missing. */
- mflags = mandatory_flags(v, pfec);
-
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-
- /* Get the l4e from the top level table and check its flags*/
- gw->l4mfn = pagetable_get_mfn(v->arch.guest_table);
- l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable);
- gw->l4e = l4p[guest_l4_table_offset(va)];
- gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- if ( rc & _PAGE_PRESENT ) goto out;
-
- /* Map the l3 table */
- gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt);
- if ( !p2m_is_ram(p2mt) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
- ASSERT(mfn_valid(gw->l3mfn));
-
- /* Get the l3e and check its flags*/
- l3p = sh_map_domain_page(gw->l3mfn);
- gw->l3e = l3p[guest_l3_table_offset(va)];
- gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- if ( rc & _PAGE_PRESENT )
- goto out;
-
-#else /* PAE only... */
-
- /* Get l3e from the cache of the top level table and check its flag */
- gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)];
- if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
-
-#endif /* PAE or 64... */
-
- /* Map the l2 table */
- gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt);
- if ( !p2m_is_ram(p2mt) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
- ASSERT(mfn_valid(gw->l2mfn));
-
- /* Get the l2e */
- l2p = sh_map_domain_page(gw->l2mfn);
- gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#else /* 32-bit only... */
-
- /* Get l2e from the top level table */
- gw->l2mfn = pagetable_get_mfn(v->arch.guest_table);
- l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable);
- gw->l2e = l2p[guest_l2_table_offset(va)];
-
-#endif /* All levels... */
-
- gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- if ( rc & _PAGE_PRESENT )
- goto out;
-
- pse = (guest_supports_superpages(v) &&
- (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE));
-
- if ( pse )
- {
- /* Special case: this guest VA is in a PSE superpage, so there's
- * no guest l1e. We make one up so that the propagation code
- * can generate a shadow l1 table. Start with the gfn of the
- * first 4k-page of the superpage. */
- gfn_t start = guest_l2e_get_gfn(gw->l2e);
- /* Grant full access in the l1e, since all the guest entry's
- * access controls are enforced in the shadow l2e. */
- int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
- _PAGE_ACCESSED|_PAGE_DIRTY);
- /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
- * of the level 1. */
- if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) )
- flags |= _PAGE_PAT;
- /* Copy the cache-control bits to the l1 as well, because we
- * can't represent PAT in the (non-PSE) shadow l2e. :(
- * This could cause problems if a guest ever maps an area of
- * memory with superpages using more than one caching mode. */
- flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD);
- /* Increment the pfn by the right number of 4k pages.
- * The ~0x1 is to mask out the PAT bit mentioned above. */
- start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va));
- gw->l1e = guest_l1e_from_gfn(start, flags);
- gw->l1mfn = _mfn(INVALID_MFN);
- }
- else
- {
- /* Not a superpage: carry on and find the l1e. */
- gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt);
- if ( !p2m_is_ram(p2mt) )
- {
- rc |= _PAGE_PRESENT;
- goto out;
- }
- ASSERT(mfn_valid(gw->l1mfn));
- l1p = sh_map_domain_page(gw->l1mfn);
- gw->l1e = l1p[guest_l1_table_offset(va)];
- gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT;
- rc |= ((gflags & mflags) ^ mflags);
- }
-
- /* Go back and set accessed and dirty bits only if the walk was a
- * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
- * get set whenever a lower-level PT is used, at least some hardware
- * walkers behave this way. */
- if ( rc == 0 )
- {
-#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
- if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
- paging_mark_dirty(d, mfn_x(gw->l4mfn));
- if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) )
- paging_mark_dirty(d, mfn_x(gw->l3mfn));
-#endif
- if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
- (pse && (pfec & PFEC_write_access))) )
- paging_mark_dirty(d, mfn_x(gw->l2mfn));
- if ( !pse )
- {
- if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
- (pfec & PFEC_write_access)) )
- paging_mark_dirty(d, mfn_x(gw->l1mfn));
- }
- }
-
- out:
-#if GUEST_PAGING_LEVELS == 4
- if ( l3p ) sh_unmap_domain_page(l3p);
-#endif
-#if GUEST_PAGING_LEVELS >= 3
- if ( l2p ) sh_unmap_domain_page(l2p);
-#endif
- if ( l1p ) sh_unmap_domain_page(l1p);
-
- return rc;
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding frame number. */
-static inline gfn_t
-guest_walk_to_gfn(walk_t *gw)
-{
- if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
- return _gfn(INVALID_GFN);
- return guest_l1e_get_gfn(gw->l1e);
-}
-
-/* Given a walk_t, translate the gw->va into the guest's notion of the
- * corresponding physical address. */
-static inline paddr_t
-guest_walk_to_gpa(walk_t *gw)
-{
- if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
- return 0;
- return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
-}
-
-#if 0 /* Keep for debugging */
-/* Pretty-print the contents of a guest-walk */
-static inline void print_gw(walk_t *gw)
-{
- SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va);
-#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
-#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
- SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
- SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4);
- SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
-#endif /* PAE or 64... */
- SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3);
-#endif /* All levels... */
- SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
- SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2);
- SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
- SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1);
-}
-#endif /* 0 */
-
#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
/* Lightweight audit: pass all the shadows associated with this guest walk
* through the audit mechanisms */
@@ -654,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne
// XXX -- this is expensive, but it's easy to cobble together...
// FIXME!
- if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0
+ if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0
&& mfn_valid(gw.l1mfn) )
{
if ( gl1mfn )
@@ -676,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns
// XXX -- this is expensive, but it's easy to cobble together...
// FIXME!
- (void) guest_walk_tables(v, addr, &gw, PFEC_page_present);
+ (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present);
*(guest_l1e_t *)eff_l1e = gw.l1e;
}
#endif /* CONFIG == GUEST (== SHADOW) */
@@ -3314,9 +3012,14 @@ static int sh_page_fault(struct vcpu *v,
}
rewalk:
+
+ /* The walk is done in a lock-free style, with some sanity check
+ * postponed after grabbing shadow lock later. Those delayed checks
+ * will make sure no inconsistent mapping being translated into
+ * shadow page table. */
version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
rmb();
- rc = guest_walk_tables(v, va, &gw, regs->error_code);
+ rc = sh_walk_guest_tables(v, va, &gw, regs->error_code);
#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
regs->error_code &= ~PFEC_page_present;
@@ -3869,7 +3572,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l
return vtlb_gfn;
#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
- if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 )
+ if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 )
{
if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) )
pfec[0] &= ~PFEC_page_present;
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/include/asm-x86/guest_pt.h
--- a/xen/include/asm-x86/guest_pt.h Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/include/asm-x86/guest_pt.h Thu Nov 13 13:02:08 2008 +0000
@@ -174,6 +174,32 @@ static inline guest_l4e_t guest_l4e_from
#endif /* GUEST_PAGING_LEVELS != 2 */
+/* Which pagetable features are supported on this vcpu? */
+
+static inline int
+guest_supports_superpages(struct vcpu *v)
+{
+ /* The _PAGE_PSE bit must be honoured in HVM guests, whenever
+ * CR4.PSE is set or the guest is in PAE or long mode.
+ * It's also used in the dummy PT for vcpus with CR4.PG cleared. */
+ return (is_hvm_vcpu(v) &&
+ (GUEST_PAGING_LEVELS != 2
+ || !hvm_paging_enabled(v)
+ || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE)));
+}
+
+static inline int
+guest_supports_nx(struct vcpu *v)
+{
+ if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx )
+ return 0;
+ if ( !is_hvm_vcpu(v) )
+ return cpu_has_nx;
+ return hvm_nx_enabled(v);
+}
+
+
+
/* Type used for recording a walk through guest pagetables. It is
* filled in by the pagetable walk function, and also used as a cache
* for later walks. When we encounter a superpage l2e, we fabricate an
@@ -199,4 +225,67 @@ struct guest_pagetable_walk
mfn_t l1mfn; /* MFN that the level 1 entry was in */
};
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding frame number. */
+static inline gfn_t
+guest_walk_to_gfn(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+ return _gfn(INVALID_GFN);
+ return guest_l1e_get_gfn(gw->l1e);
+}
+
+/* Given a walk_t, translate the gw->va into the guest's notion of the
+ * corresponding physical address. */
+static inline paddr_t
+guest_walk_to_gpa(walk_t *gw)
+{
+ if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
+ return 0;
+ return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK);
+}
+
+/* Walk the guest pagetables, after the manner of a hardware walker.
+ *
+ * Inputs: a vcpu, a virtual address, a walk_t to fill, a
+ * pointer to a pagefault code, the MFN of the guest's
+ * top-level pagetable, and a mapping of the
+ * guest's top-level pagetable.
+ *
+ * We walk the vcpu's guest pagetables, filling the walk_t with what we
+ * see and adding any Accessed and Dirty bits that are needed in the
+ * guest entries. Using the pagefault code, we check the permissions as
+ * we go. For the purposes of reading pagetables we treat all non-RAM
+ * memory as contining zeroes.
+ *
+ * Returns 0 for success, or the set of permission bits that we failed on
+ * if the walk did not complete. */
+
+/* Macro-fu so you can call guest_walk_tables() and get the right one. */
+#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels
+#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
+#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
+
+extern uint32_t
+guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw,
+ uint32_t pfec, mfn_t top_mfn, void *top_map);
+
+/* Pretty-print the contents of a guest-walk */
+static inline void print_gw(walk_t *gw)
+{
+ gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va);
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
+ gdprintk(XENLOG_INFO, " l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn));
+ gdprintk(XENLOG_INFO, " l4e=%" PRI_gpte "\n", gw->l4e.l4);
+ gdprintk(XENLOG_INFO, " l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn));
+#endif /* PAE or 64... */
+ gdprintk(XENLOG_INFO, " l3e=%" PRI_gpte "\n", gw->l3e.l3);
+#endif /* All levels... */
+ gdprintk(XENLOG_INFO, " l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn));
+ gdprintk(XENLOG_INFO, " l2e=%" PRI_gpte "\n", gw->l2e.l2);
+ gdprintk(XENLOG_INFO, " l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn));
+ gdprintk(XENLOG_INFO, " l1e=%" PRI_gpte "\n", gw->l1e.l1);
+}
+
#endif /* _XEN_ASM_GUEST_PT_H */
diff -r b87cc4de3ca6 -r 7fb33d15dc9b xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h Thu Nov 13 13:01:22 2008 +0000
+++ b/xen/include/asm-x86/perfc_defn.h Thu Nov 13 13:02:08 2008 +0000
@@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations, "wri
PERFCOUNTER(exception_fixed, "pre-exception fixed")
+PERFCOUNTER(guest_walk, "guest pagetable walks")
/* Shadow counters */
PERFCOUNTER(shadow_alloc, "calls to shadow_alloc")
@@ -92,7 +93,6 @@ PERFCOUNTER(shadow_up_pointer, "shad
PERFCOUNTER(shadow_up_pointer, "shadow unshadow by up-pointer")
PERFCOUNTER(shadow_unshadow_bf, "shadow unshadow brute-force")
PERFCOUNTER(shadow_get_page_fail, "shadow_get_page_from_l1e failed")
-PERFCOUNTER(shadow_guest_walk, "shadow walks guest tables")
PERFCOUNTER(shadow_check_gwalk, "shadow checks gwalk")
PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk")
PERFCOUNTER(shadow_rm_write_flush_tlb,
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|